From c2724cf8281e9700ac0b8a340d6ebc98d459bdb0 Mon Sep 17 00:00:00 2001 From: Lucas-Wye Date: Fri, 9 Aug 2024 08:21:21 +0000 Subject: [PATCH] [rtl] support zvk --- 1.patch | 1560 +++++++++++++++++ configgen/generated/blastoise.json | 5 +- configgen/generated/machamp.json | 5 +- configgen/generated/psyduck.json | 19 +- configgen/generated/sandslash.json | 5 +- configgen/src/Main.scala | 13 +- t1/src/Bundles.scala | 3 +- t1/src/Lane.scala | 108 +- t1/src/LaneZvk.scala | 47 + t1/src/T1.scala | 20 +- t1/src/VectorFunctionUnit.scala | 6 +- t1/src/decoder/Decoder.scala | 25 +- t1/src/decoder/InstructionDocumentation.scala | 26 + t1/src/decoder/T1DecodePattern.scala | 1 + t1/src/decoder/attribute/isItype.scala | 7 + t1/src/decoder/attribute/isUnsigned0.scala | 26 + t1/src/decoder/attribute/isUnsigned1.scala | 26 + t1/src/decoder/attribute/isVtype.scala | 16 + t1/src/decoder/attribute/isZvk.scala | 56 + t1/src/decoder/attribute/zvkUop.scala | 80 + t1/src/laneStage/LaneExecutionBridge.scala | 4 +- t1/src/laneStage/LaneStage1.scala | 255 ++- t1/src/laneStage/LaneStage3.scala | 16 +- t1/src/laneStage/SlotTokenManager.scala | 3 + t1/src/laneStage/ZvkCrossReadUnit.scala | 133 ++ t1/src/vrf/VRF.scala | 22 + 26 files changed, 2431 insertions(+), 56 deletions(-) create mode 100644 1.patch create mode 100644 t1/src/LaneZvk.scala create mode 100644 t1/src/decoder/attribute/isZvk.scala create mode 100644 t1/src/decoder/attribute/zvkUop.scala create mode 100644 t1/src/laneStage/ZvkCrossReadUnit.scala diff --git a/1.patch b/1.patch new file mode 100644 index 0000000000..462697d507 --- /dev/null +++ b/1.patch @@ -0,0 +1,1560 @@ +commit 010d0079d3d3ae079a91eb0dda817d777d4c0d55 +Author: Lucas-Wye +Date: Fri Aug 9 08:21:21 2024 +0000 + + [rtl] support zvk + +diff --git a/configgen/generated/blastoise.json b/configgen/generated/blastoise.json +index 290ef86c..241ea7ef 100644 +--- a/configgen/generated/blastoise.json ++++ b/configgen/generated/blastoise.json +@@ -167,8 +167,9 @@ + ] + ] + ], +- "zvbbModuleParameters": [] ++ "zvbbModuleParameters": [], ++ "zvkModuleParameters": [] + } + }, + "generator": "org.chipsalliance.t1.rtl.T1" +-} +\ No newline at end of file ++} +diff --git a/configgen/generated/machamp.json b/configgen/generated/machamp.json +index ceeaf5e5..144ce49f 100644 +--- a/configgen/generated/machamp.json ++++ b/configgen/generated/machamp.json +@@ -151,8 +151,9 @@ + ] + ], + "floatModuleParameters": [], +- "zvbbModuleParameters": [] ++ "zvbbModuleParameters": [], ++ "zvkModuleParameters": [] + } + }, + "generator": "org.chipsalliance.t1.rtl.T1" +-} +\ No newline at end of file ++} +diff --git a/configgen/generated/psyduck.json b/configgen/generated/psyduck.json +index 04a2f357..9efbc04e 100644 +--- a/configgen/generated/psyduck.json ++++ b/configgen/generated/psyduck.json +@@ -184,8 +184,25 @@ + 3 + ] + ] ++ ], ++ "zvkModuleParameters": [ ++ [ ++ { ++ "parameter": { ++ "datapathWidth": 32, ++ "latency": 3 ++ }, ++ "generator": "org.chipsalliance.t1.rtl.LaneZvk" ++ }, ++ [ ++ 0, ++ 1, ++ 2, ++ 3 ++ ] ++ ] + ] + } + }, + "generator": "org.chipsalliance.t1.rtl.T1" +-} +\ No newline at end of file ++} +diff --git a/configgen/generated/sandslash.json b/configgen/generated/sandslash.json +index 688085fe..08d52748 100644 +--- a/configgen/generated/sandslash.json ++++ b/configgen/generated/sandslash.json +@@ -151,8 +151,9 @@ + ] + ], + "floatModuleParameters": [], +- "zvbbModuleParameters": [] ++ "zvbbModuleParameters": [], ++ "zvkModuleParameters": [] + } + }, + "generator": "org.chipsalliance.t1.rtl.T1" +-} +\ No newline at end of file ++} +diff --git a/configgen/src/Main.scala b/configgen/src/Main.scala +index 88e3bc32..357707f3 100644 +--- a/configgen/src/Main.scala ++++ b/configgen/src/Main.scala +@@ -100,7 +100,8 @@ object Main { + Seq(0, 1, 2, 3))), + floatModuleParameters = + Seq((SerializableModuleGenerator(classOf[LaneFloat], LaneFloatParam(32, 3)), Seq(0, 1, 2, 3))), +- zvbbModuleParameters = Seq() ++ zvbbModuleParameters = Seq(), ++ zvkModuleParameters = Seq(), + ) + ) + if (doEmit) param.emit(targetFile) +@@ -151,7 +152,9 @@ object Main { + floatModuleParameters = + Seq((SerializableModuleGenerator(classOf[LaneFloat], LaneFloatParam(32, 3)), Seq(0, 1, 2, 3))), + zvbbModuleParameters = +- Seq((SerializableModuleGenerator(classOf[LaneZvbb], LaneZvbbParam(32, 3)), Seq(0, 1, 2, 3))) ++ Seq((SerializableModuleGenerator(classOf[LaneZvbb], LaneZvbbParam(32, 3)), Seq(0, 1, 2, 3))), ++ zvkModuleParameters = ++ Seq((SerializableModuleGenerator(classOf[LaneZvk], LaneZvkParam(32, 3)), Seq(0, 1, 2, 3))), + ) + ) + if (doEmit) param.emit(targetFile) +@@ -201,7 +204,8 @@ object Main { + ), + Seq(0, 1, 2, 3))), + floatModuleParameters = Seq(), +- zvbbModuleParameters = Seq() // TODO ++ zvbbModuleParameters = Seq(), ++ zvkModuleParameters = Seq(), + ) + ) + if (doEmit) param.emit(targetFile) +@@ -251,7 +255,8 @@ object Main { + ), + Seq(0, 1, 2, 3))), + floatModuleParameters = Seq(), +- zvbbModuleParameters = Seq() // TODO ++ zvbbModuleParameters = Seq(), ++ zvkModuleParameters = Seq(), + ) + ) + if (doEmit) param.emit(targetFile) +diff --git a/t1/src/Bundles.scala b/t1/src/Bundles.scala +index bb8a36f4..3f21b066 100644 +--- a/t1/src/Bundles.scala ++++ b/t1/src/Bundles.scala +@@ -619,6 +619,7 @@ class ExecutionUnitRecord(parameter: LaneParameter)(isLastSlot: Boolean) extends + val executeIndex: Bool = Bool() + val source: Vec[UInt] = Vec(3, UInt(parameter.datapathWidth.W)) + val crossReadSource: Option[UInt] = Option.when(isLastSlot)(UInt((parameter.datapathWidth * 2).W)) ++ val zvkCrossReadSource: Option[UInt] = Option.when(isLastSlot && parameter.zvkEnable)(UInt((parameter.datapathWidth * 2).W)) + /** groupCounter need use to update `Lane.maskFormatResultForGroup` */ + val groupCounter: UInt = UInt(parameter.groupNumberBits.W) + val sSendResponse: Option[Bool] = Option.when(isLastSlot)(Bool()) +@@ -722,4 +723,4 @@ class T1Retire(xLen: Int) extends Bundle { + val rd: ValidIO[T1RdRetire] = Valid(new T1RdRetire(xLen)) + val csr: ValidIO[T1CSRRetire] = Valid(new T1CSRRetire) + val mem: ValidIO[EmptyBundle] = Valid(new EmptyBundle) +-} +\ No newline at end of file ++} +diff --git a/t1/src/Lane.scala b/t1/src/Lane.scala +index 59a7eb8a..34afb1bc 100644 +--- a/t1/src/Lane.scala ++++ b/t1/src/Lane.scala +@@ -81,16 +81,18 @@ object LaneParameter { + * TODO: cover the queue full. + */ + case class LaneParameter( +- vLen: Int, +- datapathWidth: Int, +- laneNumber: Int, +- chainingSize: Int, +- crossLaneVRFWriteEscapeQueueSize: Int, +- fpuEnable: Boolean, +- portFactor: Int, +- vrfRamType: RamType, +- decoderParam: DecoderParam, +- vfuInstantiateParameter: VFUInstantiateParameter) ++ vLen: Int, ++ datapathWidth: Int, ++ laneNumber: Int, ++ chainingSize: Int, ++ crossLaneVRFWriteEscapeQueueSize: Int, ++ crossLaneVRFWriteEscapeZvkQueueSize: Int, ++ fpuEnable: Boolean, ++ zvkEnable: Boolean, ++ portFactor: Int, ++ vrfRamType: RamType, ++ decoderParam: DecoderParam, ++ vfuInstantiateParameter: VFUInstantiateParameter) + extends SerializableModuleParameter { + + /** 1 in MSB for instruction order. */ +@@ -132,7 +134,7 @@ case class LaneParameter( + * + * for each number in table below, it represent a [[datapathWidth]] + * {{{ +- * lane0 | lane1 | ... | lane8 ++ * lane0 | lane1 | ... | lane7 + * offset0 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 + * offset1 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 + * offset2 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 +@@ -210,8 +212,26 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ + * TODO: benchmark the usecase for tuning the Ring Bus width. + * find a real world case for using `narrow` and `widen` aggressively. + */ ++ // 0: 0.0 - 0.1 ++ // 1: 0.2 - 0.3 ++ // 2: 0.4 - 0.5 ++ // 3: 0.6 - 0.7 ++ // 4: 1.0 - 1.1 ++ // 5: 1.2 - 1.3 ++ // 6: 1.4 - 1.5 ++ // 7: 1.6 - 1.7 ++ ++ // 0: 0.0 - 0.1 - 0.2 - 0.3 ++ // 1: 0.4 - 0.5 - 0.6 - 0.7 ++ // 2: 1.0 - 1.1 - 1.2 - 1.3 ++ // 3: 1.4 - 1.5 - 1.6 - 1.7 ++ // 4: 2.0 - 2.1 - 2.2 - 2.3 ++ // 5: 2.4 - 2.5 - 2.6 - 2.7 ++ // 6: 3.0 - 3.1 - 3.2 - 3.3 ++ // 7: 3.4 - 3.5 - 3.6 - 3.7 + @public + val readBusPort: Vec[RingPort[ReadBusData]] = IO(Vec(2, new RingPort(new ReadBusData(parameter)))) ++ val readBusPort4: Option[Vec[RingPort[ReadBusData]]] = Option.when(parameter.zvkEnable)(IO(Vec(4, new RingPort(new ReadBusData(parameter))))) + + /** VRF Write Interface. + * only used for `narrow` an `widen` +@@ -220,6 +240,7 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ + */ + @public + val writeBusPort: Vec[RingPort[WriteBusData]] = IO(Vec(2, new RingPort(new WriteBusData(parameter)))) ++ val writeBusPort4: Option[Vec[RingPort[WriteBusData]]] = Option.when(parameter.zvkEnable)(IO(Vec(4, new RingPort(new WriteBusData(parameter))))) + + /** request from [[T1.decode]] to [[Lane]]. */ + @public +@@ -320,6 +341,9 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ + + // TODO: remove + dontTouch(writeBusPort) ++ if(parameter.zvkEnable) { ++ dontTouch(writeBusPort4.get) ++ } + + /** VRF instantces. */ + val vrf: Instance[VRF] = Instantiate(new VRF(parameter.vrfParam)) +@@ -440,8 +464,12 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ + val readCheckRequestVec: Vec[VRFReadRequest] = Wire(Vec(parameter.chainingSize * 3 + 2, + new VRFReadRequest(parameter.vrfParam.regNumBits, parameter.vrfOffsetBits, parameter.instructionIndexBits) + )) ++ val zvkReadCheckRequestVec: Vec[VRFReadRequest] = Wire(Vec(parameter.chainingSize * 3 + 4, ++ new VRFReadRequest(parameter.vrfParam.regNumBits, parameter.vrfOffsetBits, parameter.instructionIndexBits) ++ )) + + val readCheckResult: Vec[Bool] = Wire(Vec(parameter.chainingSize * 3 + 2, Bool())) ++ val zvkReadCheckResult: Vec[Bool] = Wire(Vec(parameter.chainingSize * 3 + 4, Bool())) + + /** signal used for prohibiting slots to access VRF. + * a slot will become inactive when: +@@ -465,7 +493,7 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ + val slotCanShift: Vec[Bool] = Wire(Vec(parameter.chainingSize, Bool())) + + /** Which data group is waiting for the result of the cross-lane read */ +- val readBusDequeueGroup: UInt = Wire(UInt(parameter.groupNumberBits.W)) ++ val readBusDequeueGroup: UInt = Wire(UInt(parameter.groupNumberBits.W)) // TODO: readBusDequeueGroup is currently unused + + /** enqueue valid for execution unit */ + val executeEnqueueValid: Vec[Bool] = Wire(Vec(parameter.chainingSize, Bool())) +@@ -514,6 +542,18 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ + pipe = true + ) + )) ++ val crossLaneWriteQueue4: Seq[Queue[VRFWriteRequest]] = Seq.tabulate(4)(i => Module( ++ new Queue( ++ new VRFWriteRequest( ++ parameter.vrfParam.regNumBits, ++ parameter.vrfOffsetBits, ++ parameter.instructionIndexBits, ++ parameter.datapathWidth ++ ), ++ parameter.crossLaneVRFWriteEscapeZvkQueueSize, ++ pipe = true ++ ) ++ )) + val maskedWriteUnit: Instance[MaskedWrite] = Instantiate(new MaskedWrite(parameter)) + val tokenManager: Instance[SlotTokenManager] = Instantiate(new SlotTokenManager(parameter)) + slotControl.zipWithIndex.foreach { +@@ -661,9 +701,17 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ + readCheckRequestVec((parameter.chainingSize - index - 1) * 3 + portIndex) := stage1.vrfCheckRequest(portIndex) + stage1.checkResult(portIndex) := readCheckResult((parameter.chainingSize - index - 1) * 3 + portIndex) + } ++ val zvkCheckSize = if (isLastSlot && parameter.zvkEnable) 7 else 5 ++ if(parameter.zvkEnable) { ++ Seq.tabulate(checkSize){ portIndex => ++ zvkReadCheckRequestVec((parameter.chainingSize - index - 1) * 3 + portIndex) := stage1.zvkVrfCheckRequest(portIndex) ++ } ++ stage1.zvkCheckResult(portIndex) := zvkReadCheckResult((parameter.chainingSize - index - 1) * 3 + portIndex) ++ } + // connect cross read bus + if(isLastSlot) { + val tokenSize = parameter.crossLaneVRFWriteEscapeQueueSize ++ val zvKTokenSize = parameter.crossLaneVRFWriteEscapeZvkQueueSize + readBusPort.zipWithIndex.foreach {case (readPort, portIndex) => + // tx + val tokenReg = RegInit(0.U(log2Ceil(tokenSize + 1).W)) +@@ -685,6 +733,27 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ + // dequeue to cross read unit + stage1.readBusDequeue.get(portIndex) <> queue.io.deq + } ++ readBusPort4.get.zipWithIndex.foreach {case (readPort, portIndex) => ++ // tx ++ val tokenReg = RegInit(0.U(log2Ceil(zvKTokenSize + 1).W)) ++ val tokenReady: Bool = tokenReg =/= zvKTokenSize.U ++ stage1.readBusRequest4.get(portIndex).ready := tokenReady ++ readPort.deq.valid := stage1.readBusRequest4.get(portIndex).valid && tokenReady ++ readPort.deq.bits := stage1.readBusRequest4.get(portIndex).bits ++ val tokenUpdate = Mux(readPort.deq.valid, 1.U, -1.S(tokenReg.getWidth.W).asUInt) ++ when(readPort.deq.valid ^ readPort.deqRelease) { ++ tokenReg := tokenReg + tokenUpdate ++ } ++ // rx ++ // rx queue ++ val queue = Module(new Queue(chiselTypeOf(readPort.deq.bits), zvKTokenSize, pipe=true)) ++ queue.io.enq.valid := readPort.enq.valid ++ queue.io.enq.bits := readPort.enq.bits ++ readPort.enqRelease := queue.io.deq.fire ++ assert(queue.io.enq.ready || !readPort.enq.valid) ++ // dequeue to cross read unit ++ stage1.readBusDequeue4.get(portIndex) <> queue.io.deq ++ } + + // cross write + writeBusPort.zipWithIndex.foreach {case (writePort, portIndex) => +@@ -694,6 +763,19 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ + writePort.deq.bits := stage3.crossWritePort.get(portIndex).bits + stage3.crossWritePort.get(portIndex).ready := tokenReady + ++ // update token ++ val tokenUpdate = Mux(writePort.deq.valid, 1.U, -1.S(tokenReg.getWidth.W).asUInt) ++ when(writePort.deq.valid ^ writePort.deqRelease) { ++ tokenReg := tokenReg + tokenUpdate ++ } ++ } ++ writeBusPort4.get.zipWithIndex.foreach {case (writePort, portIndex) => ++ val tokenReg = RegInit(0.U(log2Ceil(zvKTokenSize + 1).W)) ++ val tokenReady: Bool = tokenReg =/= zvKTokenSize.U ++ writePort.deq.valid := stage3.crossWritePort4.get(portIndex).valid && tokenReady ++ writePort.deq.bits := stage3.crossWritePort4.get(portIndex).bits ++ stage3.crossWritePort4.get(portIndex).ready := tokenReady ++ + // update token + val tokenUpdate = Mux(writePort.deq.valid, 1.U, -1.S(tokenReg.getWidth.W).asUInt) + when(writePort.deq.valid ^ writePort.deqRelease) { +@@ -892,7 +974,9 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ + } + + vrf.readCheck.zip(readCheckRequestVec).foreach{case (sink, source) => sink := source} ++ vrf.zvkReadCheck.zip(zvkReadCheckRequestVec).foreach{case (sink, source) => sink := source} + readCheckResult.zip(vrf.readCheckResult).foreach{case (sink, source) => sink := source} ++ zvkReadCheckResult.zip(vrf.zvkReadCheckResult).foreach{case (sink, source) => sink := source} + + allVrfWriteAfterCheck.zipWithIndex.foreach { case (req, i) => + val check = vrf.writeAllow(i) +diff --git a/t1/src/LaneZvk.scala b/t1/src/LaneZvk.scala +new file mode 100644 +index 00000000..7764c680 +--- /dev/null ++++ b/t1/src/LaneZvk.scala +@@ -0,0 +1,47 @@ ++// SPDX-License-Identifier: Apache-2.0 ++// SPDX-FileCopyrightText: 2022 Jiuyang Liu ++ ++package org.chipsalliance.t1.rtl ++ ++import chisel3.experimental.hierarchy.instantiable ++import chisel3._ ++import chisel3.experimental.{SerializableModule, SerializableModuleParameter} ++import chisel3.util._ ++import org.chipsalliance.t1.rtl.decoder.{BoolField, Decoder} ++ ++object LaneZvkParam { ++ implicit def rw: upickle.default.ReadWriter[LaneZvkParam] = upickle.default.macroRW ++} ++ ++case class LaneZvkParam(datapathWidth: Int, latency: Int) extends VFUParameter with SerializableModuleParameter { ++ val inputBundle = new LaneZvkRequest(datapathWidth) ++ val decodeField: BoolField = Decoder.zvbb ++ val outputBundle = new LaneZvkResponse(datapathWidth) ++ override val NeedSplit: Boolean = false ++} ++ ++class LaneZvkRequest(datapathWidth: Int) extends VFUPipeBundle { ++ val src = Vec(3, UInt(datapathWidth.W)) ++ val opcode = UInt(4.W) ++ val vSew = UInt(2.W) ++ val shifterSize = UInt(log2Ceil(datapathWidth).W) ++} ++ ++class LaneZvkResponse(datapathWidth: Int) extends VFUPipeBundle { ++ val data = UInt(datapathWidth.W) ++} ++ ++@instantiable ++class LaneZvk(val parameter: LaneZvkParam) ++ extends VFUModule(parameter) with SerializableModule[LaneZvkParam]{ ++ val response: LaneZvkResponse = Wire(new LaneZvkResponse(parameter.datapathWidth)) ++ val request : LaneZvkRequest = connectIO(response).asTypeOf(parameter.inputBundle) ++ ++ val zvbbSrc: UInt = request.src(1) // vs2 ++ val zvbbRs: UInt = request.src(0) // vs1 or rs1 ++ val vSew: UInt = UIntToOH(request.vSew) // sew = 0, 1, 2 ++ ++ response.data := Mux1H(UIntToOH(request.opcode), Seq( ++ )) ++} ++ +diff --git a/t1/src/T1.scala b/t1/src/T1.scala +index 5a36bb04..0d20680e 100644 +--- a/t1/src/T1.scala ++++ b/t1/src/T1.scala +@@ -122,6 +122,14 @@ case class T1Parameter( + instruction => instruction.instructionSet.name match { + case "rv_v" => true + case "rv_zvbb" => if (zvbbEnable) true else false ++ // Zvk ++ case "rv_zvkg" => if (zvkEnable) true else false ++ // case "rv_zvkn" => if (zvkEnable) true else false // TODO: no implementations for SEW=64 ++ case "rv_zvkned" => if (zvkEnable) true else false ++ case "rv_zvknha" => if (zvkEnable) true else false ++ // case "rv_zvknhb" => if (zvkEnable) true else false // TODO: no implementations for SEW=64 ++ case "rv_zvksed" => if (zvkEnable) true else false ++ case "rv_zvksh" => if (zvkEnable) true else false + case _ => false + }} ++ + t1customInstructions.map(_.instruction) +@@ -143,15 +151,19 @@ case class T1Parameter( + /** TODO: configure it. */ + val instructionQueueSize: Int = 4 + +- /** crosslane write token size */ ++ /** crosslane write token size, unclear how many would be good */ + val vrfWriteQueueSize: Int = 4 ++ val vrfWriteZvkQueueSize: Int = 8 + + /** does t1 has floating datapath? */ + val fpuEnable: Boolean = extensions.contains("Zve32f") + +- /** support of zvbb */ ++ /** support of Zvbb */ + lazy val zvbbEnable: Boolean = extensions.contains("Zvbb") + ++ /** support of Zvk */ ++ lazy val zvkEnable: Boolean = extensions.contains("Zvk") ++ + /** how many chaining does T1 support, this is not a parameter yet. */ + val chainingSize: Int = 4 + +@@ -225,7 +237,7 @@ case class T1Parameter( + // and the values are their respective delays. + val crossLaneConnectCycles: Seq[Seq[Int]] = Seq.tabulate(laneNumber)(_ => Seq(1, 1)) + +- val decoderParam: DecoderParam = DecoderParam(fpuEnable, zvbbEnable, allInstructions) ++ val decoderParam: DecoderParam = DecoderParam(fpuEnable, zvbbEnable, zvkEnable, allInstructions) + + /** paraemter for AXI4. */ + val axi4BundleParameter: AXI4BundleParameter = AXI4BundleParameter( +@@ -261,7 +273,9 @@ case class T1Parameter( + laneNumber = laneNumber, + chainingSize = chainingSize, + crossLaneVRFWriteEscapeQueueSize = vrfWriteQueueSize, ++ crossLaneVRFWriteEscapeZvkQueueSize = vrfWriteZvkQueueSize, + fpuEnable = fpuEnable, ++ zvkEnable = zvkEnable, + portFactor = vrfBankSize, + vrfRamType = vrfRamType, + decoderParam = decoderParam, +diff --git a/t1/src/VectorFunctionUnit.scala b/t1/src/VectorFunctionUnit.scala +index cf06a66a..cee52525 100644 +--- a/t1/src/VectorFunctionUnit.scala ++++ b/t1/src/VectorFunctionUnit.scala +@@ -106,7 +106,8 @@ case class VFUInstantiateParameter( + divfpModuleParameters: Seq[(SerializableModuleGenerator[LaneDivFP, LaneDivFPParam], Seq[Int])], + otherModuleParameters: Seq[(SerializableModuleGenerator[OtherUnit, OtherUnitParam], Seq[Int])], + floatModuleParameters: Seq[(SerializableModuleGenerator[LaneFloat, LaneFloatParam], Seq[Int])], +- zvbbModuleParameters: Seq[(SerializableModuleGenerator[LaneZvbb, LaneZvbbParam], Seq[Int])] ++ zvbbModuleParameters: Seq[(SerializableModuleGenerator[LaneZvbb, LaneZvbbParam], Seq[Int])], ++ zvkModuleParameters: Seq[(SerializableModuleGenerator[LaneZvk, LaneZvkParam], Seq[Int])], + ) { + val genVec: Seq[(SerializableModuleGenerator[_ <: VFUModule, _ <: VFUParameter], Seq[Int])] = + logicModuleParameters ++ +@@ -117,7 +118,8 @@ case class VFUInstantiateParameter( + divfpModuleParameters ++ + otherModuleParameters ++ + floatModuleParameters ++ +- zvbbModuleParameters ++ zvbbModuleParameters ++ ++ zvkModuleParameters + genVec.foreach { + case (_, connect) => + connect.foreach(connectIndex => require(connectIndex < slotCount)) +diff --git a/t1/src/decoder/Decoder.scala b/t1/src/decoder/Decoder.scala +index 3a029938..112e34f7 100644 +--- a/t1/src/decoder/Decoder.scala ++++ b/t1/src/decoder/Decoder.scala +@@ -13,7 +13,7 @@ import org.chipsalliance.t1.rtl.decoder.attribute._ + object DecoderParam { + implicit def rwP: upickle.default.ReadWriter[DecoderParam] = upickle.default.macroRW + } +-case class DecoderParam(fpuEnable: Boolean, zvbbEnable: Boolean, allInstructions: Seq[Instruction]) ++case class DecoderParam(fpuEnable: Boolean, zvbbEnable: Boolean, zvkEnable: Boolean, allInstructions: Seq[Instruction]) + + trait T1DecodeFiled[D <: Data] extends DecodeField[T1DecodePattern, D] with FieldName + +@@ -225,6 +225,10 @@ object Decoder { + override def getTriState(pattern: T1DecodePattern): TriState = pattern.isZvbb.value + } + ++ object zvk extends BoolField { ++ override def getTriState(pattern: T1DecodePattern): TriState = pattern.isZvk.value ++ } ++ + object topUop extends T1TopUopField { + override def genTable(pattern: T1DecodePattern): BitPat = pattern.topUop.value match { + case _: TopT0.type => BitPat("b000") +@@ -345,6 +349,19 @@ object Decoder { + case _: zvbbUop8.type => BitPat("b1000") // andn + case _ => BitPat.dontCare(4) + } ++ case zvkCase: ZvkUOPType => ++ zvkCase match { ++ case _: zvkUop0.type => BitPat("b0000") // ++ case _: zvkUop1.type => BitPat("b0001") // ++ case _: zvkUop2.type => BitPat("b0010") // ++ case _: zvkUop3.type => BitPat("b0011") // ++ case _: zvkUop4.type => BitPat("b0100") // ++ case _: zvkUop5.type => BitPat("b0101") // ++ case _: zvkUop6.type => BitPat("b0110") // ++ case _: zvkUop7.type => BitPat("b0111") // ++ case _: zvkUop8.type => BitPat("b1000") // ++ case _ => BitPat.dontCare(4) ++ } + case _ => BitPat.dontCare(4) + } + } +@@ -422,6 +439,12 @@ object Decoder { + zvbb, + ) + else Seq() ++ } ++ { ++ if (param.zvkEnable) ++ Seq( ++ zvk, ++ ) ++ else Seq() + } + def allDecodePattern(param: DecoderParam): Seq[T1DecodePattern] = param.allInstructions.map(T1DecodePattern(_, param)).toSeq.sortBy(_.instruction.name) + +diff --git a/t1/src/decoder/InstructionDocumentation.scala b/t1/src/decoder/InstructionDocumentation.scala +index 86c5a7e3..b506c61c 100644 +--- a/t1/src/decoder/InstructionDocumentation.scala ++++ b/t1/src/decoder/InstructionDocumentation.scala +@@ -439,5 +439,31 @@ case class InstructionDocumentation(instruction: Instruction, param: DecoderPara + case "vwsll.vv" => "TODO!" + case "vwsll.vx" => "TODO!" + case "vwsll.vi" => "TODO!" ++ // rv_zvkg ++ case "vghsh.vv" => "TODO!" ++ case "vgmul.vv" => "TODO!" ++ // rv_zvkned ++ case "vaesdf.vv" => "TODO!" ++ case "vaesdf.vs" => "TODO!" ++ case "vaesdm.vv" => "TODO!" ++ case "vaesdm.vs" => "TODO!" ++ case "vaesef.vv" => "TODO!" ++ case "vaesef.vs" => "TODO!" ++ case "vaesem.vv" => "TODO!" ++ case "vaesem.vs" => "TODO!" ++ case "vaesz.vs" => "TODO!" ++ case "vaeskf1.vi" => "TODO!" ++ case "vaeskf2.vi" => "TODO!" ++ // rv_zvknha ++ case "vsha2ms.vv" => "TODO!" ++ case "vsha2ch.vv" => "TODO!" ++ case "vsha2cl.vv" => "TODO!" ++ // rv_zvksed ++ case "vsm4k.vi" => "TODO!" ++ case "vsm4r.vv" => "TODO!" ++ case "vsm4r.vs" => "TODO!" ++ // rv_zvksh ++ case "vsm3c.vi" => "TODO!" ++ case "vsm3me.vv" => "TODO!" + } + } +diff --git a/t1/src/decoder/T1DecodePattern.scala b/t1/src/decoder/T1DecodePattern.scala +index 5c7d1073..3b7d9b3a 100644 +--- a/t1/src/decoder/T1DecodePattern.scala ++++ b/t1/src/decoder/T1DecodePattern.scala +@@ -108,6 +108,7 @@ case class T1DecodePattern(instruction: Instruction, param: DecoderParam) extend + def isVwmacc: isVwmacc = attribute.isVwmacc(this) + def isWidenreduce: isWidenreduce = attribute.isWidenreduce(this) + def isZvbb: isZvbb = attribute.isZvbb(this) ++ def isZvk: isZvk = attribute.isZvk(this) + def fpExecutionType: FpExecutionType.Type = attribute.FpExecutionType(this) + def topUop: TopUop = attribute.TopUop(this) + def decoderUop: DecoderUop = attribute.DecoderUop(this) +diff --git a/t1/src/decoder/attribute/isItype.scala b/t1/src/decoder/attribute/isItype.scala +index 5ba9baf2..c3db6475 100644 +--- a/t1/src/decoder/attribute/isItype.scala ++++ b/t1/src/decoder/attribute/isItype.scala +@@ -54,6 +54,13 @@ object isItype { + // rv_zvbb + "vror.vi", + "vwsll.vi", ++ // rv_zvkned ++ "vaeskf1.vi", ++ "vaeskf2.vi", ++ // rv_zvksed ++ "vsm4k.vi", ++ // rv_zvksh ++ "vsm3c.vi", + ) + allMatched.contains(t1DecodePattern.instruction.name) + } +diff --git a/t1/src/decoder/attribute/isUnsigned0.scala b/t1/src/decoder/attribute/isUnsigned0.scala +index fb041c3c..29b2bf9d 100644 +--- a/t1/src/decoder/attribute/isUnsigned0.scala ++++ b/t1/src/decoder/attribute/isUnsigned0.scala +@@ -146,6 +146,32 @@ object isUnsigned0 { + "vwsll.vv", + "vwsll.vx", + "vwsll.vi", ++ // rv_zvkg ++ "vghsh.vv", ++ "vgmul.vv", ++ // rv_zvkned ++ "vaesdf.vv", ++ "vaesdf.vs", ++ "vaesdm.vv", ++ "vaesdm.vs", ++ "vaesef.vv", ++ "vaesef.vs", ++ "vaesem.vv", ++ "vaesem.vs", ++ "vaesz.vs", ++ "vaeskf1.vi", ++ "vaeskf2.vi", ++ // rv_zvknha ++ "vsha2ms.vv", ++ "vsha2ch.vv", ++ "vsha2cl.vv", ++ // rv_zvksed ++ "vsm4k.vi", ++ "vsm4r.vv", ++ "vsm4r.vs", ++ // rv_zvksh ++ "vsm3c.vi", ++ "vsm3me.vv", + ) + allMatched.contains(t1DecodePattern.instruction.name) + } +diff --git a/t1/src/decoder/attribute/isUnsigned1.scala b/t1/src/decoder/attribute/isUnsigned1.scala +index cf4f517a..595ecfba 100644 +--- a/t1/src/decoder/attribute/isUnsigned1.scala ++++ b/t1/src/decoder/attribute/isUnsigned1.scala +@@ -118,6 +118,32 @@ object isUnsigned1 { + "vwsll.vv", + "vwsll.vx", + "vwsll.vi", ++ // rv_zvkg ++ "vghsh.vv", ++ "vgmul.vv", ++ // rv_zvkned ++ "vaesdf.vv", ++ "vaesdf.vs", ++ "vaesdm.vv", ++ "vaesdm.vs", ++ "vaesef.vv", ++ "vaesef.vs", ++ "vaesem.vv", ++ "vaesem.vs", ++ "vaesz.vs", ++ "vaeskf1.vi", ++ "vaeskf2.vi", ++ // rv_zvknha ++ "vsha2ms.vv", ++ "vsha2ch.vv", ++ "vsha2cl.vv", ++ // rv_zvksed ++ "vsm4k.vi", ++ "vsm4r.vv", ++ "vsm4r.vs", ++ // rv_zvksh ++ "vsm3c.vi", ++ "vsm3me.vv", + ) + allMatched.contains(t1DecodePattern.instruction.name) + } +diff --git a/t1/src/decoder/attribute/isVtype.scala b/t1/src/decoder/attribute/isVtype.scala +index 7649d715..0ecb480e 100644 +--- a/t1/src/decoder/attribute/isVtype.scala ++++ b/t1/src/decoder/attribute/isVtype.scala +@@ -186,6 +186,22 @@ object isVtype { + "vrol.vv", + "vror.vv", + "vwsll.vv", ++ // rv_zvkg ++ "vghsh.vv", ++ "vgmul.vv", ++ // rv_zvkned ++ "vaesdf.vv", ++ "vaesdm.vv", ++ "vaesef.vv", ++ "vaesem.vv", ++ // rv_zvknha ++ "vsha2ms.vv", ++ "vsha2ch.vv", ++ "vsha2cl.vv", ++ // rv_zvksed ++ "vsm4r.vv", ++ // rv_zvksh ++ "vsm3me.vv", + ) + allMatched.contains(t1DecodePattern.instruction.name) + } +diff --git a/t1/src/decoder/attribute/isZvk.scala b/t1/src/decoder/attribute/isZvk.scala +new file mode 100644 +index 00000000..45942377 +--- /dev/null ++++ b/t1/src/decoder/attribute/isZvk.scala +@@ -0,0 +1,56 @@ ++// SPDX-License-Identifier: Apache-2.0 ++// SPDX-FileCopyrightText: 2022 Jiuyang Liu ++ ++package org.chipsalliance.t1.rtl.decoder.attribute ++ ++import org.chipsalliance.t1.rtl.decoder.T1DecodePattern ++ ++object isZvk { ++ def apply(t1DecodePattern: T1DecodePattern): isZvk = ++ Seq( ++ y _ -> Y, ++ n _ -> N, ++ dc _ -> DC ++ ).collectFirst { ++ case (fn, tri) if fn(t1DecodePattern) => isZvk(tri) ++ }.get ++ ++ def y(t1DecodePattern: T1DecodePattern): Boolean = { ++ val allMatched = if(t1DecodePattern.param.zvkEnable) Seq( ++ "vghsh.vv", ++ "vgmul.vv", ++ "vaesdf.vv", ++ "vaesdf.vs", ++ "vaesdm.vv", ++ "vaesdm.vs", ++ "vaesef.vv", ++ "vaesef.vs", ++ "vaesem.vv", ++ "vaesem.vs", ++ "vaesz.vs", ++ "vaeskf1.vi", ++ "vaeskf2.vi", ++ "vsha2ms.vv", ++ "vsha2ch.vv", ++ "vsha2cl.vv", ++ "vsm4k.vi", ++ "vsm4r.vv", ++ "vsm4r.vs", ++ "vsm3c.vi", ++ "vsm3me.vv", ++ ) else Seq() ++ allMatched.contains(t1DecodePattern.instruction.name) ++ } ++ def n(t1DecodePattern: T1DecodePattern): Boolean = { ++ val allMatched = t1DecodePattern.param.allInstructions.filter(i => ++ !(y(t1DecodePattern) || dc(t1DecodePattern)) ++ ) ++ allMatched.contains(t1DecodePattern.instruction) ++ } ++ ++ def dc(t1DecodePattern: T1DecodePattern): Boolean = false ++} ++ ++case class isZvk(value: TriState) extends BooleanDecodeAttribute { ++ override val description: String = "goes to [[org.chipsalliance.t1.rtl.LaneZvk]]." ++} +diff --git a/t1/src/decoder/attribute/zvkUop.scala b/t1/src/decoder/attribute/zvkUop.scala +new file mode 100644 +index 00000000..6194e323 +--- /dev/null ++++ b/t1/src/decoder/attribute/zvkUop.scala +@@ -0,0 +1,80 @@ ++// SPDX-License-Identifier: Apache-2.0 ++// SPDX-FileCopyrightText: 2022 Jiuyang Liu ++ ++package org.chipsalliance.t1.rtl.decoder.attribute ++ ++import org.chipsalliance.t1.rtl.decoder.T1DecodePattern ++ ++trait ZvkUOPType extends Uop ++object zvkUop0 extends ZvkUOPType // ++object zvkUop1 extends ZvkUOPType // ++object zvkUop2 extends ZvkUOPType // ++object zvkUop3 extends ZvkUOPType // ++object zvkUop4 extends ZvkUOPType // ++object zvkUop5 extends ZvkUOPType // ++object zvkUop6 extends ZvkUOPType // ++object zvkUop7 extends ZvkUOPType // ++object zvkUop8 extends ZvkUOPType // ++ ++object ZvkUOP { ++ def apply(t1DecodePattern: T1DecodePattern): Uop = { ++ Seq( ++ t0 _ -> zvkUop0, ++ t1 _ -> zvkUop1, ++ t2 _ -> zvkUop2, ++ t3 _ -> zvkUop3, ++ t4 _ -> zvkUop4, ++ t5 _ -> zvkUop5, ++ t6 _ -> zvkUop6, ++ t7 _ -> zvkUop7, ++ t8 _ -> zvkUop8, ++ ).collectFirst { ++ case (fn, tpe) if fn(t1DecodePattern) => tpe ++ }.getOrElse(UopDC) ++ } ++ def t0(t1DecodePattern: T1DecodePattern): Boolean = { ++ val allMatched: Seq[String] = Seq( ++ ) ++ allMatched.contains(t1DecodePattern.instruction.name) ++ } ++ def t1(t1DecodePattern: T1DecodePattern): Boolean = { ++ val allMatched: Seq[String] = Seq( ++ ) ++ allMatched.contains(t1DecodePattern.instruction.name) ++ } ++ def t2(t1DecodePattern: T1DecodePattern): Boolean = { ++ val allMatched: Seq[String] = Seq( ++ ) ++ allMatched.contains(t1DecodePattern.instruction.name) ++ } ++ def t3(t1DecodePattern: T1DecodePattern): Boolean = { ++ val allMatched: Seq[String] = Seq( ++ ) ++ allMatched.contains(t1DecodePattern.instruction.name) ++ } ++ def t4(t1DecodePattern: T1DecodePattern): Boolean = { ++ val allMatched: Seq[String] = Seq( ++ ) ++ allMatched.contains(t1DecodePattern.instruction.name) ++ } ++ def t5(t1DecodePattern: T1DecodePattern): Boolean = { ++ val allMatched: Seq[String] = Seq( ++ ) ++ allMatched.contains(t1DecodePattern.instruction.name) ++ } ++ def t6(t1DecodePattern: T1DecodePattern): Boolean = { ++ val allMatched: Seq[String] = Seq( ++ ) ++ allMatched.contains(t1DecodePattern.instruction.name) ++ } ++ def t7(t1DecodePattern: T1DecodePattern): Boolean = { ++ val allMatched: Seq[String] = Seq( ++ ) ++ allMatched.contains(t1DecodePattern.instruction.name) ++ } ++ def t8(t1DecodePattern: T1DecodePattern): Boolean = { ++ val allMatched: Seq[String] = Seq( ++ ) ++ allMatched.contains(t1DecodePattern.instruction.name) ++ } ++} +diff --git a/t1/src/laneStage/LaneExecutionBridge.scala b/t1/src/laneStage/LaneExecutionBridge.scala +index 3a58046f..a1bff5f5 100644 +--- a/t1/src/laneStage/LaneExecutionBridge.scala ++++ b/t1/src/laneStage/LaneExecutionBridge.scala +@@ -13,6 +13,7 @@ import org.chipsalliance.t1.rtl.decoder.Decoder + class LaneExecuteRequest(parameter: LaneParameter, isLastSlot: Boolean) extends Bundle { + val src: Vec[UInt] = Vec(3, UInt(parameter.datapathWidth.W)) + val crossReadSource: Option[UInt] = Option.when(isLastSlot)(UInt((parameter.datapathWidth * 2).W)) ++ val zvkCrossReadSource: Option[UInt] = Option.when(isLastSlot)(UInt((parameter.datapathWidth * 4).W)) + val bordersForMaskLogic: Bool = Bool() + val mask: UInt = UInt((parameter.datapathWidth / 8).W) + val maskForFilter: UInt = UInt((parameter.datapathWidth / 8).W) +@@ -145,6 +146,7 @@ class LaneExecutionBridge(parameter: LaneParameter, isLastSlot: Boolean, slotInd + executionRecord.maskForFilter := enqueue.bits.maskForFilter + executionRecord.source := enqueue.bits.src + executionRecord.crossReadSource.foreach(_ := enqueue.bits.crossReadSource.get) ++ executionRecord.zvkCrossReadSource.foreach(_ := enqueue.bits.zvkCrossReadSource.get) + executionRecord.sSendResponse.foreach(_ := enqueue.bits.sSendResponse.get) + executionRecord.groupCounter := enqueue.bits.groupCounter + executionRecord.decodeResult := enqueue.bits.decodeResult +@@ -158,7 +160,7 @@ class LaneExecutionBridge(parameter: LaneParameter, isLastSlot: Boolean, slotInd + * it can be vd or src2. + */ + val doubleCollapse: Option[UInt] = Option.when(isLastSlot) { +- val cutCrossReadData: Vec[UInt] = cutUInt(executionRecord.crossReadSource.get, parameter.datapathWidth) ++ val cutCrossReadData: Vec[UInt] = cutUInt(executionRecord.crossReadSource.get, parameter.datapathWidth) // TODO: zvkCrossReadSource + Mux(executionRecord.executeIndex, cutCrossReadData(1), cutCrossReadData(0)) + } + +diff --git a/t1/src/laneStage/LaneStage1.scala b/t1/src/laneStage/LaneStage1.scala +index f44826e7..b5ff484f 100644 +--- a/t1/src/laneStage/LaneStage1.scala ++++ b/t1/src/laneStage/LaneStage1.scala +@@ -9,7 +9,7 @@ import chisel3.probe.{Probe, ProbeValue, define} + import chisel3.util._ + import chisel3.util.experimental.decode.DecodeBundle + import org.chipsalliance.t1.rtl.decoder.Decoder +-import org.chipsalliance.t1.rtl.lane.{CrossReadUnit, LaneState, VrfReadPipe} ++import org.chipsalliance.t1.rtl.lane.{CrossReadUnit, ZvkCrossReadUnit, LaneState, VrfReadPipe} + + class LaneStage1Enqueue(parameter: LaneParameter, isLastSlot: Boolean) extends Bundle { + val groupCounter: UInt = UInt(parameter.groupNumberBits.W) +@@ -46,6 +46,7 @@ class LaneStage1Dequeue(parameter: LaneParameter, isLastSlot: Boolean) extends B + // read result + val src: Vec[UInt] = Vec(3, UInt(parameter.datapathWidth.W)) + val crossReadSource: Option[UInt] = Option.when(isLastSlot)(UInt((parameter.datapathWidth * 2).W)) ++ val zvkCrossReadSource: Option[UInt] = Option.when(isLastSlot)(UInt((parameter.datapathWidth * 4).W)) + + // pipe state + // for exe stage +@@ -83,8 +84,14 @@ class LaneStage1(parameter: LaneParameter, isLastSlot: Boolean) extends Module { + @public + val vrfCheckRequest: Vec[VRFReadRequest] = IO(Vec(readCheckSize, Output(readRequestType))) + ++ val zvkReadCheckSize: Int = if(isLastSlot && parameter.zvkEnable) 7 else 3 ++ @public ++ val zvkVrfCheckRequest: Vec[VRFReadRequest] = IO(Vec(zvkReadCheckSize, Output(readRequestType))) ++ + @public + val checkResult: Vec[Bool] = IO(Vec(readCheckSize, Input(Bool()))) ++ @public ++ val zvkCheckResult: Vec[Bool] = IO(Vec(zvkReadCheckSize, Input(Bool()))) + + /** VRF read result for each slot, + * 3 is for [[source1]] [[source2]] [[source3]] +@@ -96,11 +103,19 @@ class LaneStage1(parameter: LaneParameter, isLastSlot: Boolean) extends Module { + val readBusDequeue: Option[Vec[DecoupledIO[ReadBusData]]] = Option.when(isLastSlot)(IO( + Vec(2, Flipped(Decoupled(new ReadBusData(parameter: LaneParameter)))) + )) ++ @public ++ val readBusDequeue4: Option[Vec[DecoupledIO[ReadBusData]]] = Option.when(isLastSlot & parameter.zvkEnable)(IO( ++ Vec(4, Flipped(Decoupled(new ReadBusData(parameter: LaneParameter)))) ++ )) + + @public + val readBusRequest: Option[Vec[DecoupledIO[ReadBusData]]] = + Option.when(isLastSlot)(IO(Vec(2, Decoupled(new ReadBusData(parameter))))) + ++ @public ++ val readBusRequest4: Option[Vec[DecoupledIO[ReadBusData]]] = ++ Option.when(isLastSlot & parameter.zvkEnable)(IO(Vec(4, Decoupled(new ReadBusData(parameter))))) ++ + val groupCounter: UInt = enqueue.bits.groupCounter + + // todo: param +@@ -124,12 +139,28 @@ class LaneStage1(parameter: LaneParameter, isLastSlot: Boolean) extends Module { + Option.when(isLastSlot)(Module(new Queue(vrfReadEntryType, readRequestQueueSizeAfterCheck))) + val queueAfterCheckMSB: Option[Queue[VRFReadQueueEntry]] = + Option.when(isLastSlot)(Module(new Queue(vrfReadEntryType, readRequestQueueSizeAfterCheck))) ++ val queueAfterCheckZvkLSBLSB: Option[Queue[VRFReadQueueEntry]] = ++ Option.when(isLastSlot && parameter.zvkEnable)(Module(new Queue(vrfReadEntryType, readRequestQueueSizeAfterCheck))) ++ val queueAfterCheckZvkLSBMSB: Option[Queue[VRFReadQueueEntry]] = ++ Option.when(isLastSlot && parameter.zvkEnable)(Module(new Queue(vrfReadEntryType, readRequestQueueSizeAfterCheck))) ++ val queueAfterCheckZvkMSBLSB: Option[Queue[VRFReadQueueEntry]] = ++ Option.when(isLastSlot && parameter.zvkEnable)(Module(new Queue(vrfReadEntryType, readRequestQueueSizeAfterCheck))) ++ val queueAfterCheckZvkMSBMSB: Option[Queue[VRFReadQueueEntry]] = ++ Option.when(isLastSlot && parameter.zvkEnable)(Module(new Queue(vrfReadEntryType, readRequestQueueSizeAfterCheck))) + + // read request queue for cross read lsb & msb + val queueBeforeCheckLSB: Option[Queue[VRFReadQueueEntry]] = + Option.when(isLastSlot)(Module(new Queue(vrfReadEntryType, readRequestQueueSizeBeforeCheck))) + val queueBeforeCheckMSB: Option[Queue[VRFReadQueueEntry]] = + Option.when(isLastSlot)(Module(new Queue(vrfReadEntryType, readRequestQueueSizeBeforeCheck))) ++ val queueBeforeCheckZvkLSBLSB: Option[Queue[VRFReadQueueEntry]] = ++ Option.when(isLastSlot && parameter.zvkEnable)(Module(new Queue(vrfReadEntryType, readRequestQueueSizeBeforeCheck))) ++ val queueBeforeCheckZvkLSBMSB: Option[Queue[VRFReadQueueEntry]] = ++ Option.when(isLastSlot && parameter.zvkEnable)(Module(new Queue(vrfReadEntryType, readRequestQueueSizeBeforeCheck))) ++ val queueBeforeCheckZvkMSBLSB: Option[Queue[VRFReadQueueEntry]] = ++ Option.when(isLastSlot && parameter.zvkEnable)(Module(new Queue(vrfReadEntryType, readRequestQueueSizeBeforeCheck))) ++ val queueBeforeCheckZvkMSBMSB: Option[Queue[VRFReadQueueEntry]] = ++ Option.when(isLastSlot && parameter.zvkEnable)(Module(new Queue(vrfReadEntryType, readRequestQueueSizeBeforeCheck))) + + // pipe from enqueue + val pipeQueue: Queue[LaneStage1Enqueue] = +@@ -147,11 +178,33 @@ class LaneStage1(parameter: LaneParameter, isLastSlot: Boolean) extends Module { + val afterCheckQueueVec: Seq[Queue[VRFReadQueueEntry]] = + Seq(queueAfterCheck1, queueAfterCheck2, queueAfterCheckVd) ++ + queueAfterCheckLSB ++ queueAfterCheckMSB +- val allReadQueueReady: Bool = beforeCheckQueueVec.map(_.io.enq.ready).reduce(_ && _) ++ val beforeCheckZvkQueueVec: Seq[Queue[VRFReadQueueEntry]] = ++ Seq(queueBeforeCheck1, queueBeforeCheck2, queueBeforeCheckVd) ++ ++ queueBeforeCheckZvkLSBLSB ++ queueBeforeCheckZvkLSBMSB ++ ++ queueBeforeCheckZvkMSBLSB ++ queueBeforeCheckZvkMSBMSB ++ val afterCheckZvkQueueVec: Seq[Queue[VRFReadQueueEntry]] = ++ Seq(queueAfterCheck1, queueAfterCheck2, queueAfterCheckVd) ++ ++ queueAfterCheckZvkLSBLSB ++ queueAfterCheckZvkLSBMSB ++ ++ queueAfterCheckZvkMSBLSB ++ queueAfterCheckZvkMSBMSB ++ val allReadQueueReady: Bool = { ++ val ready = beforeCheckQueueVec.map(_.io.enq.ready).reduce(_ && _) ++ if(parameter.zvkEnable) { ++ val zvkReady = beforeCheckZvkQueueVec.map(_.io.enq.ready).reduce(_ && _) ++ Mux(enqueue.bits.decodeResult(Decoder.crossRead && Decoder.zvk), zvkReady, ready) ++ } else { ++ ready ++ } ++ } + beforeCheckQueueVec.foreach{ q => + q.io.enq.bits.instructionIndex := enqueue.bits.instructionIndex + q.io.enq.bits.groupIndex := enqueue.bits.groupCounter + } ++ if(parameter.zvkEnable) { ++ beforeCheckZvkQueueVec.foreach{ q => ++ q.io.enq.bits.instructionIndex := enqueue.bits.instructionIndex ++ q.io.enq.bits.groupIndex := enqueue.bits.groupCounter ++ } ++ } + + enqueue.ready := allReadQueueReady && pipeQueue.io.enq.ready + +@@ -162,6 +215,14 @@ class LaneStage1(parameter: LaneParameter, isLastSlot: Boolean) extends Module { + after.io.enq.valid := before.io.deq.valid && checkResult(i) + after.io.enq.bits := before.io.deq.bits + } ++ if(parameter.zvkEnable) { ++ beforeCheckZvkQueueVec.zip(afterCheckZvkQueueVec).zipWithIndex.foreach { case ((before, after), i) => ++ zvkVrfCheckRequest(i) := before.io.deq.bits ++ before.io.deq.ready := after.io.enq.ready && zvkCheckResult(i) ++ after.io.enq.valid := before.io.deq.valid && zvkCheckResult(i) ++ after.io.enq.bits := before.io.deq.bits ++ } ++ } + // request enqueue + queueBeforeCheck1.io.enq.valid := enqueue.fire && enqueue.bits.decodeResult(Decoder.vtype) && !enqueue.bits.skipRead + queueBeforeCheck2.io.enq.valid := enqueue.fire && !enqueue.bits.skipRead +@@ -169,6 +230,11 @@ class LaneStage1(parameter: LaneParameter, isLastSlot: Boolean) extends Module { + (queueBeforeCheckLSB ++ queueBeforeCheckMSB).foreach { q => + q.io.enq.valid := enqueue.valid && allReadQueueReady && enqueue.bits.decodeResult(Decoder.crossRead) + } ++ if(parameter.zvkEnable) { ++ (queueBeforeCheckZvkLSBLSB ++ queueBeforeCheckZvkLSBMSB ++ queueBeforeCheckZvkMSBLSB ++ queueBeforeCheckZvkMSBMSB).foreach { q => ++ q.io.enq.valid := enqueue.valid && allReadQueueReady && enqueue.bits.decodeResult(Decoder.crossRead && Decoder.zvk) ++ } ++ } + + // calculate vs + queueBeforeCheck1.io.enq.bits.vs := Mux( +@@ -223,6 +289,30 @@ class LaneStage1(parameter: LaneParameter, isLastSlot: Boolean) extends Module { + q.io.enq.bits.offset := groupCounter(parameter.vrfOffsetBits - 2, 0) ## true.B + } + ++ if(parameter.zvkEnable) { ++ queueBeforeCheckZvkLSBLSB.foreach { ++ q.io.enq.bits.vs := enqueue.bits.vs2 + groupCounter(parameter.groupNumberBits - 2, parameter.vrfOffsetBits - 1) ++ q.io.enq.bits.readSource := 1.U ++ q.io.enq.bits.offset := groupCounter(parameter.vrfOffsetBits - 2, 0) ## true.B ++ } ++ queueBeforeCheckZvkLSBMSB.foreach { ++ q.io.enq.bits.vs := enqueue.bits.vs2 + groupCounter(parameter.groupNumberBits - 2, parameter.vrfOffsetBits - 1) ++ q.io.enq.bits.readSource := 1.U ++ q.io.enq.bits.offset := groupCounter(parameter.vrfOffsetBits - 2, 0) ## true.B ++ } ++ queueBeforeCheckZvkMSBLSB.foreach { ++ q.io.enq.bits.vs := enqueue.bits.vs2 + groupCounter(parameter.groupNumberBits - 2, parameter.vrfOffsetBits - 1) ++ q.io.enq.bits.readSource := 1.U ++ q.io.enq.bits.offset := groupCounter(parameter.vrfOffsetBits - 2, 0) ## true.B ++ } ++ queueBeforeCheckZvkMSBMSB.foreach { ++ q.io.enq.bits.vs := enqueue.bits.vs2 + groupCounter(parameter.groupNumberBits - 2, parameter.vrfOffsetBits - 1) ++ q.io.enq.bits.readSource := 1.U ++ q.io.enq.bits.offset := groupCounter(parameter.vrfOffsetBits - 2, 0) ## true.B ++ } ++ } ++ ++ + // read pipe + val readPipe0: Instance[VrfReadPipe] = Instantiate(new VrfReadPipe(parameter, arbitrate = false)) + val readPipe1: Instance[VrfReadPipe] = Instantiate(new VrfReadPipe(parameter, arbitrate = isLastSlot)) +@@ -240,6 +330,10 @@ class LaneStage1(parameter: LaneParameter, isLastSlot: Boolean) extends Module { + // cross lane queue + val dataQueueLSB = Option.when(isLastSlot)(Module(new Queue(UInt(parameter.datapathWidth.W), dataQueueSize))) + val dataQueueMSB = Option.when(isLastSlot)(Module(new Queue(UInt(parameter.datapathWidth.W), dataQueueSize))) ++ val dataQueueZvkLSBLSB = Option.when(isLastSlot && parameter.zvkEnable)(Module(new Queue(UInt(parameter.datapathWidth.W), dataQueueSize))) ++ val dataQueueZvkLSBMSB = Option.when(isLastSlot && parameter.zvkEnable)(Module(new Queue(UInt(parameter.datapathWidth.W), dataQueueSize))) // TODO ++ val dataQueueZvkMSBLSB = Option.when(isLastSlot && parameter.zvkEnable)(Module(new Queue(UInt(parameter.datapathWidth.W), dataQueueSize))) // TODO ++ val dataQueueZvkMSBMSB = Option.when(isLastSlot && parameter.zvkEnable)(Module(new Queue(UInt(parameter.datapathWidth.W), dataQueueSize))) + + val dataQueueNotFull2: Bool = { + val counterReg = RegInit(0.U(log2Ceil(dataQueueSize + 1).W)) +@@ -268,31 +362,71 @@ class LaneStage1(parameter: LaneParameter, isLastSlot: Boolean) extends Module { + blockingHandshake(readPipe2.enqueue, queueAfterCheckVd.io.deq, dataQueueNotFullVd) + + // contender for cross read +- readPipe1.contender.zip(queueAfterCheckLSB).foreach { case (port, queue) => +- val dataQueueNotFullLSB: Bool = { +- val counterReg = RegInit(0.U(log2Ceil(dataQueueSize + 1).W)) +- val doEnq = queue.io.deq.fire +- val doDeq = dataQueueLSB.get.io.deq.fire +- val countChange = Mux(doEnq, 1.U, -1.S(log2Ceil(dataQueueSize + 1).W).asUInt) +- when(doEnq ^ doDeq) { +- counterReg := counterReg + countChange ++ if(parameter.zvkEnable) { ++ readPipe1.contender.zip(queueAfterCheckLSB).zip(queueAfterCheckZvkLSBLSB).foreach { case (port, queue, zvkQueue) => ++ val dataQueueNotFullLSB: Bool = { ++ val counterReg = RegInit(0.U(log2Ceil(dataQueueSize + 1).W)) ++ val doEnq = queue.io.deq.fire || zvkQueue..io.deq.fire ++ val doDeq = Mux(Decoder.zvk, dataQueueZvkLSBLSB.get.io.deq.fire, dataQueueLSB.get.io.deq.fire) ++ val countChange = Mux(doEnq, 1.U, -1.S(log2Ceil(dataQueueSize + 1).W).asUInt) ++ when(doEnq ^ doDeq) { ++ counterReg := counterReg + countChange ++ } ++ !counterReg(log2Ceil(dataQueueSize)) ++ } ++ Mux( ++ Decoder.zvk, ++ blockingHandshake(port, zvkQueue.io.deq, dataQueueNotFullLSB), ++ blockingHandshake(port, queue.io.deq, dataQueueNotFullLSB) ++ ) ++ } ++ } else { ++ readPipe1.contender.zip(queueAfterCheckLSB).foreach { case (port, queue) => ++ val dataQueueNotFullLSB: Bool = { ++ val counterReg = RegInit(0.U(log2Ceil(dataQueueSize + 1).W)) ++ val doEnq = queue.io.deq.fire ++ val doDeq = dataQueueLSB.get.io.deq.fire ++ val countChange = Mux(doEnq, 1.U, -1.S(log2Ceil(dataQueueSize + 1).W).asUInt) ++ when(doEnq ^ doDeq) { ++ counterReg := counterReg + countChange ++ } ++ !counterReg(log2Ceil(dataQueueSize)) + } +- !counterReg(log2Ceil(dataQueueSize)) ++ blockingHandshake(port, queue.io.deq, dataQueueNotFullLSB) + } +- blockingHandshake(port, queue.io.deq, dataQueueNotFullLSB) + } +- readPipe2.contender.zip(queueAfterCheckMSB).foreach { case (port, queue) => +- val dataQueueNotFullMSB: Bool = { +- val counterReg = RegInit(0.U(log2Ceil(dataQueueSize + 1).W)) +- val doEnq = queue.io.deq.fire +- val doDeq = dataQueueMSB.get.io.deq.fire +- val countChange = Mux(doEnq, 1.U, -1.S(log2Ceil(dataQueueSize + 1).W).asUInt) +- when(doEnq ^ doDeq) { +- counterReg := counterReg + countChange ++ if(parameter.zvkEnable) { ++ readPipe2.contender.zip(queueAfterCheckMSB).zip(queueAfterCheckZvkMSBMSB).foreach { case (port, queue, zvkQueue) => ++ val dataQueueNotFullMSB: Bool = { ++ val counterReg = RegInit(0.U(log2Ceil(dataQueueSize + 1).W)) ++ val doEnq = queue.io.deq.fire || zvkQueue.io.deq.fire ++ val doDeq = Mux(Decoder.zvk, dataQueueZvkMSBMSB.get.io.deq.fire, dataQueueMSB.get.io.deq.fire) ++ val countChange = Mux(doEnq, 1.U, -1.S(log2Ceil(dataQueueSize + 1).W).asUInt) ++ when(doEnq ^ doDeq) { ++ counterReg := counterReg + countChange ++ } ++ !counterReg(log2Ceil(dataQueueSize)) ++ } ++ Mux( ++ Decoder.zvk, ++ blockingHandshake(port, zvkQueue.io.deq, dataQueueNotFullMSB), ++ blockingHandshake(port, queue.io.deq, dataQueueNotFullMSB) ++ ) ++ } ++ } else { ++ readPipe2.contender.zip(queueAfterCheckMSB).foreach { case (port, queue) => ++ val dataQueueNotFullMSB: Bool = { ++ val counterReg = RegInit(0.U(log2Ceil(dataQueueSize + 1).W)) ++ val doEnq = queue.io.deq.fire ++ val doDeq = dataQueueMSB.get.io.deq.fire ++ val countChange = Mux(doEnq, 1.U, -1.S(log2Ceil(dataQueueSize + 1).W).asUInt) ++ when(doEnq ^ doDeq) { ++ counterReg := counterReg + countChange ++ } ++ !counterReg(log2Ceil(dataQueueSize)) + } +- !counterReg(log2Ceil(dataQueueSize)) ++ blockingHandshake(port, queue.io.deq, dataQueueNotFullMSB) + } +- blockingHandshake(port, queue.io.deq, dataQueueNotFullMSB) + } + + // data: pipe <-> queue +@@ -301,11 +435,18 @@ class LaneStage1(parameter: LaneParameter, isLastSlot: Boolean) extends Module { + dataQueueVs2.io.enq <> readPipe1.dequeue + // pipe1 <> dataQueueLSB + dataQueueLSB.zip(readPipe1.contenderDequeue).foreach { case (sink, source) => sink.io.enq <> source } ++ if(parameter.zvkEnable) { ++ dataQueueZvkLSBLSB.zip(readPipe1.contenderDequeue).foreach { case (sink, source) => sink.io.enq <> source } ++ } ++ + + // pipe2 <-> dataQueueVd + dataQueueVd.io.enq <> readPipe2.dequeue + // pipe2 <-> dataQueueMSB + dataQueueMSB.zip(readPipe2.contenderDequeue).foreach { case (sink, source) => sink.io.enq <> source } ++ if(parameter.zvkEnable) { ++ dataQueueZvkMSBMSB.zip(readPipe2.contenderDequeue).foreach { case (sink, source) => sink.io.enq <> source } ++ } + } else { + dataQueueVs2.io.enq <> readPipe1.dequeue + dataQueueVd.io.enq <> readPipe2.dequeue +@@ -316,6 +457,12 @@ class LaneStage1(parameter: LaneParameter, isLastSlot: Boolean) extends Module { + Option.when(isLastSlot)(Module(new Queue(UInt((parameter.datapathWidth * 2).W), 1))) + val crossReadStageFree: Option[Bool] = Option.when(isLastSlot)(Wire(Bool())) + val crossReadUnitOp: Option[Instance[CrossReadUnit]] = Option.when(isLastSlot)(Instantiate(new CrossReadUnit(parameter))) ++ ++ val zvkCrossReadResultQueue: Option[Queue[UInt]] = ++ Option.when(isLastSlot && parameter.zvkEnable)(Module(new Queue(UInt((parameter.datapathWidth * 4).W), 1))) ++ val zvkCrossReadStageFree: Option[Bool] = Option.when(isLastSlot && parameter.zvkEnable)(Wire(Bool())) ++ val zvkCrossReadUnitOp: Option[Instance[ZvkCrossReadUnit]] = Option.when(isLastSlot && parameter.zvkEnable)(Instantiate(new ZvkCrossReadUnit(parameter))) ++ + if (isLastSlot) { + val dataGroupQueue: Queue[UInt] = + Module( +@@ -327,6 +474,7 @@ class LaneStage1(parameter: LaneParameter, isLastSlot: Boolean) extends Module { + // todo: need pipe ? + val laneIndexReg = RegInit(enqueue.bits.laneIndex) + val crossReadUnit = crossReadUnitOp.get ++ + crossReadUnit.dataInputLSB <> dataQueueLSB.get.io.deq + crossReadUnit.dataInputMSB <> dataQueueMSB.get.io.deq + crossReadUnit.laneIndex := laneIndexReg +@@ -341,7 +489,34 @@ class LaneStage1(parameter: LaneParameter, isLastSlot: Boolean) extends Module { + assert(dataGroupQueue.io.enq.ready || !dataGroupQueue.io.enq.valid) + dataGroupQueue.io.enq.bits := enqueue.bits.groupCounter + dataGroupQueue.io.deq.ready := crossReadUnit.dataInputLSB.fire +- dequeue.bits.readBusDequeueGroup.get := crossReadUnitOp.get.currentGroup ++ dequeue.bits.readBusDequeueGroup.get := crossReadUnitOp.get.currentGroup // TODO: readBusDequeueGroup is currently unused ++ ++ if(parameter.zvkEnable) { ++ val zvkDataGroupQueue: Queue[UInt] = ++ Module( ++ new Queue( ++ UInt(parameter.groupNumberBits.W), ++ readRequestQueueSizeBeforeCheck + readRequestQueueSizeBeforeCheck + dataQueueSize + 2 ++ ) ++ ) ++ val zvkCrossReadUnit = zvkCrossReadUnitOp.get ++ zvkCrossReadUnit.dataInputLSBLSB <> dataQueueZvkLSBLSB.get.io.deq ++ zvkCrossReadUnit.dataInputLSBMSB <> dataQueueZvkLSBMSB.get.io.deq ++ zvkCrossReadUnit.dataInputMSBLSB <> dataQueueZvkMSBLSB.get.io.deq ++ zvkCrossReadUnit.dataInputMSBMSB <> dataQueueZvkMSBMSB.get.io.deq ++ zvkCrossReadUnit.laneIndex := laneIndexReg ++ zvkCrossReadUnit.dataGroup := zvkDataGroupQueue.io.deq.bits ++ readBusRequest4.get.zip(zvkCrossReadUnit.readBusRequest).foreach { case (sink, source) => sink <> source} ++ zvkCrossReadUnit.readBusDequeue.get.zip(readBusDequeue4.get).foreach { case (sink, source) => sink <> source} ++ zvkCrossReadResultQueue.get.io.enq <> zvkCrossReadUnit.crossReadDequeue ++ zvkCrossReadStageFree.get := zvkCrossReadUnit.crossReadStageFree ++ ++ // data group ++ zvkDataGroupQueue.io.enq.valid := enqueue.fire && enqueue.bits.decodeResult(Decoder.crossRead) ++ assert(zvkDataGroupQueue.io.enq.ready || !zvkDataGroupQueue.io.enq.valid) ++ zvkDataGroupQueue.io.enq.bits := enqueue.bits.groupCounter ++ zvkDataGroupQueue.io.deq.ready := zvkCrossReadUnit.dataInputLSBLSB.fire ++ } + } + + val source1Select: UInt = Mux( +@@ -353,6 +528,7 @@ class LaneStage1(parameter: LaneParameter, isLastSlot: Boolean) extends Module { + dequeue.bits.groupCounter := pipeQueue.io.deq.bits.groupCounter + dequeue.bits.src := VecInit(Seq(source1Select, dataQueueVs2.io.deq.bits, dataQueueVd.io.deq.bits)) + dequeue.bits.crossReadSource.foreach(_ := crossReadResultQueue.get.io.deq.bits) ++ dequeue.bits.zvkCrossReadSource.foreach(_ := zvkCrossReadResultQueue.get.io.deq.bits) + dequeue.bits.sSendResponse.foreach(_ := pipeQueue.io.deq.bits.sSendResponse.get) + dequeue.bits.decodeResult := pipeQueue.io.deq.bits.decodeResult + dequeue.bits.vSew1H := pipeQueue.io.deq.bits.vSew1H +@@ -375,6 +551,8 @@ class LaneStage1(parameter: LaneParameter, isLastSlot: Boolean) extends Module { + dataQueueVd.io.deq.valid || (pipeQueue.io.deq.bits.decodeResult(Decoder.sReadVD)) + ) ++ + crossReadResultQueue.map(_.io.deq.valid || !pipeQueue.io.deq.bits.decodeResult(Decoder.crossRead)) ++ ++ ++ if(parameter.zvkEnable) zvkCrossReadResultQueue.map(_.io.deq.valid || !pipeQueue.io.deq.bits.decodeResult(Decoder.crossRead && Decoder.zvk)) else Seq() + val allDataQueueValid: Bool = VecInit(dataQueueValidVec).asUInt.andR + dequeue.valid := allDataQueueValid && pipeQueue.io.deq.valid + dataQueueVs1.ready := allDataQueueValid && dequeue.ready && pipeQueue.io.deq.bits.decodeResult(Decoder.vtype) +@@ -382,6 +560,9 @@ class LaneStage1(parameter: LaneParameter, isLastSlot: Boolean) extends Module { + dataQueueVd.io.deq.ready := + allDataQueueValid && dequeue.ready && !pipeQueue.io.deq.bits.decodeResult(Decoder.sReadVD) + crossReadResultQueue.foreach(_.io.deq.ready := allDataQueueValid && dequeue.ready && pipeQueue.io.deq.bits.decodeResult(Decoder.crossRead)) ++ if(parameter.zvkEnable) { ++ zvkCrossReadResultQueue.foreach(_.io.deq.ready := allDataQueueValid && dequeue.ready && pipeQueue.io.deq.bits.decodeResult(Decoder.crossRead && Decoder.zvk)) ++ } + stageValid := pipeQueue.io.deq.valid + val stageFinish = !stageValid + +@@ -400,10 +581,26 @@ class LaneStage1(parameter: LaneParameter, isLastSlot: Boolean) extends Module { + @public + val sSendCrossReadResultMSBProbe = Option.when(isLastSlot)(IO(Output(Probe(Bool())))) + @public ++ val sSendZvkCrossReadResultLSBLSBProbe = Option.when(isLastSlot && parameter.zvkEnable)(IO(Output(Probe(Bool())))) ++ @public ++ val sSendZvkCrossReadResultLSBMSBProbe = Option.when(isLastSlot && parameter.zvkEnable)(IO(Output(Probe(Bool())))) ++ @public ++ val sSendZvkCrossReadResultMSBLSBProbe = Option.when(isLastSlot && parameter.zvkEnable)(IO(Output(Probe(Bool())))) ++ @public ++ val sSendZvkCrossReadResultMSBMSBProbe = Option.when(isLastSlot && parameter.zvkEnable)(IO(Output(Probe(Bool())))) ++ @public + val wCrossReadLSBProbe = Option.when(isLastSlot)(IO(Output(Probe(Bool())))) + @public + val wCrossReadMSBProbe = Option.when(isLastSlot)(IO(Output(Probe(Bool())))) + @public ++ val wZvkCrossReadLSBLSBProbe = Option.when(isLastSlot && parameter.zvkEnable)(IO(Output(Probe(Bool())))) ++ @public ++ val wZvkCrossReadLSBMSBProbe = Option.when(isLastSlot && parameter.zvkEnable)(IO(Output(Probe(Bool())))) ++ @public ++ val wZvkCrossReadMSBLSBProbe = Option.when(isLastSlot && parameter.zvkEnable)(IO(Output(Probe(Bool())))) ++ @public ++ val wZvkCrossReadMSBMSBProbe = Option.when(isLastSlot && parameter.zvkEnable)(IO(Output(Probe(Bool())))) ++ @public + val vrfReadRequestProbe: Seq[(Bool, Bool)] = Seq.fill(3)((IO(Output(Probe(Bool()))),IO(Output(Probe(Bool()))))) + + +@@ -416,8 +613,19 @@ class LaneStage1(parameter: LaneParameter, isLastSlot: Boolean) extends Module { + readFinishProbe.foreach(p => define(p, ProbeValue(dataQueueVs2.io.deq.valid))) + sSendCrossReadResultLSBProbe.foreach(p => define(p, ProbeValue(crossReadUnitOp.get.crossWriteState.sSendCrossReadResultLSB))) + sSendCrossReadResultMSBProbe.foreach(p => define(p, ProbeValue(crossReadUnitOp.get.crossWriteState.sSendCrossReadResultMSB))) ++ println(sSendCrossReadResultMSBProbe) + wCrossReadLSBProbe.foreach(p => define(p, ProbeValue(crossReadUnitOp.get.crossWriteState.wCrossReadLSB))) + wCrossReadMSBProbe.foreach(p => define(p, ProbeValue(crossReadUnitOp.get.crossWriteState.wCrossReadMSB))) ++ if (parameter.zvkEnable) { ++ sSendZvkCrossReadResultLSBLSBProbe.foreach(p => define(p, ProbeValue(zvkCrossReadUnitOp.get.crossWriteState.sSendCrossReadResult(0)))) ++ sSendZvkCrossReadResultLSBMSBProbe.foreach(p => define(p, ProbeValue(zvkCrossReadUnitOp.get.crossWriteState.sSendCrossReadResult(1)))) ++ sSendZvkCrossReadResultMSBLSBProbe.foreach(p => define(p, ProbeValue(zvkCrossReadUnitOp.get.crossWriteState.sSendCrossReadResult(2)))) ++ sSendZvkCrossReadResultMSBMSBProbe.foreach(p => define(p, ProbeValue(zvkCrossReadUnitOp.get.crossWriteState.sSendCrossReadResult(3)))) ++ wZvkCrossReadLSBLSBProbe.foreach(p => define(p, ProbeValue(zvkCrossReadUnitOp.get.crossWriteState.wCrossRead(0)))) ++ wZvkCrossReadLSBMSBProbe.foreach(p => define(p, ProbeValue(zvkCrossReadUnitOp.get.crossWriteState.wCrossRead(1)))) ++ wZvkCrossReadMSBLSBProbe.foreach(p => define(p, ProbeValue(zvkCrossReadUnitOp.get.crossWriteState.wCrossRead(2)))) ++ wZvkCrossReadMSBMSBProbe.foreach(p => define(p, ProbeValue(zvkCrossReadUnitOp.get.crossWriteState.wCrossRead(3)))) ++ } + } + + vrfReadRequestProbe.zipWithIndex.foreach { case((ready, valid), i) => +diff --git a/t1/src/laneStage/LaneStage3.scala b/t1/src/laneStage/LaneStage3.scala +index 8ccc6fbc..8e8e273b 100644 +--- a/t1/src/laneStage/LaneStage3.scala ++++ b/t1/src/laneStage/LaneStage3.scala +@@ -56,6 +56,9 @@ class LaneStage3(parameter: LaneParameter, isLastSlot: Boolean) extends Module { + @public + val crossWritePort: Option[Vec[DecoupledIO[WriteBusData]]] = + Option.when(isLastSlot)(IO(Vec(2, Decoupled(new WriteBusData(parameter))))) ++ @public ++ val crossWritePort4: Option[Vec[DecoupledIO[WriteBusData]]] = ++ Option.when(isLastSlot & parameter.zvkEnable)(IO(Vec(4, Decoupled(new WriteBusData(parameter))))) + + val stageValidReg: Option[Bool] = Option.when(isLastSlot) (RegInit(false.B)) + +@@ -107,6 +110,16 @@ class LaneStage3(parameter: LaneParameter, isLastSlot: Boolean) extends Module { + sendState(index) := true.B + } + } ++ crossWritePort4.get.zipWithIndex.foreach { case (port, index) => ++ port.valid := stageValidReg.get && !sendState(index) ++ port.bits.mask := pipeEnqueue.get.mask(2 * index + 1, 2 * index) ++ port.bits.data := pipeEnqueue.get.crossWriteData(index) ++ port.bits.counter := pipeEnqueue.get.groupCounter ++ port.bits.instructionIndex := pipeEnqueue.get.instructionIndex ++ when(port.fire) { ++ sendState(index) := true.B ++ } ++ } // TODO + // scheduler synchronization + val schedulerFinish: Bool = (sSendResponse ++ wResponseFeedback).reduce(_ && _) + +@@ -185,4 +198,4 @@ class LaneStage3(parameter: LaneParameter, isLastSlot: Boolean) extends Module { + vrfWriteRequest <> vrfWriteQueue.io.deq + vrfWriteRequest.bits.offset := vrfPtrReplica.io.deq.bits + vrfWriteRequest.valid := vrfPtrReplica.io.deq.valid +-} +\ No newline at end of file ++} +diff --git a/t1/src/laneStage/SlotTokenManager.scala b/t1/src/laneStage/SlotTokenManager.scala +index ae8bb531..a9619641 100644 +--- a/t1/src/laneStage/SlotTokenManager.scala ++++ b/t1/src/laneStage/SlotTokenManager.scala +@@ -73,6 +73,9 @@ class SlotTokenManager(parameter: LaneParameter) extends Module { + @public + val crossWriteReports: Vec[ValidIO[UInt]] = IO(Vec(2, Flipped(Valid(UInt(parameter.instructionIndexBits.W))))) + ++ @public ++ val crossWriteReports4: Vec[ValidIO[UInt]] = IO(Vec(4, Flipped(Valid(UInt(parameter.instructionIndexBits.W))))) ++ + @public + val responseReport: ValidIO[UInt] = IO(Flipped(Valid(UInt(parameter.instructionIndexBits.W)))) + +diff --git a/t1/src/laneStage/ZvkCrossReadUnit.scala b/t1/src/laneStage/ZvkCrossReadUnit.scala +new file mode 100644 +index 00000000..bb3490d7 +--- /dev/null ++++ b/t1/src/laneStage/ZvkCrossReadUnit.scala +@@ -0,0 +1,133 @@ ++// SPDX-License-Identifier: Apache-2.0 ++// SPDX-FileCopyrightText: 2022 Jiuyang Liu ++ ++package org.chipsalliance.t1.rtl.lane ++ ++import chisel3._ ++import chisel3.experimental.hierarchy.{instantiable, public} ++import chisel3.util._ ++import org.chipsalliance.t1.rtl.{LaneParameter, ReadBusData} ++ ++class ZvkCrossReadState extends Bundle { ++ val sSendCrossReadResult: Seq[Bool] = Seq.fill(4)(Bool()) ++ val wCrossRead: Seq[Bool] = Seq.fill(4)(Bool()) ++} ++ ++@instantiable ++class ZvkCrossReadUnit(parameter: LaneParameter) extends Module { ++ @public ++ val dataInputLSBLSB: DecoupledIO[UInt] = IO(Flipped(Decoupled(UInt(parameter.datapathWidth.W)))) ++ @public ++ val dataInputLSBMSB: DecoupledIO[UInt] = IO(Flipped(Decoupled(UInt(parameter.datapathWidth.W)))) ++ @public ++ val dataInputMSBLSB: DecoupledIO[UInt] = IO(Flipped(Decoupled(UInt(parameter.datapathWidth.W)))) ++ @public ++ val dataInputMSBMSB: DecoupledIO[UInt] = IO(Flipped(Decoupled(UInt(parameter.datapathWidth.W)))) ++ @public ++ val laneIndex: UInt = IO(Input(UInt(parameter.laneNumberBits.W))) ++ @public ++ val dataGroup: UInt = IO(Input(UInt(parameter.groupNumberBits.W))) ++ @public ++ val currentGroup: UInt = IO(Output(UInt(parameter.groupNumberBits.W))) ++ ++ @public ++ val readBusDequeue: Option[Vec[DecoupledIO[ReadBusData]]] = ++ Option.when(parameter.zvkEnable)( ++ IO( ++ Vec(4, Flipped(Decoupled(new ReadBusData(parameter: LaneParameter))) ++ )) ++ ) ++ @public ++ val readBusRequest: Option[Vec[DecoupledIO[ReadBusData]]] = ++ Option.when(parameter.zvkEnable)(IO(Vec(4, Decoupled(new ReadBusData(parameter))))) ++ ++ @public ++ val crossReadDequeue: DecoupledIO[UInt] = IO(Decoupled(UInt((parameter.datapathWidth * 4).W))) ++ @public ++ val crossReadStageFree: Bool = IO(Output(Bool())) ++ @public ++ val crossWriteState = IO(Output(new ZvkCrossReadState)) ++ ++ val stageValid: Bool = RegInit(false.B) ++ val sSendCrossReadResultLSBLSB, sSendCrossReadResultMSBLSB, wCrossReadLSBLSB, wCrossReadMSBLSB = RegInit(true.B) ++ val sSendCrossReadResultLSBMSB, sSendCrossReadResultMSBMSB, wCrossReadLSBMSB, wCrossReadMSBMSB = RegInit(true.B) ++ val stateVec: Seq[Bool] = Seq( ++ sSendCrossReadResultLSBLSB, ++ sSendCrossReadResultLSBMSB, ++ sSendCrossReadResultMSBLSB, ++ sSendCrossReadResultMSBMSB, ++ wCrossReadLSBLSB, ++ wCrossReadLSBMSB, ++ wCrossReadMSBLSB, ++ wCrossReadMSBMSB, ++ ) ++ val sendDataVec: Vec[UInt] = RegInit(VecInit(Seq.fill(4)(0.U(parameter.datapathWidth.W)))) ++ val groupCounter: UInt = RegInit(0.U(parameter.groupNumberBits.W)) ++ val receiveDataVec: Vec[UInt] = RegInit(VecInit(Seq.fill(4)(0.U(parameter.datapathWidth.W)))) ++ val sendState = Seq( ++ sSendCrossReadResultLSBLSB, ++ sSendCrossReadResultLSBMSB, ++ sSendCrossReadResultMSBLSB, ++ sSendCrossReadResultMSBMSB, ++ ) ++ val receiveState = Seq( ++ wCrossReadLSBLSB, ++ wCrossReadLSBMSB, ++ wCrossReadMSBLSB, ++ wCrossReadMSBMSB, ++ ) ++ ++ readBusRequest.get.zipWithIndex.foreach { case (port, index) => ++ port.valid := stageValid && !sendState(index) ++ port.bits.data := sendDataVec(index) ++ when(port.fire) { sendState(index) := true.B} ++ } ++ ++ readBusDequeue.get.zipWithIndex.foreach { case (port, index) => ++ when(port.fire) { ++ receiveState(index) := true.B ++ receiveDataVec(index) := port.bits.data ++ } ++ port.ready := !receiveState(index) ++ } ++ val allStateReady: Bool = stateVec.reduce(_ && _) ++ val stageReady: Bool = !stageValid || (allStateReady && crossReadDequeue.ready) ++ val allSourceValid: Bool = Seq( ++ dataInputLSBLSB.valid, ++ dataInputLSBMSB.valid, ++ dataInputMSBLSB.valid, ++ dataInputMSBMSB.valid, ++ ).reduce(_ && _) ++ val enqueueFire: Bool = stageReady && allSourceValid ++ dataInputLSBLSB.ready := allSourceValid && stageReady ++ dataInputLSBMSB.ready := allSourceValid && stageReady ++ dataInputMSBLSB.ready := allSourceValid && stageReady ++ dataInputMSBMSB.ready := allSourceValid && stageReady ++ ++ when(enqueueFire ^ crossReadDequeue.fire) { ++ stageValid := enqueueFire ++ } ++ when(enqueueFire) { ++ stateVec.foreach(_ := false.B) ++ sendDataVec := VecInit(Seq( ++ dataInputLSBLSB.bits, ++ dataInputLSBMSB.bits, ++ dataInputMSBLSB.bits, ++ dataInputMSBMSB.bits, ++ )) ++ groupCounter := dataGroup ++ } ++ currentGroup := groupCounter ++ crossReadDequeue.bits := receiveDataVec.asUInt ++ crossReadDequeue.valid := allStateReady && stageValid ++ crossReadStageFree := (!stageValid) && stateVec.reduce(_ && _) ++ ++ crossWriteState.sSendCrossReadResult(0) := sSendCrossReadResultLSBLSB ++ crossWriteState.sSendCrossReadResult(1) := sSendCrossReadResultLSBMSB ++ crossWriteState.sSendCrossReadResult(2) := sSendCrossReadResultMSBLSB ++ crossWriteState.sSendCrossReadResult(3) := sSendCrossReadResultMSBMSB ++ crossWriteState.wCrossRead(0) := wCrossReadLSBLSB ++ crossWriteState.wCrossRead(1) := wCrossReadLSBMSB ++ crossWriteState.wCrossRead(2) := wCrossReadMSBLSB ++ crossWriteState.wCrossRead(3) := wCrossReadMSBMSB ++} +diff --git a/t1/src/vrf/VRF.scala b/t1/src/vrf/VRF.scala +index 724ef637..6c04fc6d 100644 +--- a/t1/src/vrf/VRF.scala ++++ b/t1/src/vrf/VRF.scala +@@ -150,9 +150,15 @@ class VRF(val parameter: VRFParam) extends Module with SerializableModule[VRFPar + val readCheck: Vec[VRFReadRequest] = IO(Vec(parameter.chainingSize * 3 + 2, Input( + new VRFReadRequest(parameter.regNumBits, parameter.vrfOffsetBits, parameter.instructionIndexBits) + ))) ++ @public ++ val zvkReadCheck: Vec[VRFReadRequest] = IO(Vec(parameter.chainingSize * 3 + 4, Input( ++ new VRFReadRequest(parameter.regNumBits, parameter.vrfOffsetBits, parameter.instructionIndexBits) ++ ))) + + @public + val readCheckResult: Vec[Bool] = IO(Vec(parameter.chainingSize * 3 + 2, Output(Bool()))) ++ @public ++ val zvkReadCheckResult: Vec[Bool] = IO(Vec(parameter.chainingSize * 3 + 4, Output(Bool()))) + + /** VRF read results. */ + @public +@@ -273,6 +279,22 @@ class VRF(val parameter: VRFParam) extends Module with SerializableModule[VRFPar + checkModule.checkResult + }.reduce(_ && _) + } ++ zvkReadCheck.zip(zvkReadCheckResult).foreach { case (req, res) => ++ val recordSelect = chainingRecord ++ // 先找到自的record ++ val readRecord = ++ Mux1H(recordSelect.map(_.bits.instIndex === req.instructionIndex), recordSelect.map(_.bits)) ++ res := ++ recordSelect.zip(recordValidVec).zipWithIndex.map { ++ case ((r, f), recordIndex) => ++ val checkModule = Instantiate(new ChainingCheck(parameter)) ++ checkModule.read := req ++ checkModule.readRecord := readRecord ++ checkModule.record := r ++ checkModule.recordValid := f ++ checkModule.checkResult ++ }.reduce(_ && _) ++ } + + val checkSize: Int = readRequests.size + val (firstOccupied, secondOccupied) = readRequests.zipWithIndex.foldLeft( diff --git a/configgen/generated/blastoise.json b/configgen/generated/blastoise.json index 290ef86c16..241ea7efff 100644 --- a/configgen/generated/blastoise.json +++ b/configgen/generated/blastoise.json @@ -167,8 +167,9 @@ ] ] ], - "zvbbModuleParameters": [] + "zvbbModuleParameters": [], + "zvkModuleParameters": [] } }, "generator": "org.chipsalliance.t1.rtl.T1" -} \ No newline at end of file +} diff --git a/configgen/generated/machamp.json b/configgen/generated/machamp.json index ceeaf5e59d..144ce49fa0 100644 --- a/configgen/generated/machamp.json +++ b/configgen/generated/machamp.json @@ -151,8 +151,9 @@ ] ], "floatModuleParameters": [], - "zvbbModuleParameters": [] + "zvbbModuleParameters": [], + "zvkModuleParameters": [] } }, "generator": "org.chipsalliance.t1.rtl.T1" -} \ No newline at end of file +} diff --git a/configgen/generated/psyduck.json b/configgen/generated/psyduck.json index 04a2f3572a..9efbc04e46 100644 --- a/configgen/generated/psyduck.json +++ b/configgen/generated/psyduck.json @@ -184,8 +184,25 @@ 3 ] ] + ], + "zvkModuleParameters": [ + [ + { + "parameter": { + "datapathWidth": 32, + "latency": 3 + }, + "generator": "org.chipsalliance.t1.rtl.LaneZvk" + }, + [ + 0, + 1, + 2, + 3 + ] + ] ] } }, "generator": "org.chipsalliance.t1.rtl.T1" -} \ No newline at end of file +} diff --git a/configgen/generated/sandslash.json b/configgen/generated/sandslash.json index 688085fe1f..08d52748a3 100644 --- a/configgen/generated/sandslash.json +++ b/configgen/generated/sandslash.json @@ -151,8 +151,9 @@ ] ], "floatModuleParameters": [], - "zvbbModuleParameters": [] + "zvbbModuleParameters": [], + "zvkModuleParameters": [] } }, "generator": "org.chipsalliance.t1.rtl.T1" -} \ No newline at end of file +} diff --git a/configgen/src/Main.scala b/configgen/src/Main.scala index 88e3bc3268..357707f3ac 100644 --- a/configgen/src/Main.scala +++ b/configgen/src/Main.scala @@ -100,7 +100,8 @@ object Main { Seq(0, 1, 2, 3))), floatModuleParameters = Seq((SerializableModuleGenerator(classOf[LaneFloat], LaneFloatParam(32, 3)), Seq(0, 1, 2, 3))), - zvbbModuleParameters = Seq() + zvbbModuleParameters = Seq(), + zvkModuleParameters = Seq(), ) ) if (doEmit) param.emit(targetFile) @@ -151,7 +152,9 @@ object Main { floatModuleParameters = Seq((SerializableModuleGenerator(classOf[LaneFloat], LaneFloatParam(32, 3)), Seq(0, 1, 2, 3))), zvbbModuleParameters = - Seq((SerializableModuleGenerator(classOf[LaneZvbb], LaneZvbbParam(32, 3)), Seq(0, 1, 2, 3))) + Seq((SerializableModuleGenerator(classOf[LaneZvbb], LaneZvbbParam(32, 3)), Seq(0, 1, 2, 3))), + zvkModuleParameters = + Seq((SerializableModuleGenerator(classOf[LaneZvk], LaneZvkParam(32, 3)), Seq(0, 1, 2, 3))), ) ) if (doEmit) param.emit(targetFile) @@ -201,7 +204,8 @@ object Main { ), Seq(0, 1, 2, 3))), floatModuleParameters = Seq(), - zvbbModuleParameters = Seq() // TODO + zvbbModuleParameters = Seq(), + zvkModuleParameters = Seq(), ) ) if (doEmit) param.emit(targetFile) @@ -251,7 +255,8 @@ object Main { ), Seq(0, 1, 2, 3))), floatModuleParameters = Seq(), - zvbbModuleParameters = Seq() // TODO + zvbbModuleParameters = Seq(), + zvkModuleParameters = Seq(), ) ) if (doEmit) param.emit(targetFile) diff --git a/t1/src/Bundles.scala b/t1/src/Bundles.scala index bb8a36f413..3f21b0664d 100644 --- a/t1/src/Bundles.scala +++ b/t1/src/Bundles.scala @@ -619,6 +619,7 @@ class ExecutionUnitRecord(parameter: LaneParameter)(isLastSlot: Boolean) extends val executeIndex: Bool = Bool() val source: Vec[UInt] = Vec(3, UInt(parameter.datapathWidth.W)) val crossReadSource: Option[UInt] = Option.when(isLastSlot)(UInt((parameter.datapathWidth * 2).W)) + val zvkCrossReadSource: Option[UInt] = Option.when(isLastSlot && parameter.zvkEnable)(UInt((parameter.datapathWidth * 2).W)) /** groupCounter need use to update `Lane.maskFormatResultForGroup` */ val groupCounter: UInt = UInt(parameter.groupNumberBits.W) val sSendResponse: Option[Bool] = Option.when(isLastSlot)(Bool()) @@ -722,4 +723,4 @@ class T1Retire(xLen: Int) extends Bundle { val rd: ValidIO[T1RdRetire] = Valid(new T1RdRetire(xLen)) val csr: ValidIO[T1CSRRetire] = Valid(new T1CSRRetire) val mem: ValidIO[EmptyBundle] = Valid(new EmptyBundle) -} \ No newline at end of file +} diff --git a/t1/src/Lane.scala b/t1/src/Lane.scala index 59a7eb8a66..90ea5e4c6a 100644 --- a/t1/src/Lane.scala +++ b/t1/src/Lane.scala @@ -81,16 +81,18 @@ object LaneParameter { * TODO: cover the queue full. */ case class LaneParameter( - vLen: Int, - datapathWidth: Int, - laneNumber: Int, - chainingSize: Int, - crossLaneVRFWriteEscapeQueueSize: Int, - fpuEnable: Boolean, - portFactor: Int, - vrfRamType: RamType, - decoderParam: DecoderParam, - vfuInstantiateParameter: VFUInstantiateParameter) + vLen: Int, + datapathWidth: Int, + laneNumber: Int, + chainingSize: Int, + crossLaneVRFWriteEscapeQueueSize: Int, + crossLaneVRFWriteEscapeZvkQueueSize: Int, + fpuEnable: Boolean, + zvkEnable: Boolean, + portFactor: Int, + vrfRamType: RamType, + decoderParam: DecoderParam, + vfuInstantiateParameter: VFUInstantiateParameter) extends SerializableModuleParameter { /** 1 in MSB for instruction order. */ @@ -132,7 +134,7 @@ case class LaneParameter( * * for each number in table below, it represent a [[datapathWidth]] * {{{ - * lane0 | lane1 | ... | lane8 + * lane0 | lane1 | ... | lane7 * offset0 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 * offset1 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 * offset2 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 @@ -210,8 +212,26 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ * TODO: benchmark the usecase for tuning the Ring Bus width. * find a real world case for using `narrow` and `widen` aggressively. */ + // 0: 0.0 - 0.1 + // 1: 0.2 - 0.3 + // 2: 0.4 - 0.5 + // 3: 0.6 - 0.7 + // 4: 1.0 - 1.1 + // 5: 1.2 - 1.3 + // 6: 1.4 - 1.5 + // 7: 1.6 - 1.7 + + // 0: 0.0 - 0.1 - 0.2 - 0.3 + // 1: 0.4 - 0.5 - 0.6 - 0.7 + // 2: 1.0 - 1.1 - 1.2 - 1.3 + // 3: 1.4 - 1.5 - 1.6 - 1.7 + // 4: 2.0 - 2.1 - 2.2 - 2.3 + // 5: 2.4 - 2.5 - 2.6 - 2.7 + // 6: 3.0 - 3.1 - 3.2 - 3.3 + // 7: 3.4 - 3.5 - 3.6 - 3.7 @public val readBusPort: Vec[RingPort[ReadBusData]] = IO(Vec(2, new RingPort(new ReadBusData(parameter)))) + val readBusPort4: Option[Vec[RingPort[ReadBusData]]] = Option.when(parameter.zvkEnable)(IO(Vec(4, new RingPort(new ReadBusData(parameter))))) /** VRF Write Interface. * only used for `narrow` an `widen` @@ -220,6 +240,7 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ */ @public val writeBusPort: Vec[RingPort[WriteBusData]] = IO(Vec(2, new RingPort(new WriteBusData(parameter)))) + val writeBusPort4: Option[Vec[RingPort[WriteBusData]]] = Option.when(parameter.zvkEnable)(IO(Vec(4, new RingPort(new WriteBusData(parameter))))) /** request from [[T1.decode]] to [[Lane]]. */ @public @@ -320,6 +341,9 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ // TODO: remove dontTouch(writeBusPort) + if(parameter.zvkEnable) { + dontTouch(writeBusPort4.get) + } /** VRF instantces. */ val vrf: Instance[VRF] = Instantiate(new VRF(parameter.vrfParam)) @@ -440,8 +464,12 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ val readCheckRequestVec: Vec[VRFReadRequest] = Wire(Vec(parameter.chainingSize * 3 + 2, new VRFReadRequest(parameter.vrfParam.regNumBits, parameter.vrfOffsetBits, parameter.instructionIndexBits) )) + val zvkReadCheckRequestVec: Vec[VRFReadRequest] = Wire(Vec(parameter.chainingSize * 3 + 4, + new VRFReadRequest(parameter.vrfParam.regNumBits, parameter.vrfOffsetBits, parameter.instructionIndexBits) + )) val readCheckResult: Vec[Bool] = Wire(Vec(parameter.chainingSize * 3 + 2, Bool())) + val zvkReadCheckResult: Vec[Bool] = Wire(Vec(parameter.chainingSize * 3 + 4, Bool())) /** signal used for prohibiting slots to access VRF. * a slot will become inactive when: @@ -465,7 +493,7 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ val slotCanShift: Vec[Bool] = Wire(Vec(parameter.chainingSize, Bool())) /** Which data group is waiting for the result of the cross-lane read */ - val readBusDequeueGroup: UInt = Wire(UInt(parameter.groupNumberBits.W)) + val readBusDequeueGroup: UInt = Wire(UInt(parameter.groupNumberBits.W)) // TODO: readBusDequeueGroup is currently unused /** enqueue valid for execution unit */ val executeEnqueueValid: Vec[Bool] = Wire(Vec(parameter.chainingSize, Bool())) @@ -514,6 +542,18 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ pipe = true ) )) + val crossLaneWriteQueue4: Seq[Queue[VRFWriteRequest]] = Seq.tabulate(4)(i => Module( + new Queue( + new VRFWriteRequest( + parameter.vrfParam.regNumBits, + parameter.vrfOffsetBits, + parameter.instructionIndexBits, + parameter.datapathWidth + ), + parameter.crossLaneVRFWriteEscapeZvkQueueSize, + pipe = true + ) + )) val maskedWriteUnit: Instance[MaskedWrite] = Instantiate(new MaskedWrite(parameter)) val tokenManager: Instance[SlotTokenManager] = Instantiate(new SlotTokenManager(parameter)) slotControl.zipWithIndex.foreach { @@ -661,9 +701,17 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ readCheckRequestVec((parameter.chainingSize - index - 1) * 3 + portIndex) := stage1.vrfCheckRequest(portIndex) stage1.checkResult(portIndex) := readCheckResult((parameter.chainingSize - index - 1) * 3 + portIndex) } + val zvkCheckSize = if (isLastSlot && parameter.zvkEnable) 7 else 5 + if(parameter.zvkEnable) { + Seq.tabulate(zvkCheckSize){ portIndex => + zvkReadCheckRequestVec((parameter.chainingSize - index - 1) * 3 + portIndex) := stage1.zvkVrfCheckRequest(portIndex) + stage1.zvkCheckResult(portIndex) := zvkReadCheckResult((parameter.chainingSize - index - 1) * 3 + portIndex) + } + } // connect cross read bus if(isLastSlot) { val tokenSize = parameter.crossLaneVRFWriteEscapeQueueSize + val zvKTokenSize = parameter.crossLaneVRFWriteEscapeZvkQueueSize readBusPort.zipWithIndex.foreach {case (readPort, portIndex) => // tx val tokenReg = RegInit(0.U(log2Ceil(tokenSize + 1).W)) @@ -685,6 +733,27 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ // dequeue to cross read unit stage1.readBusDequeue.get(portIndex) <> queue.io.deq } + readBusPort4.get.zipWithIndex.foreach {case (readPort, portIndex) => + // tx + val tokenReg = RegInit(0.U(log2Ceil(zvKTokenSize + 1).W)) + val tokenReady: Bool = tokenReg =/= zvKTokenSize.U + stage1.readBusRequest4.get(portIndex).ready := tokenReady + readPort.deq.valid := stage1.readBusRequest4.get(portIndex).valid && tokenReady + readPort.deq.bits := stage1.readBusRequest4.get(portIndex).bits + val tokenUpdate = Mux(readPort.deq.valid, 1.U, -1.S(tokenReg.getWidth.W).asUInt) + when(readPort.deq.valid ^ readPort.deqRelease) { + tokenReg := tokenReg + tokenUpdate + } + // rx + // rx queue + val queue = Module(new Queue(chiselTypeOf(readPort.deq.bits), zvKTokenSize, pipe=true)) + queue.io.enq.valid := readPort.enq.valid + queue.io.enq.bits := readPort.enq.bits + readPort.enqRelease := queue.io.deq.fire + assert(queue.io.enq.ready || !readPort.enq.valid) + // dequeue to cross read unit + stage1.readBusDequeue4.get(portIndex) <> queue.io.deq + } // cross write writeBusPort.zipWithIndex.foreach {case (writePort, portIndex) => @@ -694,6 +763,19 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ writePort.deq.bits := stage3.crossWritePort.get(portIndex).bits stage3.crossWritePort.get(portIndex).ready := tokenReady + // update token + val tokenUpdate = Mux(writePort.deq.valid, 1.U, -1.S(tokenReg.getWidth.W).asUInt) + when(writePort.deq.valid ^ writePort.deqRelease) { + tokenReg := tokenReg + tokenUpdate + } + } + writeBusPort4.get.zipWithIndex.foreach {case (writePort, portIndex) => + val tokenReg = RegInit(0.U(log2Ceil(zvKTokenSize + 1).W)) + val tokenReady: Bool = tokenReg =/= zvKTokenSize.U + writePort.deq.valid := stage3.crossWritePort4.get(portIndex).valid && tokenReady + writePort.deq.bits := stage3.crossWritePort4.get(portIndex).bits + stage3.crossWritePort4.get(portIndex).ready := tokenReady + // update token val tokenUpdate = Mux(writePort.deq.valid, 1.U, -1.S(tokenReg.getWidth.W).asUInt) when(writePort.deq.valid ^ writePort.deqRelease) { @@ -892,7 +974,9 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ } vrf.readCheck.zip(readCheckRequestVec).foreach{case (sink, source) => sink := source} + vrf.zvkReadCheck.zip(zvkReadCheckRequestVec).foreach{case (sink, source) => sink := source} readCheckResult.zip(vrf.readCheckResult).foreach{case (sink, source) => sink := source} + zvkReadCheckResult.zip(vrf.zvkReadCheckResult).foreach{case (sink, source) => sink := source} allVrfWriteAfterCheck.zipWithIndex.foreach { case (req, i) => val check = vrf.writeAllow(i) diff --git a/t1/src/LaneZvk.scala b/t1/src/LaneZvk.scala new file mode 100644 index 0000000000..7764c680aa --- /dev/null +++ b/t1/src/LaneZvk.scala @@ -0,0 +1,47 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: 2022 Jiuyang Liu + +package org.chipsalliance.t1.rtl + +import chisel3.experimental.hierarchy.instantiable +import chisel3._ +import chisel3.experimental.{SerializableModule, SerializableModuleParameter} +import chisel3.util._ +import org.chipsalliance.t1.rtl.decoder.{BoolField, Decoder} + +object LaneZvkParam { + implicit def rw: upickle.default.ReadWriter[LaneZvkParam] = upickle.default.macroRW +} + +case class LaneZvkParam(datapathWidth: Int, latency: Int) extends VFUParameter with SerializableModuleParameter { + val inputBundle = new LaneZvkRequest(datapathWidth) + val decodeField: BoolField = Decoder.zvbb + val outputBundle = new LaneZvkResponse(datapathWidth) + override val NeedSplit: Boolean = false +} + +class LaneZvkRequest(datapathWidth: Int) extends VFUPipeBundle { + val src = Vec(3, UInt(datapathWidth.W)) + val opcode = UInt(4.W) + val vSew = UInt(2.W) + val shifterSize = UInt(log2Ceil(datapathWidth).W) +} + +class LaneZvkResponse(datapathWidth: Int) extends VFUPipeBundle { + val data = UInt(datapathWidth.W) +} + +@instantiable +class LaneZvk(val parameter: LaneZvkParam) + extends VFUModule(parameter) with SerializableModule[LaneZvkParam]{ + val response: LaneZvkResponse = Wire(new LaneZvkResponse(parameter.datapathWidth)) + val request : LaneZvkRequest = connectIO(response).asTypeOf(parameter.inputBundle) + + val zvbbSrc: UInt = request.src(1) // vs2 + val zvbbRs: UInt = request.src(0) // vs1 or rs1 + val vSew: UInt = UIntToOH(request.vSew) // sew = 0, 1, 2 + + response.data := Mux1H(UIntToOH(request.opcode), Seq( + )) +} + diff --git a/t1/src/T1.scala b/t1/src/T1.scala index 5a36bb040b..0d20680e09 100644 --- a/t1/src/T1.scala +++ b/t1/src/T1.scala @@ -122,6 +122,14 @@ case class T1Parameter( instruction => instruction.instructionSet.name match { case "rv_v" => true case "rv_zvbb" => if (zvbbEnable) true else false + // Zvk + case "rv_zvkg" => if (zvkEnable) true else false + // case "rv_zvkn" => if (zvkEnable) true else false // TODO: no implementations for SEW=64 + case "rv_zvkned" => if (zvkEnable) true else false + case "rv_zvknha" => if (zvkEnable) true else false + // case "rv_zvknhb" => if (zvkEnable) true else false // TODO: no implementations for SEW=64 + case "rv_zvksed" => if (zvkEnable) true else false + case "rv_zvksh" => if (zvkEnable) true else false case _ => false }} ++ t1customInstructions.map(_.instruction) @@ -143,15 +151,19 @@ case class T1Parameter( /** TODO: configure it. */ val instructionQueueSize: Int = 4 - /** crosslane write token size */ + /** crosslane write token size, unclear how many would be good */ val vrfWriteQueueSize: Int = 4 + val vrfWriteZvkQueueSize: Int = 8 /** does t1 has floating datapath? */ val fpuEnable: Boolean = extensions.contains("Zve32f") - /** support of zvbb */ + /** support of Zvbb */ lazy val zvbbEnable: Boolean = extensions.contains("Zvbb") + /** support of Zvk */ + lazy val zvkEnable: Boolean = extensions.contains("Zvk") + /** how many chaining does T1 support, this is not a parameter yet. */ val chainingSize: Int = 4 @@ -225,7 +237,7 @@ case class T1Parameter( // and the values are their respective delays. val crossLaneConnectCycles: Seq[Seq[Int]] = Seq.tabulate(laneNumber)(_ => Seq(1, 1)) - val decoderParam: DecoderParam = DecoderParam(fpuEnable, zvbbEnable, allInstructions) + val decoderParam: DecoderParam = DecoderParam(fpuEnable, zvbbEnable, zvkEnable, allInstructions) /** paraemter for AXI4. */ val axi4BundleParameter: AXI4BundleParameter = AXI4BundleParameter( @@ -261,7 +273,9 @@ case class T1Parameter( laneNumber = laneNumber, chainingSize = chainingSize, crossLaneVRFWriteEscapeQueueSize = vrfWriteQueueSize, + crossLaneVRFWriteEscapeZvkQueueSize = vrfWriteZvkQueueSize, fpuEnable = fpuEnable, + zvkEnable = zvkEnable, portFactor = vrfBankSize, vrfRamType = vrfRamType, decoderParam = decoderParam, diff --git a/t1/src/VectorFunctionUnit.scala b/t1/src/VectorFunctionUnit.scala index cf06a66afe..cee5252566 100644 --- a/t1/src/VectorFunctionUnit.scala +++ b/t1/src/VectorFunctionUnit.scala @@ -106,7 +106,8 @@ case class VFUInstantiateParameter( divfpModuleParameters: Seq[(SerializableModuleGenerator[LaneDivFP, LaneDivFPParam], Seq[Int])], otherModuleParameters: Seq[(SerializableModuleGenerator[OtherUnit, OtherUnitParam], Seq[Int])], floatModuleParameters: Seq[(SerializableModuleGenerator[LaneFloat, LaneFloatParam], Seq[Int])], - zvbbModuleParameters: Seq[(SerializableModuleGenerator[LaneZvbb, LaneZvbbParam], Seq[Int])] + zvbbModuleParameters: Seq[(SerializableModuleGenerator[LaneZvbb, LaneZvbbParam], Seq[Int])], + zvkModuleParameters: Seq[(SerializableModuleGenerator[LaneZvk, LaneZvkParam], Seq[Int])], ) { val genVec: Seq[(SerializableModuleGenerator[_ <: VFUModule, _ <: VFUParameter], Seq[Int])] = logicModuleParameters ++ @@ -117,7 +118,8 @@ case class VFUInstantiateParameter( divfpModuleParameters ++ otherModuleParameters ++ floatModuleParameters ++ - zvbbModuleParameters + zvbbModuleParameters ++ + zvkModuleParameters genVec.foreach { case (_, connect) => connect.foreach(connectIndex => require(connectIndex < slotCount)) diff --git a/t1/src/decoder/Decoder.scala b/t1/src/decoder/Decoder.scala index 3a02993891..112e34f741 100644 --- a/t1/src/decoder/Decoder.scala +++ b/t1/src/decoder/Decoder.scala @@ -13,7 +13,7 @@ import org.chipsalliance.t1.rtl.decoder.attribute._ object DecoderParam { implicit def rwP: upickle.default.ReadWriter[DecoderParam] = upickle.default.macroRW } -case class DecoderParam(fpuEnable: Boolean, zvbbEnable: Boolean, allInstructions: Seq[Instruction]) +case class DecoderParam(fpuEnable: Boolean, zvbbEnable: Boolean, zvkEnable: Boolean, allInstructions: Seq[Instruction]) trait T1DecodeFiled[D <: Data] extends DecodeField[T1DecodePattern, D] with FieldName @@ -225,6 +225,10 @@ object Decoder { override def getTriState(pattern: T1DecodePattern): TriState = pattern.isZvbb.value } + object zvk extends BoolField { + override def getTriState(pattern: T1DecodePattern): TriState = pattern.isZvk.value + } + object topUop extends T1TopUopField { override def genTable(pattern: T1DecodePattern): BitPat = pattern.topUop.value match { case _: TopT0.type => BitPat("b000") @@ -345,6 +349,19 @@ object Decoder { case _: zvbbUop8.type => BitPat("b1000") // andn case _ => BitPat.dontCare(4) } + case zvkCase: ZvkUOPType => + zvkCase match { + case _: zvkUop0.type => BitPat("b0000") // + case _: zvkUop1.type => BitPat("b0001") // + case _: zvkUop2.type => BitPat("b0010") // + case _: zvkUop3.type => BitPat("b0011") // + case _: zvkUop4.type => BitPat("b0100") // + case _: zvkUop5.type => BitPat("b0101") // + case _: zvkUop6.type => BitPat("b0110") // + case _: zvkUop7.type => BitPat("b0111") // + case _: zvkUop8.type => BitPat("b1000") // + case _ => BitPat.dontCare(4) + } case _ => BitPat.dontCare(4) } } @@ -422,6 +439,12 @@ object Decoder { zvbb, ) else Seq() + } ++ { + if (param.zvkEnable) + Seq( + zvk, + ) + else Seq() } def allDecodePattern(param: DecoderParam): Seq[T1DecodePattern] = param.allInstructions.map(T1DecodePattern(_, param)).toSeq.sortBy(_.instruction.name) diff --git a/t1/src/decoder/InstructionDocumentation.scala b/t1/src/decoder/InstructionDocumentation.scala index 86c5a7e358..b506c61c39 100644 --- a/t1/src/decoder/InstructionDocumentation.scala +++ b/t1/src/decoder/InstructionDocumentation.scala @@ -439,5 +439,31 @@ case class InstructionDocumentation(instruction: Instruction, param: DecoderPara case "vwsll.vv" => "TODO!" case "vwsll.vx" => "TODO!" case "vwsll.vi" => "TODO!" + // rv_zvkg + case "vghsh.vv" => "TODO!" + case "vgmul.vv" => "TODO!" + // rv_zvkned + case "vaesdf.vv" => "TODO!" + case "vaesdf.vs" => "TODO!" + case "vaesdm.vv" => "TODO!" + case "vaesdm.vs" => "TODO!" + case "vaesef.vv" => "TODO!" + case "vaesef.vs" => "TODO!" + case "vaesem.vv" => "TODO!" + case "vaesem.vs" => "TODO!" + case "vaesz.vs" => "TODO!" + case "vaeskf1.vi" => "TODO!" + case "vaeskf2.vi" => "TODO!" + // rv_zvknha + case "vsha2ms.vv" => "TODO!" + case "vsha2ch.vv" => "TODO!" + case "vsha2cl.vv" => "TODO!" + // rv_zvksed + case "vsm4k.vi" => "TODO!" + case "vsm4r.vv" => "TODO!" + case "vsm4r.vs" => "TODO!" + // rv_zvksh + case "vsm3c.vi" => "TODO!" + case "vsm3me.vv" => "TODO!" } } diff --git a/t1/src/decoder/T1DecodePattern.scala b/t1/src/decoder/T1DecodePattern.scala index 5c7d107339..3b7d9b3a7e 100644 --- a/t1/src/decoder/T1DecodePattern.scala +++ b/t1/src/decoder/T1DecodePattern.scala @@ -108,6 +108,7 @@ case class T1DecodePattern(instruction: Instruction, param: DecoderParam) extend def isVwmacc: isVwmacc = attribute.isVwmacc(this) def isWidenreduce: isWidenreduce = attribute.isWidenreduce(this) def isZvbb: isZvbb = attribute.isZvbb(this) + def isZvk: isZvk = attribute.isZvk(this) def fpExecutionType: FpExecutionType.Type = attribute.FpExecutionType(this) def topUop: TopUop = attribute.TopUop(this) def decoderUop: DecoderUop = attribute.DecoderUop(this) diff --git a/t1/src/decoder/attribute/isItype.scala b/t1/src/decoder/attribute/isItype.scala index 5ba9baf2ea..c3db647503 100644 --- a/t1/src/decoder/attribute/isItype.scala +++ b/t1/src/decoder/attribute/isItype.scala @@ -54,6 +54,13 @@ object isItype { // rv_zvbb "vror.vi", "vwsll.vi", + // rv_zvkned + "vaeskf1.vi", + "vaeskf2.vi", + // rv_zvksed + "vsm4k.vi", + // rv_zvksh + "vsm3c.vi", ) allMatched.contains(t1DecodePattern.instruction.name) } diff --git a/t1/src/decoder/attribute/isUnsigned0.scala b/t1/src/decoder/attribute/isUnsigned0.scala index fb041c3c78..29b2bf9d3a 100644 --- a/t1/src/decoder/attribute/isUnsigned0.scala +++ b/t1/src/decoder/attribute/isUnsigned0.scala @@ -146,6 +146,32 @@ object isUnsigned0 { "vwsll.vv", "vwsll.vx", "vwsll.vi", + // rv_zvkg + "vghsh.vv", + "vgmul.vv", + // rv_zvkned + "vaesdf.vv", + "vaesdf.vs", + "vaesdm.vv", + "vaesdm.vs", + "vaesef.vv", + "vaesef.vs", + "vaesem.vv", + "vaesem.vs", + "vaesz.vs", + "vaeskf1.vi", + "vaeskf2.vi", + // rv_zvknha + "vsha2ms.vv", + "vsha2ch.vv", + "vsha2cl.vv", + // rv_zvksed + "vsm4k.vi", + "vsm4r.vv", + "vsm4r.vs", + // rv_zvksh + "vsm3c.vi", + "vsm3me.vv", ) allMatched.contains(t1DecodePattern.instruction.name) } diff --git a/t1/src/decoder/attribute/isUnsigned1.scala b/t1/src/decoder/attribute/isUnsigned1.scala index cf4f517a03..595ecfbabb 100644 --- a/t1/src/decoder/attribute/isUnsigned1.scala +++ b/t1/src/decoder/attribute/isUnsigned1.scala @@ -118,6 +118,32 @@ object isUnsigned1 { "vwsll.vv", "vwsll.vx", "vwsll.vi", + // rv_zvkg + "vghsh.vv", + "vgmul.vv", + // rv_zvkned + "vaesdf.vv", + "vaesdf.vs", + "vaesdm.vv", + "vaesdm.vs", + "vaesef.vv", + "vaesef.vs", + "vaesem.vv", + "vaesem.vs", + "vaesz.vs", + "vaeskf1.vi", + "vaeskf2.vi", + // rv_zvknha + "vsha2ms.vv", + "vsha2ch.vv", + "vsha2cl.vv", + // rv_zvksed + "vsm4k.vi", + "vsm4r.vv", + "vsm4r.vs", + // rv_zvksh + "vsm3c.vi", + "vsm3me.vv", ) allMatched.contains(t1DecodePattern.instruction.name) } diff --git a/t1/src/decoder/attribute/isVtype.scala b/t1/src/decoder/attribute/isVtype.scala index 7649d715a2..0ecb480e55 100644 --- a/t1/src/decoder/attribute/isVtype.scala +++ b/t1/src/decoder/attribute/isVtype.scala @@ -186,6 +186,22 @@ object isVtype { "vrol.vv", "vror.vv", "vwsll.vv", + // rv_zvkg + "vghsh.vv", + "vgmul.vv", + // rv_zvkned + "vaesdf.vv", + "vaesdm.vv", + "vaesef.vv", + "vaesem.vv", + // rv_zvknha + "vsha2ms.vv", + "vsha2ch.vv", + "vsha2cl.vv", + // rv_zvksed + "vsm4r.vv", + // rv_zvksh + "vsm3me.vv", ) allMatched.contains(t1DecodePattern.instruction.name) } diff --git a/t1/src/decoder/attribute/isZvk.scala b/t1/src/decoder/attribute/isZvk.scala new file mode 100644 index 0000000000..459423778c --- /dev/null +++ b/t1/src/decoder/attribute/isZvk.scala @@ -0,0 +1,56 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: 2022 Jiuyang Liu + +package org.chipsalliance.t1.rtl.decoder.attribute + +import org.chipsalliance.t1.rtl.decoder.T1DecodePattern + +object isZvk { + def apply(t1DecodePattern: T1DecodePattern): isZvk = + Seq( + y _ -> Y, + n _ -> N, + dc _ -> DC + ).collectFirst { + case (fn, tri) if fn(t1DecodePattern) => isZvk(tri) + }.get + + def y(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched = if(t1DecodePattern.param.zvkEnable) Seq( + "vghsh.vv", + "vgmul.vv", + "vaesdf.vv", + "vaesdf.vs", + "vaesdm.vv", + "vaesdm.vs", + "vaesef.vv", + "vaesef.vs", + "vaesem.vv", + "vaesem.vs", + "vaesz.vs", + "vaeskf1.vi", + "vaeskf2.vi", + "vsha2ms.vv", + "vsha2ch.vv", + "vsha2cl.vv", + "vsm4k.vi", + "vsm4r.vv", + "vsm4r.vs", + "vsm3c.vi", + "vsm3me.vv", + ) else Seq() + allMatched.contains(t1DecodePattern.instruction.name) + } + def n(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched = t1DecodePattern.param.allInstructions.filter(i => + !(y(t1DecodePattern) || dc(t1DecodePattern)) + ) + allMatched.contains(t1DecodePattern.instruction) + } + + def dc(t1DecodePattern: T1DecodePattern): Boolean = false +} + +case class isZvk(value: TriState) extends BooleanDecodeAttribute { + override val description: String = "goes to [[org.chipsalliance.t1.rtl.LaneZvk]]." +} diff --git a/t1/src/decoder/attribute/zvkUop.scala b/t1/src/decoder/attribute/zvkUop.scala new file mode 100644 index 0000000000..6194e32344 --- /dev/null +++ b/t1/src/decoder/attribute/zvkUop.scala @@ -0,0 +1,80 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: 2022 Jiuyang Liu + +package org.chipsalliance.t1.rtl.decoder.attribute + +import org.chipsalliance.t1.rtl.decoder.T1DecodePattern + +trait ZvkUOPType extends Uop +object zvkUop0 extends ZvkUOPType // +object zvkUop1 extends ZvkUOPType // +object zvkUop2 extends ZvkUOPType // +object zvkUop3 extends ZvkUOPType // +object zvkUop4 extends ZvkUOPType // +object zvkUop5 extends ZvkUOPType // +object zvkUop6 extends ZvkUOPType // +object zvkUop7 extends ZvkUOPType // +object zvkUop8 extends ZvkUOPType // + +object ZvkUOP { + def apply(t1DecodePattern: T1DecodePattern): Uop = { + Seq( + t0 _ -> zvkUop0, + t1 _ -> zvkUop1, + t2 _ -> zvkUop2, + t3 _ -> zvkUop3, + t4 _ -> zvkUop4, + t5 _ -> zvkUop5, + t6 _ -> zvkUop6, + t7 _ -> zvkUop7, + t8 _ -> zvkUop8, + ).collectFirst { + case (fn, tpe) if fn(t1DecodePattern) => tpe + }.getOrElse(UopDC) + } + def t0(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched: Seq[String] = Seq( + ) + allMatched.contains(t1DecodePattern.instruction.name) + } + def t1(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched: Seq[String] = Seq( + ) + allMatched.contains(t1DecodePattern.instruction.name) + } + def t2(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched: Seq[String] = Seq( + ) + allMatched.contains(t1DecodePattern.instruction.name) + } + def t3(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched: Seq[String] = Seq( + ) + allMatched.contains(t1DecodePattern.instruction.name) + } + def t4(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched: Seq[String] = Seq( + ) + allMatched.contains(t1DecodePattern.instruction.name) + } + def t5(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched: Seq[String] = Seq( + ) + allMatched.contains(t1DecodePattern.instruction.name) + } + def t6(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched: Seq[String] = Seq( + ) + allMatched.contains(t1DecodePattern.instruction.name) + } + def t7(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched: Seq[String] = Seq( + ) + allMatched.contains(t1DecodePattern.instruction.name) + } + def t8(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched: Seq[String] = Seq( + ) + allMatched.contains(t1DecodePattern.instruction.name) + } +} diff --git a/t1/src/laneStage/LaneExecutionBridge.scala b/t1/src/laneStage/LaneExecutionBridge.scala index 3a58046f3a..a1bff5f545 100644 --- a/t1/src/laneStage/LaneExecutionBridge.scala +++ b/t1/src/laneStage/LaneExecutionBridge.scala @@ -13,6 +13,7 @@ import org.chipsalliance.t1.rtl.decoder.Decoder class LaneExecuteRequest(parameter: LaneParameter, isLastSlot: Boolean) extends Bundle { val src: Vec[UInt] = Vec(3, UInt(parameter.datapathWidth.W)) val crossReadSource: Option[UInt] = Option.when(isLastSlot)(UInt((parameter.datapathWidth * 2).W)) + val zvkCrossReadSource: Option[UInt] = Option.when(isLastSlot)(UInt((parameter.datapathWidth * 4).W)) val bordersForMaskLogic: Bool = Bool() val mask: UInt = UInt((parameter.datapathWidth / 8).W) val maskForFilter: UInt = UInt((parameter.datapathWidth / 8).W) @@ -145,6 +146,7 @@ class LaneExecutionBridge(parameter: LaneParameter, isLastSlot: Boolean, slotInd executionRecord.maskForFilter := enqueue.bits.maskForFilter executionRecord.source := enqueue.bits.src executionRecord.crossReadSource.foreach(_ := enqueue.bits.crossReadSource.get) + executionRecord.zvkCrossReadSource.foreach(_ := enqueue.bits.zvkCrossReadSource.get) executionRecord.sSendResponse.foreach(_ := enqueue.bits.sSendResponse.get) executionRecord.groupCounter := enqueue.bits.groupCounter executionRecord.decodeResult := enqueue.bits.decodeResult @@ -158,7 +160,7 @@ class LaneExecutionBridge(parameter: LaneParameter, isLastSlot: Boolean, slotInd * it can be vd or src2. */ val doubleCollapse: Option[UInt] = Option.when(isLastSlot) { - val cutCrossReadData: Vec[UInt] = cutUInt(executionRecord.crossReadSource.get, parameter.datapathWidth) + val cutCrossReadData: Vec[UInt] = cutUInt(executionRecord.crossReadSource.get, parameter.datapathWidth) // TODO: zvkCrossReadSource Mux(executionRecord.executeIndex, cutCrossReadData(1), cutCrossReadData(0)) } diff --git a/t1/src/laneStage/LaneStage1.scala b/t1/src/laneStage/LaneStage1.scala index f44826e795..2707c9a94b 100644 --- a/t1/src/laneStage/LaneStage1.scala +++ b/t1/src/laneStage/LaneStage1.scala @@ -9,7 +9,7 @@ import chisel3.probe.{Probe, ProbeValue, define} import chisel3.util._ import chisel3.util.experimental.decode.DecodeBundle import org.chipsalliance.t1.rtl.decoder.Decoder -import org.chipsalliance.t1.rtl.lane.{CrossReadUnit, LaneState, VrfReadPipe} +import org.chipsalliance.t1.rtl.lane.{CrossReadUnit, ZvkCrossReadUnit, LaneState, VrfReadPipe} class LaneStage1Enqueue(parameter: LaneParameter, isLastSlot: Boolean) extends Bundle { val groupCounter: UInt = UInt(parameter.groupNumberBits.W) @@ -46,6 +46,7 @@ class LaneStage1Dequeue(parameter: LaneParameter, isLastSlot: Boolean) extends B // read result val src: Vec[UInt] = Vec(3, UInt(parameter.datapathWidth.W)) val crossReadSource: Option[UInt] = Option.when(isLastSlot)(UInt((parameter.datapathWidth * 2).W)) + val zvkCrossReadSource: Option[UInt] = Option.when(isLastSlot)(UInt((parameter.datapathWidth * 4).W)) // pipe state // for exe stage @@ -83,8 +84,14 @@ class LaneStage1(parameter: LaneParameter, isLastSlot: Boolean) extends Module { @public val vrfCheckRequest: Vec[VRFReadRequest] = IO(Vec(readCheckSize, Output(readRequestType))) + val zvkReadCheckSize: Int = if(isLastSlot && parameter.zvkEnable) 7 else 3 + @public + val zvkVrfCheckRequest: Vec[VRFReadRequest] = IO(Vec(zvkReadCheckSize, Output(readRequestType))) + @public val checkResult: Vec[Bool] = IO(Vec(readCheckSize, Input(Bool()))) + @public + val zvkCheckResult: Vec[Bool] = IO(Vec(zvkReadCheckSize, Input(Bool()))) /** VRF read result for each slot, * 3 is for [[source1]] [[source2]] [[source3]] @@ -96,11 +103,19 @@ class LaneStage1(parameter: LaneParameter, isLastSlot: Boolean) extends Module { val readBusDequeue: Option[Vec[DecoupledIO[ReadBusData]]] = Option.when(isLastSlot)(IO( Vec(2, Flipped(Decoupled(new ReadBusData(parameter: LaneParameter)))) )) + @public + val readBusDequeue4: Option[Vec[DecoupledIO[ReadBusData]]] = Option.when(isLastSlot & parameter.zvkEnable)(IO( + Vec(4, Flipped(Decoupled(new ReadBusData(parameter: LaneParameter)))) + )) @public val readBusRequest: Option[Vec[DecoupledIO[ReadBusData]]] = Option.when(isLastSlot)(IO(Vec(2, Decoupled(new ReadBusData(parameter))))) + @public + val readBusRequest4: Option[Vec[DecoupledIO[ReadBusData]]] = + Option.when(isLastSlot & parameter.zvkEnable)(IO(Vec(4, Decoupled(new ReadBusData(parameter))))) + val groupCounter: UInt = enqueue.bits.groupCounter // todo: param @@ -124,12 +139,28 @@ class LaneStage1(parameter: LaneParameter, isLastSlot: Boolean) extends Module { Option.when(isLastSlot)(Module(new Queue(vrfReadEntryType, readRequestQueueSizeAfterCheck))) val queueAfterCheckMSB: Option[Queue[VRFReadQueueEntry]] = Option.when(isLastSlot)(Module(new Queue(vrfReadEntryType, readRequestQueueSizeAfterCheck))) + val queueAfterCheckZvkLSBLSB: Option[Queue[VRFReadQueueEntry]] = + Option.when(isLastSlot && parameter.zvkEnable)(Module(new Queue(vrfReadEntryType, readRequestQueueSizeAfterCheck))) + val queueAfterCheckZvkLSBMSB: Option[Queue[VRFReadQueueEntry]] = + Option.when(isLastSlot && parameter.zvkEnable)(Module(new Queue(vrfReadEntryType, readRequestQueueSizeAfterCheck))) + val queueAfterCheckZvkMSBLSB: Option[Queue[VRFReadQueueEntry]] = + Option.when(isLastSlot && parameter.zvkEnable)(Module(new Queue(vrfReadEntryType, readRequestQueueSizeAfterCheck))) + val queueAfterCheckZvkMSBMSB: Option[Queue[VRFReadQueueEntry]] = + Option.when(isLastSlot && parameter.zvkEnable)(Module(new Queue(vrfReadEntryType, readRequestQueueSizeAfterCheck))) // read request queue for cross read lsb & msb val queueBeforeCheckLSB: Option[Queue[VRFReadQueueEntry]] = Option.when(isLastSlot)(Module(new Queue(vrfReadEntryType, readRequestQueueSizeBeforeCheck))) val queueBeforeCheckMSB: Option[Queue[VRFReadQueueEntry]] = Option.when(isLastSlot)(Module(new Queue(vrfReadEntryType, readRequestQueueSizeBeforeCheck))) + val queueBeforeCheckZvkLSBLSB: Option[Queue[VRFReadQueueEntry]] = + Option.when(isLastSlot && parameter.zvkEnable)(Module(new Queue(vrfReadEntryType, readRequestQueueSizeBeforeCheck))) + val queueBeforeCheckZvkLSBMSB: Option[Queue[VRFReadQueueEntry]] = + Option.when(isLastSlot && parameter.zvkEnable)(Module(new Queue(vrfReadEntryType, readRequestQueueSizeBeforeCheck))) + val queueBeforeCheckZvkMSBLSB: Option[Queue[VRFReadQueueEntry]] = + Option.when(isLastSlot && parameter.zvkEnable)(Module(new Queue(vrfReadEntryType, readRequestQueueSizeBeforeCheck))) + val queueBeforeCheckZvkMSBMSB: Option[Queue[VRFReadQueueEntry]] = + Option.when(isLastSlot && parameter.zvkEnable)(Module(new Queue(vrfReadEntryType, readRequestQueueSizeBeforeCheck))) // pipe from enqueue val pipeQueue: Queue[LaneStage1Enqueue] = @@ -147,11 +178,33 @@ class LaneStage1(parameter: LaneParameter, isLastSlot: Boolean) extends Module { val afterCheckQueueVec: Seq[Queue[VRFReadQueueEntry]] = Seq(queueAfterCheck1, queueAfterCheck2, queueAfterCheckVd) ++ queueAfterCheckLSB ++ queueAfterCheckMSB - val allReadQueueReady: Bool = beforeCheckQueueVec.map(_.io.enq.ready).reduce(_ && _) + val beforeCheckZvkQueueVec: Seq[Queue[VRFReadQueueEntry]] = + Seq(queueBeforeCheck1, queueBeforeCheck2, queueBeforeCheckVd) ++ + queueBeforeCheckZvkLSBLSB ++ queueBeforeCheckZvkLSBMSB ++ + queueBeforeCheckZvkMSBLSB ++ queueBeforeCheckZvkMSBMSB + val afterCheckZvkQueueVec: Seq[Queue[VRFReadQueueEntry]] = + Seq(queueAfterCheck1, queueAfterCheck2, queueAfterCheckVd) ++ + queueAfterCheckZvkLSBLSB ++ queueAfterCheckZvkLSBMSB ++ + queueAfterCheckZvkMSBLSB ++ queueAfterCheckZvkMSBMSB + val allReadQueueReady: Bool = { + val ready = beforeCheckQueueVec.map(_.io.enq.ready).reduce(_ && _) + if(parameter.zvkEnable) { + val zvkReady = beforeCheckZvkQueueVec.map(_.io.enq.ready).reduce(_ && _) + Mux(enqueue.bits.decodeResult(Decoder.crossRead) & enqueue.bits.decodeResult(Decoder.zvk), zvkReady, ready) + } else { + ready + } + } beforeCheckQueueVec.foreach{ q => q.io.enq.bits.instructionIndex := enqueue.bits.instructionIndex q.io.enq.bits.groupIndex := enqueue.bits.groupCounter } + if(parameter.zvkEnable) { + beforeCheckZvkQueueVec.foreach{ q => + q.io.enq.bits.instructionIndex := enqueue.bits.instructionIndex + q.io.enq.bits.groupIndex := enqueue.bits.groupCounter + } + } enqueue.ready := allReadQueueReady && pipeQueue.io.enq.ready @@ -162,6 +215,14 @@ class LaneStage1(parameter: LaneParameter, isLastSlot: Boolean) extends Module { after.io.enq.valid := before.io.deq.valid && checkResult(i) after.io.enq.bits := before.io.deq.bits } + if(parameter.zvkEnable) { + beforeCheckZvkQueueVec.zip(afterCheckZvkQueueVec).zipWithIndex.foreach { case ((before, after), i) => + zvkVrfCheckRequest(i) := before.io.deq.bits + before.io.deq.ready := after.io.enq.ready && zvkCheckResult(i) + after.io.enq.valid := before.io.deq.valid && zvkCheckResult(i) + after.io.enq.bits := before.io.deq.bits + } + } // request enqueue queueBeforeCheck1.io.enq.valid := enqueue.fire && enqueue.bits.decodeResult(Decoder.vtype) && !enqueue.bits.skipRead queueBeforeCheck2.io.enq.valid := enqueue.fire && !enqueue.bits.skipRead @@ -169,6 +230,11 @@ class LaneStage1(parameter: LaneParameter, isLastSlot: Boolean) extends Module { (queueBeforeCheckLSB ++ queueBeforeCheckMSB).foreach { q => q.io.enq.valid := enqueue.valid && allReadQueueReady && enqueue.bits.decodeResult(Decoder.crossRead) } + if(parameter.zvkEnable) { + (queueBeforeCheckZvkLSBLSB ++ queueBeforeCheckZvkLSBMSB ++ queueBeforeCheckZvkMSBLSB ++ queueBeforeCheckZvkMSBMSB).foreach { q => + q.io.enq.valid := enqueue.valid && allReadQueueReady && enqueue.bits.decodeResult(Decoder.crossRead) && enqueue.bits.decodeResult(Decoder.zvk) + } + } // calculate vs queueBeforeCheck1.io.enq.bits.vs := Mux( @@ -223,6 +289,30 @@ class LaneStage1(parameter: LaneParameter, isLastSlot: Boolean) extends Module { q.io.enq.bits.offset := groupCounter(parameter.vrfOffsetBits - 2, 0) ## true.B } + if(parameter.zvkEnable) { + queueBeforeCheckZvkLSBLSB.foreach { q => + q.io.enq.bits.vs := enqueue.bits.vs2 + groupCounter(parameter.groupNumberBits - 2, parameter.vrfOffsetBits - 1) + q.io.enq.bits.readSource := 1.U + q.io.enq.bits.offset := groupCounter(parameter.vrfOffsetBits - 2, 0) ## true.B + } + queueBeforeCheckZvkLSBMSB.foreach { q => + q.io.enq.bits.vs := enqueue.bits.vs2 + groupCounter(parameter.groupNumberBits - 2, parameter.vrfOffsetBits - 1) + q.io.enq.bits.readSource := 1.U + q.io.enq.bits.offset := groupCounter(parameter.vrfOffsetBits - 2, 0) ## true.B + } + queueBeforeCheckZvkMSBLSB.foreach { q => + q.io.enq.bits.vs := enqueue.bits.vs2 + groupCounter(parameter.groupNumberBits - 2, parameter.vrfOffsetBits - 1) + q.io.enq.bits.readSource := 1.U + q.io.enq.bits.offset := groupCounter(parameter.vrfOffsetBits - 2, 0) ## true.B + } + queueBeforeCheckZvkMSBMSB.foreach { q => + q.io.enq.bits.vs := enqueue.bits.vs2 + groupCounter(parameter.groupNumberBits - 2, parameter.vrfOffsetBits - 1) + q.io.enq.bits.readSource := 1.U + q.io.enq.bits.offset := groupCounter(parameter.vrfOffsetBits - 2, 0) ## true.B + } + } + + // read pipe val readPipe0: Instance[VrfReadPipe] = Instantiate(new VrfReadPipe(parameter, arbitrate = false)) val readPipe1: Instance[VrfReadPipe] = Instantiate(new VrfReadPipe(parameter, arbitrate = isLastSlot)) @@ -240,6 +330,10 @@ class LaneStage1(parameter: LaneParameter, isLastSlot: Boolean) extends Module { // cross lane queue val dataQueueLSB = Option.when(isLastSlot)(Module(new Queue(UInt(parameter.datapathWidth.W), dataQueueSize))) val dataQueueMSB = Option.when(isLastSlot)(Module(new Queue(UInt(parameter.datapathWidth.W), dataQueueSize))) + val dataQueueZvkLSBLSB = Option.when(isLastSlot && parameter.zvkEnable)(Module(new Queue(UInt(parameter.datapathWidth.W), dataQueueSize))) + val dataQueueZvkLSBMSB = Option.when(isLastSlot && parameter.zvkEnable)(Module(new Queue(UInt(parameter.datapathWidth.W), dataQueueSize))) // TODO + val dataQueueZvkMSBLSB = Option.when(isLastSlot && parameter.zvkEnable)(Module(new Queue(UInt(parameter.datapathWidth.W), dataQueueSize))) // TODO + val dataQueueZvkMSBMSB = Option.when(isLastSlot && parameter.zvkEnable)(Module(new Queue(UInt(parameter.datapathWidth.W), dataQueueSize))) val dataQueueNotFull2: Bool = { val counterReg = RegInit(0.U(log2Ceil(dataQueueSize + 1).W)) @@ -268,31 +362,71 @@ class LaneStage1(parameter: LaneParameter, isLastSlot: Boolean) extends Module { blockingHandshake(readPipe2.enqueue, queueAfterCheckVd.io.deq, dataQueueNotFullVd) // contender for cross read - readPipe1.contender.zip(queueAfterCheckLSB).foreach { case (port, queue) => - val dataQueueNotFullLSB: Bool = { - val counterReg = RegInit(0.U(log2Ceil(dataQueueSize + 1).W)) - val doEnq = queue.io.deq.fire - val doDeq = dataQueueLSB.get.io.deq.fire - val countChange = Mux(doEnq, 1.U, -1.S(log2Ceil(dataQueueSize + 1).W).asUInt) - when(doEnq ^ doDeq) { - counterReg := counterReg + countChange + if(parameter.zvkEnable) { + readPipe1.contender.zip(queueAfterCheckLSB).zip(queueAfterCheckZvkLSBLSB).foreach { case (port, queue, zvkQueue) => + val dataQueueNotFullLSB: Bool = { + val counterReg = RegInit(0.U(log2Ceil(dataQueueSize + 1).W)) + val doEnq = queue.io.deq.fire || zvkQueue.io.deq.fire + val doDeq = Mux(Decoder.zvk, dataQueueZvkLSBLSB.get.io.deq.fire, dataQueueLSB.get.io.deq.fire) + val countChange = Mux(doEnq, 1.U, -1.S(log2Ceil(dataQueueSize + 1).W).asUInt) + when(doEnq ^ doDeq) { + counterReg := counterReg + countChange + } + !counterReg(log2Ceil(dataQueueSize)) + } + Mux( + Decoder.zvk, + blockingHandshake(port, zvkQueue.io.deq, dataQueueNotFullLSB), + blockingHandshake(port, queue.io.deq, dataQueueNotFullLSB) + ) + } + } else { + readPipe1.contender.zip(queueAfterCheckLSB).foreach { case (port, queue) => + val dataQueueNotFullLSB: Bool = { + val counterReg = RegInit(0.U(log2Ceil(dataQueueSize + 1).W)) + val doEnq = queue.io.deq.fire + val doDeq = dataQueueLSB.get.io.deq.fire + val countChange = Mux(doEnq, 1.U, -1.S(log2Ceil(dataQueueSize + 1).W).asUInt) + when(doEnq ^ doDeq) { + counterReg := counterReg + countChange + } + !counterReg(log2Ceil(dataQueueSize)) } - !counterReg(log2Ceil(dataQueueSize)) + blockingHandshake(port, queue.io.deq, dataQueueNotFullLSB) } - blockingHandshake(port, queue.io.deq, dataQueueNotFullLSB) } - readPipe2.contender.zip(queueAfterCheckMSB).foreach { case (port, queue) => - val dataQueueNotFullMSB: Bool = { - val counterReg = RegInit(0.U(log2Ceil(dataQueueSize + 1).W)) - val doEnq = queue.io.deq.fire - val doDeq = dataQueueMSB.get.io.deq.fire - val countChange = Mux(doEnq, 1.U, -1.S(log2Ceil(dataQueueSize + 1).W).asUInt) - when(doEnq ^ doDeq) { - counterReg := counterReg + countChange + if(parameter.zvkEnable) { + readPipe2.contender.zip(queueAfterCheckMSB).zip(queueAfterCheckZvkMSBMSB).foreach { case (port, queue, zvkQueue) => + val dataQueueNotFullMSB: Bool = { + val counterReg = RegInit(0.U(log2Ceil(dataQueueSize + 1).W)) + val doEnq = queue.io.deq.fire || zvkQueue.io.deq.fire + val doDeq = Mux(Decoder.zvk, dataQueueZvkMSBMSB.get.io.deq.fire, dataQueueMSB.get.io.deq.fire) + val countChange = Mux(doEnq, 1.U, -1.S(log2Ceil(dataQueueSize + 1).W).asUInt) + when(doEnq ^ doDeq) { + counterReg := counterReg + countChange + } + !counterReg(log2Ceil(dataQueueSize)) + } + Mux( + Decoder.zvk, + blockingHandshake(port, zvkQueue.io.deq, dataQueueNotFullMSB), + blockingHandshake(port, queue.io.deq, dataQueueNotFullMSB) + ) + } + } else { + readPipe2.contender.zip(queueAfterCheckMSB).foreach { case (port, queue) => + val dataQueueNotFullMSB: Bool = { + val counterReg = RegInit(0.U(log2Ceil(dataQueueSize + 1).W)) + val doEnq = queue.io.deq.fire + val doDeq = dataQueueMSB.get.io.deq.fire + val countChange = Mux(doEnq, 1.U, -1.S(log2Ceil(dataQueueSize + 1).W).asUInt) + when(doEnq ^ doDeq) { + counterReg := counterReg + countChange + } + !counterReg(log2Ceil(dataQueueSize)) } - !counterReg(log2Ceil(dataQueueSize)) + blockingHandshake(port, queue.io.deq, dataQueueNotFullMSB) } - blockingHandshake(port, queue.io.deq, dataQueueNotFullMSB) } // data: pipe <-> queue @@ -301,11 +435,18 @@ class LaneStage1(parameter: LaneParameter, isLastSlot: Boolean) extends Module { dataQueueVs2.io.enq <> readPipe1.dequeue // pipe1 <> dataQueueLSB dataQueueLSB.zip(readPipe1.contenderDequeue).foreach { case (sink, source) => sink.io.enq <> source } + if(parameter.zvkEnable) { + dataQueueZvkLSBLSB.zip(readPipe1.contenderDequeue).foreach { case (sink, source) => sink.io.enq <> source } + } + // pipe2 <-> dataQueueVd dataQueueVd.io.enq <> readPipe2.dequeue // pipe2 <-> dataQueueMSB dataQueueMSB.zip(readPipe2.contenderDequeue).foreach { case (sink, source) => sink.io.enq <> source } + if(parameter.zvkEnable) { + dataQueueZvkMSBMSB.zip(readPipe2.contenderDequeue).foreach { case (sink, source) => sink.io.enq <> source } + } } else { dataQueueVs2.io.enq <> readPipe1.dequeue dataQueueVd.io.enq <> readPipe2.dequeue @@ -316,6 +457,12 @@ class LaneStage1(parameter: LaneParameter, isLastSlot: Boolean) extends Module { Option.when(isLastSlot)(Module(new Queue(UInt((parameter.datapathWidth * 2).W), 1))) val crossReadStageFree: Option[Bool] = Option.when(isLastSlot)(Wire(Bool())) val crossReadUnitOp: Option[Instance[CrossReadUnit]] = Option.when(isLastSlot)(Instantiate(new CrossReadUnit(parameter))) + + val zvkCrossReadResultQueue: Option[Queue[UInt]] = + Option.when(isLastSlot && parameter.zvkEnable)(Module(new Queue(UInt((parameter.datapathWidth * 4).W), 1))) + val zvkCrossReadStageFree: Option[Bool] = Option.when(isLastSlot && parameter.zvkEnable)(Wire(Bool())) + val zvkCrossReadUnitOp: Option[Instance[ZvkCrossReadUnit]] = Option.when(isLastSlot && parameter.zvkEnable)(Instantiate(new ZvkCrossReadUnit(parameter))) + if (isLastSlot) { val dataGroupQueue: Queue[UInt] = Module( @@ -327,6 +474,7 @@ class LaneStage1(parameter: LaneParameter, isLastSlot: Boolean) extends Module { // todo: need pipe ? val laneIndexReg = RegInit(enqueue.bits.laneIndex) val crossReadUnit = crossReadUnitOp.get + crossReadUnit.dataInputLSB <> dataQueueLSB.get.io.deq crossReadUnit.dataInputMSB <> dataQueueMSB.get.io.deq crossReadUnit.laneIndex := laneIndexReg @@ -341,7 +489,34 @@ class LaneStage1(parameter: LaneParameter, isLastSlot: Boolean) extends Module { assert(dataGroupQueue.io.enq.ready || !dataGroupQueue.io.enq.valid) dataGroupQueue.io.enq.bits := enqueue.bits.groupCounter dataGroupQueue.io.deq.ready := crossReadUnit.dataInputLSB.fire - dequeue.bits.readBusDequeueGroup.get := crossReadUnitOp.get.currentGroup + dequeue.bits.readBusDequeueGroup.get := crossReadUnitOp.get.currentGroup // TODO: readBusDequeueGroup is currently unused + + if(parameter.zvkEnable) { + val zvkDataGroupQueue: Queue[UInt] = + Module( + new Queue( + UInt(parameter.groupNumberBits.W), + readRequestQueueSizeBeforeCheck + readRequestQueueSizeBeforeCheck + dataQueueSize + 2 + ) + ) + val zvkCrossReadUnit = zvkCrossReadUnitOp.get + zvkCrossReadUnit.dataInputLSBLSB <> dataQueueZvkLSBLSB.get.io.deq + zvkCrossReadUnit.dataInputLSBMSB <> dataQueueZvkLSBMSB.get.io.deq + zvkCrossReadUnit.dataInputMSBLSB <> dataQueueZvkMSBLSB.get.io.deq + zvkCrossReadUnit.dataInputMSBMSB <> dataQueueZvkMSBMSB.get.io.deq + zvkCrossReadUnit.laneIndex := laneIndexReg + zvkCrossReadUnit.dataGroup := zvkDataGroupQueue.io.deq.bits + readBusRequest4.get.zip(zvkCrossReadUnit.readBusRequest).foreach { case (sink, source) => sink <> source} + zvkCrossReadUnit.readBusDequeue.get.zip(readBusDequeue4.get).foreach { case (sink, source) => sink <> source} + zvkCrossReadResultQueue.get.io.enq <> zvkCrossReadUnit.crossReadDequeue + zvkCrossReadStageFree.get := zvkCrossReadUnit.crossReadStageFree + + // data group + zvkDataGroupQueue.io.enq.valid := enqueue.fire && enqueue.bits.decodeResult(Decoder.crossRead) + assert(zvkDataGroupQueue.io.enq.ready || !zvkDataGroupQueue.io.enq.valid) + zvkDataGroupQueue.io.enq.bits := enqueue.bits.groupCounter + zvkDataGroupQueue.io.deq.ready := zvkCrossReadUnit.dataInputLSBLSB.fire + } } val source1Select: UInt = Mux( @@ -353,6 +528,7 @@ class LaneStage1(parameter: LaneParameter, isLastSlot: Boolean) extends Module { dequeue.bits.groupCounter := pipeQueue.io.deq.bits.groupCounter dequeue.bits.src := VecInit(Seq(source1Select, dataQueueVs2.io.deq.bits, dataQueueVd.io.deq.bits)) dequeue.bits.crossReadSource.foreach(_ := crossReadResultQueue.get.io.deq.bits) + dequeue.bits.zvkCrossReadSource.foreach(_ := zvkCrossReadResultQueue.get.io.deq.bits) dequeue.bits.sSendResponse.foreach(_ := pipeQueue.io.deq.bits.sSendResponse.get) dequeue.bits.decodeResult := pipeQueue.io.deq.bits.decodeResult dequeue.bits.vSew1H := pipeQueue.io.deq.bits.vSew1H @@ -374,7 +550,8 @@ class LaneStage1(parameter: LaneParameter, isLastSlot: Boolean) extends Module { dataQueueVs2.io.deq.valid || pipeQueue.io.deq.bits.skipRead, dataQueueVd.io.deq.valid || (pipeQueue.io.deq.bits.decodeResult(Decoder.sReadVD)) ) ++ - crossReadResultQueue.map(_.io.deq.valid || !pipeQueue.io.deq.bits.decodeResult(Decoder.crossRead)) + crossReadResultQueue.map(_.io.deq.valid || !pipeQueue.io.deq.bits.decodeResult(Decoder.crossRead)) ++ + if(parameter.zvkEnable) zvkCrossReadResultQueue.map(_.io.deq.valid || (!pipeQueue.io.deq.bits.decodeResult(Decoder.crossRead) && !pipeQueue.io.deq.bits.decodeResult(Decoder.zvk))) else Seq() val allDataQueueValid: Bool = VecInit(dataQueueValidVec).asUInt.andR dequeue.valid := allDataQueueValid && pipeQueue.io.deq.valid dataQueueVs1.ready := allDataQueueValid && dequeue.ready && pipeQueue.io.deq.bits.decodeResult(Decoder.vtype) @@ -382,6 +559,9 @@ class LaneStage1(parameter: LaneParameter, isLastSlot: Boolean) extends Module { dataQueueVd.io.deq.ready := allDataQueueValid && dequeue.ready && !pipeQueue.io.deq.bits.decodeResult(Decoder.sReadVD) crossReadResultQueue.foreach(_.io.deq.ready := allDataQueueValid && dequeue.ready && pipeQueue.io.deq.bits.decodeResult(Decoder.crossRead)) + if(parameter.zvkEnable) { + zvkCrossReadResultQueue.foreach(_.io.deq.ready := allDataQueueValid && dequeue.ready && (pipeQueue.io.deq.bits.decodeResult(Decoder.crossRead) && pipeQueue.io.deq.bits.decodeResult(Decoder.zvk))) + } stageValid := pipeQueue.io.deq.valid val stageFinish = !stageValid @@ -400,10 +580,26 @@ class LaneStage1(parameter: LaneParameter, isLastSlot: Boolean) extends Module { @public val sSendCrossReadResultMSBProbe = Option.when(isLastSlot)(IO(Output(Probe(Bool())))) @public + val sSendZvkCrossReadResultLSBLSBProbe = Option.when(isLastSlot && parameter.zvkEnable)(IO(Output(Probe(Bool())))) + @public + val sSendZvkCrossReadResultLSBMSBProbe = Option.when(isLastSlot && parameter.zvkEnable)(IO(Output(Probe(Bool())))) + @public + val sSendZvkCrossReadResultMSBLSBProbe = Option.when(isLastSlot && parameter.zvkEnable)(IO(Output(Probe(Bool())))) + @public + val sSendZvkCrossReadResultMSBMSBProbe = Option.when(isLastSlot && parameter.zvkEnable)(IO(Output(Probe(Bool())))) + @public val wCrossReadLSBProbe = Option.when(isLastSlot)(IO(Output(Probe(Bool())))) @public val wCrossReadMSBProbe = Option.when(isLastSlot)(IO(Output(Probe(Bool())))) @public + val wZvkCrossReadLSBLSBProbe = Option.when(isLastSlot && parameter.zvkEnable)(IO(Output(Probe(Bool())))) + @public + val wZvkCrossReadLSBMSBProbe = Option.when(isLastSlot && parameter.zvkEnable)(IO(Output(Probe(Bool())))) + @public + val wZvkCrossReadMSBLSBProbe = Option.when(isLastSlot && parameter.zvkEnable)(IO(Output(Probe(Bool())))) + @public + val wZvkCrossReadMSBMSBProbe = Option.when(isLastSlot && parameter.zvkEnable)(IO(Output(Probe(Bool())))) + @public val vrfReadRequestProbe: Seq[(Bool, Bool)] = Seq.fill(3)((IO(Output(Probe(Bool()))),IO(Output(Probe(Bool()))))) @@ -416,8 +612,19 @@ class LaneStage1(parameter: LaneParameter, isLastSlot: Boolean) extends Module { readFinishProbe.foreach(p => define(p, ProbeValue(dataQueueVs2.io.deq.valid))) sSendCrossReadResultLSBProbe.foreach(p => define(p, ProbeValue(crossReadUnitOp.get.crossWriteState.sSendCrossReadResultLSB))) sSendCrossReadResultMSBProbe.foreach(p => define(p, ProbeValue(crossReadUnitOp.get.crossWriteState.sSendCrossReadResultMSB))) + println(sSendCrossReadResultMSBProbe) wCrossReadLSBProbe.foreach(p => define(p, ProbeValue(crossReadUnitOp.get.crossWriteState.wCrossReadLSB))) wCrossReadMSBProbe.foreach(p => define(p, ProbeValue(crossReadUnitOp.get.crossWriteState.wCrossReadMSB))) + if (parameter.zvkEnable) { + sSendZvkCrossReadResultLSBLSBProbe.foreach(p => define(p, ProbeValue(zvkCrossReadUnitOp.get.crossWriteState.sSendCrossReadResult(0)))) + sSendZvkCrossReadResultLSBMSBProbe.foreach(p => define(p, ProbeValue(zvkCrossReadUnitOp.get.crossWriteState.sSendCrossReadResult(1)))) + sSendZvkCrossReadResultMSBLSBProbe.foreach(p => define(p, ProbeValue(zvkCrossReadUnitOp.get.crossWriteState.sSendCrossReadResult(2)))) + sSendZvkCrossReadResultMSBMSBProbe.foreach(p => define(p, ProbeValue(zvkCrossReadUnitOp.get.crossWriteState.sSendCrossReadResult(3)))) + wZvkCrossReadLSBLSBProbe.foreach(p => define(p, ProbeValue(zvkCrossReadUnitOp.get.crossWriteState.wCrossRead(0)))) + wZvkCrossReadLSBMSBProbe.foreach(p => define(p, ProbeValue(zvkCrossReadUnitOp.get.crossWriteState.wCrossRead(1)))) + wZvkCrossReadMSBLSBProbe.foreach(p => define(p, ProbeValue(zvkCrossReadUnitOp.get.crossWriteState.wCrossRead(2)))) + wZvkCrossReadMSBMSBProbe.foreach(p => define(p, ProbeValue(zvkCrossReadUnitOp.get.crossWriteState.wCrossRead(3)))) + } } vrfReadRequestProbe.zipWithIndex.foreach { case((ready, valid), i) => diff --git a/t1/src/laneStage/LaneStage3.scala b/t1/src/laneStage/LaneStage3.scala index 8ccc6fbc11..eee9ad3be1 100644 --- a/t1/src/laneStage/LaneStage3.scala +++ b/t1/src/laneStage/LaneStage3.scala @@ -17,6 +17,7 @@ class LaneStage3Enqueue(parameter: LaneParameter, isLastSlot: Boolean) extends B val mask: UInt = UInt((parameter.datapathWidth/8).W) val ffoIndex: UInt = UInt(log2Ceil(parameter.vLen / 8).W) val crossWriteData: Vec[UInt] = Vec(2, UInt(parameter.datapathWidth.W)) + val zvkCrossWriteData: Vec[UInt] = Vec(4, UInt(parameter.datapathWidth.W)) val sSendResponse: Bool = Bool() val ffoSuccess: Bool = Bool() val fpReduceValid: Option[Bool] = Option.when(parameter.fpuEnable && isLastSlot)(Bool()) @@ -56,6 +57,9 @@ class LaneStage3(parameter: LaneParameter, isLastSlot: Boolean) extends Module { @public val crossWritePort: Option[Vec[DecoupledIO[WriteBusData]]] = Option.when(isLastSlot)(IO(Vec(2, Decoupled(new WriteBusData(parameter))))) + @public + val crossWritePort4: Option[Vec[DecoupledIO[WriteBusData]]] = + Option.when(isLastSlot & parameter.zvkEnable)(IO(Vec(4, Decoupled(new WriteBusData(parameter))))) val stageValidReg: Option[Bool] = Option.when(isLastSlot) (RegInit(false.B)) @@ -107,6 +111,16 @@ class LaneStage3(parameter: LaneParameter, isLastSlot: Boolean) extends Module { sendState(index) := true.B } } + crossWritePort4.get.zipWithIndex.foreach { case (port, index) => + port.valid := stageValidReg.get && !sendState(index) + port.bits.mask := pipeEnqueue.get.mask(4 * index + 1, 4 * index) + port.bits.data := pipeEnqueue.get.zvkCrossWriteData(index) + port.bits.counter := pipeEnqueue.get.groupCounter + port.bits.instructionIndex := pipeEnqueue.get.instructionIndex + when(port.fire) { + sendState(index) := true.B + } + } // scheduler synchronization val schedulerFinish: Bool = (sSendResponse ++ wResponseFeedback).reduce(_ && _) @@ -185,4 +199,4 @@ class LaneStage3(parameter: LaneParameter, isLastSlot: Boolean) extends Module { vrfWriteRequest <> vrfWriteQueue.io.deq vrfWriteRequest.bits.offset := vrfPtrReplica.io.deq.bits vrfWriteRequest.valid := vrfPtrReplica.io.deq.valid -} \ No newline at end of file +} diff --git a/t1/src/laneStage/SlotTokenManager.scala b/t1/src/laneStage/SlotTokenManager.scala index ae8bb531eb..a961964192 100644 --- a/t1/src/laneStage/SlotTokenManager.scala +++ b/t1/src/laneStage/SlotTokenManager.scala @@ -73,6 +73,9 @@ class SlotTokenManager(parameter: LaneParameter) extends Module { @public val crossWriteReports: Vec[ValidIO[UInt]] = IO(Vec(2, Flipped(Valid(UInt(parameter.instructionIndexBits.W))))) + @public + val crossWriteReports4: Vec[ValidIO[UInt]] = IO(Vec(4, Flipped(Valid(UInt(parameter.instructionIndexBits.W))))) + @public val responseReport: ValidIO[UInt] = IO(Flipped(Valid(UInt(parameter.instructionIndexBits.W)))) diff --git a/t1/src/laneStage/ZvkCrossReadUnit.scala b/t1/src/laneStage/ZvkCrossReadUnit.scala new file mode 100644 index 0000000000..bb3490d725 --- /dev/null +++ b/t1/src/laneStage/ZvkCrossReadUnit.scala @@ -0,0 +1,133 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: 2022 Jiuyang Liu + +package org.chipsalliance.t1.rtl.lane + +import chisel3._ +import chisel3.experimental.hierarchy.{instantiable, public} +import chisel3.util._ +import org.chipsalliance.t1.rtl.{LaneParameter, ReadBusData} + +class ZvkCrossReadState extends Bundle { + val sSendCrossReadResult: Seq[Bool] = Seq.fill(4)(Bool()) + val wCrossRead: Seq[Bool] = Seq.fill(4)(Bool()) +} + +@instantiable +class ZvkCrossReadUnit(parameter: LaneParameter) extends Module { + @public + val dataInputLSBLSB: DecoupledIO[UInt] = IO(Flipped(Decoupled(UInt(parameter.datapathWidth.W)))) + @public + val dataInputLSBMSB: DecoupledIO[UInt] = IO(Flipped(Decoupled(UInt(parameter.datapathWidth.W)))) + @public + val dataInputMSBLSB: DecoupledIO[UInt] = IO(Flipped(Decoupled(UInt(parameter.datapathWidth.W)))) + @public + val dataInputMSBMSB: DecoupledIO[UInt] = IO(Flipped(Decoupled(UInt(parameter.datapathWidth.W)))) + @public + val laneIndex: UInt = IO(Input(UInt(parameter.laneNumberBits.W))) + @public + val dataGroup: UInt = IO(Input(UInt(parameter.groupNumberBits.W))) + @public + val currentGroup: UInt = IO(Output(UInt(parameter.groupNumberBits.W))) + + @public + val readBusDequeue: Option[Vec[DecoupledIO[ReadBusData]]] = + Option.when(parameter.zvkEnable)( + IO( + Vec(4, Flipped(Decoupled(new ReadBusData(parameter: LaneParameter))) + )) + ) + @public + val readBusRequest: Option[Vec[DecoupledIO[ReadBusData]]] = + Option.when(parameter.zvkEnable)(IO(Vec(4, Decoupled(new ReadBusData(parameter))))) + + @public + val crossReadDequeue: DecoupledIO[UInt] = IO(Decoupled(UInt((parameter.datapathWidth * 4).W))) + @public + val crossReadStageFree: Bool = IO(Output(Bool())) + @public + val crossWriteState = IO(Output(new ZvkCrossReadState)) + + val stageValid: Bool = RegInit(false.B) + val sSendCrossReadResultLSBLSB, sSendCrossReadResultMSBLSB, wCrossReadLSBLSB, wCrossReadMSBLSB = RegInit(true.B) + val sSendCrossReadResultLSBMSB, sSendCrossReadResultMSBMSB, wCrossReadLSBMSB, wCrossReadMSBMSB = RegInit(true.B) + val stateVec: Seq[Bool] = Seq( + sSendCrossReadResultLSBLSB, + sSendCrossReadResultLSBMSB, + sSendCrossReadResultMSBLSB, + sSendCrossReadResultMSBMSB, + wCrossReadLSBLSB, + wCrossReadLSBMSB, + wCrossReadMSBLSB, + wCrossReadMSBMSB, + ) + val sendDataVec: Vec[UInt] = RegInit(VecInit(Seq.fill(4)(0.U(parameter.datapathWidth.W)))) + val groupCounter: UInt = RegInit(0.U(parameter.groupNumberBits.W)) + val receiveDataVec: Vec[UInt] = RegInit(VecInit(Seq.fill(4)(0.U(parameter.datapathWidth.W)))) + val sendState = Seq( + sSendCrossReadResultLSBLSB, + sSendCrossReadResultLSBMSB, + sSendCrossReadResultMSBLSB, + sSendCrossReadResultMSBMSB, + ) + val receiveState = Seq( + wCrossReadLSBLSB, + wCrossReadLSBMSB, + wCrossReadMSBLSB, + wCrossReadMSBMSB, + ) + + readBusRequest.get.zipWithIndex.foreach { case (port, index) => + port.valid := stageValid && !sendState(index) + port.bits.data := sendDataVec(index) + when(port.fire) { sendState(index) := true.B} + } + + readBusDequeue.get.zipWithIndex.foreach { case (port, index) => + when(port.fire) { + receiveState(index) := true.B + receiveDataVec(index) := port.bits.data + } + port.ready := !receiveState(index) + } + val allStateReady: Bool = stateVec.reduce(_ && _) + val stageReady: Bool = !stageValid || (allStateReady && crossReadDequeue.ready) + val allSourceValid: Bool = Seq( + dataInputLSBLSB.valid, + dataInputLSBMSB.valid, + dataInputMSBLSB.valid, + dataInputMSBMSB.valid, + ).reduce(_ && _) + val enqueueFire: Bool = stageReady && allSourceValid + dataInputLSBLSB.ready := allSourceValid && stageReady + dataInputLSBMSB.ready := allSourceValid && stageReady + dataInputMSBLSB.ready := allSourceValid && stageReady + dataInputMSBMSB.ready := allSourceValid && stageReady + + when(enqueueFire ^ crossReadDequeue.fire) { + stageValid := enqueueFire + } + when(enqueueFire) { + stateVec.foreach(_ := false.B) + sendDataVec := VecInit(Seq( + dataInputLSBLSB.bits, + dataInputLSBMSB.bits, + dataInputMSBLSB.bits, + dataInputMSBMSB.bits, + )) + groupCounter := dataGroup + } + currentGroup := groupCounter + crossReadDequeue.bits := receiveDataVec.asUInt + crossReadDequeue.valid := allStateReady && stageValid + crossReadStageFree := (!stageValid) && stateVec.reduce(_ && _) + + crossWriteState.sSendCrossReadResult(0) := sSendCrossReadResultLSBLSB + crossWriteState.sSendCrossReadResult(1) := sSendCrossReadResultLSBMSB + crossWriteState.sSendCrossReadResult(2) := sSendCrossReadResultMSBLSB + crossWriteState.sSendCrossReadResult(3) := sSendCrossReadResultMSBMSB + crossWriteState.wCrossRead(0) := wCrossReadLSBLSB + crossWriteState.wCrossRead(1) := wCrossReadLSBMSB + crossWriteState.wCrossRead(2) := wCrossReadMSBLSB + crossWriteState.wCrossRead(3) := wCrossReadMSBMSB +} diff --git a/t1/src/vrf/VRF.scala b/t1/src/vrf/VRF.scala index 724ef63723..6c04fc6d57 100644 --- a/t1/src/vrf/VRF.scala +++ b/t1/src/vrf/VRF.scala @@ -150,9 +150,15 @@ class VRF(val parameter: VRFParam) extends Module with SerializableModule[VRFPar val readCheck: Vec[VRFReadRequest] = IO(Vec(parameter.chainingSize * 3 + 2, Input( new VRFReadRequest(parameter.regNumBits, parameter.vrfOffsetBits, parameter.instructionIndexBits) ))) + @public + val zvkReadCheck: Vec[VRFReadRequest] = IO(Vec(parameter.chainingSize * 3 + 4, Input( + new VRFReadRequest(parameter.regNumBits, parameter.vrfOffsetBits, parameter.instructionIndexBits) + ))) @public val readCheckResult: Vec[Bool] = IO(Vec(parameter.chainingSize * 3 + 2, Output(Bool()))) + @public + val zvkReadCheckResult: Vec[Bool] = IO(Vec(parameter.chainingSize * 3 + 4, Output(Bool()))) /** VRF read results. */ @public @@ -273,6 +279,22 @@ class VRF(val parameter: VRFParam) extends Module with SerializableModule[VRFPar checkModule.checkResult }.reduce(_ && _) } + zvkReadCheck.zip(zvkReadCheckResult).foreach { case (req, res) => + val recordSelect = chainingRecord + // 先找到自的record + val readRecord = + Mux1H(recordSelect.map(_.bits.instIndex === req.instructionIndex), recordSelect.map(_.bits)) + res := + recordSelect.zip(recordValidVec).zipWithIndex.map { + case ((r, f), recordIndex) => + val checkModule = Instantiate(new ChainingCheck(parameter)) + checkModule.read := req + checkModule.readRecord := readRecord + checkModule.record := r + checkModule.recordValid := f + checkModule.checkResult + }.reduce(_ && _) + } val checkSize: Int = readRequests.size val (firstOccupied, secondOccupied) = readRequests.zipWithIndex.foldLeft(