Skip to content

Commit

Permalink
[rtl] Use token to manage the vrf status of instructions.(wip)
Browse files Browse the repository at this point in the history
  • Loading branch information
qinjun-li committed May 29, 2024
1 parent 4261534 commit 203c59f
Show file tree
Hide file tree
Showing 4 changed files with 72 additions and 139 deletions.
23 changes: 9 additions & 14 deletions t1/src/Bundles.scala
Original file line number Diff line number Diff line change
Expand Up @@ -64,13 +64,6 @@ class InstructionRecord(instructionIndexWidth: Int) extends Bundle {
*/
val instructionIndex: UInt = UInt(instructionIndexWidth.W)

/** whether instruction need wait for write queue clear,
* for instructions has `widen`, it need use cross lane write channel,
* but lane will regard the instruction is finished when data is sent to ring,
* so we need this bit to record if the ring is cleared.
*/
val needWaitWriteQueueClear: Bool = Bool()

/** whether instruction is load store.
* it should tell scalar core if this is a load store unit.
*/
Expand Down Expand Up @@ -470,6 +463,14 @@ class LSUWriteCheck(regNumBits: Int, offsetBits: Int, instructionIndexSize: Int)
val instructionIndex: UInt = UInt(instructionIndexSize.W)
}

class VRFInstructionState extends Bundle {
val stFinish: Bool = Bool()
// execute finish, wait for write queue clear
val wWriteQueueClear: Bool = Bool()
val wLaneLastReport: Bool = Bool()
val wLaneClear: Bool = Bool()
}

class VRFWriteReport(param: VRFParam) extends Bundle {
// 8 reg/group; which group?
val vd: ValidIO[UInt] = Valid(UInt(param.regNumBits.W))
Expand All @@ -482,22 +483,16 @@ class VRFWriteReport(param: VRFParam) extends Bundle {
val crossWrite: Bool = Bool()
// instruction will cross read
val crossRead: Bool = Bool()
val stFinish: Bool = Bool()
// index type lsu
val indexType: Bool = Bool()
// execute finish, wait for write queue clear
val wWriteQueueClear: Bool = Bool()
// wait cross write bus clear
val wBusClear = Bool()
// wait cross write queue clear
val wQueueClear = Bool()
// 乘加
val ma: Bool = Bool()
// 慢指令 mask unit
val slow: Bool = Bool()
// which element will access(write or store read)
// true: No access or access has been completed
val elementMask: UInt = UInt(param.elementSize.W)
val state = new VRFInstructionState
}

/** 为了decode, 指令需要在入口的时候打一拍, 这是需要保存的信息 */
Expand Down
118 changes: 31 additions & 87 deletions t1/src/Lane.scala
Original file line number Diff line number Diff line change
Expand Up @@ -289,10 +289,6 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[
@public
val loadDataInLSUWriteQueue: UInt = IO(Input(UInt(parameter.chainingSize.W)))

/** for RaW, VRF should wait for cross write bus to be empty. */
@public
val dataInCrossBus: UInt = IO(Input(UInt(parameter.chainingSize.W)))

/** How many dataPath will writ by instruction in this lane */
@public
val writeCount: UInt =
Expand All @@ -303,8 +299,6 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[
val writeReadyForLsu: Bool = IO(Output(Bool()))
@public
val vrfReadyToStore: Bool = IO(Output(Bool()))
@public
val crossWriteDataInSlot: UInt = IO(Output(UInt(parameter.chainingSize.W)))

@public
val probe: LaneProbe = IO(Output(Probe(new LaneProbe(parameter.chainingSize))))
Expand Down Expand Up @@ -349,9 +343,6 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[
val maskIndexVec: Vec[UInt] =
RegInit(VecInit(Seq.fill(parameter.chainingSize)(0.U(log2Ceil(parameter.maskGroupWidth).W))))

/** pipe state for slot */
val pipeFinishVec: Vec[Bool] = RegInit(VecInit(Seq.fill(parameter.chainingSize)(false.B)))

/** the find first one index register in this lane. */
val ffoIndexReg: UInt = RegInit(0.U(log2Ceil(parameter.vLen / 8).W))

Expand Down Expand Up @@ -488,11 +479,9 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[

val slot0EnqueueFire: Bool = Wire(Bool())

/** assert when a instruction is finished in the slot. */
val instructionFinishedVec: Vec[UInt] = Wire(Vec(parameter.chainingSize, UInt(parameter.chainingSize.W)))

/** any cross lane write data in slot */
val crossWriteDataInSlotVec: Vec[UInt] = Wire(Vec(parameter.chainingSize, UInt(parameter.chainingSize.W)))
/** assert when a instruction is valid in the slot. */
val instructionValid: UInt = Wire(UInt(parameter.chainingSize.W))
val instructionValidNext: UInt = RegNext(instructionValid, 0.U)

/** assert when a instruction will not use mask unit */
val instructionUnrelatedMaskUnitVec: Vec[UInt] = Wire(Vec(parameter.chainingSize, UInt(parameter.chainingSize.W)))
Expand All @@ -514,9 +503,6 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[
))
val maskedWriteUnit: Instance[MaskedWrite] = Instantiate(new MaskedWrite(parameter))
val tokenManager: Instance[SlotTokenManager] = Instantiate(new SlotTokenManager(parameter))
val dataInPipeQueue: UInt = Wire(UInt(parameter.chainingSize.W))
// data in allVrfWriteAfterCheck
val dataInAfterCheck: UInt = Wire(UInt(parameter.chainingSize.W))
slotControl.zipWithIndex.foreach {
case (record, index) =>
val decodeResult: DecodeBundle = record.laneRequest.decodeResult
Expand Down Expand Up @@ -550,21 +536,19 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[
// register for s0 enqueue, it will move with the slot
// 'maskGroupCountVec' 'maskIndexVec' 'pipeFinishVec'

// pipe clear
val pipeClear: Bool = Wire(Bool())

if (isLastSlot) {
slotActive(index) := slotOccupied(index) && !pipeFinishVec(index)
// todo: Reach vfu
slotActive(index) := slotOccupied(index)
} else {
slotActive(index) := slotOccupied(index) && !pipeFinishVec(index) && !slotShiftValid(index) &&
slotActive(index) := slotOccupied(index) && !slotShiftValid(index) &&
!(decodeResult(Decoder.crossRead) || decodeResult(Decoder.crossWrite) || decodeResult(Decoder.widenReduce)) &&
decodeResult(Decoder.scheduler)
}

if(isLastSlot) {
slotCanShift(index) := pipeClear && pipeFinishVec(index)
slotCanShift(index) := !slotOccupied(index)
} else {
slotCanShift(index) := pipeClear
slotCanShift(index) := true.B
}

val laneState: LaneState = Wire(new LaneState(parameter))
Expand Down Expand Up @@ -612,7 +596,7 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[
// todo: handle all elements in first group are masked
maskIndexVec(index) := stage0.updateLaneState.maskIndex
when(stage0.updateLaneState.outOfExecutionRange) {
pipeFinishVec(index) := true.B
slotOccupied(index) := false.B
}
}

Expand All @@ -636,17 +620,8 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[
val instructionIndex1H: UInt = UIntToOH(
record.laneRequest.instructionIndex(parameter.instructionIndexBits - 2, 0)
)
instructionFinishedVec(index) := 0.U
instructionUnrelatedMaskUnitVec(index) :=
Mux(decodeResult(Decoder.maskUnit) && decodeResult(Decoder.readOnly), 0.U, instructionIndex1H)
val dataInWritePipe: Bool =
ohCheck(maskedWriteUnit.maskedWrite1H, record.laneRequest.instructionIndex, parameter.chainingSize) |
ohCheck(dataInPipeQueue, record.laneRequest.instructionIndex, parameter.chainingSize) |
ohCheck(dataInAfterCheck, record.laneRequest.instructionIndex, parameter.chainingSize)
when(slotOccupied(index) && pipeClear && pipeFinishVec(index) && !dataInWritePipe) {
slotOccupied(index) := false.B
instructionFinishedVec(index) := instructionIndex1H
}

// stage 1: read stage
stage1.enqueue.valid := stage0.dequeue.valid
Expand Down Expand Up @@ -714,12 +689,6 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[
}
}

crossWriteDataInSlotVec(index) := Mux(
(pipeClear & !slotOccupied(index)) || !decodeResult(Decoder.crossWrite),
0.U,
indexToOH(record.laneRequest.instructionIndex, parameter.chainingSize)
)

stage2.enqueue.valid := stage1.dequeue.valid && executionUnit.enqueue.ready
stage1.dequeue.ready := stage2.enqueue.ready && executionUnit.enqueue.ready
executionUnit.enqueue.valid := stage1.dequeue.valid && stage2.enqueue.ready
Expand Down Expand Up @@ -814,7 +783,6 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[
vrfWriteArbiter(index).bits := stage3.vrfWriteRequest.bits
stage3.vrfWriteRequest.ready := vrfWriteArbiter(index).ready

pipeClear := !Seq(stage0.stageValid, stage1.stageValid, stage2.stageValid, stage3.stageValid, dataInWritePipe).reduce(_ || _)
tokenManager.enqReports(index) := stage0.tokenReport

// probes
Expand All @@ -823,7 +791,7 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[
probeWire.slots(index).changingMaskSet := record.mask.valid || !record.laneRequest.mask
probeWire.slots(index).slotActive := slotActive(index)
probeWire.slots(index).slotOccupied := slotOccupied(index)
probeWire.slots(index).pipeFinish := pipeFinishVec(index)
probeWire.slots(index).pipeFinish := !slotOccupied(index)
probeWire.slots(index).slotShiftValid := slotShiftValid(index)
probeWire.slots(index).decodeResultIsCrossReadOrWrite := decodeResult(Decoder.crossRead) || decodeResult(Decoder.crossWrite)
probeWire.slots(index).decodeResultIsScheduler := decodeResult(Decoder.scheduler)
Expand Down Expand Up @@ -873,11 +841,6 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[
// It’s been a long time since I selected it. Need pipe
val queueBeforeMaskWrite: Queue[VRFWriteRequest] =
Module(new Queue(chiselTypeOf(maskedWriteUnit.enqueue.bits), entries = 1, pipe = true))
dataInPipeQueue := Mux(
queueBeforeMaskWrite.io.deq.valid,
indexToOH(queueBeforeMaskWrite.io.deq.bits.instructionIndex, parameter.chainingSize),
0.U
)
val writeSelect: UInt = Wire(UInt((parameter.chainingSize + 3).W))
val writeCavitation: UInt = VecInit(allVrfWriteAfterCheck.map(_.mask === 0.U)).asUInt

Expand Down Expand Up @@ -919,25 +882,6 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[
vrf.readCheck.zip(readCheckRequestVec).foreach{case (sink, source) => sink := source}
readCheckResult.zip(vrf.readCheckResult).foreach{case (sink, source) => sink := source}

dataInAfterCheck := allVrfWriteAfterCheck.zipWithIndex.map { case (req, i) =>
val check = vrf.writeAllow(i)
val enqReady = check && (!afterCheckValid(i) || afterCheckDequeueReady(i))
val enqFire = enqReady && allVrfWrite(i).valid
allVrfWrite(i).ready := enqReady
when(enqFire) {
req := allVrfWrite(i).bits
}
val deqFire = afterCheckDequeueFire(i)
when(deqFire ^ enqFire) {
afterCheckValid(i) := enqFire
}
Mux(
afterCheckValid(i),
indexToOH(req.instructionIndex, parameter.chainingSize),
0.U
)
}.reduce(_ | _)

// Arbiter
writeSelect := ffo(VecInit(afterCheckValid).asUInt & (~writeCavitation).asUInt)
afterCheckDequeueReady.zipWithIndex.foreach { case (p, i) =>
Expand Down Expand Up @@ -1099,7 +1043,7 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[
slotControl(slotIndex) := entranceControl
maskGroupCountVec(slotIndex) := 0.U(parameter.maskGroupSizeBits.W)
maskIndexVec(slotIndex) := 0.U(log2Ceil(parameter.maskGroupWidth).W)
pipeFinishVec(slotIndex) := false.B
slotOccupied(slotIndex) := true.B
}
enqueueFire
} else {
Expand All @@ -1110,18 +1054,12 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[
slotControl(slotIndex) := slotControl(slotIndex + 1)
maskGroupCountVec(slotIndex) := maskGroupCountVec(slotIndex + 1)
maskIndexVec(slotIndex) := maskIndexVec(slotIndex + 1)
pipeFinishVec(slotIndex) := pipeFinishVec(slotIndex + 1)
slotOccupied(slotIndex) := slotOccupied(slotIndex + 1)
}
enqueueFire
}
}

val slotDequeueFire: Seq[Bool] = (slotCanShift.head && slotOccupied.head) +: slotEnqueueFire
Seq.tabulate(parameter.chainingSize) { slotIndex =>
when(slotEnqueueFire(slotIndex) ^ slotDequeueFire(slotIndex)) {
slotOccupied(slotIndex) := slotEnqueueFire(slotIndex)
}
}
slot0EnqueueFire := slotEnqueueFire.head

// handshake
Expand All @@ -1148,10 +1086,12 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[
vrf.instructionWriteReport.bits.st := laneRequest.bits.store
vrf.instructionWriteReport.bits.crossWrite := laneRequest.bits.decodeResult(Decoder.crossWrite)
vrf.instructionWriteReport.bits.crossRead := laneRequest.bits.decodeResult(Decoder.crossRead)
vrf.instructionWriteReport.bits.stFinish := false.B
vrf.instructionWriteReport.bits.wWriteQueueClear := false.B
vrf.instructionWriteReport.bits.wBusClear := false.B
vrf.instructionWriteReport.bits.wQueueClear := false.B
// init state
vrf.instructionWriteReport.bits.state.stFinish := !laneRequest.bits.loadStore
// load need wait for write queue clear in lsu write queue
vrf.instructionWriteReport.bits.state.wWriteQueueClear := !(laneRequest.bits.loadStore && !laneRequest.bits.store)
vrf.instructionWriteReport.bits.state.wLaneLastReport := !laneRequest.valid
vrf.instructionWriteReport.bits.state.wLaneClear := false.B

val elementSizeForOneRegister: Int = parameter.vLen / parameter.datapathWidth / parameter.laneNumber
val nrMask: UInt = VecInit(Seq.tabulate(8){ i =>
Expand Down Expand Up @@ -1188,27 +1128,31 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[
vrf.instructionWriteReport.bits.elementMask := shifterMask

// clear record by instructionFinished
vrf.instructionLastReport := lsuLastReport | (instructionFinished & instructionUnrelatedMaskUnitVec.reduce(_ | _))
vrf.instructionLastReport := instructionFinished
vrf.lsuLastReport := lsuLastReport
vrf.lsuMaskGroupChange := lsuMaskGroupChange
vrf.loadDataInLSUWriteQueue := loadDataInLSUWriteQueue
vrf.dataInCrossBus := dataInCrossBus
vrf.dataInWriteQueue :=
crossLaneWriteQueue.map(q => Mux(q.io.deq.valid, indexToOH(q.io.deq.bits.instructionIndex, parameter.chainingSize), 0.U)).reduce(_ | _)|
Mux(topWriteQueue.valid, indexToOH(topWriteQueue.bits.instructionIndex, parameter.chainingSize), 0.U) |
maskedWriteUnit.maskedWrite1H | dataInPipeQueue | dataInAfterCheck
instructionFinished := instructionFinishedVec.reduce(_ | _)
crossWriteDataInSlot := crossWriteDataInSlotVec.reduce(_ | _) | dataInPipeQueue |
maskedWriteUnit.maskedWrite1H | dataInAfterCheck
vrf.dataInLane := instructionValid
instructionFinished := (~instructionValid).asUInt & instructionValidNext
writeReadyForLsu := vrf.writeReadyForLsu
vrfReadyToStore := vrf.vrfReadyToStore
tokenManager.crossWriteReports.zipWithIndex.foreach {case (rpt, rptIndex) =>
rpt.valid := afterCheckDequeueFire(parameter.chainingSize + 1 + rptIndex)
rpt.bits := allVrfWriteAfterCheck(parameter.chainingSize + 1 + rptIndex).instructionIndex
}
// todo: add mask unit write token
tokenManager.responseReport.valid := laneResponse.valid
tokenManager.responseReport.bits := laneResponse.bits.instructionIndex
tokenManager.responseFeedbackReport.valid := laneResponseFeedback.valid
tokenManager.responseFeedbackReport.bits := laneResponseFeedback.bits.instructionIndex
val instInSlot: UInt = slotControl.zip(slotOccupied).map { case (slotState, occupied) =>
Mux(
occupied,
indexToOH(slotState.laneRequest.instructionIndex, parameter.chainingSize),
0.U
)
}.reduce(_ | _)
instructionValid := tokenManager.instructionValid | instInSlot

// slot write
tokenManager.slotWriteReport.zipWithIndex.foreach {case (rpt, rptIndex) =>
Expand Down
19 changes: 7 additions & 12 deletions t1/src/T1.scala
Original file line number Diff line number Diff line change
Expand Up @@ -451,7 +451,6 @@ class T1(val parameter: T1Parameter) extends Module with SerializableModule[T1Pa
* - vd is v0
*/
val specialInstruction: Bool = decodeResult(Decoder.special) || requestReg.bits.vdIsV0
val dataInCrossBus = Wire(UInt(parameter.chainingSize.W))
val writeQueueClearVec = Wire(Vec(parameter.laneNumber, Bool()))
val writeQueueClear: Bool = !writeQueueClearVec.asUInt.orR

Expand Down Expand Up @@ -571,16 +570,13 @@ class T1(val parameter: T1Parameter) extends Module with SerializableModule[T1Pa
* this signal is used to update the `control.endTag`.
*/
val lsuFinished: Bool = ohCheck(lsu.lastReport, control.record.instructionIndex, parameter.chainingSize)
val busClear: Bool = !ohCheck(dataInCrossBus, control.record.instructionIndex, parameter.chainingSize)
// instruction is allocated to this slot.
when(instructionToSlotOH(index)) {
// instruction metadata
control.record.instructionIndex := requestReg.bits.instructionIndex
// TODO: remove
control.record.isLoadStore := isLoadStoreType
control.record.maskType := maskType
control.record.needWaitWriteQueueClear :=
requestReg.bits.decodeResult(Decoder.crossWrite) || requestReg.bits.decodeResult(Decoder.maskUnit)
// control signals
control.state.idle := false.B
control.state.wLast := false.B
Expand All @@ -593,7 +589,7 @@ class T1(val parameter: T1Parameter) extends Module with SerializableModule[T1Pa
// state machine starts here
.otherwise {
when(laneAndLSUFinish) {
control.state.wLast := !control.record.needWaitWriteQueueClear || (busClear && writeQueueClear)
control.state.wLast := true.B
}
// TODO: execute first, then commit
when(responseCounter === control.record.instructionIndex && response.fire) {
Expand Down Expand Up @@ -1500,7 +1496,6 @@ class T1(val parameter: T1Parameter) extends Module with SerializableModule[T1Pa

lane.lsuMaskGroupChange := lsu.lsuMaskGroupChange
lane.loadDataInLSUWriteQueue := lsu.dataInWriteQueue(index)
lane.dataInCrossBus := dataInCrossBus
// 2 + 3 = 5
val rowWith: Int = log2Ceil(parameter.datapathWidth / 8) + log2Ceil(parameter.laneNumber)
lane.writeCount :=
Expand Down Expand Up @@ -1551,8 +1546,8 @@ class T1(val parameter: T1Parameter) extends Module with SerializableModule[T1Pa
lsu.vrfReadyToStore := VecInit(laneVec.map(_.vrfReadyToStore)).asUInt.andR

// 连lane的环
dataInCrossBus := parameter.crossLaneConnectCycles.zipWithIndex.map { case (cycles, index) =>
cycles.zipWithIndex.map { case (cycle, portIndex) =>
parameter.crossLaneConnectCycles.zipWithIndex.foreach { case (cycles, index) =>
cycles.zipWithIndex.foreach { case (cycle, portIndex) =>
// read source <=> write sink
val readSourceIndex = (2 * index + portIndex) % parameter.laneNumber
val readSourcePort = (2 * index + portIndex) / parameter.laneNumber
Expand All @@ -1574,10 +1569,10 @@ class T1(val parameter: T1Parameter) extends Module with SerializableModule[T1Pa
0.U.asTypeOf(new EmptyBundle),
cycle
).valid
connectWithShifter(cycle, id = Some((a: WriteBusData) => a.instructionIndex))(laneVec(index).writeBusPort(portIndex).deq,
laneVec(readSourceIndex).writeBusPort(readSourcePort).enq).get | laneVec(index).crossWriteDataInSlot
}.reduce(_ | _)
}.reduce(_ | _)
connectWithShifter(cycle)(laneVec(index).writeBusPort(portIndex).deq,
laneVec(readSourceIndex).writeBusPort(readSourcePort).enq).get
}
}

memoryPorts.zip(lsu.tlPort).foreach {
case (source, sink) =>
Expand Down
Loading

0 comments on commit 203c59f

Please sign in to comment.