Merge remote-tracking branch 'origin/dev' into tlb-pipe

ucb-bar · Mar 18, 2021 · 9200402 · 9200402
2 parents b120818 + b631f0b
commit 9200402
Show file tree

Hide file tree

Showing 36 changed files with 2,829 additions and 608 deletions.
diff --git a/.circleci/build-toolchains.sh b/.circleci/build-toolchains.sh
@@ -28,5 +28,5 @@ if [ ! -d "$HOME/$1-install" ]; then
     cd $HOME
 
     # init all submodules including the tools (doesn't use CI_MAKE_PROC due to mem. constraints)
-    CHIPYARD_DIR="$LOCAL_CHIPYARD_DIR" NPROC=$CI_MAKE_PROC $LOCAL_CHIPYARD_DIR/scripts/build-toolchains.sh esp-tools
+    CHIPYARD_DIR="$LOCAL_CHIPYARD_DIR" NPROC=$CI_MAKE_NPROC $LOCAL_CHIPYARD_DIR/scripts/build-toolchains.sh esp-tools
 fi
diff --git a/.circleci/defaults.sh b/.circleci/defaults.sh
@@ -14,7 +14,7 @@
 #############
 
 # make parallelism
-CI_MAKE_NPROC=8
+CI_MAKE_NPROC=4
 LOCAL_MAKE_NPROC=$CI_MAKE_NPROC
 
 # verilator version

diff --git a/CHIPYARD.hash b/CHIPYARD.hash
@@ -1 +1 @@
-939e3a9f94d5bfef9671f49c37cd3acd5fc26128
+1e2f778a6705033d67ccbcc932e66083e4646f15
diff --git a/README.md b/README.md
@@ -18,7 +18,7 @@ Gemmini is implemented as a RoCC accelerator with non-standard RISC-V custom ins
 
 At the heart of the accelerator lies a systolic array which performs matrix multiplications. By default, the matrix multiplication support both _output-stationary_ and _weight-stationary_ dataflows, which programmers can pick between at runtime. However, the dataflow can also be hardened at elaboration time.
 
-The systolic array's inputs and outputs are stored in an explicity managed scratchpad, made up of banked SRAMs. A DMA engine facilitates the tranfer of data between main memory and the scratchpad.
+The systolic array's inputs and outputs are stored in an explicity managed scratchpad, made up of banked SRAMs. A DMA engine facilitates the transfer of data between main memory and the scratchpad.
 
 Because weight-stationary dataflows require an accumulator outside the systolic array, we add a final SRAM bank, equipped with adder units, which can be conceptually considered an extension of the scratchpad memory space. The systolic array can store results to any address in the accumulator, and can also read new inputs from any address in the accumulator. The DMA engine can also tranfer data directly between the accumulator and main memory, which is often necessary to load in biases.
 
@@ -75,7 +75,7 @@ The ``software`` directory of the generator includes the aforementioned library
 The Gemmini generator generates a C header file based on the generator parameters. This header files gets compiled together with the matrix multiplication library to tune library performance. The generated header file can be found under ``software/gemmini-rocc-tests/include/gemmini_params.h``
 
 Gemmini can also be used to run ONNX-specified neural-networks through a port of Microsoft's ONNX-Runtime framework. The port is included as the [onnxruntime-riscv](https://github.com/pranav-prakash/onnxruntime-riscv) repository submoduled in the `software` directory.
-To start using ONNX-Runtime, run `git submodule update --init --recursive software/onnxruntime-riscv`, and read the documentation at [here](https://github.com/pranav-prakash/onnxruntime-riscv/blob/systolic/systolic_runner/docs).
+To start using ONNX-Runtime, run `git submodule update --init --recursive software/onnxruntime-riscv`, and read the documentation [here](https://github.com/pranav-prakash/onnxruntime-riscv/blob/systolic/systolic_runner/docs).
 
 ## Build and Run Gemmini Tests
 
@@ -317,3 +317,15 @@ This section describes an additional set of RoCC instructions that configure and
 ### `COMPUTE_CISC` runs a complete hardware tiling sequence with the configured A, B, C, D, M, N, K, RPT_BIAS values
 **Format:** `compute_cisc`
 - `funct` = 17
+
+# Citing Gemmini
+If Gemmini helps you in your academic research, you are encouraged to cite our paper. Here is an example bibtex:
+```
+@article{genc2019gemmini,
+  title={Gemmini: An Agile Systolic Array Generator Enabling Systematic Evaluations of Deep-Learning Architectures},
+  author={Genc, Hasan and Haj-Ali, Ameer and Iyer, Vighnesh and Amid, Alon and Mao, Howard and Wright, John and Schmidt, Colin and Zhao, Jerry and Ou, Albert and Banister, Max and Shao, Yakun Sophia and Nikolic, Borivoje and Stoica, Ion and Asanovic, Krste},
+  journal={arXiv preprint arXiv:1911.09925},
+  year={2019}
+}
+```
+
diff --git a/SPIKE.hash b/SPIKE.hash
@@ -1 +1 @@
-3db7a449d97bf40a101ef541089054e6af59d7df
+bc3222e351cdd645b6fd2605fd9611e3bc0d9cae
diff --git a/software/gemmini-rocc-tests b/software/gemmini-rocc-tests
diff --git a/src/main/scala/gemmini/AccumulatorMem.scala b/src/main/scala/gemmini/AccumulatorMem.scala
@@ -17,19 +17,21 @@ class AccumulatorReadReq[T <: Data](n: Int, shift_width: Int, scale_t: T) extend
   override def cloneType: this.type = new AccumulatorReadReq(n, shift_width, scale_t.cloneType).asInstanceOf[this.type]
 }
 
-class AccumulatorReadResp[T <: Data: Arithmetic](rdataType: Vec[Vec[T]], fullDataType: Vec[Vec[T]]) extends Bundle {
-  val data = rdataType.cloneType
-  val full_data = fullDataType.cloneType
+class AccumulatorReadResp[T <: Data: Arithmetic, U <: Data](fullDataType: Vec[Vec[T]], scale_t: U, shift_width: Int) extends Bundle {
+  val data = fullDataType.cloneType
   val fromDMA = Bool()
-
-  override def cloneType: this.type = new AccumulatorReadResp(rdataType.cloneType, fullDataType.cloneType).asInstanceOf[this.type]
+  val scale = scale_t.cloneType
+  val relu6_shift = UInt(shift_width.W)
+  val act = UInt(2.W)
+  val acc_bank_id = UInt(2.W) // TODO don't hardcode
+  override def cloneType: this.type = new AccumulatorReadResp(fullDataType.cloneType, scale_t, shift_width).asInstanceOf[this.type]
 }
 
-class AccumulatorReadIO[T <: Data: Arithmetic, U <: Data](n: Int, shift_width: Int, rdataType: Vec[Vec[T]], fullDataType: Vec[Vec[T]], scale_t: U) extends Bundle {
-  val req = Decoupled(new AccumulatorReadReq(n, shift_width, scale_t))
-  val resp = Flipped(Decoupled(new AccumulatorReadResp(rdataType.cloneType, fullDataType.cloneType)))
+class AccumulatorReadIO[T <: Data: Arithmetic, U <: Data](n: Int, shift_width: Int, fullDataType: Vec[Vec[T]], scale_t: U) extends Bundle {
+  val req = Decoupled(new AccumulatorReadReq[U](n, shift_width, scale_t))
+  val resp = Flipped(Decoupled(new AccumulatorReadResp[T, U](fullDataType, scale_t, shift_width)))
 
-  override def cloneType: this.type = new AccumulatorReadIO(n, shift_width, rdataType.cloneType, fullDataType.cloneType, scale_t.cloneType).asInstanceOf[this.type]
+  override def cloneType: this.type = new AccumulatorReadIO(n, shift_width, fullDataType.cloneType, scale_t.cloneType).asInstanceOf[this.type]
 }
 
 class AccumulatorWriteReq[T <: Data: Arithmetic](n: Int, t: Vec[Vec[T]]) extends Bundle {
@@ -42,16 +44,19 @@ class AccumulatorWriteReq[T <: Data: Arithmetic](n: Int, t: Vec[Vec[T]]) extends
   override def cloneType: this.type = new AccumulatorWriteReq(n, t).asInstanceOf[this.type]
 }
 
-class AccumulatorMemIO [T <: Data: Arithmetic, U <: Data](n: Int, t: Vec[Vec[T]], rdata: Vec[Vec[T]], scale_t: U) extends Bundle {
-  val read = Flipped(new AccumulatorReadIO(n, log2Ceil(t.head.head.getWidth), rdata, t, scale_t))
+class AccumulatorMemIO [T <: Data: Arithmetic, U <: Data](n: Int, t: Vec[Vec[T]], scale_t: U) extends Bundle {
+  val read = Flipped(new AccumulatorReadIO(n, log2Ceil(t.head.head.getWidth), t, scale_t))
   // val write = Flipped(new AccumulatorWriteIO(n, t))
   val write = Flipped(Decoupled(new AccumulatorWriteReq(n, t)))
 
-  override def cloneType: this.type = new AccumulatorMemIO(n, t, rdata, scale_t).asInstanceOf[this.type]
+  override def cloneType: this.type = new AccumulatorMemIO(n, t, scale_t).asInstanceOf[this.type]
 }
 
-class AccumulatorMem[T <: Data, U <: Data](n: Int, t: Vec[Vec[T]], rdataType: Vec[Vec[T]], mem_pipeline: Int, scale_args: ScaleArguments[T, U], read_small_data: Boolean, read_full_data: Boolean)
-                               (implicit ev: Arithmetic[T]) extends Module {
+class AccumulatorMem[T <: Data, U <: Data](
+  n: Int, t: Vec[Vec[T]], scale_args: ScaleArguments[T, U],
+  acc_singleported: Boolean, num_acc_sub_banks: Int
+)
+  (implicit ev: Arithmetic[T]) extends Module {
   // TODO Do writes in this module work with matrices of size 2? If we try to read from an address right after writing
   // to it, then we might not get the written data. We might need some kind of cooldown counter after addresses in the
   // accumulator have been written to for configurations with such small matrices
@@ -64,9 +69,8 @@ class AccumulatorMem[T <: Data, U <: Data](n: Int, t: Vec[Vec[T]], rdataType: Ve
   import ev._
 
   // TODO unify this with TwoPortSyncMemIO
-  val io = IO(new AccumulatorMemIO(n, t, rdataType, scale_args.multiplicand_t))
+  val io = IO(new AccumulatorMemIO(n, t, scale_args.multiplicand_t))
 
-  val mem = TwoPortSyncMem(n, t, t.getWidth / 8) // TODO We assume byte-alignment here. Use aligned_to instead
 
   // For any write operation, we spend 2 cycles reading the existing address out, buffering it in a register, and then
   // accumulating on top of it (if necessary)
@@ -75,83 +79,162 @@ class AccumulatorMem[T <: Data, U <: Data](n: Int, t: Vec[Vec[T]], rdataType: Ve
   val acc_buf = ShiftRegister(io.write.bits.acc, 2)
   val mask_buf = ShiftRegister(io.write.bits.mask, 2)
   val w_buf_valid = ShiftRegister(io.write.fire(), 2)
-
-  val w_sum = VecInit((RegNext(mem.io.rdata) zip wdata_buf).map { case (rv, wv) =>
+  val acc_rdata = Wire(t)
+  acc_rdata := DontCare
+  val read_rdata = Wire(t)
+  read_rdata := DontCare
+  val block_read_req = WireInit(false.B)
+  val w_sum = VecInit((RegNext(acc_rdata) zip wdata_buf).map { case (rv, wv) =>
     VecInit((rv zip wv).map(t => t._1 + t._2))
   })
 
-  mem.io.waddr := waddr_buf
-  mem.io.wen := w_buf_valid
-  mem.io.wdata := Mux(acc_buf, w_sum, wdata_buf)
-  mem.io.mask := mask_buf
-
-  mem.io.raddr := Mux(io.write.fire() && io.write.bits.acc, io.write.bits.addr, io.read.req.bits.addr)
-  mem.io.ren := io.read.req.fire() || (io.write.fire() && io.write.bits.acc)
-
-  class PipelinedRdataAndActT extends Bundle {
-    val data = mem.io.rdata.cloneType
-    val full_data = mem.io.rdata.cloneType
-    val scale = io.read.req.bits.scale.cloneType
-    val relu6_shift = io.read.req.bits.relu6_shift.cloneType
-    val act = io.read.req.bits.act.cloneType
-    val fromDMA = io.read.req.bits.fromDMA.cloneType
+  if (!acc_singleported) {
+    val mem = TwoPortSyncMem(n, t, t.getWidth / 8) // TODO We assume byte-alignment here. Use aligned_to instead
+    mem.io.waddr := waddr_buf
+    mem.io.wen := w_buf_valid
+    mem.io.wdata := Mux(acc_buf, w_sum, wdata_buf)
+    mem.io.mask := mask_buf
+    acc_rdata := mem.io.rdata
+    read_rdata := mem.io.rdata
+    mem.io.raddr := Mux(io.write.fire() && io.write.bits.acc, io.write.bits.addr, io.read.req.bits.addr)
+    mem.io.ren := io.read.req.fire() || (io.write.fire() && io.write.bits.acc)
+  } else {
+    val mask_len = t.getWidth / 8
+    val mask_elem = UInt((t.getWidth / mask_len).W)
+    val reads = Wire(Vec(2, Decoupled(UInt())))
+    reads(0).valid := io.write.valid && io.write.bits.acc
+    reads(0).bits  := io.write.bits.addr
+    reads(0).ready := true.B
+    reads(1).valid := io.read.req.valid
+    reads(1).bits  := io.read.req.bits.addr
+    reads(1).ready := true.B
+    block_read_req := !reads(1).ready
+    for (i <- 0 until num_acc_sub_banks) {
+      def isThisBank(addr: UInt) = addr(log2Ceil(num_acc_sub_banks)-1,0) === i.U
+      def getBankIdx(addr: UInt) = addr >> log2Ceil(num_acc_sub_banks)
+      val mem = SyncReadMem(n / num_acc_sub_banks, Vec(mask_len, mask_elem))
+
+      val ren = WireInit(false.B)
+      val raddr = WireInit(getBankIdx(reads(0).bits))
+      val nEntries = 3
+      // Writes coming 2 cycles after read leads to bad bank behavior
+      // Add another buffer here
+      class W_Q_Entry[T <: Data](mask_len: Int, mask_elem: T) extends Bundle {
+        val valid = Bool()
+        val data = Vec(mask_len, mask_elem)
+        val mask = Vec(mask_len, Bool())
+        val addr = UInt(log2Ceil(n/num_acc_sub_banks).W)
+        override def cloneType: this.type = new W_Q_Entry(mask_len, mask_elem).asInstanceOf[this.type]
+      }
+      val w_q = Reg(Vec(nEntries, new W_Q_Entry(mask_len, mask_elem)))
+      for (e <- w_q) {
+        when (e.valid) {
+          assert(!(
+            io.write.valid && io.write.bits.acc &&
+            isThisBank(io.write.bits.addr) && getBankIdx(io.write.bits.addr) === e.addr &&
+            ((io.write.bits.mask.asUInt & e.mask.asUInt) =/= 0.U)
+          ))
+          when (io.read.req.valid && isThisBank(io.read.req.bits.addr) && getBankIdx(io.read.req.bits.addr) === e.addr) {
+            reads(1).ready := false.B
+          }
+        }
+      }
+      val w_q_head = RegInit(1.U(nEntries.W))
+      val w_q_tail = RegInit(1.U(nEntries.W))
+      when (reset.asBool) {
+        w_q.foreach(_.valid := false.B)
+      }
+      val wen = WireInit(false.B)
+      val wdata = Mux1H(w_q_head.asBools, w_q.map(_.data))
+      val wmask = Mux1H(w_q_head.asBools, w_q.map(_.mask))
+      val waddr = Mux1H(w_q_head.asBools, w_q.map(_.addr))
+      when (wen) {
+        w_q_head := w_q_head << 1 | w_q_head(nEntries-1)
+        for (i <- 0 until nEntries) {
+          when (w_q_head(i)) {
+            w_q(i).valid := false.B
+          }
+        }
+      }
+
+      when (w_buf_valid && isThisBank(waddr_buf)) {
+        assert(!((w_q_tail.asBools zip w_q.map(_.valid)).map({ case (h,v) => h && v }).reduce(_||_)))
+        w_q_tail := w_q_tail << 1 | w_q_tail(nEntries-1)
+        for (i <- 0 until nEntries) {
+          when (w_q_tail(i)) {
+            w_q(i).valid := true.B
+            w_q(i).data  := Mux(acc_buf, w_sum, wdata_buf).asTypeOf(Vec(mask_len, mask_elem))
+            w_q(i).mask  := mask_buf
+            w_q(i).addr  := getBankIdx(waddr_buf)
+          }
+        }
+
+      }
+      val bank_rdata = mem.read(raddr, ren && !wen).asTypeOf(t)
+      when (RegNext(ren && reads(0).valid && isThisBank(reads(0).bits))) {
+        acc_rdata := bank_rdata
+      } .elsewhen (RegNext(ren)) {
+        read_rdata := bank_rdata
+      }
+      when (wen) {
+        mem.write(waddr, wdata, wmask)
+      }
+      // Three requestors, 1 slot
+      // Priority is incoming reads for RMW > writes from RMW > incoming reads
+      when (reads(0).valid && isThisBank(reads(0).bits)) {
+        ren := true.B
+        when (isThisBank(reads(1).bits)) {
+          reads(1).ready := false.B
+        }
+      } .elsewhen ((w_q_head.asBools zip w_q.map(_.valid)).map({ case (h,v) => h && v }).reduce(_||_)) {
+        wen := true.B
+        when (isThisBank(reads(1).bits)) {
+          reads(1).ready := false.B
+        }
+      } .otherwise {
+        ren := isThisBank(reads(1).bits)
+        raddr := getBankIdx(reads(1).bits)
+      }
+    }
   }
 
-  val q = Module(new Queue(new PipelinedRdataAndActT, 1, true, true))
-  q.io.enq.bits.data := mem.io.rdata
-  q.io.enq.bits.full_data := mem.io.rdata
+  val q = Module(new Queue(new AccumulatorReadResp(t, scale_args.multiplicand_t, log2Ceil(t.head.head.getWidth)),  1, true, true))
+  q.io.enq.bits.data := read_rdata
   q.io.enq.bits.scale := RegNext(io.read.req.bits.scale)
   q.io.enq.bits.relu6_shift := RegNext(io.read.req.bits.relu6_shift)
   q.io.enq.bits.act := RegNext(io.read.req.bits.act)
   q.io.enq.bits.fromDMA := RegNext(io.read.req.bits.fromDMA)
+  q.io.enq.bits.acc_bank_id := DontCare
   q.io.enq.valid := RegNext(io.read.req.fire())
 
-  val p = Pipeline(q.io.deq, mem_pipeline, Seq.fill(mem_pipeline)((x: PipelinedRdataAndActT) => x) :+ {
-    x: PipelinedRdataAndActT =>
-      val activated_rdata = VecInit(x.data.map(v => VecInit(v.map { e =>
-        // val e_scaled = e >> x.shift
-        val e_scaled = scale_args.scale_func(e, x.scale)
-        val e_clipped = e_scaled.clippedToWidthOf(rdataType.head.head)
-        val e_act = MuxCase(e_clipped, Seq(
-          (x.act === Activation.RELU) -> e_clipped.relu,
-          (x.act === Activation.RELU6) -> e_clipped.relu6(x.relu6_shift)))
 
-        e_act
-      })))
+  val p = q.io.deq
 
-      val result = WireInit(x)
-      result.data := activated_rdata
+  io.read.resp.bits.data := p.bits.data
+  io.read.resp.bits.fromDMA := p.bits.fromDMA
+  io.read.resp.bits.relu6_shift := p.bits.relu6_shift
+  io.read.resp.bits.act := p.bits.act
+  io.read.resp.bits.scale := p.bits.scale
+  io.read.resp.bits.acc_bank_id := DontCare // This is set in Scratchpad
+  io.read.resp.valid := p.valid
+  p.ready := io.read.resp.ready
 
-      result
-  })
 
   val q_will_be_empty = (q.io.count +& q.io.enq.fire()) - q.io.deq.fire() === 0.U
   io.read.req.ready := q_will_be_empty && (
       // Make sure we aren't accumulating, which would take over both ports
       !(io.write.fire() && io.write.bits.acc) &&
       // Make sure we aren't reading something that is still being written
       !(RegNext(io.write.fire()) && RegNext(io.write.bits.addr) === io.read.req.bits.addr) &&
-      !(w_buf_valid && waddr_buf === io.read.req.bits.addr)
-    )
-  io.read.resp.bits.data := p.bits.data
-  io.read.resp.bits.full_data := p.bits.full_data
-  io.read.resp.bits.fromDMA := p.bits.fromDMA
-  io.read.resp.valid := p.valid
-  p.ready := io.read.resp.ready
+      !(w_buf_valid && waddr_buf === io.read.req.bits.addr) &&
+      !block_read_req
+  )
 
-  if (read_small_data)
-    io.read.resp.bits.data := p.bits.data
-  else
-    io.read.resp.bits.data := 0.U.asTypeOf(p.bits.data) // TODO make this DontCare instead
 
-  if (read_full_data)
-    io.read.resp.bits.full_data := p.bits.full_data
-  else
-    io.read.resp.bits.full_data := 0.U.asTypeOf(q.io.enq.bits.full_data) // TODO make this DontCare instead
 
   // io.write.current_waddr.valid := mem.io.wen
   // io.write.current_waddr.bits := mem.io.waddr
-  io.write.ready := !io.write.bits.acc || (!(io.write.bits.addr === mem.io.waddr && mem.io.wen) &&
+  io.write.ready := !io.write.bits.acc || (!(io.write.bits.addr === waddr_buf && w_buf_valid) &&
     !(io.write.bits.addr === RegNext(io.write.bits.addr) && RegNext(io.write.fire())))
 
   // assert(!(io.read.req.valid && io.write.en && io.write.acc), "reading and accumulating simultaneously is not supported")