Skip to content

Commit

Permalink
Vector LSU (#453)
Browse files Browse the repository at this point in the history
* Rewrite downloader to use BufferedReqResp

* Move BufferedRespReq from downloader to executor.

* Present vrf interface in VectorExecutor.

* Added checking of LS width in frontend.

* Added handle_load

* Start writing LSU test.

* Next part of tests.

* Some fixes.

* VectorLSUTests works.

* Fix failing tests. All tests passed.

* Add connections between LSUs and vector unit.

* Added dependencies.

* Fix tests.

* Change in handling LSU reserved signal.

* Fix vector core tests.

* Added decoder test for checking vector memory instructions.

* Fix in scheduler. Test kill the canary.

* Fix vector asm test. All tests are passing.

* Bytes tests passed. All tests are passing.

* Fix needed regs vm

* Longer byte test.

* Add docstrings.

* Lint.

* Fix typos.

* Add vector core to synthetise script.

* Remove combinational loop, to don't have problems with synthesis.

* Changes in VRF.

* Some optimisations.

* BasicFifo optimisations.

* Remove MemoryBank from VRF.

* Remove vrf fifo_req.

* VRF. Reduce fifo_write number.

* Cut critical path in ALU.

* Replace FlexibleAdder with FlexibleElementwiseFunction.

* Add bufor on vector executor input.

* Remove LMUL translation.

* Substitute fifos with registers.

* Change BasicFifo to Registers. VRF

* Try to cut critical path in VectorRegister

* Bump number of lines to 2

* Change number of lines back to 1 for test purposes.

* Preparations for benchmarking.

* Fix downloader test.

* Fix vector tests. All unit tests passed.

* Fix benchmarks.

* Add new benchmarks. All unit tests passed.

* Lint

* Increase serializer depth. Rewrite benchmarks.

---------

Co-authored-by: Lekcyjna <[email protected]>
  • Loading branch information
lekcyjna123 and Lekcyjna authored Sep 29, 2023
1 parent 40bbfcf commit ddb21b5
Show file tree
Hide file tree
Showing 61 changed files with 2,399 additions and 418 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,16 @@ venv/
ENV/
env.bak/
venv.bak/
.ycm_extra_conf.py

# Verilog files
*.v

# Waveform dumps
*.vcd
*.gtkw
*.fst
*.fst.hier

# Tests outputs
test/__traces__
Expand Down
35 changes: 33 additions & 2 deletions coreblocks/frontend/decoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -344,7 +344,18 @@ class Encoding:
OpType.V_CONTROL: [
Encoding(Opcode.OP_V, Funct3.OPCFG),
],
OpType.V_MEMORY: [],
OpType.V_LOAD: [
Encoding(Opcode.LOAD_FP, Funct3.VMEM8),
Encoding(Opcode.LOAD_FP, Funct3.VMEM16),
Encoding(Opcode.LOAD_FP, Funct3.VMEM32),
Encoding(Opcode.LOAD_FP, Funct3.VMEM64),
],
OpType.V_STORE: [
Encoding(Opcode.STORE_FP, Funct3.VMEM8),
Encoding(Opcode.STORE_FP, Funct3.VMEM16),
Encoding(Opcode.STORE_FP, Funct3.VMEM32),
Encoding(Opcode.STORE_FP, Funct3.VMEM64),
],
}


Expand Down Expand Up @@ -519,7 +530,7 @@ def elaborate(self, platform):
m.d.comb += instruction_type.eq(InstrType.I)
with m.Case(Opcode.LUI, Opcode.AUIPC):
m.d.comb += instruction_type.eq(InstrType.U)
with m.Case(Opcode.OP, Opcode.OP_V):
with m.Case(Opcode.OP, Opcode.OP_V, Opcode.LOAD_FP, Opcode.STORE_FP):
m.d.comb += instruction_type.eq(InstrType.R)
with m.Case(Opcode.JAL):
m.d.comb += instruction_type.eq(InstrType.J)
Expand Down Expand Up @@ -685,6 +696,26 @@ def elaborate(self, platform):
self.rs2_type.eq(RegisterType.X),
self.rd_type.eq(RegisterType.X),
]
with m.If((self.opcode == Opcode.STORE_FP) | (self.opcode == Opcode.LOAD_FP)):
m.d.comb += [
self.rs1_type.eq(RegisterType.X),
self.rd_type.eq(RegisterType.V),
]
mop = Signal(2)
m.d.comb += self._extract(26, mop)
with m.Switch(mop):
with m.Case(0):
# unit stride
m.d.comb += self.rs2_v.eq(0)
with m.Case(1):
# indexed unordered
m.d.comb += self.rs2_type.eq(RegisterType.V)
with m.Case(2):
# stride
m.d.comb += self.rs2_type.eq(RegisterType.X)
with m.Case(3):
# indexed ordered
m.d.comb += self.rs2_type.eq(RegisterType.V)

# Instruction simplification

Expand Down
27 changes: 15 additions & 12 deletions coreblocks/fu/alu.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from amaranth import *

from coreblocks.transactions import *
from coreblocks.transactions.lib import FIFO
from coreblocks.transactions.lib import FIFO, Register

from coreblocks.params import OpType, Funct3, Funct7, GenParams, FuncUnitLayouts, FunctionalComponentParams
from coreblocks.utils import HasElaborate, OneHotSwitch
Expand Down Expand Up @@ -220,22 +220,25 @@ def elaborate(self, platform):
m = TModule()

m.submodules.alu = alu = Alu(self.gen_params, alu_fn=self.alu_fn)
m.submodules.fifo = fifo = FIFO(self.gen_params.get(FuncUnitLayouts).accept, 2)
m.submodules.fifo_in = fifo_in = Register(self.gen_params.get(FuncUnitLayouts).issue)
m.submodules.fifo_out = fifo_out = FIFO(self.gen_params.get(FuncUnitLayouts).accept, 2)
m.submodules.decoder = decoder = self.alu_fn.get_decoder(self.gen_params)

@def_method(m, self.accept)
def _():
return fifo.read(m)
self.accept.proxy(m, fifo_out.read)

@def_method(m, self.issue)
def _(arg):
m.d.comb += decoder.exec_fn.eq(arg.exec_fn)
m.d.comb += alu.fn.eq(decoder.decode_fn)
with Transaction().body(m):
arg = fifo_in.read(m)
m.d.top_comb += decoder.exec_fn.eq(arg.exec_fn)
m.d.top_comb += alu.fn.eq(decoder.decode_fn)

m.d.top_comb += alu.in1.eq(arg.s1_val)
m.d.top_comb += alu.in2.eq(Mux(arg.imm, arg.imm, arg.s2_val))

m.d.comb += alu.in1.eq(arg.s1_val)
m.d.comb += alu.in2.eq(Mux(arg.imm, arg.imm, arg.s2_val))
fifo_out.write(m, rob_id=arg.rob_id, result=alu.out, rp_dst=arg.rp_dst, exception=0)

fifo.write(m, rob_id=arg.rob_id, result=alu.out, rp_dst=arg.rp_dst, exception=0)
@def_method(m, self.issue)
def _(arg):
fifo_in.write(m, arg)

return m

Expand Down
68 changes: 67 additions & 1 deletion coreblocks/fu/vector_unit/utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
from amaranth import *
from coreblocks.params import *
from coreblocks.utils import *
from coreblocks.transactions import *

__all__ = ["expand_mask"]
__all__ = ["expand_mask", "load_store_width_to_eew_circ", "elem_mask_to_byte_mask"]


def expand_mask(v_params, mask: Value) -> Value:
Expand All @@ -18,3 +21,66 @@ def expand_mask(v_params, mask: Value) -> Value:
The mask which should be expanded.
"""
return Cat(Mux(mask[i], 0xFF, 0x00) for i in range(v_params.bytes_in_elen))


def elem_mask_to_byte_mask(m: TModule, v_params: VectorParameters, elem_mask: Value, eew: Value):
"""Generate a circuit to convert the mask from the elem format to the byte format.
The elem format always has valid first `k` bits where `k = ELEN/EEW` and each
bit describes whether an element is valid or not. The byte format has always `ELEN//8`
bits and each bit represents wheter a byte is valid or not.
Parameters
----------
m : TModule
Module to connect the circuit to.
v_params : VectorParameters
Vector unit configuration.
elem_mask : Value
Mask in elem format to be converted.
eew : Value(EEW)
The EEW for which the `elem_mask` was generated.
Returns
-------
Mask in byte format.
"""
result = Signal(v_params.bytes_in_elen)
with m.Switch(eew):
for eew_iter in EEW:
with m.Case(eew_iter):
m.d.av_comb += result.eq(
Cat([Repl(bit, 2 ** int(eew_iter)) for bit in elem_mask[: v_params.elen // eew_to_bits(eew_iter)]])
)
return result


def load_store_width_to_eew_circ(m: ModuleLike, width: Value) -> Signal:
"""Generate a converter from vector load/store width to EEW.
This function decodes `width` (which is simply a funct3) from vector
load/store instruction encodings and converts it to the corresponding data EEW.
Parameters
----------
m : TModule
Module to connect the circuit to.
width : Value
Vector load/store width to decode.
Returns
-------
The EEW of data on which load/store is operating.
"""
eew = Signal(EEW)
with m.Switch(width):
# constants taken from RISC-V V extension specification
with m.Case(0):
m.d.comb += eew.eq(EEW.w8)
with m.Case(5):
m.d.comb += eew.eq(EEW.w16)
with m.Case(6):
m.d.comb += eew.eq(EEW.w32)
with m.Case():
m.d.comb += eew.eq(EEW.w64)
return eew
47 changes: 42 additions & 5 deletions coreblocks/fu/vector_unit/v_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,24 @@ class VectorBackend(Elaboratable):
The method to insert instructions from the vector frontend.
initialise_regs : list[Method]
List with one method for each register, to initialise it on allocation.
vrf_write : list[Method]
List with one method for each register bank, to write data into it.
vrf_read_req : list[Method]
List with one method for each register bank, to request data to be read from it.
vrf_read_resp : list[Method]
List with one method for each register bank, to read requested data.
v_update : Method
The method to call to indicate that a vector register is ready.
scoreboard_get_dirty : Method
The method to check if the register is already ready.
scoreboard_set_dirty : Method
The method for setting the dirty bit for the register to indicate that it's not ready
and that there are no results yet.
"""

def __init__(self, gen_params: GenParams, announce: Method, report_end: Method):
def __init__(
self, gen_params: GenParams, announce: Method, report_end: Method, v_update_methods: list[Method] = []
):
"""
Parameters
----------
Expand All @@ -54,41 +69,59 @@ def __init__(self, gen_params: GenParams, announce: Method, report_end: Method):
scalar core.
report_end : Method
Used to report the end of instruction execution to `VectorRetirement`.
v_update_methods : list[Method]
Methods to be called with vector register updates.
"""
self.gen_params = gen_params
self.v_params = self.gen_params.v_params
self.announce = announce
self.report_end = report_end
self.v_update_methods = v_update_methods

self.layouts = VectorBackendLayouts(self.gen_params)
self.vvrs_layouts = VectorVRSLayout(self.gen_params, rs_entries_bits=self.v_params.vvrs_entries_bits)
self.vreg_layout = VectorRegisterBankLayouts(self.gen_params)
self.alu_layouts = VectorAluLayouts(self.gen_params)
self.vrf_layout = VRFFragmentLayouts(self.gen_params)
self.scoreboard_layout = ScoreboardLayouts(self.v_params.vrp_count)

self.put_instr = Method(i=self.layouts.vvrs_in)
self.initialise_regs = [Method(i=self.vreg_layout.initialise) for _ in range(self.v_params.vrp_count)]
self.report_mult = Method(i=self.layouts.ender_report_mult)
self.vrf_write = [Method(i=self.vrf_layout.write) for _ in range(self.v_params.register_bank_count)]
self.vrf_read_req = [Method(i=self.vrf_layout.read_req) for _ in range(self.v_params.register_bank_count)]
self.vrf_read_resp = [Method(o=self.vrf_layout.read_resp_o) for _ in range(self.v_params.register_bank_count)]
self.scoreboard_get_dirty = Method(
i=self.scoreboard_layout.get_dirty_in, o=self.scoreboard_layout.get_dirty_out
)
self.scoreboard_set_dirty = Method(i=self.scoreboard_layout.set_dirty_in)
self.v_update = Method(i=self.vvrs_layouts.update_in)

def elaborate(self, platform) -> TModule:
m = TModule()

m.submodules.ready_scoreboard = ready_scoreboard = Scoreboard(
self.v_params.vrp_count, superscalarity=4, data_forward=False
self.v_params.vrp_count, superscalarity=5, data_forward=False
)
m.submodules.vvrs = vvrs = VVRS(self.gen_params, self.v_params.vvrs_entries)
m.submodules.insert_to_vvrs = insert_to_vvrs = VectorInsertToVVRS(
self.gen_params,
vvrs.select,
vvrs.insert,
ready_scoreboard.get_dirty_list,
ready_scoreboard.get_dirty_list[:4],
ready_scoreboard.set_dirty_list[0],
)
self.scoreboard_get_dirty.proxy(m, ready_scoreboard.get_dirty_list[4])
self.scoreboard_set_dirty.proxy(m, ready_scoreboard.set_dirty_list[1])

self.put_instr.proxy(m, insert_to_vvrs.issue)

m.submodules.update_product = update_product = MethodProduct([vvrs.update, insert_to_vvrs.update])
m.submodules.update_product = update_product = MethodProduct(
[vvrs.update, insert_to_vvrs.update] + self.v_update_methods
)
self.v_update.proxy(m, update_product.method)
m.submodules.ender = ender = VectorExecutionEnder(
self.gen_params, self.announce, update_product.method, ready_scoreboard.set_dirty_list[1], self.report_end
self.gen_params, self.announce, self.v_update, ready_scoreboard.set_dirty_list[2], self.report_end
)
self.report_mult.proxy(m, ender.report_mult)
executors = [
Expand Down Expand Up @@ -117,6 +150,10 @@ def elaborate(self, platform) -> TModule:
init_banks_list = [executor.initialise_regs[i] for executor in executors]
connect_init_banks_list.append(MethodProduct(init_banks_list))
self.initialise_regs[i].proxy(m, connect_init_banks_list[-1].method)
for i, executor in enumerate(executors):
self.vrf_write[i].proxy(m, executor.write_vrf)
self.vrf_read_req[i].proxy(m, executor.read_req)
self.vrf_read_resp[i].proxy(m, executor.read_resp)
m.submodules.connect_init_banks = ModuleConnector(*connect_init_banks_list)

return m
Loading

0 comments on commit ddb21b5

Please sign in to comment.