From ddb21b50c9cbc46d154436b3a48a8cdbe323ceeb Mon Sep 17 00:00:00 2001 From: lekcyjna123 <34948061+lekcyjna123@users.noreply.github.com> Date: Fri, 29 Sep 2023 14:58:19 +0200 Subject: [PATCH] Vector LSU (#453) * Rewrite downloader to use BufferedReqResp * Move BufferedRespReq from downloader to executor. * Present vrf interface in VectorExecutor. * Added checking of LS width in frontend. * Added handle_load * Start writing LSU test. * Next part of tests. * Some fixes. * VectorLSUTests works. * Fix failing tests. All tests passed. * Add connections between LSUs and vector unit. * Added dependencies. * Fix tests. * Change in handling LSU reserved signal. * Fix vector core tests. * Added decoder test for checking vector memory instructions. * Fix in scheduler. Test kill the canary. * Fix vector asm test. All tests are passing. * Bytes tests passed. All tests are passing. * Fix needed regs vm * Longer byte test. * Add docstrings. * Lint. * Fix typos. * Add vector core to synthetise script. * Remove combinational loop, to don't have problems with synthesis. * Changes in VRF. * Some optimisations. * BasicFifo optimisations. * Remove MemoryBank from VRF. * Remove vrf fifo_req. * VRF. Reduce fifo_write number. * Cut critical path in ALU. * Replace FlexibleAdder with FlexibleElementwiseFunction. * Add bufor on vector executor input. * Remove LMUL translation. * Substitute fifos with registers. * Change BasicFifo to Registers. VRF * Try to cut critical path in VectorRegister * Bump number of lines to 2 * Change number of lines back to 1 for test purposes. * Preparations for benchmarking. * Fix downloader test. * Fix vector tests. All unit tests passed. * Fix benchmarks. * Add new benchmarks. All unit tests passed. * Lint * Increase serializer depth. Rewrite benchmarks. --------- Co-authored-by: Lekcyjna <309016@uwr.edu.pl> --- .gitignore | 3 + coreblocks/frontend/decoder.py | 35 +- coreblocks/fu/alu.py | 27 +- coreblocks/fu/vector_unit/utils.py | 68 ++- coreblocks/fu/vector_unit/v_backend.py | 47 +- coreblocks/fu/vector_unit/v_core.py | 49 +- .../fu/vector_unit/v_elems_downloader.py | 27 +- coreblocks/fu/vector_unit/v_executor.py | 32 +- coreblocks/fu/vector_unit/v_frontend.py | 11 +- .../fu/vector_unit/v_input_verification.py | 9 +- coreblocks/fu/vector_unit/v_layouts.py | 40 +- coreblocks/fu/vector_unit/v_len_getter.py | 1 + coreblocks/fu/vector_unit/v_mask_extractor.py | 32 -- coreblocks/fu/vector_unit/v_needed_regs.py | 4 +- coreblocks/fu/vector_unit/v_register.py | 70 ++- coreblocks/fu/vector_unit/v_status.py | 4 +- coreblocks/fu/vector_unit/v_translator.py | 15 +- coreblocks/fu/vector_unit/vector_alu.py | 8 +- coreblocks/fu/vector_unit/vrf.py | 54 ++- coreblocks/lsu/__init__.py | 2 + coreblocks/lsu/dummyLsu.py | 20 +- coreblocks/lsu/vector_lsu.py | 418 ++++++++++++++++++ coreblocks/params/configurations.py | 23 +- coreblocks/params/isa.py | 30 +- coreblocks/params/keys.py | 35 ++ coreblocks/params/optypes.py | 6 +- coreblocks/scheduler/scheduler.py | 25 +- coreblocks/stages/backend.py | 4 +- coreblocks/structs_common/scoreboard.py | 17 +- coreblocks/transactions/core.py | 2 +- coreblocks/transactions/lib.py | 240 +++++++--- coreblocks/utils/fifo.py | 24 +- docker/riscv-toolchain.Dockerfile | 2 +- scripts/gen_verilog.py | 22 +- scripts/run_benchmarks.py | 45 +- scripts/synthesize.py | 1 + test/asm/vector.asm | 46 ++ test/asm/vector_bytes.asm | 55 +++ test/common.py | 54 ++- test/external/embench/Makefile | 5 +- .../board_config/coreblocks-sim/board.cfg | 7 +- .../coreblocks-sim/boardsupport.c | 22 +- test/external/embench/common/link.ld | 4 +- .../vadd-lot-of-scalars/add-lot-of-scalars.c | 76 ++++ .../coreblocks_benchmarks/vadd-mem/vadd-mem.c | 72 +++ .../embench/coreblocks_benchmarks/vadd/vadd.c | 69 +++ .../embench/coreblocks_benchmarks/vmem/vmem.c | 73 +++ test/frontend/test_decoder.py | 377 ++++++++++++++++ test/fu/vector_unit/common.py | 10 +- test/fu/vector_unit/test_v_core.py | 30 ++ .../fu/vector_unit/test_v_elems_downloader.py | 18 +- test/fu/vector_unit/test_v_frontend.py | 50 +-- .../test_v_instruction_verification.py | 27 +- test/fu/vector_unit/test_v_status.py | 2 +- test/fu/vector_unit/test_vrf.py | 78 ++-- test/lsu/test_vector_lsu.py | 226 ++++++++++ test/regression/benchmark.py | 6 +- test/regression/pysim.py | 8 +- test/structs_common/test_scoreboard.py | 16 +- test/test_core.py | 31 +- test/utils/test_fifo.py | 3 +- 61 files changed, 2399 insertions(+), 418 deletions(-) create mode 100644 coreblocks/lsu/__init__.py create mode 100644 coreblocks/lsu/vector_lsu.py create mode 100644 test/asm/vector.asm create mode 100644 test/asm/vector_bytes.asm create mode 100644 test/external/embench/coreblocks_benchmarks/vadd-lot-of-scalars/add-lot-of-scalars.c create mode 100644 test/external/embench/coreblocks_benchmarks/vadd-mem/vadd-mem.c create mode 100644 test/external/embench/coreblocks_benchmarks/vadd/vadd.c create mode 100644 test/external/embench/coreblocks_benchmarks/vmem/vmem.c create mode 100644 test/lsu/test_vector_lsu.py diff --git a/.gitignore b/.gitignore index b545b086c..361578340 100644 --- a/.gitignore +++ b/.gitignore @@ -11,6 +11,7 @@ venv/ ENV/ env.bak/ venv.bak/ +.ycm_extra_conf.py # Verilog files *.v @@ -18,6 +19,8 @@ venv.bak/ # Waveform dumps *.vcd *.gtkw +*.fst +*.fst.hier # Tests outputs test/__traces__ diff --git a/coreblocks/frontend/decoder.py b/coreblocks/frontend/decoder.py index 06ab5c436..767105715 100644 --- a/coreblocks/frontend/decoder.py +++ b/coreblocks/frontend/decoder.py @@ -344,7 +344,18 @@ class Encoding: OpType.V_CONTROL: [ Encoding(Opcode.OP_V, Funct3.OPCFG), ], - OpType.V_MEMORY: [], + OpType.V_LOAD: [ + Encoding(Opcode.LOAD_FP, Funct3.VMEM8), + Encoding(Opcode.LOAD_FP, Funct3.VMEM16), + Encoding(Opcode.LOAD_FP, Funct3.VMEM32), + Encoding(Opcode.LOAD_FP, Funct3.VMEM64), + ], + OpType.V_STORE: [ + Encoding(Opcode.STORE_FP, Funct3.VMEM8), + Encoding(Opcode.STORE_FP, Funct3.VMEM16), + Encoding(Opcode.STORE_FP, Funct3.VMEM32), + Encoding(Opcode.STORE_FP, Funct3.VMEM64), + ], } @@ -519,7 +530,7 @@ def elaborate(self, platform): m.d.comb += instruction_type.eq(InstrType.I) with m.Case(Opcode.LUI, Opcode.AUIPC): m.d.comb += instruction_type.eq(InstrType.U) - with m.Case(Opcode.OP, Opcode.OP_V): + with m.Case(Opcode.OP, Opcode.OP_V, Opcode.LOAD_FP, Opcode.STORE_FP): m.d.comb += instruction_type.eq(InstrType.R) with m.Case(Opcode.JAL): m.d.comb += instruction_type.eq(InstrType.J) @@ -685,6 +696,26 @@ def elaborate(self, platform): self.rs2_type.eq(RegisterType.X), self.rd_type.eq(RegisterType.X), ] + with m.If((self.opcode == Opcode.STORE_FP) | (self.opcode == Opcode.LOAD_FP)): + m.d.comb += [ + self.rs1_type.eq(RegisterType.X), + self.rd_type.eq(RegisterType.V), + ] + mop = Signal(2) + m.d.comb += self._extract(26, mop) + with m.Switch(mop): + with m.Case(0): + # unit stride + m.d.comb += self.rs2_v.eq(0) + with m.Case(1): + # indexed unordered + m.d.comb += self.rs2_type.eq(RegisterType.V) + with m.Case(2): + # stride + m.d.comb += self.rs2_type.eq(RegisterType.X) + with m.Case(3): + # indexed ordered + m.d.comb += self.rs2_type.eq(RegisterType.V) # Instruction simplification diff --git a/coreblocks/fu/alu.py b/coreblocks/fu/alu.py index 450f8b649..1d18f3cf4 100644 --- a/coreblocks/fu/alu.py +++ b/coreblocks/fu/alu.py @@ -2,7 +2,7 @@ from amaranth import * from coreblocks.transactions import * -from coreblocks.transactions.lib import FIFO +from coreblocks.transactions.lib import FIFO, Register from coreblocks.params import OpType, Funct3, Funct7, GenParams, FuncUnitLayouts, FunctionalComponentParams from coreblocks.utils import HasElaborate, OneHotSwitch @@ -220,22 +220,25 @@ def elaborate(self, platform): m = TModule() m.submodules.alu = alu = Alu(self.gen_params, alu_fn=self.alu_fn) - m.submodules.fifo = fifo = FIFO(self.gen_params.get(FuncUnitLayouts).accept, 2) + m.submodules.fifo_in = fifo_in = Register(self.gen_params.get(FuncUnitLayouts).issue) + m.submodules.fifo_out = fifo_out = FIFO(self.gen_params.get(FuncUnitLayouts).accept, 2) m.submodules.decoder = decoder = self.alu_fn.get_decoder(self.gen_params) - @def_method(m, self.accept) - def _(): - return fifo.read(m) + self.accept.proxy(m, fifo_out.read) - @def_method(m, self.issue) - def _(arg): - m.d.comb += decoder.exec_fn.eq(arg.exec_fn) - m.d.comb += alu.fn.eq(decoder.decode_fn) + with Transaction().body(m): + arg = fifo_in.read(m) + m.d.top_comb += decoder.exec_fn.eq(arg.exec_fn) + m.d.top_comb += alu.fn.eq(decoder.decode_fn) + + m.d.top_comb += alu.in1.eq(arg.s1_val) + m.d.top_comb += alu.in2.eq(Mux(arg.imm, arg.imm, arg.s2_val)) - m.d.comb += alu.in1.eq(arg.s1_val) - m.d.comb += alu.in2.eq(Mux(arg.imm, arg.imm, arg.s2_val)) + fifo_out.write(m, rob_id=arg.rob_id, result=alu.out, rp_dst=arg.rp_dst, exception=0) - fifo.write(m, rob_id=arg.rob_id, result=alu.out, rp_dst=arg.rp_dst, exception=0) + @def_method(m, self.issue) + def _(arg): + fifo_in.write(m, arg) return m diff --git a/coreblocks/fu/vector_unit/utils.py b/coreblocks/fu/vector_unit/utils.py index be4a6c80e..c40ee91da 100644 --- a/coreblocks/fu/vector_unit/utils.py +++ b/coreblocks/fu/vector_unit/utils.py @@ -1,6 +1,9 @@ from amaranth import * +from coreblocks.params import * +from coreblocks.utils import * +from coreblocks.transactions import * -__all__ = ["expand_mask"] +__all__ = ["expand_mask", "load_store_width_to_eew_circ", "elem_mask_to_byte_mask"] def expand_mask(v_params, mask: Value) -> Value: @@ -18,3 +21,66 @@ def expand_mask(v_params, mask: Value) -> Value: The mask which should be expanded. """ return Cat(Mux(mask[i], 0xFF, 0x00) for i in range(v_params.bytes_in_elen)) + + +def elem_mask_to_byte_mask(m: TModule, v_params: VectorParameters, elem_mask: Value, eew: Value): + """Generate a circuit to convert the mask from the elem format to the byte format. + + The elem format always has valid first `k` bits where `k = ELEN/EEW` and each + bit describes whether an element is valid or not. The byte format has always `ELEN//8` + bits and each bit represents wheter a byte is valid or not. + + Parameters + ---------- + m : TModule + Module to connect the circuit to. + v_params : VectorParameters + Vector unit configuration. + elem_mask : Value + Mask in elem format to be converted. + eew : Value(EEW) + The EEW for which the `elem_mask` was generated. + + Returns + ------- + Mask in byte format. + """ + result = Signal(v_params.bytes_in_elen) + with m.Switch(eew): + for eew_iter in EEW: + with m.Case(eew_iter): + m.d.av_comb += result.eq( + Cat([Repl(bit, 2 ** int(eew_iter)) for bit in elem_mask[: v_params.elen // eew_to_bits(eew_iter)]]) + ) + return result + + +def load_store_width_to_eew_circ(m: ModuleLike, width: Value) -> Signal: + """Generate a converter from vector load/store width to EEW. + + This function decodes `width` (which is simply a funct3) from vector + load/store instruction encodings and converts it to the corresponding data EEW. + + Parameters + ---------- + m : TModule + Module to connect the circuit to. + width : Value + Vector load/store width to decode. + + Returns + ------- + The EEW of data on which load/store is operating. + """ + eew = Signal(EEW) + with m.Switch(width): + # constants taken from RISC-V V extension specification + with m.Case(0): + m.d.comb += eew.eq(EEW.w8) + with m.Case(5): + m.d.comb += eew.eq(EEW.w16) + with m.Case(6): + m.d.comb += eew.eq(EEW.w32) + with m.Case(): + m.d.comb += eew.eq(EEW.w64) + return eew diff --git a/coreblocks/fu/vector_unit/v_backend.py b/coreblocks/fu/vector_unit/v_backend.py index bd844a62a..c6df8e299 100644 --- a/coreblocks/fu/vector_unit/v_backend.py +++ b/coreblocks/fu/vector_unit/v_backend.py @@ -41,9 +41,24 @@ class VectorBackend(Elaboratable): The method to insert instructions from the vector frontend. initialise_regs : list[Method] List with one method for each register, to initialise it on allocation. + vrf_write : list[Method] + List with one method for each register bank, to write data into it. + vrf_read_req : list[Method] + List with one method for each register bank, to request data to be read from it. + vrf_read_resp : list[Method] + List with one method for each register bank, to read requested data. + v_update : Method + The method to call to indicate that a vector register is ready. + scoreboard_get_dirty : Method + The method to check if the register is already ready. + scoreboard_set_dirty : Method + The method for setting the dirty bit for the register to indicate that it's not ready + and that there are no results yet. """ - def __init__(self, gen_params: GenParams, announce: Method, report_end: Method): + def __init__( + self, gen_params: GenParams, announce: Method, report_end: Method, v_update_methods: list[Method] = [] + ): """ Parameters ---------- @@ -54,41 +69,59 @@ def __init__(self, gen_params: GenParams, announce: Method, report_end: Method): scalar core. report_end : Method Used to report the end of instruction execution to `VectorRetirement`. + v_update_methods : list[Method] + Methods to be called with vector register updates. """ self.gen_params = gen_params self.v_params = self.gen_params.v_params self.announce = announce self.report_end = report_end + self.v_update_methods = v_update_methods self.layouts = VectorBackendLayouts(self.gen_params) self.vvrs_layouts = VectorVRSLayout(self.gen_params, rs_entries_bits=self.v_params.vvrs_entries_bits) self.vreg_layout = VectorRegisterBankLayouts(self.gen_params) self.alu_layouts = VectorAluLayouts(self.gen_params) + self.vrf_layout = VRFFragmentLayouts(self.gen_params) + self.scoreboard_layout = ScoreboardLayouts(self.v_params.vrp_count) self.put_instr = Method(i=self.layouts.vvrs_in) self.initialise_regs = [Method(i=self.vreg_layout.initialise) for _ in range(self.v_params.vrp_count)] self.report_mult = Method(i=self.layouts.ender_report_mult) + self.vrf_write = [Method(i=self.vrf_layout.write) for _ in range(self.v_params.register_bank_count)] + self.vrf_read_req = [Method(i=self.vrf_layout.read_req) for _ in range(self.v_params.register_bank_count)] + self.vrf_read_resp = [Method(o=self.vrf_layout.read_resp_o) for _ in range(self.v_params.register_bank_count)] + self.scoreboard_get_dirty = Method( + i=self.scoreboard_layout.get_dirty_in, o=self.scoreboard_layout.get_dirty_out + ) + self.scoreboard_set_dirty = Method(i=self.scoreboard_layout.set_dirty_in) + self.v_update = Method(i=self.vvrs_layouts.update_in) def elaborate(self, platform) -> TModule: m = TModule() m.submodules.ready_scoreboard = ready_scoreboard = Scoreboard( - self.v_params.vrp_count, superscalarity=4, data_forward=False + self.v_params.vrp_count, superscalarity=5, data_forward=False ) m.submodules.vvrs = vvrs = VVRS(self.gen_params, self.v_params.vvrs_entries) m.submodules.insert_to_vvrs = insert_to_vvrs = VectorInsertToVVRS( self.gen_params, vvrs.select, vvrs.insert, - ready_scoreboard.get_dirty_list, + ready_scoreboard.get_dirty_list[:4], ready_scoreboard.set_dirty_list[0], ) + self.scoreboard_get_dirty.proxy(m, ready_scoreboard.get_dirty_list[4]) + self.scoreboard_set_dirty.proxy(m, ready_scoreboard.set_dirty_list[1]) self.put_instr.proxy(m, insert_to_vvrs.issue) - m.submodules.update_product = update_product = MethodProduct([vvrs.update, insert_to_vvrs.update]) + m.submodules.update_product = update_product = MethodProduct( + [vvrs.update, insert_to_vvrs.update] + self.v_update_methods + ) + self.v_update.proxy(m, update_product.method) m.submodules.ender = ender = VectorExecutionEnder( - self.gen_params, self.announce, update_product.method, ready_scoreboard.set_dirty_list[1], self.report_end + self.gen_params, self.announce, self.v_update, ready_scoreboard.set_dirty_list[2], self.report_end ) self.report_mult.proxy(m, ender.report_mult) executors = [ @@ -117,6 +150,10 @@ def elaborate(self, platform) -> TModule: init_banks_list = [executor.initialise_regs[i] for executor in executors] connect_init_banks_list.append(MethodProduct(init_banks_list)) self.initialise_regs[i].proxy(m, connect_init_banks_list[-1].method) + for i, executor in enumerate(executors): + self.vrf_write[i].proxy(m, executor.write_vrf) + self.vrf_read_req[i].proxy(m, executor.read_req) + self.vrf_read_resp[i].proxy(m, executor.read_resp) m.submodules.connect_init_banks = ModuleConnector(*connect_init_banks_list) return m diff --git a/coreblocks/fu/vector_unit/v_core.py b/coreblocks/fu/vector_unit/v_core.py index d96365289..bbc4d0eb6 100644 --- a/coreblocks/fu/vector_unit/v_core.py +++ b/coreblocks/fu/vector_unit/v_core.py @@ -55,6 +55,8 @@ def __init__(self, gen_params: GenParams): self.x_retirement_layouts = gen_params.get(RetirementLayouts) self.fu_layouts = gen_params.get(FuncUnitLayouts) self.v_frontend_layouts = VectorFrontendLayouts(self.gen_params) + self.vrf_layout = VRFFragmentLayouts(self.gen_params) + self.scoreboard_layout = ScoreboardLayouts(self.v_params.vrp_count) self.insert = Method(i=self.vxrs_layouts.insert_in) self.select = Method(o=self.vxrs_layouts.select_out) @@ -62,10 +64,31 @@ def __init__(self, gen_params: GenParams): self.precommit = Method(i=self.x_retirement_layouts.precommit) self.get_result = Method(o=self.fu_layouts.accept) + self.vrf_write = [ + Method(i=self.vrf_layout.write, name=f"vrf_write{i}") for i in range(self.v_params.register_bank_count) + ] + self.vrf_read_req = [ + Method(i=self.vrf_layout.read_req, name=f"vrf_read_req{i}") + for i in range(self.v_params.register_bank_count) + ] + self.vrf_read_resp = [ + Method(o=self.vrf_layout.read_resp_o, name=f"vrf_read_resp{i}") + for i in range(self.v_params.register_bank_count) + ] + self.scoreboard_get_dirty = Method( + i=self.scoreboard_layout.get_dirty_in, o=self.scoreboard_layout.get_dirty_out + ) + self.scoreboard_set_dirty = Method(i=self.scoreboard_layout.set_dirty_in) + + self.connections = self.gen_params.get(DependencyManager) + self.connections.add_dependency(VectorFrontendInsertKey(), self.insert) + self.connections.add_dependency(VectorVRFAccessKey(), (self.vrf_write, self.vrf_read_req, self.vrf_read_resp)) + self.connections.add_dependency(VectorScoreboardKey(), (self.scoreboard_get_dirty, self.scoreboard_set_dirty)) + def elaborate(self, platform) -> TModule: m = TModule() - rob_block_interrupts = self.gen_params.get(DependencyManager).get_dependency(ROBBlockInterruptsKey()) + rob_block_interrupts = self.connections.get_dependency(ROBBlockInterruptsKey()) v_freerf = SuperscalarFreeRF(self.v_params.vrp_count, 1, reset_state=2**self.v_params.vrl_count - 1) v_frat = FRAT(gen_params=self.gen_params, superscalarity=2, zero_init=False) @@ -74,32 +97,44 @@ def elaborate(self, platform) -> TModule: v_retirement = VectorRetirement( self.gen_params, self.v_params.vrp_count, v_rrat.commit, v_freerf.deallocates[0] ) - announcer = VectorAnnouncer(self.gen_params, 3) + announcer = VectorAnnouncer(self.gen_params, 4) + vlsu = self.connections.get_dependency(VectorLSUKey()) - backend = VectorBackend(self.gen_params, announcer.announce_list[0], v_retirement.report_end) + backend = VectorBackend(self.gen_params, announcer.announce_list[0], v_retirement.report_end, [vlsu.update_v]) fifo_to_vvrs = BasicFifo(self.v_frontend_layouts.instr_to_vvrs, 2) - fifo_to_mem = BasicFifo(self.v_frontend_layouts.instr_to_mem, 2) frontend = VectorFrontend( self.gen_params, rob_block_interrupts, announcer.announce_list[1], announcer.announce_list[2], - backend.report_mult, v_freerf.allocate, v_frat.get_rename_list[0], v_frat.get_rename_list[1], v_frat.set_rename_list[0], - fifo_to_mem.write, + vlsu.insert_v, fifo_to_vvrs.write, backend.initialise_regs, ) connect_data_to_vvrs = ConnectTrans(fifo_to_vvrs.read, backend.put_instr) + connect_mem_result = ConnectTrans(vlsu.get_result_v, announcer.announce_list[3]) + with Transaction(name="vlsu_get_result_v_trans").body(m): + data = vlsu.get_result_v(m) + announcer.announce_list[3](m, data) + v_retirement.report_end(m, rob_id=data.rob_id, rp_dst=data.rp_dst) + backend.v_update(m, tag=data.rp_dst, value=0) self.precommit.proxy(m, v_retirement.precommit) self.get_result.proxy(m, announcer.accept) self.insert.proxy(m, frontend.insert) self.select.proxy(m, frontend.select) self.update.proxy(m, frontend.update) + self.scoreboard_get_dirty.proxy(m, backend.scoreboard_get_dirty) + self.scoreboard_set_dirty.proxy(m, backend.scoreboard_set_dirty) + + for i in range(len(backend.vrf_write)): + self.vrf_write[i].proxy(m, backend.vrf_write[i]) + self.vrf_read_req[i].proxy(m, backend.vrf_read_req[i]) + self.vrf_read_resp[i].proxy(m, backend.vrf_read_resp[i]) m.submodules.v_freerf = v_freerf m.submodules.v_frat = v_frat @@ -108,9 +143,9 @@ def elaborate(self, platform) -> TModule: m.submodules.announcer = announcer m.submodules.backend = backend m.submodules.fifo_to_vvrs = fifo_to_vvrs - m.submodules.fifo_to_mem = fifo_to_mem m.submodules.frontend = frontend m.submodules.connect_data_to_vvrs = connect_data_to_vvrs + m.submodules.connect_mem_result = connect_mem_result return m diff --git a/coreblocks/fu/vector_unit/v_elems_downloader.py b/coreblocks/fu/vector_unit/v_elems_downloader.py index d709ec303..372e52816 100644 --- a/coreblocks/fu/vector_unit/v_elems_downloader.py +++ b/coreblocks/fu/vector_unit/v_elems_downloader.py @@ -41,17 +41,17 @@ def __init__( Core configuration. read_req_list : list[Method] List of methods used to send requests to the vector register file. - There should be at least 4 entries. + These methods should be already buffered. There should be at least 4 entries. read_resp_list : list[Method] - List of methods used to read the response to the requests + List of buffered methods used to read the response to the requests previously send to VRF. There should be at least 4 entries. send_to_fu : Method The method called to pass the downloaded data to the vector FU. """ self.gen_params = gen_params self.v_params = self.gen_params.v_params - self.read_req_list = [NotMethod(m) for m in read_req_list] - self.read_resp_list = [NotMethod(m) for m in read_resp_list] + self.read_req_list = read_req_list + self.read_resp_list = read_resp_list self.send_to_fu = send_to_fu self.layouts = VectorBackendLayouts(self.gen_params) @@ -82,22 +82,13 @@ def elaborate(self, platform): ) m.d.top_comb += Cat(uniqness_checker.input_valids).eq(needed_signals) - fifos_to_vrf = [FIFO(self.vrf_layout.read_req, 2) for _ in range(regs_number)] - m.submodules.fifos_to_vrf = ModuleConnector(*fifos_to_vrf) - - fifos_to_resp_in = [FIFO(self.vrf_layout.read_resp_i, 4) for _ in range(regs_number)] - m.submodules.fifos_to_resp_in = ModuleConnector(*fifos_to_resp_in) - barrier = Barrier(self.vrf_layout.read_resp_o, regs_number) m.submodules.barrier = barrier - m.submodules.connect_req = ModuleConnector( - *[ConnectTrans(fifos_to_vrf[i].read, self.read_req_list[i].method) for i in range(regs_number)] + # TODO Use barrier dirrectly inside BufferedReqResp to reduce latency + m.submodules.connect_barrier = ModuleConnector( + *[ConnectTrans(self.read_resp_list[i], barrier.write_list[i]) for i in range(regs_number)] ) - for i in range(regs_number): - with Transaction().body(m): - arg = fifos_to_resp_in[i].read(m) - barrier.write_list[i](m, self.read_resp_list[i].method(m, arg)) with Transaction(name="downloader_request_trans").body(m, request=instr_valid & (req_counter != 0)): addr = Signal(range(self.v_params.elens_in_bank)) @@ -106,8 +97,7 @@ def elaborate(self, platform): with m.If(uniqness_checker.valids[i]): vrp_id = Signal(self.v_params.vrp_count_bits) m.d.comb += vrp_id.eq(instr[field_name]) - fifos_to_vrf[i].write(m, vrp_id=vrp_id, addr=addr) - fifos_to_resp_in[i].write(m, vrp_id=vrp_id) + self.read_req_list[i](m, vrp_id=vrp_id, addr=addr) m.d.sync += req_counter.eq(addr) data_to_fu = Record(self.layouts.downloader_data_out) @@ -128,6 +118,7 @@ def elaborate(self, platform): with m.If(resp_counter == 1): m.d.sync += instr_valid.eq(0) barrier.set_valids(m, valids=(2**regs_number - 1)) + with m.If(instr.elens_len == resp_counter): m.d.comb += data_to_fu.last_mask.eq(last_mask_saved) uniq = Signal(4, name="uniq") diff --git a/coreblocks/fu/vector_unit/v_executor.py b/coreblocks/fu/vector_unit/v_executor.py index 212336f9e..0f4f93f5d 100644 --- a/coreblocks/fu/vector_unit/v_executor.py +++ b/coreblocks/fu/vector_unit/v_executor.py @@ -41,6 +41,12 @@ class VectorExecutor(Elaboratable): initialise_regs : list[Method] A list of methods, one for each vector register, to initialise its content on vector register allocation. + write_vrf : Method + The method to write to the register bank associated with this executor. + read_req : Method + The method to request a read from the register bank associated with this executor. + read_resp : Method + The method used to read the response to the previous issued request. """ def __init__(self, gen_params: GenParams, fragment_index: int, end: Method): @@ -63,11 +69,15 @@ def __init__(self, gen_params: GenParams, fragment_index: int, end: Method): self.layouts = VectorBackendLayouts(self.gen_params) self.alu_layouts = VectorAluLayouts(self.gen_params) self.vreg_layout = VectorRegisterBankLayouts(self.gen_params) + self.vrf_layout = VRFFragmentLayouts(self.gen_params) self.issue = Method(i=self.layouts.executor_in) self.initialise_regs = [ Method(i=self.vreg_layout.initialise, name=f"initialise{i}") for i in range(self.v_params.vrp_count) ] + self.write_vrf = Method(i=self.vrf_layout.write) + self.read_req = Method(i=self.vrf_layout.read_req) + self.read_resp = Method(o=self.vrf_layout.read_resp_o) def elaborate(self, platform) -> TModule: m = TModule() @@ -91,11 +101,25 @@ def elaborate(self, platform) -> TModule: splitter = VectorExecutionDataSplitter( self.gen_params, fu_in_fifo.write, old_dst_fifo.write, mask_in_fifo.write ) - downloader = VectorElemsDownloader(self.gen_params, vrf.read_req, vrf.read_resp, splitter.issue) - uploader = VectorElemsUploader(self.gen_params, vrf.write, old_dst_fifo.read, mask_out_fifo.read, self.end) - issue_connect = Connect(self.layouts.executor_in) + serializers = [ + Serializer(port_count=2, serialized_req_method=vrf.read_req[i], serialized_resp_method=vrf.read_resp[i], depth = 6) + for i in range(vrf.read_ports_count) + ] + write_wrapper = def_one_caller_wrapper(vrf.write, self.write_vrf) + + downloader = VectorElemsDownloader( + self.gen_params, + [ser.serialize_in[0] for ser in serializers], + [ser.serialize_out[0] for ser in serializers], + splitter.issue, + ) + uploader = VectorElemsUploader(self.gen_params, self.write_vrf, old_dst_fifo.read, mask_out_fifo.read, self.end) + + issue_connect = Register(self.layouts.executor_in) self.issue.proxy(m, issue_connect.write) + self.read_req.proxy(m, serializers[2].serialize_in[1]) + self.read_resp.proxy(m, serializers[2].serialize_out[1]) connect_input_data = Transaction(name="connect_input_data") with connect_input_data.body(m, request=~end_pending_to_report): @@ -149,5 +173,7 @@ def elaborate(self, platform) -> TModule: m.submodules.connect_mask_extractor_in = connect_mask_extractor_in m.submodules.connect_alu_in = connect_alu_in m.submodules.connect_alu_out = connect_alu_out + m.submodules.serializers = ModuleConnector(*serializers) + m.submodules.write_wrapper = write_wrapper return m diff --git a/coreblocks/fu/vector_unit/v_frontend.py b/coreblocks/fu/vector_unit/v_frontend.py index 0a143f8f5..518b94338 100644 --- a/coreblocks/fu/vector_unit/v_frontend.py +++ b/coreblocks/fu/vector_unit/v_frontend.py @@ -55,7 +55,7 @@ def elaborate(self, platform) -> TModule: @def_method(m, self.issue) def _(arg): - with m.If(arg.exec_fn.op_type == OpType.V_MEMORY): + with m.If((arg.exec_fn.op_type == OpType.V_LOAD) | (arg.exec_fn.op_type == OpType.V_STORE)): rec_mem = Record(self.layouts.instr_to_mem) m.d.top_comb += assign(rec_mem, arg, fields=AssignType.COMMON) self.put_to_mem(m, rec_mem) @@ -98,7 +98,6 @@ def __init__( rob_block_interrupts: Method, announce: Method, announce2: Method, - announce_mult: Method, alloc_reg: Method, get_rename1_frat: Method, get_rename2_frat: Method, @@ -119,9 +118,6 @@ def __init__( illegal instructions. announce2 : Method The same as above. Second instance. - announce_mult : Method - Method to report to the vector announcement module the number of internal instructions - generated from an original vector instruction. alloc_reg : Method Allocate a new vector register. get_rename1_frat : Method @@ -143,7 +139,6 @@ def __init__( self.rob_block_interrupts = rob_block_interrupts self.announce = announce self.announce2 = announce2 - self.announce_mult = announce_mult self.alloc_reg = NotMethod(alloc_reg) self.get_rename1_frat = get_rename1_frat self.get_rename2_frat = get_rename2_frat @@ -187,9 +182,7 @@ def elaborate(self, platform) -> TModule: ) m.submodules.fifo_from_translator = fifo_from_translator = BasicFifo(self.layouts.translator_out, 2) - m.submodules.translator = translator = VectorTranslator( - self.gen_params, fifo_from_translator.write, self.announce_mult - ) + m.submodules.translator = translator = VectorTranslator(self.gen_params, fifo_from_translator.write) m.submodules.from_status_to_tranlator = ConnectTrans(fifo_from_v_status.read, translator.issue) diff --git a/coreblocks/fu/vector_unit/v_input_verification.py b/coreblocks/fu/vector_unit/v_input_verification.py index 98de28b1e..f8088bcc0 100644 --- a/coreblocks/fu/vector_unit/v_input_verification.py +++ b/coreblocks/fu/vector_unit/v_input_verification.py @@ -3,6 +3,7 @@ from coreblocks.transactions.lib import * from coreblocks.params import * from coreblocks.fu.vector_unit.v_layouts import * +from coreblocks.fu.vector_unit.utils import load_store_width_to_eew_circ from coreblocks.utils.fifo import BasicFifo __all__ = ["VectorInputVerificator"] @@ -91,7 +92,13 @@ def check_instr(self, m: TModule, instr) -> Value: with m.If(self.vstart != 0): m.d.comb += illegal_because_vstart.eq(1) - return illegal_because_vill | illegal_because_vstart + illegal_because_ls_width = Signal() + with m.If( + (load_store_width_to_eew_circ(m, instr.exec_fn.funct3) > bits_to_eew(self.v_params.elen)) + & ((instr.exec_fn.op_type == OpType.V_LOAD) | (instr.exec_fn.op_type == OpType.V_STORE)) + ): + m.d.comb += illegal_because_ls_width.eq(1) + return illegal_because_vill | illegal_because_vstart | illegal_because_ls_width def elaborate(self, platform): m = TModule() diff --git a/coreblocks/fu/vector_unit/v_layouts.py b/coreblocks/fu/vector_unit/v_layouts.py index 911fd09ab..083315bf3 100644 --- a/coreblocks/fu/vector_unit/v_layouts.py +++ b/coreblocks/fu/vector_unit/v_layouts.py @@ -7,6 +7,7 @@ class VectorRegisterBankLayouts: def __init__(self, gen_params: GenParams): v_params = gen_params.v_params + self.read_req = [("addr", range(v_params.elens_in_bank))] self.read_resp = [("data", v_params.elen)] self.write = [ @@ -24,7 +25,6 @@ def __init__(self, gen_params: GenParams): v_params = gen_params.v_params self.read_req = [("vrp_id", v_params.vrp_count_bits), ("addr", range(v_params.elens_in_bank))] - self.read_resp_i = [("vrp_id", v_params.vrp_count_bits)] self.read_resp_o = [("data", v_params.elen)] self.write = [ @@ -46,6 +46,7 @@ def __init__(self, gen_params: GenParams, *, rs_entries_bits: int): rs_interface.data_layout, fields={ "rp_s1", + "rp_s1_reg", "rp_s2", "rp_dst", "rob_id", @@ -104,6 +105,7 @@ def __init__(self, gen_params: GenParams): self.verification_in = self.verification_out = self.status_in = [ ("rp_s1", common.p_register_entry), + ("rp_s1_reg", gen_params.phys_regs_bits), ("rp_s2", common.p_register_entry), ("rp_dst", common.p_register_entry), ("rob_id", gen_params.rob_entries_bits), @@ -114,7 +116,7 @@ def __init__(self, gen_params: GenParams): ("imm2", gen_params.imm2_width), ] - self.status_out = self.translator_in = layout_difference(self.status_in, fields={"imm2"}) + [ + self.status_out = self.translator_in = layout_difference(self.status_in, fields={"imm2", "rp_s1_reg"}) + [ ("vtype", self.vtype) ] self.translator_inner = layout_difference(self.translator_in, fields={"imm"}) @@ -128,7 +130,6 @@ def __init__(self, gen_params: GenParams): self.translator_report_multiplier = [("mult", 4), ("rob_id", gen_params.rob_entries_bits)] self.instr_to_mem = [ - ("rp_s1", common.p_register_entry), ("rp_s2", common.p_register_entry), ("rp_dst", common.p_register_entry), ("rob_id", gen_params.rob_entries_bits), @@ -238,3 +239,36 @@ def __init__(self, gen_params: GenParams): ("rob_id", gen_params.rob_entries_bits), ("rp_dst", common.p_register_entry), ] + + +class VectorLSULayouts: + def __init__(self, gen_params: GenParams): + common = gen_params.get(CommonLayouts) + common_vector = VectorCommonLayouts(gen_params) + retirement = gen_params.get(RetirementLayouts) + self.rs_entries_bits = 0 + + rs_interface = gen_params.get(RSInterfaceLayouts, rs_entries_bits=self.rs_entries_bits) + self.rs_data_layout = [ + ("rp_s2", common.p_register_entry), + ("rp_dst", common.p_register_entry), + ("rob_id", gen_params.rob_entries_bits), + ("exec_fn", common.exec_fn), + ("s1_val", gen_params.isa.xlen), + ("s2_val", gen_params.isa.xlen), + ("imm2", gen_params.imm2_width), + ("vtype", common_vector.vtype), + ("rp_s3", common.p_register_entry), + ("rp_v0", [("id", gen_params.phys_regs_bits)]), + ("rp_s2_rdy", 1), + ("rp_s3_rdy", 1), + ("rp_v0_rdy", 1), + ] + + self.rs_insert_in = [("rs_data", self.rs_data_layout), ("rs_entry_id", self.rs_entries_bits)] + + self.rs_select_out = rs_interface.select_out + + self.rs_update_in = rs_interface.update_in + + self.precommit = retirement.precommit diff --git a/coreblocks/fu/vector_unit/v_len_getter.py b/coreblocks/fu/vector_unit/v_len_getter.py index 511d3711c..74e7610ec 100644 --- a/coreblocks/fu/vector_unit/v_len_getter.py +++ b/coreblocks/fu/vector_unit/v_len_getter.py @@ -51,6 +51,7 @@ def elaborate(self, platform): elens_len = Signal(self.v_params.elens_in_bank_bits) last_mask = Signal(self.v_params.bytes_in_elen) + # TODO add support for tail undisturbed policy @def_method(m, self.issue) def _(arg): with m.Switch(arg.vtype.sew): diff --git a/coreblocks/fu/vector_unit/v_mask_extractor.py b/coreblocks/fu/vector_unit/v_mask_extractor.py index 8571d69e3..1a557fa78 100644 --- a/coreblocks/fu/vector_unit/v_mask_extractor.py +++ b/coreblocks/fu/vector_unit/v_mask_extractor.py @@ -7,38 +7,6 @@ __all__ = ["VectorMaskExtractor"] -def elem_mask_to_byte_mask(m: TModule, v_params: VectorParameters, elem_mask: Value, eew: Value): - """Generate a circuit to convert the mask from the elem format to the byte format. - - The elem format always has valid first `k` bits where `k = ELEN/EEW` and each - bit describes whether an element is valid or not. The byte format has always `ELEN//8` - bits and each bit represents wheter a byte is valid or not. - - Parameters - ---------- - m : TModule - Module to connect the circuit to. - v_params : VectorParameters - Vector unit configuration. - elem_mask : Value - Mask in elem format to be converted. - eew : Value(EEW) - The EEW for which the `elem_mask` was generated. - - Returns - ------- - Mask in byte format. - """ - result = Signal(v_params.bytes_in_elen) - with m.Switch(eew): - for eew_iter in EEW: - with m.Case(eew_iter): - m.d.av_comb += result.eq( - Cat([Repl(bit, 2 ** int(eew_iter)) for bit in elem_mask[: v_params.elen // eew_to_bits(eew_iter)]]) - ) - return result - - class VectorMaskExtractor(Elaboratable): """Module used to extract mask from the vector register entry diff --git a/coreblocks/fu/vector_unit/v_needed_regs.py b/coreblocks/fu/vector_unit/v_needed_regs.py index c1b8f4448..7af25f039 100644 --- a/coreblocks/fu/vector_unit/v_needed_regs.py +++ b/coreblocks/fu/vector_unit/v_needed_regs.py @@ -31,11 +31,11 @@ def elaborate(self, platform): @def_method(m, self.issue) def _(arg): m.d.top_comb += [ - rec_out.v0_needed.eq(arg.vtype.ma == 0), + rec_out.v0_needed.eq(arg.exec_fn.funct7[0] == 0), rec_out.s1_needed.eq(arg.rp_s1.type == RegisterType.V), rec_out.s2_needed.eq(1), # There is no support for three source instructions yet (interger multiply-add) - rec_out.s3_needed.eq(0), + rec_out.s3_needed.eq((arg.vtype.ma == 0) | (arg.vtype.ta == 0)), ] return rec_out diff --git a/coreblocks/fu/vector_unit/v_register.py b/coreblocks/fu/vector_unit/v_register.py index f16d22f3d..5de84d52d 100644 --- a/coreblocks/fu/vector_unit/v_register.py +++ b/coreblocks/fu/vector_unit/v_register.py @@ -1,6 +1,5 @@ from amaranth import * from coreblocks.transactions.core import * -from coreblocks.transactions.lib import MemoryBank from coreblocks.params import * from coreblocks.utils.fifo import BasicFifo from coreblocks.fu.vector_unit.v_layouts import VectorRegisterBankLayouts @@ -43,15 +42,11 @@ def __init__(self, *, gen_params: GenParams): self.layouts = VectorRegisterBankLayouts(self.gen_params) - self.bank = MemoryBank( - data_layout=self.layouts.read_resp, elem_count=self.v_params.elens_in_bank, granularity=8 - ) - # improvement: move to async memory self.byte_mask = Signal(self.v_params.bytes_in_vlen // self.v_params.register_bank_count) - self.read_req = Method.like(self.bank.read_req, name="read_req") - self.read_resp = Method.like(self.bank.read_resp, name="read_resp") + self.read_req = Method(i=self.layouts.read_req, name="read_req") + self.read_resp = Method(o=self.layouts.read_resp, name="read_resp") self.write = Method(i=self.layouts.write) self.write_scalar = Method() self.write_mask = Method() @@ -61,27 +56,62 @@ def __init__(self, *, gen_params: GenParams): def elaborate(self, platform) -> TModule: m = TModule() + resp_ready = Signal() + + data_mem = Memory(width=self.v_params.elen, depth=self.v_params.elens_in_bank) + # we have either bunch of writes or reads. Reads and writes can not be send interchangable + # so we can hav transparent=False + m.submodules.read_port = read_port = data_mem.read_port(transparent=False) + m.submodules.write_port = write_port = data_mem.write_port(granularity=8) + mask_forward = BasicFifo([("data", self.v_params.bytes_in_elen)], 2) m.submodules.mask_forward = mask_forward - m.submodules.bank = self.bank - @def_method(m, self.read_req) - def _(arg): - self.bank.read_req(m, arg) - mask_forward.write(m, data=self.byte_mask.word_select(arg.addr, self.v_params.bytes_in_elen)) - - @def_method(m, self.read_resp) - def _(): - out = self.bank.read_resp(m) + # @def_method(m, self.read_resp, resp_ready) + # def _(): + # mask = mask_forward.read(m) + # out_masked = Signal(self.v_params.elen) + # expanded_mask = ~expand_mask(self.v_params, mask.data) + # m.d.top_comb += out_masked.eq(read_port.data | expanded_mask) + # # Use enable signal to don't store last address in local register + # m.d.sync += resp_ready.eq(0) + # return {"data": out_masked} + # + # # Schedule before allow us to don't have a support memory for the previously read + # # data, so we optimise resource usage at the cost of critical path + # self.read_resp.schedule_before(self.read_req) + # @def_method(m, self.read_req, ~resp_ready | self.read_resp.run) + # def _(addr): + # m.d.top_comb += read_port.addr.eq(addr) + # m.d.comb += read_port.en.eq(1) + # m.d.sync += resp_ready.eq(1) + # mask_forward.write(m, data=self.byte_mask.word_select(addr, self.v_params.bytes_in_elen)) + + m.submodules.data_out_fifo = data_out_fifo = BasicFifo(self.layouts.read_resp, 2) + self.read_resp.proxy(m, data_out_fifo.read) + m.d.comb += read_port.en.eq(0) + + with Transaction().body(m): mask = mask_forward.read(m) - out_masked = Signal.like(out) + out_masked = Signal(self.v_params.elen) expanded_mask = ~expand_mask(self.v_params, mask.data) - m.d.top_comb += out_masked.eq(out | expanded_mask) - return {"data": out_masked} + m.d.top_comb += out_masked.eq(read_port.data | expanded_mask) + # Use enable signal to avoid storing last address in local register + m.d.sync += resp_ready.eq(0) + data_out_fifo.write(m, data=out_masked) + + @def_method(m, self.read_req, ~resp_ready | data_out_fifo.write.ready) + def _(addr): + m.d.top_comb += read_port.addr.eq(addr) + m.d.comb += read_port.en.eq(1) + m.d.sync += resp_ready.eq(1) + mask_forward.write(m, data=self.byte_mask.word_select(addr, self.v_params.bytes_in_elen)) @def_method(m, self.write) def _(addr, data, valid_mask): - self.bank.write(m, addr=addr, data=data, mask=valid_mask) + m.d.top_comb += write_port.addr.eq(addr) + m.d.top_comb += write_port.data.eq(data) + m.d.comb += write_port.en.eq(valid_mask) mask_part = self.byte_mask.word_select(addr, self.v_params.bytes_in_elen) m.d.sync += mask_part.eq(mask_part | valid_mask) diff --git a/coreblocks/fu/vector_unit/v_status.py b/coreblocks/fu/vector_unit/v_status.py index 058d498b2..132ba6bee 100644 --- a/coreblocks/fu/vector_unit/v_status.py +++ b/coreblocks/fu/vector_unit/v_status.py @@ -153,10 +153,10 @@ def process_vsetvl(self, m, instr): with m.Else(): m.d.sync += self.vtype.eq(new_vtype) - with m.If(valid_rs1 & instr.rp_s1.id.bool()): + with m.If(valid_rs1 & instr.rp_s1_reg.bool()): m.d.comb += avl.eq(instr.s1_val) - with m.If(instr.rp_dst.id.bool() | instr.rp_s1.id.bool()): + with m.If(instr.rp_dst.id.bool() | instr.rp_s1_reg.bool() | (instr.imm2[-2:] == 3)): m.d.sync += self.vl.eq(avl) self.retire(m, rob_id=instr.rob_id, exception=0, result=avl, rp_dst=instr.rp_dst) diff --git a/coreblocks/fu/vector_unit/v_translator.py b/coreblocks/fu/vector_unit/v_translator.py index 6ddc48af4..c18c96956 100644 --- a/coreblocks/fu/vector_unit/v_translator.py +++ b/coreblocks/fu/vector_unit/v_translator.py @@ -164,7 +164,10 @@ def elaborate(self, platform) -> TModule: def _(arg): rec = Record(self.layouts.translator_inner) m.d.comb += assign(rec, arg, fields=AssignType.COMMON) - with m.If((arg.exec_fn.funct3 == Funct3.OPIVI) & (arg.exec_fn.op_type != OpType.V_MEMORY)): + with m.If( + (arg.exec_fn.funct3 == Funct3.OPIVI) + & ((arg.exec_fn.op_type != OpType.V_LOAD) & (arg.exec_fn.op_type != OpType.V_STORE)) + ): m.d.comb += rec.s1_val.eq(arg.imm) m.d.comb += rec.rp_s1.type.eq(RegisterType.X) return rec @@ -188,7 +191,7 @@ class VectorTranslator(Elaboratable): Send an instruction to transform. """ - def __init__(self, gen_params: GenParams, put_instr: Method, retire_mult: Method): + def __init__(self, gen_params: GenParams, put_instr: Method): """ Parameters ---------- @@ -196,13 +199,9 @@ def __init__(self, gen_params: GenParams, put_instr: Method, retire_mult: Method Core configuration put_instr : Method The method used to pass the instruction to the next processing stage. - retire_mult : Method - The method used to report the number of internal instructions generated - from a programme instruction. """ self.gen_params = gen_params self.put_instr = put_instr - self.retire_mult = retire_mult self.layouts = VectorFrontendLayouts(self.gen_params) self.issue = Method(i=self.layouts.translator_in) @@ -211,13 +210,11 @@ def elaborate(self, platform) -> TModule: m = TModule() m.submodules.transl_rp3 = transl_rp3 = VectorTranslateRS3(self.gen_params, self.put_instr) - m.submodules.transl_lmul = transl_lmul = VectorTranslateLMUL(self.gen_params, transl_rp3.issue) m.submodules.transl_rewrite_imm = transl_rewrite_imm = VectorTranslateRewirteImm(self.gen_params) @def_method(m, self.issue) def _(arg): rewrited_imm = transl_rewrite_imm.issue(m, arg) - mult = transl_lmul.issue(m, rewrited_imm) - self.retire_mult(m, mult) + transl_rp3.issue(m, rewrited_imm) return m diff --git a/coreblocks/fu/vector_unit/vector_alu.py b/coreblocks/fu/vector_unit/vector_alu.py index 2e8735991..fb2824605 100644 --- a/coreblocks/fu/vector_unit/vector_alu.py +++ b/coreblocks/fu/vector_unit/vector_alu.py @@ -102,12 +102,12 @@ def create_flexible_elementwise_function(self, m: TModule, name: str, op: Callab def elaborate(self, platform): m = TModule() - out_width = bits_to_eew(self.v_params.elen) - m.submodules.decoder = decoder = self.fn.get_decoder(self.gen_params) - m.submodules.adder = adder = FlexibleAdder(out_width) m.d.top_comb += assign(decoder.exec_fn, self.exec_fn, fields=AssignType.ALL) + subtract = Signal() + adder = FlexibleElementwiseFunction(bits_to_eew(self.v_params.elen), (lambda x, y: x + Mux(subtract, -y, y))) + m.submodules.adder = adder m.d.top_comb += adder.eew.eq(self.eew) m.d.top_comb += adder.in1.eq(self.in1) m.d.top_comb += adder.in2.eq(self.in2) @@ -115,7 +115,7 @@ def elaborate(self, platform): with OneHotCase(VectorAluFn.Fn.ADD): m.d.comb += self.out.eq(adder.out_data) with OneHotCase(VectorAluFn.Fn.SUB): - m.d.comb += adder.subtract.eq(1) + m.d.comb += subtract.eq(1) m.d.comb += self.out.eq(adder.out_data) with OneHotCase(VectorAluFn.Fn.SRA): # Arithmetic right shift with m.Switch(self.eew): diff --git a/coreblocks/fu/vector_unit/vrf.py b/coreblocks/fu/vector_unit/vrf.py index 6ed499cf7..cf894bd0f 100644 --- a/coreblocks/fu/vector_unit/vrf.py +++ b/coreblocks/fu/vector_unit/vrf.py @@ -1,10 +1,12 @@ from amaranth import * +from amaranth.utils import * from coreblocks.transactions import * from coreblocks.transactions.lib import * from coreblocks.params import * from coreblocks.utils import * from coreblocks.fu.vector_unit.v_layouts import VRFFragmentLayouts from coreblocks.fu.vector_unit.v_register import VectorRegisterBank +from coreblocks.utils.fifo import * __all__ = ["VRFFragment"] @@ -43,9 +45,7 @@ def __init__(self, *, gen_params: GenParams): self.read_ports_count = 4 self.read_req = [Method(i=self.layout.read_req) for _ in range(self.read_ports_count)] - self.read_resp = [ - Method(i=self.layout.read_resp_i, o=self.layout.read_resp_o) for _ in range(self.read_ports_count) - ] + self.read_resp = [Method(o=self.layout.read_resp_o) for _ in range(self.read_ports_count)] self.write = Method(i=self.layout.write) self.regs = [VectorRegisterBank(gen_params=self.gen_params) for _ in range(self.v_params.vrp_count)] @@ -60,21 +60,47 @@ def elaborate(self, platform): m.submodules.regs = ModuleConnector(*self.regs) m.submodules.clear_product = self.clear_module + fifo_write = BasicFifo(self.layout.write, 2) + fifos_req_port = [BasicFifo(self.layout.read_req, 2) for i in range(self.read_ports_count)] + fifos_resp = [BasicFifo(self.regs[0].read_resp.data_out.layout, 2) for i in range(self.read_ports_count)] + fifos_resp_id = [ + BasicFifo([("port_id", log2_int(self.read_ports_count, False))], 3) for j in range(self.v_params.vrp_count) + ] + + m.submodules.fifo_write = fifo_write + m.submodules.fifos_resp_id = ModuleConnector(ModuleConnector(*fifos_resp_id)) + m.submodules.fifos_req_port = ModuleConnector(*fifos_req_port) + m.submodules.fifos_resp = ModuleConnector(*fifos_resp) + + for i in range(self.read_ports_count): + for j in range(self.v_params.vrp_count): + with Transaction().body(m, request=(fifos_req_port[i].head.vrp_id == j)): + arg = fifos_req_port[i].read(m) + self.regs[j].read_req(m, addr=arg.addr) + fifos_resp_id[j].write(m, port_id=i) + + for i in range(self.read_ports_count): + for j in range(self.v_params.vrp_count): + with Transaction().body(m, request=(fifos_resp_id[j].head == i)): + id = fifos_resp_id[j].read(m) # noqa: F841 + data = self.regs[j].read_resp(m) + fifos_resp[i].write(m, data) + + for j in range(self.v_params.vrp_count): + with Transaction().body(m, request=(fifo_write.head.vrp_id == j)): + arg = fifo_write.read(m) + self.regs[j].write(m, data=arg.data, addr=arg.addr, valid_mask=arg.valid_mask) + @def_method(m, self.write) - def _(vrp_id, addr, data, valid_mask): - for j in condition_switch(m, vrp_id, self.v_params.vrp_count, nonblocking=False): - self.regs[j].write(m, data=data, addr=addr, valid_mask=valid_mask) + def _(arg): + fifo_write.write(m, arg) @loop_def_method(m, self.read_req) - def _(_, vrp_id, addr): - for j in condition_switch(m, vrp_id, self.v_params.vrp_count, nonblocking=False): - self.regs[j].read_req(m, addr=addr) + def _(i, arg): + fifos_req_port[i].write(m, arg) @loop_def_method(m, self.read_resp) - def _(_, vrp_id): - out = Record(self.layout.read_resp_o) - for j in condition_switch(m, vrp_id, self.v_params.vrp_count, nonblocking=False): - m.d.comb += assign(out, self.regs[j].read_resp(m), fields=AssignType.ALL) - return out + def _(i): + return fifos_resp[i].read(m) return m diff --git a/coreblocks/lsu/__init__.py b/coreblocks/lsu/__init__.py new file mode 100644 index 000000000..817ac229b --- /dev/null +++ b/coreblocks/lsu/__init__.py @@ -0,0 +1,2 @@ +from coreblocks.lsu.dummyLsu import * # noqa: F401 +from coreblocks.lsu.vector_lsu import * # noqa: F401 diff --git a/coreblocks/lsu/dummyLsu.py b/coreblocks/lsu/dummyLsu.py index 36077ce7b..c7eff9573 100644 --- a/coreblocks/lsu/dummyLsu.py +++ b/coreblocks/lsu/dummyLsu.py @@ -210,21 +210,35 @@ def __init__(self, gen_params: GenParams, bus: WishboneMaster) -> None: self.get_result = Method(o=self.fu_layouts.accept) self.precommit = Method(i=self.lsu_layouts.precommit) + self._set_reserved = Method(i=[("reserved", 1)]) + self._get_reserved = Method(o=[("reserved", 1)]) + self.bus = bus + # TODO make vector and scalar LSU synchronisation cleaner + connections = self.gen_params.get(DependencyManager) + connections.add_dependency(LSUReservedKey(), (self._get_reserved, self._set_reserved)) def elaborate(self, platform): m = TModule() - reserved = Signal() # means that current_instr is reserved + reserved = Signal(name="reserved") # means that current_instr is reserved current_instr = Record(self.lsu_layouts.rs_data_layout + [("valid", 1)]) m.submodules.internal = internal = LSUDummyInternals(self.gen_params, self.bus, current_instr) result_ready = internal.result_ready | ((current_instr.exec_fn.op_type == OpType.FENCE) & current_instr.valid) + @def_method(m, self._set_reserved) + def _(arg): + m.d.sync += reserved.eq(arg.reserved) + + @def_method(m, self._get_reserved) + def _(): + return {"reserved": reserved} + @def_method(m, self.select, ~reserved) def _(): # We always return 0, because we have only one place in instruction storage. - m.d.sync += reserved.eq(1) + self._set_reserved(m, reserved=1) return {"rs_entry_id": 0} @def_method(m, self.insert) @@ -246,7 +260,7 @@ def _(): m.d.comb += internal.get_result_ack.eq(1) m.d.sync += current_instr.eq(0) - m.d.sync += reserved.eq(0) + self._set_reserved(m, reserved=0) return { "rob_id": current_instr.rob_id, diff --git a/coreblocks/lsu/vector_lsu.py b/coreblocks/lsu/vector_lsu.py new file mode 100644 index 000000000..0800ae56b --- /dev/null +++ b/coreblocks/lsu/vector_lsu.py @@ -0,0 +1,418 @@ +from amaranth import * +from amaranth.lib.coding import Decoder + +from coreblocks.transactions import Method, def_method, Transaction, TModule +from coreblocks.transactions.lib import * +from coreblocks.params import * +from coreblocks.peripherals.wishbone import WishboneMaster +from coreblocks.utils import assign, ModuleLike, AssignType +from coreblocks.utils.protocols import FuncBlock +from coreblocks.fu.vector_unit.v_layouts import * +from coreblocks.fu.vector_unit.utils import * + +__all__ = ["VectorLSU"] + + +class VectorLSUDummyInternals(Elaboratable): + """ + Internal implementation of `VectorLSU` it handles separatly load and stores and + issue proper requests to memory/vector register file as long as needed to process + all `vl` elements. + + Attributes + ---------- + get_result_ack : Signal, in + Instructs to clean the internal state after processing an instruction. + execute : Signal, in + Signals that side effects can be executed. + result_ready : Signal, out + Signals that `VectorLSUDummyInternals` ended instruction processing. + op_exception : Signal, out + Informs if there was an exception. + """ + + def __init__( + self, + gen_params: GenParams, + bus: WishboneMaster, + current_instr: Record, + write_vrf: list[Method], + read_req_vrf: list[Method], + read_resp_vrf: list[Method], + ) -> None: + """ + Parameters + ---------- + gen_params : GenParams + Parameters to be used during processor generation. + bus : WishboneMaster + An instance of the Wishbone master for interfacing with the data memory. + current_instr : Record, in + Reference to signal containing instruction currently processed by LSU. + write_vrf : list[Method] + List with one method for each register bank to write into it. + read_req_vrf : list[Method] + List with one method for each register bank to send a read request to it. + read_resp_vrf : list[Method] + List with one method for each register bank to receive the response for the + previous send read request. + """ + self.gen_params = gen_params + self.v_params = self.gen_params.v_params + self.current_instr = current_instr + self.bus = bus + self.write_vrf = write_vrf + self.read_req_vrf = read_req_vrf + self.read_resp_vrf = read_resp_vrf + + self.dependency_manager = self.gen_params.get(DependencyManager) + self.report = self.dependency_manager.get_dependency(ExceptionReportKey()) + + self.get_result_ack = Signal() + self.result_ready = Signal() + self.execute = Signal() + self.op_exception = Signal() + + self.elems_counter = Signal(bits_for(self.gen_params.v_params.bytes_in_vlen)) + self.elen_counter = Signal(bits_for(self.gen_params.v_params.elens_in_vlen)) + self.bank_id = Signal(bits_for(self.v_params.register_bank_count)) + self.local_addr = Signal(log2_int(self.gen_params.v_params.elens_in_bank, False)) + + def calculate_addr(self, m: ModuleLike): + addr = Signal(self.gen_params.isa.xlen) + m.d.comb += addr.eq(self.current_instr.s1_val + (self.elen_counter << 2)) + return addr + + def prepare_bytes_mask(self, m: TModule, addr: Signal) -> Signal: + mask = Signal(self.v_params.bytes_in_elen) + elem_mask = Signal(self.v_params.bytes_in_elen) + m.submodules.binary_to_onehot = binary_to_onehot = Decoder(self.v_params.bytes_in_elen + 1) + diff = Signal.like(self.current_instr.vtype.vl) + m.d.top_comb += diff.eq(self.current_instr.vtype.vl - self.elems_counter) + m.d.top_comb += binary_to_onehot.i.eq(diff) + last = Signal() + with m.Switch(self.current_instr.vtype.sew): + for sew_iter in SEW: + if eew_to_bits(sew_iter) <= self.v_params.elen: + with m.Case(sew_iter): + m.d.av_comb += last.eq(diff < self.v_params.elen // eew_to_bits(sew_iter)) + m.d.av_comb += elem_mask.eq( + Mux( + last, + binary_to_onehot.o - 1, + 2 ** (self.v_params.elen // eew_to_bits(EEW.w8)) - 1, + ) + ) + m.d.top_comb += mask.eq(elem_mask_to_byte_mask(m, self.v_params, elem_mask, self.current_instr.vtype.sew)) + return mask + + def check_align(self, m: TModule, addr: Signal): + aligned = Signal() + # TODO Allow for aligments to elements instead of aligment to ELEN + match self.v_params.elen: + case 64: + m.d.comb += aligned.eq(addr[0:3] == 0) + case 32: + m.d.comb += aligned.eq(addr[0:2] == 0) + case 16: + m.d.comb += aligned.eq(addr[0] == 0) + case 8: + m.d.comb += aligned.eq(1) + return aligned + + def counters_increase(self, m: TModule): + with m.Switch(self.current_instr.vtype.sew): + for sew_iter in SEW: + with m.Case(sew_iter): + m.d.sync += self.elems_counter.eq(self.elems_counter + self.v_params.elen // eew_to_bits(sew_iter)) + m.d.sync += self.elen_counter.eq(self.elen_counter + 1) + with m.If(self.local_addr + 1 == self.v_params.elens_in_bank): + m.d.sync += self.local_addr.eq(0) + m.d.sync += self.bank_id.eq(self.bank_id + 1) + with m.Else(): + m.d.sync += self.local_addr.eq(self.local_addr + 1) + + def handle_load(self, m: TModule, request: Value, bytes_mask, addr, restart: Value): + cast_dst_vrp_id = Signal(range(self.v_params.vrp_count)) + m.d.top_comb += cast_dst_vrp_id.eq(self.current_instr.rp_dst.id) + with m.FSM(): + with m.State("ReqFromMem"): + with m.If(restart): + m.d.sync += self.local_addr.eq(0) + m.d.sync += self.bank_id.eq(0) + m.d.sync += self.elems_counter.eq(0) + m.d.sync += self.elen_counter.eq(0) + with m.If((self.elems_counter >= self.current_instr.vtype.vl) & request & ~self.result_ready): + m.d.sync += self.op_exception.eq(0) + m.d.sync += self.result_ready.eq(1) + with m.Else(): + with Transaction(name="load_req_from_mem_trans").body(m, request=request & ~self.result_ready): + self.bus.request(m, addr=addr >> 2, we=0, sel=0, data=0) + m.next = "RespFromMem" + with m.State("RespFromMem"): + with Transaction(name="load_resp_from_mem_trans").body(m, request=request): + fetched = self.bus.result(m) + with m.If(fetched.err): + cause = ExceptionCause.LOAD_ACCESS_FAULT + self.report(m, rob_id=self.current_instr.rob_id, cause=cause) + m.d.sync += self.op_exception.eq(fetched.err) + m.d.sync += self.result_ready.eq(1) + with m.Else(): + with m.Switch(self.bank_id): + for i in range(self.v_params.register_bank_count): + with m.Case(i): + self.write_vrf[i]( + m, + addr=self.local_addr, + vrp_id=cast_dst_vrp_id, + valid_mask=bytes_mask, + data=fetched.data, + ) + self.counters_increase(m) + m.next = "ReqFromMem" + + def handle_store(self, m: TModule, request: Value, bytes_mask, addr, restart: Value): + cast_s3_vrp_id = Signal(range(self.v_params.vrp_count)) + m.d.top_comb += cast_s3_vrp_id.eq(self.current_instr.rp_s3.id) + with m.FSM(): + with m.State("ReqFromReg"): + with m.If(restart): + m.d.sync += self.local_addr.eq(0) + m.d.sync += self.bank_id.eq(0) + m.d.sync += self.elems_counter.eq(0) + m.d.sync += self.elen_counter.eq(0) + with m.If((self.elems_counter >= self.current_instr.vtype.vl) & request & ~self.result_ready): + m.d.sync += self.op_exception.eq(0) + m.d.sync += self.result_ready.eq(1) + with m.Else(): + with Transaction(name="store_req_reg_trans").body(m, request=request & ~self.result_ready): + for i in condition_switch(m, self.bank_id, self.v_params.register_bank_count): + self.read_req_vrf[i](m, addr=self.local_addr, vrp_id=cast_s3_vrp_id) + m.next = "RespFromReg" + with m.State("RespFromReg"): + with Transaction(name="store_resp_reg_trans").body(m, request=request): + resp = Record(self.read_resp_vrf[0].data_out.layout) + for i in condition_switch(m, self.bank_id, self.v_params.register_bank_count): + m.d.comb += assign(resp, self.read_resp_vrf[i](m), fields=AssignType.ALL) + self.bus.request(m, addr=addr >> 2, we=1, sel=bytes_mask, data=resp.data) + m.next = "RespFromMem" + with m.State("RespFromMem"): + with Transaction(name="store_resp_from_mem_trans").body(m, request=request): + fetched = self.bus.result(m) + with m.If(fetched.err): + cause = ExceptionCause.STORE_ACCESS_FAULT + self.report(m, rob_id=self.current_instr.rob_id, cause=cause) + m.d.sync += self.op_exception.eq(fetched.err) + m.d.sync += self.result_ready.eq(1) + with m.Else(): + self.counters_increase(m) + m.next = "ReqFromReg" + + def elaborate(self, platform): + m = TModule() + + instr_ready = ( + (self.current_instr.rp_s2_rdy == 1) + & (self.current_instr.rp_s3_rdy == 1) + & (self.current_instr.rp_v0_rdy == 1) + & self.current_instr.valid + & ~self.result_ready + ) + + is_load = self.current_instr.exec_fn.op_type == OpType.V_LOAD + + addr = self.calculate_addr(m) + aligned = self.check_align(m, addr) + bytes_mask = self.prepare_bytes_mask(m, addr) + + self.handle_load(m, instr_ready & is_load & aligned, bytes_mask, addr, self.get_result_ack) + self.handle_store(m, instr_ready & self.execute & aligned & ~is_load, bytes_mask, addr, self.get_result_ack) + with Transaction(name="miss_align_trans").body(m, request=instr_ready & (is_load | self.execute) & ~aligned): + m.d.sync += self.op_exception.eq(1) + m.d.sync += self.result_ready.eq(1) + + cause = Mux(is_load, ExceptionCause.LOAD_ADDRESS_MISALIGNED, ExceptionCause.STORE_ADDRESS_MISALIGNED) + self.report(m, rob_id=self.current_instr.rob_id, cause=cause) + + with m.If(self.get_result_ack): + m.d.sync += self.result_ready.eq(0) + m.d.sync += self.op_exception.eq(0) + + return m + + +class VectorLSU(FuncBlock, Elaboratable): + """A Load Store Unit for handling operations on vector registers. + + This module implements support for unit-stride vector memory operations. + At the moment it only supports ELEN=32 due to length dependencies between + `WishboneMaster`, `xlen` and `elen`. Additionally, each address must be + aligned to `elen` and alignment to `eew` may not be sufficient. + + The `VectorLSU` is implemented as a `FunctionalBlock` to don't block the selection + of the `VectorBlockComponent` if the `VectorLSU` is currently processing an instruction. + To illustrate the problem, consider the following program: + + .. highlight:: asm + .. code-block:: asm + + vle32.v v3, (x0) + vadd.vv v2, v1, v0 + + In this programme `vadd` is independent of `vle32`, so it can be executed in parallel, + but the `VectorLSU` selection has to be blocked after the selection of `vle32`, so that no other + memory instructions are inserted (the assumption of the scheduler is that selection + can be blocked, but insertion not). If `VectorLSU` will be a part of `VectorBlockComponent`, then + selection of `vle32` will block selection of `vadd`, because in selection there is no + argument that can be used to do filtering, so `vadd` will be serialised. In the future + this should be refactored. + + Selection of this block will block selection of `DummyLSU` to serialise memory accesses. + + All `FunctionalBlock` methods in this block have vector versions, that should be used + by the `VectorCore`. Scalar versions are mostly a stubs for interface compatibility. + + Attributes + ---------- + insert : Method + Proxy for the `VectorCoreBlock` insert. It uses the fact that in `VectorCoreBlock` + there is a `FifoRS` which ignores `register_entry_id`. + select : Method + Select this block to execute the instruction. This also blocks execution of `LSUDummy`. + Not ready if `LSUDummy` is currently running. + update : Method + Stub + get_result : Method + Stub + insert_v : Method + The method for inserting memory instructions pre-processed by the VectorFrontend. + update_v : Method + Get the updates of the vector registers. + get_result_v : Method + Get the result of the memory instruction. + precommit : Method + Listen to precommit announcements. + """ + + def __init__(self, gen_params: GenParams) -> None: + self.gen_params = gen_params + self.v_params = self.gen_params.v_params + self.fu_layouts = gen_params.get(FuncUnitLayouts) + self.v_lsu_layouts = gen_params.get(VectorLSULayouts) + self.connections = self.gen_params.get(DependencyManager) + + self.vxrs_layouts = VectorXRSLayout( + self.gen_params, rs_entries_bits=log2_int(self.v_params.vxrs_entries, False) + ) + self.frontend_layouts = VectorFrontendLayouts(self.gen_params) + + self.insert = Method(i=self.vxrs_layouts.insert_in) + self.update = Method(i=self.vxrs_layouts.update_in) + self.select = Method(o=self.v_lsu_layouts.rs_select_out) + self.insert_v = Method(i=self.frontend_layouts.instr_to_mem) + self.update_v = Method(i=self.v_lsu_layouts.rs_update_in) + self.get_result = Method(o=self.fu_layouts.accept) + self.get_result_v = Method(o=self.fu_layouts.accept) + self.precommit = Method(i=self.v_lsu_layouts.precommit) + + if self.gen_params.isa.xlen != self.gen_params.v_params.elen or self.gen_params.v_params.elen != 32: + raise ValueError("Vector LSU don't support XLEN != ELEN != 32 yet.") + + def elaborate(self, platform): + m = TModule() + reserved = Signal() + current_instr = Record(self.v_lsu_layouts.rs_data_layout + [("valid", 1)]) + _get_reserved, _set_reserved = self.connections.get_dependency(LSUReservedKey()) + self.bus = self.connections.get_dependency(WishboneDataKey()) + scoreboard_get_dirty, scoreboard_set_dirty = self.connections.get_dependency(VectorScoreboardKey()) + self.write_vrf, self.read_req_vrf, self.read_resp_vrf = self.connections.get_dependency(VectorVRFAccessKey()) + + m.submodules.internal = internal = VectorLSUDummyInternals( + self.gen_params, self.bus, current_instr, self.write_vrf, self.read_req_vrf, self.read_resp_vrf + ) + result_ready = internal.result_ready + + with Transaction().body(m): + m.d.comb += reserved.eq(_get_reserved(m).reserved) + + @def_method(m, self.select, ~reserved) + def _(): + # We always return 0, because we have only one place in instruction storage. + _set_reserved(m, reserved=1) + return {"rs_entry_id": 0} + + @def_method(m, self.insert_v) + def _(arg): + m.d.sync += assign(current_instr, arg) + m.d.sync += current_instr.valid.eq(1) + # no support for instructions which use v0 or vs2, so don't wait for them + m.d.sync += current_instr.rp_v0_rdy.eq(1) + m.d.sync += current_instr.rp_s2_rdy.eq(1) + cast_rp_s3 = Signal(self.v_params.vrp_count_bits) + m.d.top_comb += cast_rp_s3.eq(arg.rp_s3.id) + m.d.sync += current_instr.rp_s3_rdy.eq(~scoreboard_get_dirty(m, id=cast_rp_s3)) + cast_rp_dst = Signal(self.v_params.vrp_count_bits) + m.d.top_comb += cast_rp_dst.eq(arg.rp_dst.id) + scoreboard_set_dirty(m, id=cast_rp_dst, dirty=1) + + @def_method(m, self.update_v) + def _(tag: Value, value: Value): + with m.If(current_instr.rp_s2 == tag): + m.d.sync += current_instr.rp_s2_rdy.eq(1) + with m.If(current_instr.rp_s3 == tag): + m.d.sync += current_instr.rp_s3_rdy.eq(1) + with m.If(current_instr.rp_v0 == tag): + m.d.sync += current_instr.rp_v0_rdy.eq(1) + + frontend_insert = self.connections.get_dependency(VectorFrontendInsertKey()) + self.insert.proxy(m, frontend_insert) + + # Scalar updating will be handled by VectorFrontend + @def_method(m, self.update) + def _(arg): + pass + + # Result announcements will be handled by VectorAnnouncer + @def_method(m, self.get_result, 0) + def _(arg): + pass + + @def_method(m, self.get_result_v, result_ready) + def _(): + m.d.comb += internal.get_result_ack.eq(1) + + m.d.sync += current_instr.eq(0) + _set_reserved(m, reserved=0) + + cast_rp_dst = Signal(self.v_params.vrp_count_bits) + m.d.top_comb += cast_rp_dst.eq(current_instr.rp_dst.id) + scoreboard_set_dirty(m, id=cast_rp_dst, dirty=0) + + return { + "rob_id": current_instr.rob_id, + "rp_dst": current_instr.rp_dst, + "result": 0, + "exception": internal.op_exception, + } + + @def_method(m, self.precommit) + def _(rob_id: Value): + with m.If(current_instr.valid & (rob_id == current_instr.rob_id)): + m.d.comb += internal.execute.eq(1) + + return m + + +class VectorLSUBlockComponent(BlockComponentParams): + def get_module(self, gen_params: GenParams) -> FuncBlock: + connections = gen_params.get(DependencyManager) + unit = VectorLSU(gen_params) + connections.add_dependency(InstructionPrecommitKey(), unit.precommit) + connections.add_dependency(VectorLSUKey(), unit) + return unit + + def get_optypes(self) -> set[OpType]: + return {OpType.V_LOAD, OpType.V_STORE} + + def get_rs_entry_count(self) -> int: + return 1 diff --git a/coreblocks/params/configurations.py b/coreblocks/params/configurations.py index c1c401d71..270f8f0a2 100644 --- a/coreblocks/params/configurations.py +++ b/coreblocks/params/configurations.py @@ -15,7 +15,9 @@ from coreblocks.fu.zbc import ZbcComponent from coreblocks.fu.zbs import ZbsComponent from coreblocks.fu.exception import ExceptionUnitComponent +from coreblocks.fu.vector_unit.v_core import VectorBlockComponent from coreblocks.lsu.dummyLsu import LSUBlockComponent +from coreblocks.lsu.vector_lsu import VectorLSUBlockComponent from coreblocks.structs_common.csr import CSRBlockComponent __all__ = [ @@ -24,6 +26,7 @@ "basic_core_config", "tiny_core_config", "full_core_config", + "vector_core_config", "test_core_config", "test_vector_core_config", ] @@ -186,4 +189,22 @@ def replace(self, **kwargs): ) # Core configuration with vector extension -vector_core_config = CoreConfiguration(_implied_extensions=Extension.V) +vector_core_config = CoreConfiguration( + allow_partial_extensions=True, + vector_config=VectorUnitConfiguration(register_bank_count=1), + func_units_config=( + RSBlockComponent( + [ + ALUComponent(), + ShiftUnitComponent(), + JumpComponent(), + ExceptionUnitComponent(), + ], + rs_entries=4, + ), + LSUBlockComponent(), + CSRBlockComponent(), + VectorBlockComponent(8), + VectorLSUBlockComponent(), + ), +) diff --git a/coreblocks/params/isa.py b/coreblocks/params/isa.py index c0bf27a28..465490fa6 100644 --- a/coreblocks/params/isa.py +++ b/coreblocks/params/isa.py @@ -18,6 +18,7 @@ "ISA", "RegisterType", "funct6_to_funct7", + "load_store_width_to_eew", "SEW", "EEW", "EMUL", @@ -65,15 +66,15 @@ class Opcode(IntEnum, shape=5): class Funct3(IntEnum, shape=3): - JALR = BEQ = B = ADD = SUB = FENCE = PRIV = MUL = MULW = _EINSTRACCESSFAULT = OPIVV = 0b000 + JALR = BEQ = B = ADD = SUB = FENCE = PRIV = MUL = MULW = _EINSTRACCESSFAULT = OPIVV = VMEM8 = 0b000 BNE = H = SLL = FENCEI = CSRRW = MULH = BCLR = BINV = BSET = CLZ = CPOP = CTZ = ROL \ = SEXTB = SEXTH = CLMUL = _EILLEGALINSTR = OPFVV = 0b001 # fmt: skip W = SLT = CSRRS = MULHSU = SH1ADD = CLMULR = _EBREAKPOINT = OPMVV = 0b010 D = SLTU = CSRRC = MULHU = CLMULH = _EINSTRPAGEFAULT = OPIVI = 0b011 BLT = BU = XOR = DIV = DIVW = SH2ADD = MIN = XNOR = ZEXTH = OPIVX = 0b100 - BGE = HU = SR = CSRRWI = DIVU = DIVUW = BEXT = ORCB = REV8 = ROR = MINU = OPFVF = 0b101 - BLTU = OR = CSRRSI = REM = REMW = SH3ADD = MAX = ORN = OPMVX = 0b110 - BGEU = AND = CSRRCI = REMU = REMUW = ANDN = MAXU = OPCFG = 0b111 + BGE = HU = SR = CSRRWI = DIVU = DIVUW = BEXT = ORCB = REV8 = ROR = MINU = OPFVF = VMEM16 = 0b101 + BLTU = OR = CSRRSI = REM = REMW = SH3ADD = MAX = ORN = OPMVX = VMEM32 = 0b110 + BGEU = AND = CSRRCI = REMU = REMUW = ANDN = MAXU = OPCFG = VMEM64 = 0b111 class Funct6(IntEnum, shape=6): @@ -369,6 +370,27 @@ def lmul_to_int(lmul: LMUL) -> int: return math.ceil(lmul_to_float(lmul)) +def load_store_width_to_eew(funct3: Funct3 | int) -> EEW: + """Convert vector load/store funct3 to EEW. + + Parameters + ---------- + funct3 : Funct3 | int + Value to convert. + """ + match funct3: + # constants taken from RISC-V V extension specification + case 0: + return EEW.w8 + case 5: + return EEW.w16 + case 6: + return EEW.w32 + case 7: + return EEW.w64 + raise ValueError("Wrong vector load/store width.") + + @unique class Extension(enum.IntFlag): """ diff --git a/coreblocks/params/keys.py b/coreblocks/params/keys.py index 9d289f0c8..c906a3b1b 100644 --- a/coreblocks/params/keys.py +++ b/coreblocks/params/keys.py @@ -7,6 +7,7 @@ if TYPE_CHECKING: from coreblocks.structs_common.csr_generic import GenericCSRRegisters # noqa: F401 + from coreblocks.lsu.vector_lsu import VectorLSU # noqa: F401 __all__ = [ "WishboneDataKey", @@ -16,6 +17,11 @@ "GenericCSRRegistersKey", "ROBBlockInterruptsKey", "ROBPeekKey", + "LSUReservedKey", + "VectorFrontendInsertKey", + "VectorVRFAccessKey", + "VectorLSUKey", + "VectorScoreboardKey", ] @@ -52,3 +58,32 @@ class ROBBlockInterruptsKey(SimpleKey[Method]): @dataclass(frozen=True) class ROBPeekKey(SimpleKey[Method]): pass + + +@dataclass(frozen=True) +class LSUReservedKey(SimpleKey[tuple[Method, Method]]): + pass + + +# TODO To remove after refactor +@dataclass(frozen=True) +class VectorFrontendInsertKey(SimpleKey[Method]): + pass + + +# TODO To remove after refactor +@dataclass(frozen=True) +class VectorVRFAccessKey(SimpleKey[tuple[list[Method], list[Method], list[Method]]]): + pass + + +# TODO To remove after refactor +@dataclass(frozen=True) +class VectorLSUKey(SimpleKey["VectorLSU"]): + pass + + +# TODO To remove after refactor +@dataclass(frozen=True) +class VectorScoreboardKey(SimpleKey[tuple[Method, Method]]): + pass diff --git a/coreblocks/params/optypes.py b/coreblocks/params/optypes.py index 1dff55b34..a4a02901f 100644 --- a/coreblocks/params/optypes.py +++ b/coreblocks/params/optypes.py @@ -53,7 +53,8 @@ class OpType(IntEnum): V_PERMUTATION = auto() V_CONTROL = auto() # optype used both for vsetvl and internal instructions V_REDUCTION = auto() - V_MEMORY = auto() + V_LOAD = auto() + V_STORE = auto() # @@ -122,7 +123,8 @@ class OpType(IntEnum): OpType.V_PERMUTATION, OpType.V_CONTROL, OpType.V_REDUCTION, - OpType.V_MEMORY, + OpType.V_LOAD, + OpType.V_STORE, ], } diff --git a/coreblocks/scheduler/scheduler.py b/coreblocks/scheduler/scheduler.py index 637371cb4..3395d282b 100644 --- a/coreblocks/scheduler/scheduler.py +++ b/coreblocks/scheduler/scheduler.py @@ -3,7 +3,7 @@ from amaranth import * from coreblocks.transactions import Method, Transaction, TModule -from coreblocks.transactions.lib import FIFO, Forwarder +from coreblocks.transactions.lib import FIFO, Forwarder, condition from coreblocks.params import SchedulerLayouts, GenParams, OpType, RegisterType from coreblocks.utils import assign, AssignType from coreblocks.utils.protocols import FuncBlock @@ -315,8 +315,6 @@ def __init__( def elaborate(self, platform): m = TModule() - # This transaction will not be stalled by single RS because insert methods do not use conditional calling, - # therefore we can use single transaction here. with Transaction().body(m): instr = self.get_instr(m) source1 = self.rf_read1(m, {"reg_id": instr.regs_p.s1.id}) @@ -325,8 +323,8 @@ def elaborate(self, platform): data = { # when operand value is valid the convention is to set operand source to 0 "rs_data": { - "rp_s1": Mux(source1.valid, 0, instr.regs_p.s1), - "rp_s2": Mux(source2.valid, 0, instr.regs_p.s2), + "rp_s1": Mux(source1.valid & (instr.regs_p.s1.type == RegisterType.X), 0, instr.regs_p.s1), + "rp_s2": Mux(source2.valid & (instr.regs_p.s2.type == RegisterType.X), 0, instr.regs_p.s2), "rp_s1_reg": instr.regs_p.s1.id, "rp_s2_reg": instr.regs_p.s2.id, "rp_dst": instr.regs_p.dst, @@ -340,15 +338,16 @@ def elaborate(self, platform): }, } - for i, rs_insert in enumerate(self.rs_insert): - # connect only matching fields - arg = Record.like(rs_insert.data_in) - m.d.comb += assign(arg, data, fields=AssignType.COMMON) - # this assignment truncates signal width from max rs_entry_bits to target RS specific width - m.d.comb += arg.rs_entry_id.eq(instr.rs_entry_id) + with condition(m, priority=False) as branch: + for i, rs_insert in enumerate(self.rs_insert): + # connect only matching fields + arg = Record.like(rs_insert.data_in) + m.d.top_comb += assign(arg, data, fields=AssignType.COMMON) + # this assignment truncates signal width from max rs_entry_bits to target RS specific width + m.d.top_comb += arg.rs_entry_id.eq(instr.rs_entry_id) - with m.If(instr.rs_selected == i): - rs_insert(m, arg) + with branch(instr.rs_selected == i): + rs_insert(m, arg) return m diff --git a/coreblocks/stages/backend.py b/coreblocks/stages/backend.py index 47c1ef82d..663666d4b 100644 --- a/coreblocks/stages/backend.py +++ b/coreblocks/stages/backend.py @@ -1,6 +1,6 @@ from amaranth import * -from coreblocks.params import GenParams +from coreblocks.params import GenParams, RegisterType from coreblocks.transactions import Method, Transaction, TModule __all__ = ["ResultAnnouncement"] @@ -59,7 +59,7 @@ def elaborate(self, platform): result = self.m_get_result(m) self.m_rob_mark_done(m, rob_id=result.rob_id, exception=result.exception) - with m.If(result.exception == 0): + with m.If((result.exception == 0) & (result.rp_dst.type == RegisterType.X)): self.m_rf_write_val(m, reg_id=result.rp_dst.id, reg_val=result.result) with m.If(result.rp_dst.id != 0): self.m_rs_write_val(m, tag=result.rp_dst, value=result.result) diff --git a/coreblocks/structs_common/scoreboard.py b/coreblocks/structs_common/scoreboard.py index e3e77a7c3..a13df06b3 100644 --- a/coreblocks/structs_common/scoreboard.py +++ b/coreblocks/structs_common/scoreboard.py @@ -1,7 +1,6 @@ from amaranth import * from coreblocks.transactions import * from coreblocks.params import * -from coreblocks.utils.utils import PriorityUniqnessChecker __all__ = ["Scoreboard"] @@ -17,9 +16,8 @@ class Scoreboard(Elaboratable): Methods to get the dirty bit for the given index id. Layout: ScoreboardLayouts.get_dirty_* set_dirty_list : list[Method] - Methods to set the dirty bit for the given index id. If more than - one method try to write to the same index, the method - with the lowest index in list has a priority. + Methods to set the dirty bit for the given index id. + No conflict detection. Layout: ScoreboardLayouts.set_dirty_in """ @@ -45,25 +43,18 @@ def __init__(self, entries_number: int, superscalarity: int = 1, *, data_forward ] self.set_dirty_list = [Method(i=self.layouts.set_dirty_in, name=f"set{i}") for i in range(self.superscalarity)] - for i in range(1, self.superscalarity): - self.set_dirty_list[i - 1].schedule_before(self.set_dirty_list[i]) - def elaborate(self, platform) -> TModule: m = TModule() data = Signal(self.entries_number, name="data") - m.submodules.checker = checker = PriorityUniqnessChecker( - self.superscalarity, len(Record(self.layouts.set_dirty_in).id), non_valid_ok=True - ) if self.data_forward: data_forward = Signal(self.entries_number, name="data_forward") data_forward_valid = Signal(self.entries_number, name="data_forward_valid") - @loop_def_method(m, self.set_dirty_list, ready_list=checker.valids) + # TODO add conflict detection + @loop_def_method(m, self.set_dirty_list) def _(i, id, dirty): - m.d.top_comb += checker.inputs[i].eq(id) - m.d.top_comb += checker.input_valids[i].eq(self.set_dirty_list[i].run) m.d.sync += data.bit_select(id, 1).eq(dirty) if self.data_forward: m.d.comb += data_forward.bit_select(id, 1).eq(dirty) diff --git a/coreblocks/transactions/core.py b/coreblocks/transactions/core.py index 53511909a..d4544ec4a 100644 --- a/coreblocks/transactions/core.py +++ b/coreblocks/transactions/core.py @@ -71,7 +71,7 @@ def rec(transaction: Transaction, source: TransactionBase): for method in source.method_uses.keys(): if not method.defined: raise RuntimeError( - f"Trying to use method '{method.name}' which is not defined yet. " + f"Trying to use method '{method.name}:{method.owned_name}' which is not defined yet. " + "Are you sure that it was added to proper submodule?" ) if method in self.methods_by_transaction[transaction]: diff --git a/coreblocks/transactions/lib.py b/coreblocks/transactions/lib.py index c0012f54a..209eae978 100644 --- a/coreblocks/transactions/lib.py +++ b/coreblocks/transactions/lib.py @@ -34,6 +34,7 @@ "condition", "connected_conditions", "condition_switch", + "def_one_caller_wrapper", "AnyToAnySimpleRoutingBlock", "OmegaRoutingNetwork", "PriorityOrderingTransProxyTrans", @@ -45,6 +46,8 @@ "DownCounter", "Barrier", "ContentAddressableMemory", + "BufferedMethodCall", + "BufferedReqResp", ] # FIFOs @@ -561,62 +564,6 @@ def _(arg): return m -class MethodFilter(Elaboratable): - """Method filter. - - Takes a target method and creates a method which calls the target method - only when some condition is true. The condition function takes two - parameters, a module and the input `Record` of the method. Non-zero - return value is interpreted as true. Alternatively to using a function, - a `Method` can be passed as a condition. - - Caveat: because of the limitations of transaction scheduling, the target - method is locked for usage even if it is not called. - - Attributes - ---------- - method: Method - The transformed method. - """ - - def __init__( - self, target: Method, condition: Callable[[TModule, Record], ValueLike], default: Optional[RecordDict] = None - ): - """ - Parameters - ---------- - target: Method - The target method. - condition: function or Method - The condition which, when true, allows the call to `target`. When - false, `default` is returned. - default: Value or dict, optional - The default value returned from the filtered method when the condition - is false. If omitted, zero is returned. - """ - if default is None: - default = Record.like(target.data_out) - - self.target = target - self.method = Method.like(target) - self.condition = condition - self.default = default - - def elaborate(self, platform): - m = TModule() - - ret = Record.like(self.target.data_out) - m.d.comb += assign(ret, self.default, fields=AssignType.ALL) - - @def_method(m, self.method) - def _(arg): - with m.If(self.condition(m, arg)): - m.d.comb += ret.eq(self.target(m, arg)) - return ret - - return m - - class MethodProduct(Elaboratable): def __init__( self, @@ -1330,6 +1277,79 @@ def _internal_branch(cond: ValueLike): m.d.top_comb += all_conds.eq(~Cat(all_not_conds_list).all()) +class MethodFilter(Elaboratable): + """Method filter. + + Takes a target method and creates a method which calls the target method + only when some condition is true. The condition function takes two + parameters, a module and the input `Record` of the method. Non-zero + return value is interpreted as true. Alternatively to using a function, + a `Method` can be passed as a condition. + + By default the target method is locked for usage even if it is not called. + If this is not desired effect, set `use_condition` to True. + + Attributes + ---------- + method: Method + The transformed method. + """ + + def __init__( + self, + target: Method, + condition: Callable[[TModule, Record], ValueLike], + default: Optional[RecordDict] = None, + use_condition: bool = False, + ): + """ + Parameters + ---------- + target: Method + The target method. + condition: function or Method + The condition which, when true, allows the call to `target`. When + false, `default` is returned. + default: Value or dict, optional + The default value returned from the filtered method when the condition + is false. If omitted, zero is returned. + use_condition : bool + Instead of `m.If` use simultaneus `condition` which allow to execute + this filter if the condition is False and target is not ready. + """ + if default is None: + default = Record.like(target.data_out) + + self.target = target + self.use_condition = use_condition + self.method = Method(i=target.data_in.layout, o=target.data_out.layout, single_caller=self.use_condition) + self.condition = condition + self.default = default + + def elaborate(self, platform): + m = TModule() + + ret = Record.like(self.target.data_out) + m.d.comb += assign(ret, self.default, fields=AssignType.ALL) + + @def_method(m, self.method) + def _(arg): + if self.use_condition: + cond = Signal() + m.d.top_comb += cond.eq(self.condition(m, arg)) + with condition(m, nonblocking=False) as branch: + with branch(cond): + m.d.comb += ret.eq(self.target(m, arg)) + with branch(~cond): + pass + else: + with m.If(self.condition(m, arg)): + m.d.comb += ret.eq(self.target(m, arg)) + return ret + + return m + + def def_one_caller_wrapper(method_to_wrap: Method, wrapper: Method) -> TModule: """ Function used to a wrap method that can only have one caller. After wrapping @@ -2058,3 +2078,111 @@ def _(addr): m.d.comb += encoder_addr.input.eq(if_addr) return m + + +class BufferedMethodCall(Elaboratable): + """Wrap method call with fifos + + This module takes a method and calls it when it gets an argument, but + first storing the argument in the fifo buffer. Similarly, the results + of the call are also stored in the fifo buffer and are available after + a cycle. + + Attributes + ---------- + call_in : Method + Method to pass data to be buffered before forwarding them + to the target method. + call_out : Method + The method used to read the buffered results of the target method. + """ + + def __init__(self, called_method: Method, buffor_depth: int = 2): + """ + Parameters + ---------- + called_method : Method + Target method for which input and output should be buffered. + buffor_depth : int + The depth of the buffers. + """ + self.called_method = called_method + self.buffor_depth = buffor_depth + + self.call_in = Method(i=self.called_method.data_in.layout) + self.call_out = Method(o=self.called_method.data_out.layout) + + def elaborate(self, platform): + m = TModule() + + fifo_in = BasicFifo(self.called_method.data_in.layout, self.buffor_depth) + # TODO add posibility to use outside buffer to reduce latency + fifo_out = BasicFifo(self.called_method.data_out.layout, self.buffor_depth) + + self.call_in.proxy(m, fifo_in.write) + self.call_out.proxy(m, fifo_out.read) + + with Transaction().body(m): + fifo_out.write(m, self.called_method(m, fifo_in.read(m))) + + m.submodules.fifo_in = fifo_in + m.submodules.fifo_out = fifo_out + return m + + +class BufferedReqResp(Elaboratable): + """Wrap the request-response methods pair with the buffer + + This module takes a request-response methods pair and provides + the wrappers that: + + - passes request arguments to the original request method, + - transforms the request arguments with the transformation specified by the user + - stores transformed arguments in the buffer + - passes transformed arguments from the buffer to the original response method to retrieve the response + + Attributes + ---------- + req : Method + The request method wrapper. + resp : Method + The response method wrapper. + """ + + def __init__( + self, + req_method: Method, + resp_method: Method, + buffor_depth: int = 2, + resp_in_transform: Optional[Tuple[MethodLayout, Callable[[TModule, Record], RecordDict]]] = None, + ): + self.req_method = req_method + self.resp_method = resp_method + self.buffor_depth = buffor_depth + self.resp_in_transform = resp_in_transform + + self.req = Method(i=self.req_method.data_in.layout) + self.resp = Method(o=self.resp_method.data_out.layout) + + def elaborate(self, platform): + m = TModule() + + fifo_req = BasicFifo(self.req_method.data_in.layout, 2) + buffered_resp = BufferedMethodCall(self.resp_method, self.buffor_depth) + resp_in_transformer = MethodTransformer(buffered_resp.call_in, i_transform=self.resp_in_transform) + + with Transaction().body(m): + self.req_method(m, fifo_req.read(m)) + + @def_method(m, self.req) + def _(arg): + fifo_req.write(m, arg) + resp_in_transformer.method(m, arg) + + self.resp.proxy(m, buffered_resp.call_out) + + m.submodules.fifo_req = fifo_req + m.submodules.buffered_resp = buffered_resp + m.submodules.resp_in_transformer = resp_in_transformer + + return m diff --git a/coreblocks/utils/fifo.py b/coreblocks/utils/fifo.py index d1974b3a9..6c9d84da4 100644 --- a/coreblocks/utils/fifo.py +++ b/coreblocks/utils/fifo.py @@ -53,9 +53,6 @@ def __init__(self, layout: MethodLayout, depth: int) -> None: # current fifo depth self.level = Signal((self.depth).bit_length()) - self.clear.add_conflict(self.read, Priority.LEFT) - self.clear.add_conflict(self.write, Priority.LEFT) - # for interface compatibility with MultiportFifo self.read_methods = [self.read] self.write_methods = [self.write] @@ -63,13 +60,14 @@ def __init__(self, layout: MethodLayout, depth: int) -> None: def elaborate(self, platform): m = TModule() - m.submodules.buff_rdport = self.buff_rdport = self.buff.read_port( - domain="comb", transparent=True - ) # FWFT behaviour + next_read_idx = Signal.like(self.read_idx) + m.d.top_comb += next_read_idx.eq(mod_incr(self.read_idx, self.depth)) + + m.submodules.buff_rdport = self.buff_rdport = self.buff.read_port(domain="sync", transparent=True) m.submodules.buff_wrport = self.buff_wrport = self.buff.write_port() - m.d.comb += self.read_ready.eq(self.level > 0) - m.d.comb += self.write_ready.eq(self.level < self.depth) + m.d.top_comb += self.read_ready.eq(self.level != 0) + m.d.top_comb += self.write_ready.eq(self.level != self.depth) with m.If(self.read.run & ~self.write.run): m.d.sync += self.level.eq(self.level - 1) @@ -78,20 +76,20 @@ def elaborate(self, platform): with m.If(self.clear.run): m.d.sync += self.level.eq(0) - m.d.comb += self.buff_rdport.addr.eq(self.read_idx) - m.d.comb += self.head.eq(self.buff_rdport.data) + m.d.top_comb += self.buff_rdport.addr.eq(Mux(self.read.run, next_read_idx, self.read_idx)) + m.d.top_comb += self.head.eq(self.buff_rdport.data) @def_method(m, self.write, ready=self.write_ready) def _(arg: Record) -> None: - m.d.comb += self.buff_wrport.addr.eq(self.write_idx) - m.d.comb += self.buff_wrport.data.eq(arg) + m.d.top_comb += self.buff_wrport.addr.eq(self.write_idx) + m.d.top_comb += self.buff_wrport.data.eq(arg) m.d.comb += self.buff_wrport.en.eq(1) m.d.sync += self.write_idx.eq(mod_incr(self.write_idx, self.depth)) @def_method(m, self.read, self.read_ready) def _() -> ValueLike: - m.d.sync += self.read_idx.eq(mod_incr(self.read_idx, self.depth)) + m.d.sync += self.read_idx.eq(next_read_idx) return self.head @def_method(m, self.clear) diff --git a/docker/riscv-toolchain.Dockerfile b/docker/riscv-toolchain.Dockerfile index ec33ce01b..cc70c937f 100644 --- a/docker/riscv-toolchain.Dockerfile +++ b/docker/riscv-toolchain.Dockerfile @@ -12,6 +12,6 @@ RUN apt-get update && \ RUN git clone https://github.com/riscv/riscv-gnu-toolchain && \ cd riscv-gnu-toolchain && \ git checkout 2023.05.14 && \ - ./configure --with-multilib-generator="rv32i-ilp32--a*zifence*zicsr;rv32im-ilp32--a*zifence*zicsr;rv32ic-ilp32--a*zifence*zicsr;rv32imc-ilp32--a*zifence*zicsr;rv32imfc-ilp32f--a*zifence;rv32i_zmmul-ilp32--a*zifence*zicsr;rv32ic_zmmul-ilp32--a*zifence*zicsr" && \ + ./configure --with-multilib-generator="rv32i-ilp32--a*zifence*zicsr;rv32im-ilp32--a*zifence*zicsr;rv32ic-ilp32--a*zifence*zicsr;rv32imc-ilp32--a*zifence*zicsr;rv32imfc-ilp32f--a*zifence;rv32i_zmmul-ilp32--a*zifence*zicsr;rv32ic_zmmul-ilp32--a*zifence*zicsr;rv32i_zve32x-ilp32--zifence*zicsr" && \ make -j$(nproc) && \ cd / && rm -rf riscv-gnu-toolchain diff --git a/scripts/gen_verilog.py b/scripts/gen_verilog.py index 683e771c1..93720cc1f 100755 --- a/scripts/gen_verilog.py +++ b/scripts/gen_verilog.py @@ -5,7 +5,7 @@ import argparse from amaranth.build import Platform -from amaranth.back import verilog +from amaranth.back import verilog, rtlil from amaranth import Module, Elaboratable if __name__ == "__main__": @@ -24,6 +24,7 @@ "basic": basic_core_config, "tiny": tiny_core_config, "full": full_core_config, + "vector": vector_core_config, } @@ -43,13 +44,18 @@ def elaborate(self, platform: Platform): return tm -def gen_verilog(core_config: CoreConfiguration, output_path): +def gen_output(core_config: CoreConfiguration, output_path, type): top = Top(GenParams(core_config)) with open(output_path, "w") as f: signals = list(flatten_signals(top.wb_instr)) + list(flatten_signals(top.wb_data)) - f.write(verilog.convert(top, ports=signals, strip_internal_attrs=True)) + if type == "verilog": + f.write(verilog.convert(top, ports=signals, strip_internal_attrs=True)) + elif type == "rtlil": + f.write(rtlil.convert(top, ports=signals)) + else: + raise ValueError(f"Output type '{type}' not known.") def main(): @@ -74,6 +80,14 @@ def main(): "-o", "--output", action="store", default="core.v", help="Output file path. Default: %(default)s" ) + parser.add_argument( + "-t", + "--type", + action="store", + default="verilog", + help="Choose generation target. Available values: 'verilog', 'rtlil'. Default: %(default)s", + ) + args = parser.parse_args() os.environ["AMARANTH_verbose"] = "true" if args.verbose else "false" @@ -81,7 +95,7 @@ def main(): if args.config not in str_to_coreconfig: raise KeyError(f"Unknown config '{args.config}'") - gen_verilog(str_to_coreconfig[args.config], args.output) + gen_output(str_to_coreconfig[args.config], args.output, args.type) if __name__ == "__main__": diff --git a/scripts/run_benchmarks.py b/scripts/run_benchmarks.py index 1a0c9a2b4..76604901c 100755 --- a/scripts/run_benchmarks.py +++ b/scripts/run_benchmarks.py @@ -16,6 +16,14 @@ import test.regression.benchmark # noqa: E402 from test.regression.pysim import PySimulation # noqa: E402 +from coreblocks.params.configurations import * # noqa: E402 + +str_to_coreconfig: dict[str, CoreConfiguration] = { + "basic": basic_core_config, + "tiny": tiny_core_config, + "full": full_core_config, + "vector": vector_core_config, +} def cd_to_topdir(): @@ -53,7 +61,16 @@ def load_benchmarks(): def run_benchmarks_with_cocotb(benchmarks: list[str], traces: bool) -> bool: - arglist = ["make", "-C", "test/regression/cocotb", "-f", "benchmark.Makefile", "--no-print-directory"] + cpu_count = len(os.sched_getaffinity(0)) + arglist = [ + "make", + "-C", + "test/regression/cocotb", + "-f", + "benchmark.Makefile", + "--no-print-directory", + f"-j{cpu_count}", + ] test_cases = ",".join(benchmarks) arglist += [f"TESTCASE={test_cases}"] @@ -66,7 +83,7 @@ def run_benchmarks_with_cocotb(benchmarks: list[str], traces: bool) -> bool: return res.returncode == 0 -def run_benchmarks_with_pysim(benchmarks: list[str], traces: bool, verbose: bool) -> bool: +def run_benchmarks_with_pysim(benchmarks: list[str], traces: bool, verbose: bool, core_conf: CoreConfiguration) -> bool: suite = unittest.TestSuite() def _gen_test(test_name: str): @@ -75,7 +92,9 @@ def test_fn(): if traces: traces_file = "benchmark." + test_name asyncio.run( - test.regression.benchmark.run_benchmark(PySimulation(verbose, traces_file=traces_file), test_name) + test.regression.benchmark.run_benchmark( + PySimulation(verbose, traces_file=traces_file, core_conf=core_conf), test_name + ) ) test_fn.__name__ = test_name @@ -92,11 +111,17 @@ def test_fn(): return result.wasSuccessful() -def run_benchmarks(benchmarks: list[str], backend: Literal["pysim", "cocotb"], traces: bool, verbose: bool) -> bool: +def run_benchmarks( + benchmarks: list[str], + backend: Literal["pysim", "cocotb"], + traces: bool, + verbose: bool, + core_conf: CoreConfiguration, +) -> bool: if backend == "cocotb": return run_benchmarks_with_cocotb(benchmarks, traces) elif backend == "pysim": - return run_benchmarks_with_pysim(benchmarks, traces, verbose) + return run_benchmarks_with_pysim(benchmarks, traces, verbose, core_conf) return False @@ -106,6 +131,14 @@ def main(): parser.add_argument("-t", "--trace", action="store_true", help="Dump waveforms") parser.add_argument("-v", "--verbose", action="store_true", help="Verbose output") parser.add_argument("-b", "--backend", default="cocotb", choices=["cocotb", "pysim"], help="Simulation backend") + parser.add_argument( + "-c", + "--config", + action="store", + default="full", + help="Select core configuration. " + + f"Available configurations: {', '.join(list(str_to_coreconfig.keys()))}. Default: %(default)s", + ) parser.add_argument( "-o", "--output", @@ -131,7 +164,7 @@ def main(): print(f"Could not find benchmark '{args.benchmark_name}'") sys.exit(1) - success = run_benchmarks(benchmarks, args.backend, args.trace, args.verbose) + success = run_benchmarks(benchmarks, args.backend, args.trace, args.verbose, str_to_coreconfig[args.config]) if not success: print("Benchmark execution failed") sys.exit(1) diff --git a/scripts/synthesize.py b/scripts/synthesize.py index 43ce88b7b..a46fef3e3 100755 --- a/scripts/synthesize.py +++ b/scripts/synthesize.py @@ -25,6 +25,7 @@ "basic": basic_core_config, "tiny": tiny_core_config, "full": full_core_config, + "vector": vector_core_config, } diff --git a/test/asm/vector.asm b/test/asm/vector.asm new file mode 100644 index 000000000..d368805ce --- /dev/null +++ b/test/asm/vector.asm @@ -0,0 +1,46 @@ +initMem: + # Initialise first 256 bits of memory with 32-bits values + li x1, 1 + sw x1, 0(x0) + li x1, 2 + sw x1, 4(x0) + li x1, 5 + sw x1, 8(x0) + li x1, 10 + sw x1, 12(x0) + li x1, 3 + sw x1, 16(x0) + li x1, 0 + sw x1, 20(x0) + li x1, 14 + sw x1, 24(x0) + li x1, 42 + sw x1, 28(x0) # done at cycle ~235 +doVectorOperations: + vsetivli x0, 8, e32,m1,ta,ma + # Load 8 first 32-bits elements from address 0 from memory to registers v0 and v1 + vle32.v v0, (x0) + vle32.v v1, (x0) + vadd.vv v2, v0, v0 + vadd.vv v2, v2, v1 + # Store vector on addresses from byte 32 to check interleaving of vector and + # scalar instructions. Value "32" is calculated as a sacalar sum + li x20, 11 + li x21, 21 + add x1, x20, x21 + # Add a canary on first entry which shouldn't be modified + li x20, 0xDEADBEEF + sw x20, 64(x0) + vse32.v v2, (x1) +getFromMem: + lw x1, 32(x0) + lw x2, 36(x0) + lw x3, 40(x0) + lw x4, 44(x0) + lw x5, 48(x0) + lw x6, 52(x0) + lw x7, 56(x0) + lw x8, 60(x0) + lw x9, 64(x0) +infloop: + j infloop diff --git a/test/asm/vector_bytes.asm b/test/asm/vector_bytes.asm new file mode 100644 index 000000000..04e2f3a8d --- /dev/null +++ b/test/asm/vector_bytes.asm @@ -0,0 +1,55 @@ +initMem: + # Initialise first 256 bits of memory + li x1, 1 + sw x1, 0(x0) + li x1, 2 + sw x1, 4(x0) + li x1, 5 + sw x1, 8(x0) + li x1, 10 + sw x1, 12(x0) + li x1, 3 + sw x1, 16(x0) + li x1, 0 + sw x1, 20(x0) + li x1, 14 + sw x1, 24(x0) + li x1, 42 + sw x1, 28(x0) +byteDataInit: + li x1, 30 + sb x1, 5(x0) + li x1, 25 + sb x1, 6(x0) + li x1, 255 + sb x1, 21(x0) + li x1, 181 + sb x1, 29(x0) + li x1, 15 + sb x1, 30(x0) + vsetivli x0, 31, e8,m1,ta,ma + vle8.v v3, (x0) + vadd.vi v2, v3, 0 + vsetivli x0, 30, e8,m1,tu,ma + li x1, 9 # loop counter +byteLoop: + vadd.vv v3, v3, v2 + addi x1, x1, -1 + bne x1, x0, byteLoop + li x1, 32 + li x2, 0x55 + sb x2, 63(x0) + vsetivli x0, 31, e8,m1,ta,ma + vse8.v v3, (x1) +getFromMem: + lw x1, 32(x0) + lw x2, 36(x0) + lw x3, 40(x0) + lw x4, 44(x0) + lw x5, 48(x0) + lw x6, 52(x0) + lw x7, 56(x0) + lw x8, 60(x0) +infloop: + j infloop + diff --git a/test/common.py b/test/common.py index 6f5ef24e9..600365e0c 100644 --- a/test/common.py +++ b/test/common.py @@ -29,7 +29,18 @@ from coreblocks.transactions.core import SignalBundle, Method, TransactionModule from coreblocks.transactions.lib import AdapterBase, AdapterTrans, Adapter, MethodLayout from coreblocks.transactions._utils import method_def_helper -from coreblocks.params import RegisterType, Funct3, Funct7, OpType, GenParams, Opcode, SEW, LMUL, eew_to_bits +from coreblocks.params import ( + RegisterType, + Funct3, + Funct7, + OpType, + GenParams, + Opcode, + SEW, + LMUL, + eew_to_bits, + lmul_to_int, +) from coreblocks.utils import ( ValueLike, HasElaborate, @@ -168,15 +179,15 @@ def convert_vtype_to_imm(vtype) -> int: return imm -def generate_vtype(gen_params: GenParams, max_vl: Optional[int] = None): +def generate_vtype(gen_params: GenParams, max_vl: Optional[int] = None, const_lmul: Optional[LMUL] = None): sew = random.choice([sew for sew in list(SEW) if eew_to_bits(sew) <= gen_params.v_params.elen]) - lmul = random.choice(list(LMUL)) + lmul = random.choice(list(LMUL)) if const_lmul is None else const_lmul ta = random.randrange(2) ma = random.randrange(2) - if max_vl is not None: - vl = random.randrange(max_vl) - else: - vl = random.randrange(2**16) + vl_lim = gen_params.v_params.vlen // eew_to_bits(sew) * lmul_to_int(lmul) + if max_vl is not None and max_vl < vl_lim: + vl_lim = max_vl + vl = random.randrange(vl_lim) return { "sew": sew, "lmul": lmul, @@ -200,6 +211,7 @@ def generate_instr( non_uniform_s2_val=True, overwriting: dict = {}, max_vl: Optional[int] = None, + const_lmul: Optional[LMUL] = None, ): rec = {} if max_reg_bits is None: @@ -208,43 +220,45 @@ def generate_instr( reg_phys_width = max_reg_bits for field in layout: - if "regs_l" in field[0]: + if "regs_l" == field[0]: if max_reg_bits is None: width = gen_params.isa.reg_cnt_log else: width = max_reg_bits rec["regs_l"] = generate_register_set(width, support_vector=support_vector) - if "regs_p" in field[0]: + if "regs_p" == field[0]: rec["regs_p"] = generate_register_set(reg_phys_width, support_vector=support_vector) for label in ["rp_dst", "rp_s1", "rp_s2", "rp_s3"]: - if label in field[0]: + if label == field[0]: rec[label] = generate_register_entry(reg_phys_width, support_vector=support_vector) - if "exec_fn" in field[0]: + if "exec_fn" == field[0]: rec["exec_fn"] = generate_exec_fn(optypes, funct7, funct3) - if "opcode" in field[0]: + if "opcode" == field[0]: rec["opcode"] = random.choice(list(Opcode)) - if "imm" in field[0]: + if "imm" == field[0]: rec["imm"] = random.randrange(max_imm) - if "imm2" in field[0]: + if "imm2" == field[0]: rec["imm2"] = random.randrange(2**gen_params.imm2_width) - if "rob_id" in field[0]: + if "rob_id" == field[0]: rec["rob_id"] = random.randrange(2**gen_params.rob_entries_bits) - if "pc" in field[0]: + if "pc" == field[0]: rec["pc"] = random.randrange(2**32) - if "illegal" in field[0]: + if "illegal" == field[0]: rec["illegal"] = random.randrange(2) if generate_illegal else 0 - if "s1_val" in field[0]: + if "s1_val" == field[0]: rec["s1_val"] = random.randrange(2**gen_params.isa.xlen) - if "s2_val" in field[0]: + if "s2_val" == field[0]: if non_uniform_s2_val and random.random() < 0.5: s2_val = 0 else: s2_val = random.randrange(2**gen_params.isa.xlen) rec["s2_val"] = s2_val if "vtype" in field[0]: - rec["vtype"] = generate_vtype(gen_params, max_vl=max_vl) + rec["vtype"] = generate_vtype(gen_params, max_vl=max_vl, const_lmul=const_lmul) if "rp_v0" in field[0]: rec["rp_v0"] = {"id": random.randrange(gen_params.v_params.vrp_count)} + if field[0] in ["rp_s1_reg", "rp_s2_reg"]: + rec[field[0]] = random.randrange(2**gen_params.phys_regs_bits) return overwrite_dict_values(rec, overwriting) diff --git a/test/external/embench/Makefile b/test/external/embench/Makefile index 5398901be..3ec635a8f 100644 --- a/test/external/embench/Makefile +++ b/test/external/embench/Makefile @@ -3,7 +3,10 @@ all: build copy-config: cp -v -r -f board_config/coreblocks-sim embench-iot/config/riscv32/boards -build: copy-config +copy-coreblocks-benchmarks: + cp -v -r -f coreblocks_benchmarks/* embench-iot/src + +build: copy-config copy-coreblocks-benchmarks embench-iot/build_all.py \ --builddir ../build \ --logdir ../logs \ diff --git a/test/external/embench/board_config/coreblocks-sim/board.cfg b/test/external/embench/board_config/coreblocks-sim/board.cfg index 96eaae307..5dde8009a 100644 --- a/test/external/embench/board_config/coreblocks-sim/board.cfg +++ b/test/external/embench/board_config/coreblocks-sim/board.cfg @@ -1,5 +1,6 @@ -cc = 'riscv64-unknown-elf-gcc' -cflags = (['-c', '-fdata-sections', '-march=rv32ic_zmmul_zicsr', '-mabi=ilp32']) -ldflags = (['-Wl,-gc-sections', '-march=rv32ic_zmmul_zicsr', '-mabi=ilp32', '-nostartfiles', '-T../../../common/link.ld']) +cc = 'clang' +ld = 'riscv64-unknown-elf-gcc' +cflags = (['--gcc-toolchain=/usr/local','-c', '-fdata-sections', '-march=rv32i_zve32x', '-mabi=ilp32', '--target=riscv32', '--sysroot=/usr/local/riscv64-unknown-elf','-v', '-O3', '-mcmodel=medany' ]) +ldflags = (['-Wl,-gc-sections', '-march=rv32i_zicsr_zve32x', '-mabi=ilp32', '-nostartfiles', '-T../../../common/link.ld']) user_libs = (['-lm']) cpu_mhz = 0.01 diff --git a/test/external/embench/board_config/coreblocks-sim/boardsupport.c b/test/external/embench/board_config/coreblocks-sim/boardsupport.c index e0b851a15..ddbb8df20 100644 --- a/test/external/embench/board_config/coreblocks-sim/boardsupport.c +++ b/test/external/embench/board_config/coreblocks-sim/boardsupport.c @@ -6,9 +6,15 @@ asm volatile ("csrr %0, " #reg : "=r"(__tmp)); \ __tmp; }) -#define rdcycle() ((((uint64_t) read_csr(cycleh)) << 32) | read_csr(cycle)) #define rdinstret() ((((uint64_t) read_csr(instreth)) << 32) | read_csr(instret)) +uint64_t rdcycle() +{ + uint32_t low = read_csr(cycle); + uint64_t high = read_csr(cycleh); + return (high << 32) | low; +} + typedef struct { uint64_t cycle_cnt; uint64_t instr_cnt; @@ -19,14 +25,18 @@ typedef struct { static uint64_t cycle_cnt_start; static uint64_t instr_cnt_start; -void start_trigger() { - cycle_cnt_start = rdcycle(); +inline void start_trigger() { instr_cnt_start = rdinstret(); + asm volatile ("":::"memory"); + cycle_cnt_start = rdcycle(); } -void stop_trigger() { - TO_HOST.cycle_cnt = rdcycle() - cycle_cnt_start; - TO_HOST.instr_cnt = rdinstret() - instr_cnt_start; +inline void stop_trigger() { + uint64_t cycle_cnt_end = rdcycle(); + asm volatile ("":::"memory"); + uint64_t instr_cnt_end = rdinstret(); + TO_HOST.cycle_cnt = cycle_cnt_end - cycle_cnt_start; + TO_HOST.instr_cnt = instr_cnt_end - instr_cnt_start; } void initialise_board () { diff --git a/test/external/embench/common/link.ld b/test/external/embench/common/link.ld index 13d42abe9..5835af192 100644 --- a/test/external/embench/common/link.ld +++ b/test/external/embench/common/link.ld @@ -5,8 +5,8 @@ _STACK_SIZE = 0x1000; MEMORY { - imem (rxai!w) : ORIGIN = 0x00000000, LENGTH = 64K - dmem (wxa!ri) : ORIGIN = 0x10000000, LENGTH = 128K + imem : ORIGIN = 0x00000000, LENGTH = 64K + dmem : ORIGIN = 0x10000000, LENGTH = 128K } SECTIONS diff --git a/test/external/embench/coreblocks_benchmarks/vadd-lot-of-scalars/add-lot-of-scalars.c b/test/external/embench/coreblocks_benchmarks/vadd-lot-of-scalars/add-lot-of-scalars.c new file mode 100644 index 000000000..36332cdb4 --- /dev/null +++ b/test/external/embench/coreblocks_benchmarks/vadd-lot-of-scalars/add-lot-of-scalars.c @@ -0,0 +1,76 @@ +#include "support.h" + +typedef unsigned long DWORD; +#define _LEN 32 +const DWORD LEN = _LEN; +DWORD tab_in[_LEN]; +DWORD tab_out[_LEN]; +const unsigned int body_iterations = 50; + +DWORD __attribute__((noinline)) vadd_body(DWORD counter) +{ + DWORD buf1, buf2; + asm volatile ( + "addi x0, x0, 0 \n" + "vsetvli x0, %[LEN], e32,m1,ta,ma \n" + "vle32.v v1, (%[tab_in]) \n" + "vadd.vi v2, v1, 0 \n" + "start_vadd_%=: \n" + "vadd.vv v2, v2, v1 \n" + "addi %[counter], %[counter], -1 \n" + "li %[buf1], 2 \n" + "li %[buf2], 4 \n" + "add %[buf1], %[buf2], %[buf1] \n" + "add %[buf1], %[buf2], %[buf1] \n" + "addi %[buf1], %[buf1], -1 \n" + "bne x0, %[counter], start_vadd_%= \n" + "vse32.v v2, (%[tab_out]) \n" + : [counter]"+r"(counter), + [buf1] "=&r" (buf1), + [buf2] "=&r" (buf2) + : [LEN]"r"(LEN), + [tab_in]"r"(tab_in), + [tab_out]"r"(tab_out) + : "v1", "v2", "memory"); + return 0; +} + +void initialise_benchmark (void) +{ + for(unsigned int i = 0; i < LEN; i++) + { + tab_in[i]=i; + } +} + +void warm_caches (int __attribute__((unused)) heat) +{ + vadd_body(4); + return; +} + +int benchmark (void) +{ + return vadd_body(body_iterations); +} + +int verify_benchmark (int __attribute__((unused)) r) +{ + int expected =0; + int got = 0; + for(unsigned int i = 0; i < LEN; i++) + { + got += tab_out[i]; + expected += tab_in[i]*(body_iterations+1); + } + +// asm volatile( +// "li t0, 0x80000004 \n" +// "sw %[out], 0(t0) \n" +// "li t0, 0x80000000 \n" +// "sw a0, 0(t0) \n" +// : +// : [out] "r"(r) +// : "memory"); + return expected == got; +} diff --git a/test/external/embench/coreblocks_benchmarks/vadd-mem/vadd-mem.c b/test/external/embench/coreblocks_benchmarks/vadd-mem/vadd-mem.c new file mode 100644 index 000000000..67dfca18e --- /dev/null +++ b/test/external/embench/coreblocks_benchmarks/vadd-mem/vadd-mem.c @@ -0,0 +1,72 @@ +#include "support.h" + +typedef unsigned long DWORD; +#define _LEN 32 +const DWORD LEN = _LEN; +DWORD tab_in[_LEN]; +DWORD tab_out[_LEN]; +const unsigned int body_iterations = 50; + +DWORD __attribute__((noinline)) vadd_body(DWORD counter) +{ + asm volatile ( + "addi x0, x0, 0 \n" + "vsetvli x0, %[LEN], e32,m1,ta,ma \n" + "vle32.v v1, (%[tab_in]) \n" + "vadd.vi v3, v1, 10 \n" + "vadd.vi v2, v1, 0 \n" + "start_vadd_%=: \n" + "vle32.v v1, (%[tab_in]) \n" + "vadd.vv v2, v2, v3 \n" + "vadd.vv v2, v2, v3 \n" + "vadd.vv v2, v2, v1 \n" + "addi %[counter], %[counter], -1 \n" + "bne x0, %[counter], start_vadd_%= \n" + "vse32.v v2, (%[tab_out])" + : [counter]"+r"(counter) + : [LEN]"r"(LEN), + [tab_out]"r"(tab_out), + [tab_in]"r"(tab_in) + : "v1", "v2", "v3", "memory"); + return 0; +} + +void initialise_benchmark (void) +{ + for(unsigned int i = 0; i < LEN; i++) + { + tab_in[i]=i; + } +} + +void warm_caches (int __attribute__((unused)) heat) +{ + vadd_body(4); + return; +} + +int benchmark (void) +{ + return vadd_body(body_iterations); +} + +int verify_benchmark (int __attribute__((unused)) r) +{ + int expected =0; + int got = 0; + for(unsigned int i = 0; i < LEN; i++) + { + got += tab_out[i]; + expected += tab_in[i]*(body_iterations*3+1) + body_iterations*20; + } + +// asm volatile( +// "li t0, 0x80000004 \n" +// "sw %[out], 0(t0) \n" +// "li t0, 0x80000000 \n" +// "sw a0, 0(t0) \n" +// : +// : [out] "r"(r) +// : "memory"); + return expected == got; +} diff --git a/test/external/embench/coreblocks_benchmarks/vadd/vadd.c b/test/external/embench/coreblocks_benchmarks/vadd/vadd.c new file mode 100644 index 000000000..6359d0f11 --- /dev/null +++ b/test/external/embench/coreblocks_benchmarks/vadd/vadd.c @@ -0,0 +1,69 @@ +#include "support.h" + +typedef unsigned long DWORD; +#define _LEN 32 +const DWORD LEN = _LEN; +DWORD tab_in[_LEN]; +DWORD tab_out[_LEN]; +const unsigned int body_iterations = 50; + +DWORD __attribute__((noinline)) vadd_body(DWORD counter) +{ + asm volatile ( + "addi x0, x0, 0 \n" + "vsetvli x0, %[LEN], e32,m1,ta,ma \n" + "vle32.v v1, (%[tab_in]) \n" + "vadd.vi v2, v1, 0 \n" + "start_vadd_%=: \n" + "vadd.vv v2, v2, v1 \n" + "addi %[counter], %[counter], -1 \n" + "bne x0, %[counter], start_vadd_%= \n" + "vse32.v v2, (%[tab_out])" + : [counter]"+r"(counter) + : [LEN]"r"(LEN), + [tab_in]"r"(tab_in), + [tab_out]"r"(tab_out) + : "v1", "v2", "memory"); + return 0; +} + +void initialise_benchmark (void) +{ + for(unsigned int i = 0; i < LEN; i++) + { + tab_in[i]=i; + } +} + +void warm_caches (int __attribute__((unused)) heat) +{ + vadd_body(4); + return; +} + +int benchmark (void) +{ + return vadd_body(body_iterations); +} + +int verify_benchmark (int __attribute__((unused)) r) +{ + int expected =0; + int got = 0; + for(unsigned int i = 0; i < LEN; i++) + { + got += tab_out[i]; + expected += tab_in[i]*(body_iterations +1); + } + +// asm volatile( +// "li t0, 0x80000004 \n" +// "sw %[out], 0(t0) \n" +// "li t0, 0x80000000 \n" +// "sw a0, 0(t0) \n" +// : +// : [out] "r"(r) +// : "memory"); + return 1; + return expected == got; +} diff --git a/test/external/embench/coreblocks_benchmarks/vmem/vmem.c b/test/external/embench/coreblocks_benchmarks/vmem/vmem.c new file mode 100644 index 000000000..be8285086 --- /dev/null +++ b/test/external/embench/coreblocks_benchmarks/vmem/vmem.c @@ -0,0 +1,73 @@ +#include "support.h" + +typedef unsigned long DWORD; +#define _LEN 32 +const DWORD LEN = _LEN; +DWORD tab_in[_LEN]; +DWORD tab_out[_LEN]; +DWORD support_tab[_LEN]; +const unsigned int body_iterations = 50; + +DWORD __attribute__((noinline)) vadd_body(DWORD counter) +{ + asm volatile ( + "vsetvli x0, %[LEN], e32,m1,ta,ma \n" + "start_vadd_%=: \n" + "vle32.v v1, (%[tab_in]) \n" + "vle32.v v2, (%[support_tab]) \n" + "vadd.vv v2, v2, v1 \n" + "vse32.v v2, (%[support_tab]) \n" + "addi %[counter], %[counter], -1 \n" + "bne x0, %[counter], start_vadd_%= \n" + "vle32.v v2, (%[support_tab]) \n" + "vse32.v v2, (%[tab_out]) \n" + : [counter]"+r"(counter) + : [LEN]"r"(LEN), + [tab_in]"r"(tab_in), + [tab_out]"r"(tab_out), + [support_tab] "r"(support_tab) + : "v1", "v2", "v3", "memory"); + return 0; +} + +void initialise_benchmark (void) +{ + for(unsigned int i = 0; i < LEN; i++) + { + tab_in[i]=i; + support_tab[i] = 0; + } +} + +void warm_caches (int __attribute__((unused)) heat) +{ + vadd_body(4); + initialise_benchmark(); + return; +} + +int benchmark (void) +{ + return vadd_body(body_iterations); +} + +int verify_benchmark (int __attribute__((unused)) r) +{ + int expected =0; + int got = 0; + for(unsigned int i = 0; i < LEN; i++) + { + got += tab_out[i]; + expected += tab_in[i]*body_iterations; + } + +// asm volatile( +// "li t0, 0x80000004 \n" +// "sw %[out], 0(t0) \n" +// "li t0, 0x80000000 \n" +// "sw a0, 0(t0) \n" +// : +// : [out] "r"(r) +// : "memory"); + return expected == got; +} diff --git a/test/frontend/test_decoder.py b/test/frontend/test_decoder.py index c7453d991..c26f63d78 100644 --- a/test/frontend/test_decoder.py +++ b/test/frontend/test_decoder.py @@ -1573,6 +1573,379 @@ def __init__( ), # vsetivli x1, 8, e32,m8,ta,ma ] + DECODER_TESTS_V_MEMORY = [ + InstrTest( + 0x02008187, + Opcode.LOAD_FP, + Funct3.VMEM8, + rd=3, + rd_rf=RegisterType.V, + rs1=1, + rs1_rf=RegisterType.X, + imm2=0x020, + op=OpType.V_LOAD, + ), # vle8.v v3, (x1) + InstrTest( + 0x0200D187, + Opcode.LOAD_FP, + Funct3.VMEM16, + rd=3, + rd_rf=RegisterType.V, + rs1=1, + rs1_rf=RegisterType.X, + imm2=0x020, + op=OpType.V_LOAD, + ), # vle16.v v3, (x1) + InstrTest( + 0x0200E187, + Opcode.LOAD_FP, + Funct3.VMEM32, + rd=3, + rd_rf=RegisterType.V, + rs1=1, + rs1_rf=RegisterType.X, + imm2=0x020, + op=OpType.V_LOAD, + ), # vle32.v v3, (x1) + InstrTest( + 0x0200F187, + Opcode.LOAD_FP, + Funct3.VMEM64, + rd=3, + rd_rf=RegisterType.V, + rs1=1, + rs1_rf=RegisterType.X, + imm2=0x020, + op=OpType.V_LOAD, + ), # vle64.v v3, (x1) + InstrTest( + 0x02B08187, + Opcode.LOAD_FP, + Funct3.VMEM8, + rd=3, + rd_rf=RegisterType.V, + rs1=1, + rs1_rf=RegisterType.X, + imm2=0x02B, + op=OpType.V_LOAD, + ), # vlm.v v3, (x1) + InstrTest( + 0x0A208187, + Opcode.LOAD_FP, + Funct3.VMEM8, + rd=3, + rd_rf=RegisterType.V, + rs1=1, + rs1_rf=RegisterType.X, + rs2=2, + rs2_rf=RegisterType.X, + imm2=0x0A2, + op=OpType.V_LOAD, + ), # vlse8.v v3, (x1), x2 + InstrTest( + 0x0A20D187, + Opcode.LOAD_FP, + Funct3.VMEM16, + rd=3, + rd_rf=RegisterType.V, + rs1=1, + rs1_rf=RegisterType.X, + rs2=2, + rs2_rf=RegisterType.X, + imm2=0x0A2, + op=OpType.V_LOAD, + ), # vlse16.v v3, (x1), x2 + InstrTest( + 0x0A20E187, + Opcode.LOAD_FP, + Funct3.VMEM32, + rd=3, + rd_rf=RegisterType.V, + rs1=1, + rs1_rf=RegisterType.X, + rs2=2, + rs2_rf=RegisterType.X, + imm2=0x0A2, + op=OpType.V_LOAD, + ), # vlse32.v v3, (x1), x2 + InstrTest( + 0x0A20F187, + Opcode.LOAD_FP, + Funct3.VMEM64, + rd=3, + rd_rf=RegisterType.V, + rs1=1, + rs1_rf=RegisterType.X, + rs2=2, + rs2_rf=RegisterType.X, + imm2=0x0A2, + op=OpType.V_LOAD, + ), # vlse64.v v3, (x1), x2 + InstrTest( + 0x06208187, + Opcode.LOAD_FP, + Funct3.VMEM8, + rd=3, + rd_rf=RegisterType.V, + rs1=1, + rs1_rf=RegisterType.X, + rs2=2, + rs2_rf=RegisterType.V, + imm2=0x062, + op=OpType.V_LOAD, + ), # vluxei8.v v3, (x1), v2 + InstrTest( + 0x0620D187, + Opcode.LOAD_FP, + Funct3.VMEM16, + rd=3, + rd_rf=RegisterType.V, + rs1=1, + rs1_rf=RegisterType.X, + rs2=2, + rs2_rf=RegisterType.V, + imm2=0x062, + op=OpType.V_LOAD, + ), # vluxei16.v v3, (x1), v2 + InstrTest( + 0x0620E187, + Opcode.LOAD_FP, + Funct3.VMEM32, + rd=3, + rd_rf=RegisterType.V, + rs1=1, + rs1_rf=RegisterType.X, + rs2=2, + rs2_rf=RegisterType.V, + imm2=0x062, + op=OpType.V_LOAD, + ), # vluxei32.v v3, (x1), v2 + InstrTest( + 0x0E208187, + Opcode.LOAD_FP, + Funct3.VMEM8, + rd=3, + rd_rf=RegisterType.V, + rs1=1, + rs1_rf=RegisterType.X, + rs2=2, + rs2_rf=RegisterType.V, + imm2=0x0E2, + op=OpType.V_LOAD, + ), # vloxei8.v v3, (x1), v2 + InstrTest( + 0x0E20D187, + Opcode.LOAD_FP, + Funct3.VMEM16, + rd=3, + rd_rf=RegisterType.V, + rs1=1, + rs1_rf=RegisterType.X, + rs2=2, + rs2_rf=RegisterType.V, + imm2=0x0E2, + op=OpType.V_LOAD, + ), # vloxei16.v v3, (x1), v2 + InstrTest( + 0x0E20E187, + Opcode.LOAD_FP, + Funct3.VMEM32, + rd=3, + rd_rf=RegisterType.V, + rs1=1, + rs1_rf=RegisterType.X, + rs2=2, + rs2_rf=RegisterType.V, + imm2=0x0E2, + op=OpType.V_LOAD, + ), # vloxei32.v v3, (x1), v2 + InstrTest( + 0x020081A7, + Opcode.STORE_FP, + Funct3.VMEM8, + rd=3, + rd_rf=RegisterType.V, + rs1=1, + rs1_rf=RegisterType.X, + imm2=0x020, + op=OpType.V_STORE, + ), # vse8.v v3, (x1) + InstrTest( + 0x0200D1A7, + Opcode.STORE_FP, + Funct3.VMEM16, + rd=3, + rd_rf=RegisterType.V, + rs1=1, + rs1_rf=RegisterType.X, + imm2=0x020, + op=OpType.V_STORE, + ), # vse16.v v3, (x1) + InstrTest( + 0x0200E1A7, + Opcode.STORE_FP, + Funct3.VMEM32, + rd=3, + rd_rf=RegisterType.V, + rs1=1, + rs1_rf=RegisterType.X, + imm2=0x020, + op=OpType.V_STORE, + ), # vse32.v v3, (x1) + InstrTest( + 0x0200F1A7, + Opcode.STORE_FP, + Funct3.VMEM64, + rd=3, + rd_rf=RegisterType.V, + rs1=1, + rs1_rf=RegisterType.X, + imm2=0x020, + op=OpType.V_STORE, + ), # vse64.v v3, (x1) + InstrTest( + 0x02B081A7, + Opcode.STORE_FP, + Funct3.VMEM8, + rd=3, + rd_rf=RegisterType.V, + rs1=1, + rs1_rf=RegisterType.X, + imm2=0x02B, + op=OpType.V_STORE, + ), # vsm.v v3, (x1) + InstrTest( + 0x0A2081A7, + Opcode.STORE_FP, + Funct3.VMEM8, + rd=3, + rd_rf=RegisterType.V, + rs1=1, + rs1_rf=RegisterType.X, + rs2=2, + rs2_rf=RegisterType.X, + imm2=0x0A2, + op=OpType.V_STORE, + ), # vsse8.v v3, (x1), x2 + InstrTest( + 0x0A20D1A7, + Opcode.STORE_FP, + Funct3.VMEM16, + rd=3, + rd_rf=RegisterType.V, + rs1=1, + rs1_rf=RegisterType.X, + rs2=2, + rs2_rf=RegisterType.X, + imm2=0x0A2, + op=OpType.V_STORE, + ), # vsse16.v v3, (x1), x2 + InstrTest( + 0x0A20E1A7, + Opcode.STORE_FP, + Funct3.VMEM32, + rd=3, + rd_rf=RegisterType.V, + rs1=1, + rs1_rf=RegisterType.X, + rs2=2, + rs2_rf=RegisterType.X, + imm2=0x0A2, + op=OpType.V_STORE, + ), # vsse32.v v3, (x1), x2 + InstrTest( + 0x0A20F1A7, + Opcode.STORE_FP, + Funct3.VMEM64, + rd=3, + rd_rf=RegisterType.V, + rs1=1, + rs1_rf=RegisterType.X, + rs2=2, + rs2_rf=RegisterType.X, + imm2=0x0A2, + op=OpType.V_STORE, + ), # vsse64.v v3, (x1), x2 + InstrTest( + 0x062081A7, + Opcode.STORE_FP, + Funct3.VMEM8, + rd=3, + rd_rf=RegisterType.V, + rs1=1, + rs1_rf=RegisterType.X, + rs2=2, + rs2_rf=RegisterType.V, + imm2=0x062, + op=OpType.V_STORE, + ), # vsuxei8.v v3, (x1), v2 + InstrTest( + 0x0620D1A7, + Opcode.STORE_FP, + Funct3.VMEM16, + rd=3, + rd_rf=RegisterType.V, + rs1=1, + rs1_rf=RegisterType.X, + rs2=2, + rs2_rf=RegisterType.V, + imm2=0x062, + op=OpType.V_STORE, + ), # vsuxei16.v v3, (x1), v2 + InstrTest( + 0x0620E1A7, + Opcode.STORE_FP, + Funct3.VMEM32, + rd=3, + rd_rf=RegisterType.V, + rs1=1, + rs1_rf=RegisterType.X, + rs2=2, + rs2_rf=RegisterType.V, + imm2=0x062, + op=OpType.V_STORE, + ), # vsuxei32.v v3, (x1), v2 + InstrTest( + 0x0E2081A7, + Opcode.STORE_FP, + Funct3.VMEM8, + rd=3, + rd_rf=RegisterType.V, + rs1=1, + rs1_rf=RegisterType.X, + rs2=2, + rs2_rf=RegisterType.V, + imm2=0x0E2, + op=OpType.V_STORE, + ), # vsoxei8.v v3, (x1), v2 + InstrTest( + 0x0E20D1A7, + Opcode.STORE_FP, + Funct3.VMEM16, + rd=3, + rd_rf=RegisterType.V, + rs1=1, + rs1_rf=RegisterType.X, + rs2=2, + rs2_rf=RegisterType.V, + imm2=0x0E2, + op=OpType.V_STORE, + ), # vsoxei16.v v3, (x1), v2 + InstrTest( + 0x0E20E1A7, + Opcode.STORE_FP, + Funct3.VMEM32, + rd=3, + rd_rf=RegisterType.V, + rs1=1, + rs1_rf=RegisterType.X, + rs2=2, + rs2_rf=RegisterType.V, + imm2=0x0E2, + op=OpType.V_STORE, + ), # vsoxei32.v v3, (x1), v2 + ] + def setUp(self): gen = GenParams( test_core_config.replace( @@ -1686,6 +2059,10 @@ def test_v_control(self): for test in self.DECODER_TESTS_V_CONTROL: self.do_test(test) + def test_v_memory(self): + for test in self.DECODER_TESTS_V_MEMORY: + self.do_test(test) + class TestDecoderEExtLegal(TestCaseWithSimulator): E_TEST = [ diff --git a/test/fu/vector_unit/common.py b/test/fu/vector_unit/common.py index 2f2894ecd..2272901a9 100644 --- a/test/fu/vector_unit/common.py +++ b/test/fu/vector_unit/common.py @@ -18,14 +18,12 @@ def generate_vsetvl( ): v_params = gen_params.v_params instr = generate_instr(gen_params, layout, max_vl=max_vl, max_reg_bits=max_reg_bits) - vtype = generate_vtype(gen_params, max_vl=max_vl) - if const_lmul is not None: - vtype["lmul"] = const_lmul + if "rp_s1_reg" in instr: + instr["rp_s1_reg"] = instr["rp_s1"]["id"] + vtype = generate_vtype(gen_params, max_vl=max_vl, const_lmul=const_lmul) imm2 = convert_vtype_to_imm(vtype) while not allow_illegal and eew_to_bits(vtype["sew"]) > v_params.elen: - vtype = generate_vtype(gen_params, max_vl=max_vl) - if const_lmul is not None: - vtype["lmul"] = const_lmul + vtype = generate_vtype(gen_params, max_vl=max_vl, const_lmul=const_lmul) imm2 = convert_vtype_to_imm(vtype) vsetvl_type = random.randrange(4) if vsetvl_type == 2: diff --git a/test/fu/vector_unit/test_v_core.py b/test/fu/vector_unit/test_v_core.py index 62c89b9bd..14dbd8de4 100644 --- a/test/fu/vector_unit/test_v_core.py +++ b/test/fu/vector_unit/test_v_core.py @@ -6,6 +6,9 @@ from test.fu.vector_unit.common import * from collections import deque from parameterized import parameterized_class +from coreblocks.lsu.vector_lsu import * +from coreblocks.peripherals.wishbone import * +from test.peripherals.test_wishbone import WishboneInterfaceWrapper @parameterized_class(["seed", "register_bank_count", "test_number"], [(14, 1, 70), (15, 2, 40)]) @@ -22,6 +25,7 @@ def setUp(self): ) ) self.v_params = self.gen_params.v_params + wb_params = WishboneParameters(data_width=self.v_params.elen, addr_width=32) self.vxrs_layouts = VectorXRSLayout( self.gen_params, rs_entries_bits=log2_int(self.v_params.vxrs_entries, False) @@ -30,11 +34,21 @@ def setUp(self): self.rob_block_interrupt = MethodMock(i=self.gen_params.get(ROBLayouts).block_interrupts) self.rob_peek = MethodMock(o=self.gen_params.get(ROBLayouts).peek_layout) self.exception_report = MethodMock(i=self.gen_params.get(ExceptionRegisterLayouts).report) + self.get_reserved = MethodMock(o=[("reserved", 1)]) + self.set_reserved = MethodMock(i=[("reserved", 1)]) + self.bus = WishboneMaster(wb_params) + self.wishbone = WishboneInterfaceWrapper(self.bus.wbMaster) self.gen_params.get(DependencyManager).add_dependency( ROBBlockInterruptsKey(), self.rob_block_interrupt.get_method() ) self.gen_params.get(DependencyManager).add_dependency(ROBPeekKey(), self.rob_peek.get_method()) self.gen_params.get(DependencyManager).add_dependency(ExceptionReportKey(), self.exception_report.get_method()) + self.gen_params.get(DependencyManager).add_dependency( + LSUReservedKey(), (self.get_reserved.get_method(), self.set_reserved.get_method()) + ) + self.gen_params.get(DependencyManager).add_dependency(WishboneDataKey(), self.bus) + self.vlsu = VectorLSU(self.gen_params) + self.gen_params.get(DependencyManager).add_dependency(VectorLSUKey(), self.vlsu) self.circ = SimpleTestCircuit(VectorCore(self.gen_params)) self.m = ModuleConnector( @@ -42,12 +56,26 @@ def setUp(self): rob_block_interrupt=self.rob_block_interrupt, rob_peek=self.rob_peek, exception_report=self.exception_report, + get_reserved=self.get_reserved, + set_reserved=self.set_reserved, + vlsu=self.vlsu, + bus=self.bus, ) self.generator = get_vector_instr_generator() self.instr_q = deque() self.instr_ended_q = deque() self.lowest_used_rob_id = 0 + self.reserved = 0 + + @def_method_mock(lambda self: self.get_reserved) + def get_reserved_process(self): + return {"reserved": self.reserved} + + @def_method_mock(lambda self: self.set_reserved, sched_prio=1) + def set_reserved_process(self, reserved): + self.assertTrue(False) + self.reserved = reserved @def_method_mock(lambda self: self.rob_block_interrupt) def rob_block_interrupt_process(self, arg): @@ -179,3 +207,5 @@ def test_liveness(self): sim.add_sync_process(self.rob_block_interrupt_process) sim.add_sync_process(self.rob_peek_process) sim.add_sync_process(self.exception_report_process) + sim.add_sync_process(self.get_reserved_process) + sim.add_sync_process(self.set_reserved_process) diff --git a/test/fu/vector_unit/test_v_elems_downloader.py b/test/fu/vector_unit/test_v_elems_downloader.py index f27c7bfd9..86974f6f4 100644 --- a/test/fu/vector_unit/test_v_elems_downloader.py +++ b/test/fu/vector_unit/test_v_elems_downloader.py @@ -4,6 +4,7 @@ from coreblocks.fu.vector_unit.v_layouts import * from coreblocks.fu.vector_unit.v_elems_downloader import * from coreblocks.fu.vector_unit.vrf import * +from coreblocks.transactions.lib import * from test.fu.vector_unit.common import * from collections import deque from parameterized import parameterized_class @@ -22,19 +23,30 @@ def setUp(self): vector_config=VectorUnitConfiguration(vrp_count=self.vrp_count, _vrl_count=7) ) ) - self.test_number = 30 + self.test_number = 50 self.v_params = self.gen_params.v_params self.layout = VectorBackendLayouts(self.gen_params) + self.vrf_layout = VRFFragmentLayouts(self.gen_params) vrf = VRFFragment(gen_params=self.gen_params) self.fu_receiver = MethodMock(i=self.layout.downloader_data_out) self.circ = SimpleTestCircuit( - VectorElemsDownloader(self.gen_params, vrf.read_req, vrf.read_resp, self.fu_receiver.get_method()) + VectorElemsDownloader( + self.gen_params, + vrf.read_req, + vrf.read_resp, + self.fu_receiver.get_method(), + ) ) self.write = TestbenchIO(AdapterTrans(vrf.write)) - self.m = ModuleConnector(circ=self.circ, fu_receiver=self.fu_receiver, vrf=vrf, vrf_write=self.write) + self.m = ModuleConnector( + circ=self.circ, + fu_receiver=self.fu_receiver, + vrf=vrf, + vrf_write=self.write, + ) self.received_data = deque() diff --git a/test/fu/vector_unit/test_v_frontend.py b/test/fu/vector_unit/test_v_frontend.py index 9f56368ce..ec843a5e1 100644 --- a/test/fu/vector_unit/test_v_frontend.py +++ b/test/fu/vector_unit/test_v_frontend.py @@ -23,7 +23,6 @@ def setUp(self): self.put_vvrs = MethodMock(i=self.layouts.instr_to_vvrs) self.announce = MethodMock(i=FuncUnitLayouts(self.gen_params).accept) self.announce2 = MethodMock(i=FuncUnitLayouts(self.gen_params).accept) - self.announce_mult = MethodMock(i=self.layouts.translator_report_multiplier) self.exception_report = MethodMock(i=self.gen_params.get(ExceptionRegisterLayouts).report) self.gen_params.get(DependencyManager).add_dependency(ExceptionReportKey(), self.exception_report.get_method()) self.frat = FRAT(gen_params=self.gen_params, superscalarity=2) @@ -37,7 +36,6 @@ def setUp(self): self.rob_block_interrupt.get_method(), self.announce.get_method(), self.announce2.get_method(), - self.announce_mult.get_method(), self.freerf.allocate, self.frat.get_rename_list[0], self.frat.get_rename_list[1], @@ -56,7 +54,6 @@ def setUp(self): put_vvrs=self.put_vvrs, announce=self.announce, announce2=self.announce2, - announce_mult=self.announce_mult, exception_report=self.exception_report, deallocate=self.deallocate, initialise=ModuleConnector(*self.initialise_list), @@ -68,7 +65,6 @@ def setUp(self): self.received_instr_mem = deque() self.received_block_interrupt = deque() self.received_announce = deque() - self.received_mult = deque() self._robs = deque() self._org_robs = deque() self.initialise_requests = deque() @@ -86,6 +82,7 @@ def f(): @def_method_mock(lambda self: self.exception_report) def report_process(self, arg): + # We don't expect any errors self.assertTrue(False) @def_method_mock(lambda self: self.put_vvrs) @@ -114,10 +111,6 @@ def announce_process(self, arg): def announce2_process(self, arg): self.received_announce.append(arg) - @def_method_mock(lambda self: self.announce_mult) - def mult_process(self, arg): - self.received_mult.append(arg) - def input_process(self, generator): def f(): for i in range(self.test_number): @@ -143,23 +136,23 @@ def remove_duplicates(self, lista): return nowa def checker(self): - while len(self.received_mult) + len(self.received_announce) < self.test_number: + while len(self.received_instr) + len(self.received_instr_mem) + len(self.received_announce) < self.test_number: yield # be sure that there is no other instructions in pipeline - yield from self.tick(30) - self.assertEqual(len(self.received_mult) + len(self.received_announce), self.test_number) + yield from self.tick(2) + self.assertEqual( + len(self.received_instr) + len(self.received_instr_mem) + len(self.received_announce), self.test_number + ) def compare_fields(org_instr, vtype, list_to_pop): - lmul = lmul_to_int(vtype["lmul"]) - for _ in range(lmul): - announced_instr = list_to_pop.popleft() - self.assertFieldsEqual(org_instr, announced_instr, ["rob_id", "exec_fn"]) + announced_instr = list_to_pop.popleft() + self.assertFieldsEqual(org_instr, announced_instr, ["rob_id", "exec_fn"]) for org_instr, vtype in self.orginal_instr: if org_instr["exec_fn"]["op_type"] == OpType.V_CONTROL: announced_instr = self.received_announce.popleft() self.assertFieldsEqual(org_instr, announced_instr, ["rob_id", "rp_dst"]) - elif org_instr["exec_fn"]["op_type"] == OpType.V_MEMORY: + elif org_instr["exec_fn"]["op_type"] in [OpType.V_LOAD, OpType.V_STORE]: compare_fields(org_instr, vtype, self.received_instr_mem) else: compare_fields(org_instr, vtype, self.received_instr) @@ -176,7 +169,7 @@ def deallocator_process(self): def test_random(self): random.seed(14) - with self.run_simulation(self.m) as sim: + with self.run_simulation(self.m, 500) as sim: sim.add_sync_process(self.checker) sim.add_sync_process( self.input_process( @@ -190,29 +183,6 @@ def test_random(self): sim.add_sync_process(self.rob_block_interrupt_process) sim.add_sync_process(self.announce_process) sim.add_sync_process(self.announce2_process) - sim.add_sync_process(self.mult_process) - sim.add_sync_process(self.report_process) - sim.add_sync_process(self.deallocator_process) - for f in self.initialise_process_list: - sim.add_sync_process(f) - - def test_heavy_load(self): - random.seed(15) - with self.run_simulation(self.m) as sim: - sim.add_sync_process(self.checker) - sim.add_sync_process( - self.input_process( - lambda: self.generate_vector_instr( - self.gen_params, self.layouts.verification_in, vsetvl_different_rp_id=True, const_lmul=LMUL.m8 - ) - ) - ) - sim.add_sync_process(self.put_vvrs_process) - sim.add_sync_process(self.put_mem_process) - sim.add_sync_process(self.rob_block_interrupt_process) - sim.add_sync_process(self.announce_process) - sim.add_sync_process(self.announce2_process) - sim.add_sync_process(self.mult_process) sim.add_sync_process(self.report_process) sim.add_sync_process(self.deallocator_process) for f in self.initialise_process_list: diff --git a/test/fu/vector_unit/test_v_instruction_verification.py b/test/fu/vector_unit/test_v_instruction_verification.py index 74d7bcd81..110576571 100644 --- a/test/fu/vector_unit/test_v_instruction_verification.py +++ b/test/fu/vector_unit/test_v_instruction_verification.py @@ -9,6 +9,7 @@ class TestVInstructionVerification(TestCaseWithSimulator): def setUp(self): + self.maxDiff = None random.seed(14) self.gen_params = GenParams(test_vector_core_config) self.test_number = 100 @@ -56,6 +57,8 @@ def test_passing(self): data_q = deque() rbi_q = deque() put_q = deque() + retire_q = deque() + report_q = deque() def create_mocks(): @def_method_mock(lambda: self.rob_block_interrupts) @@ -76,19 +79,37 @@ def get_vstart(): @def_method_mock(lambda: self.retire) def retire(rob_id, result, rp_dst, exception): - self.assertTrue(False) + self.assertTrue(retire_q, f"rob_id: {rob_id}") + data = retire_q.popleft() + self.assertEqual(exception, 1) + self.assertEqual(rp_dst, data["rp_dst"]) @def_method_mock(lambda: self.exception_report) def report(rob_id, cause): - self.assertTrue(False) + self.assertTrue(report_q, f"rob_id: {rob_id}") + report_q.popleft() + self.assertEqual(ExceptionCause.ILLEGAL_INSTRUCTION, cause) return rbi, put_instr, get_vill, get_vstart, retire, report + def _get_load_store_width(funct3): + try: + return eew_to_bits(load_store_width_to_eew(funct3)) + except ValueError: + return 64 + def process(): for _ in range(self.test_number): data = generate_instr(self.gen_params, self.vf_layout.verification_in, support_vector=True) + if ( + data["exec_fn"]["op_type"] in [OpType.V_LOAD, OpType.V_STORE] + and _get_load_store_width(data["exec_fn"]["funct3"]) > self.v_params.elen + ): + retire_q.append(data) + report_q.append(data) + else: + data_q.append(data) yield from self.circ.issue.call(data) - data_q.append(data) # wait few cycles to be sure that all mocks were called for _ in range(2): yield diff --git a/test/fu/vector_unit/test_v_status.py b/test/fu/vector_unit/test_v_status.py index aa00d90c5..ec013a4a5 100644 --- a/test/fu/vector_unit/test_v_status.py +++ b/test/fu/vector_unit/test_v_status.py @@ -68,7 +68,7 @@ def process(): self.assertEqual(org_vtype["vl"], retire_data["result"]) for (org_data, org_vtype), resp in zip(data_normal_q, put_q): - self.assertDictContainsSubset(get_dict_without(org_data, ["imm2"]), resp) + self.assertDictContainsSubset(get_dict_without(org_data, ["imm2", "rp_s1_reg"]), resp) self.assertDictContainsSubset({"vtype": org_vtype}, resp) with self.run_simulation(self.m) as sim: diff --git a/test/fu/vector_unit/test_vrf.py b/test/fu/vector_unit/test_vrf.py index ee23a1b3a..386bd18fd 100644 --- a/test/fu/vector_unit/test_vrf.py +++ b/test/fu/vector_unit/test_vrf.py @@ -37,76 +37,62 @@ def setUp(self): self.circ = SimpleTestCircuit(VRFFragment(gen_params=gp)) self.read_port_count = 4 - self.test_number = 100 + self.test_iterations = 10 + self.iteration_len = 10 self.reference_memory = [ [2**self.vp.elen - 1 for __ in range(self.vp.elens_in_bank)] for _ in range(self.vp.vrp_count) ] - self.expected_reads = deque() - self.vrp_running = deque() + self.expected_reads = [deque() for i in range(self.read_port_count)] self.received_reads = deque() self.passive_requestors = 0 self.passive_receivers = 0 + self.barierr_writer = SimBarrier(1 + self.read_port_count) + self.read_cond_var = CondVar(transparent=False) def writer(self): - cycle = 0 - for _ in range(self.test_number): - cycle += 1 - req = generate_write(self.vp) - yield from self.circ.write.call(req) - yield Settle() - current_val = self.reference_memory[req["vrp_id"]][req["addr"]] - new_val = (req["data"] & expand_mask(req["valid_mask"])) | (current_val & ~expand_mask(req["valid_mask"])) - self.reference_memory[req["vrp_id"]][req["addr"]] = new_val - while random.random() < self.wait_chance: - yield + for __ in range(self.test_iterations): + for _ in range(self.iteration_len): + req = generate_write(self.vp) + yield from self.circ.write.call(req) + yield Settle() + current_val = self.reference_memory[req["vrp_id"]][req["addr"]] + new_val = (req["data"] & expand_mask(req["valid_mask"])) | ( + current_val & ~expand_mask(req["valid_mask"]) + ) + self.reference_memory[req["vrp_id"]][req["addr"]] = new_val + while random.random() < self.wait_chance: + yield + yield from self.read_cond_var.notify() + yield from self.barierr_writer.wait() def generate_read_requestor(self, k): def f(): - for _ in range(self.test_number): - req = generate_read_req(self.vp) - yield from self.circ.read_req[k].call(req) - self.expected_reads.append((req["vrp_id"], self.reference_memory[req["vrp_id"]][req["addr"]])) - self.vrp_running.append(req["vrp_id"]) - while random.random() < self.wait_chance: - yield - self.passive_requestors += 1 + for __ in range(self.test_iterations): + yield from self.read_cond_var.wait() + for _ in range(self.iteration_len): + req = generate_read_req(self.vp) + yield from self.circ.read_req[k].call(req) + self.expected_reads[k].append(self.reference_memory[req["vrp_id"]][req["addr"]]) + while random.random() < self.wait_chance: + yield + yield from self.barierr_writer.wait() return f def generate_read_receiver(self, k): def f(): - for _ in range(self.test_number): - while not self.vrp_running: - yield - vrp_id = self.vrp_running.popleft() - d = {"vrp_id": vrp_id} - resp = yield from self.circ.read_resp[k].call(d) - self.received_reads.append((vrp_id, resp["data"])) + for _ in range(self.test_iterations * self.iteration_len): + resp = yield from self.circ.read_resp[k].call() + self.assertIn(resp["data"], self.expected_reads[k]) + self.expected_reads[k].remove(resp["data"]) while random.random() < self.wait_chance: yield - self.passive_receivers += 1 - yield Passive() return f - def checker(self): - while self.passive_requestors + self.passive_receivers < 2 * self.read_port_count: - yield - - for vrp_id, val in self.expected_reads: - for rec in self.received_reads: - rec_vrp, rec_val = rec - if rec_vrp == vrp_id: - self.assertEqual(val, rec_val, f"For {vrp_id}") - break - else: - raise RuntimeError(f"Not found: {vrp_id} in {self.received_reads}") - self.received_reads.remove(rec) - def test_random(self): with self.run_simulation(self.circ, 5000) as sim: sim.add_sync_process(self.writer) - sim.add_sync_process(self.checker) for i in range(self.read_port_count): sim.add_sync_process(self.generate_read_receiver(i)) sim.add_sync_process(self.generate_read_requestor(i)) diff --git a/test/lsu/test_vector_lsu.py b/test/lsu/test_vector_lsu.py new file mode 100644 index 000000000..e9d9ab927 --- /dev/null +++ b/test/lsu/test_vector_lsu.py @@ -0,0 +1,226 @@ +import math +import random +from test.common import * +from coreblocks.lsu.vector_lsu import * +from coreblocks.fu.vector_unit.v_layouts import * +from coreblocks.params.configurations import * +from test.fu.vector_unit.common import * +from collections import deque +from coreblocks.peripherals.wishbone import * +from test.peripherals.test_wishbone import WishboneInterfaceWrapper + + +class VRFStub: + def __init__(self, gen_params: GenParams): + self.gen_params = gen_params + self.v_params = self.gen_params.v_params + self.write = MethodMock(i=self.gen_params.get(VRFFragmentLayouts).write) + self.read_req = MethodMock(i=self.gen_params.get(VRFFragmentLayouts).read_req) + self.read_resp = MethodMock(o=self.gen_params.get(VRFFragmentLayouts).read_resp_o) + + self.methods = ModuleConnector(write=self.write, read_req=self.read_req, read_resp=self.read_resp) + + self.regs = [[0 for __ in range(self.v_params.elens_in_bank)] for _ in range(self.v_params.vrp_count)] + self.reqs = deque() + + @def_method_mock(lambda self: self.write) + def write_process(self, vrp_id, addr, valid_mask, data): + expanded = expand_mask(valid_mask) + # check if LSU want to save the same value as generated by wishbone process + assert (self.regs[vrp_id][addr] & expanded) == (data & expanded) + + @def_method_mock(lambda self: self.read_req, sched_prio=1) + def read_req_process(self, arg): + self.reqs.append(arg) + + @def_method_mock(lambda self: self.read_resp, enable=lambda self: self.reqs) + def read_resp_process(self): + req = self.reqs.popleft() + return {"data": self.regs[req["vrp_id"]][req["addr"]]} + + +class TestVectorLSU(TestCaseWithSimulator): + def setUp(self): + random.seed(14) + self.gen_params = GenParams( + test_vector_core_config.replace(vector_config=VectorUnitConfiguration(vrp_count=8, _vrl_count=7)) + ) + self.test_number = 40 + self.v_params = self.gen_params.v_params + wb_params = WishboneParameters(data_width=self.v_params.elen, addr_width=32) + self.layouts = self.gen_params.get(VectorLSULayouts) + self.vxrs_layouts = VectorXRSLayout( + self.gen_params, rs_entries_bits=log2_int(self.v_params.vxrs_entries, False) + ) + self.scoreboard_layout = ScoreboardLayouts(self.v_params.vrp_count) + + self.exception_report = MethodMock(i=self.gen_params.get(ExceptionRegisterLayouts).report) + self.insert_x = MethodMock(i=self.vxrs_layouts.insert_in) + self.get_reserved = MethodMock(o=[("reserved", 1)]) + self.set_reserved = MethodMock(i=[("reserved", 1)]) + self.scoreboard_get_dirty = MethodMock( + i=self.scoreboard_layout.get_dirty_in, o=self.scoreboard_layout.get_dirty_out + ) + self.scoreboard_set_dirty = MethodMock(i=self.scoreboard_layout.set_dirty_in) + self.vrfs = [VRFStub(self.gen_params) for _ in range(self.v_params.register_bank_count)] + + self.bus = WishboneMaster(wb_params) + self.wishbone = WishboneInterfaceWrapper(self.bus.wbMaster) + + self.connections = self.gen_params.get(DependencyManager) + self.connections.add_dependency(ExceptionReportKey(), self.exception_report.get_method()) + self.connections.add_dependency( + LSUReservedKey(), (self.get_reserved.get_method(), self.set_reserved.get_method()) + ) + self.connections.add_dependency(WishboneDataKey(), self.bus) + self.connections.add_dependency( + VectorVRFAccessKey(), + ( + [vrf.write.get_method() for vrf in self.vrfs], + [vrf.read_req.get_method() for vrf in self.vrfs], + [vrf.read_resp.get_method() for vrf in self.vrfs], + ), + ) + self.connections.add_dependency(VectorFrontendInsertKey(), self.insert_x.get_method()) + self.connections.add_dependency( + VectorScoreboardKey(), (self.scoreboard_get_dirty.get_method(), self.scoreboard_set_dirty.get_method()) + ) + self.circ = SimpleTestCircuit(VectorLSU(self.gen_params)) + self.m = ModuleConnector( + circ=self.circ, + vrfs=ModuleConnector(*[vrf.methods for vrf in self.vrfs]), + bus=self.bus, + exception_report=self.exception_report, + insert_x=self.insert_x, + get_reserved=self.get_reserved, + set_reserved=self.set_reserved, + scoreboard_get=self.scoreboard_get_dirty, + scoreboard_set=self.scoreboard_set_dirty, + ) + + self.current_instr = None + self.elens_to_send = -1 + self.reserved = 0 + + @def_method_mock(lambda self: self.exception_report) + def exception_process(self, arg): + self.assertTrue(False) + + @def_method_mock(lambda self: self.insert_x) + def insert_x_process(self, arg): + pass + + @def_method_mock(lambda self: self.get_reserved) + def get_reserved_process(self): + return {"reserved": self.reserved} + + @def_method_mock(lambda self: self.set_reserved, sched_prio=1) + def set_reserved_process(self, reserved): + self.reserved = reserved + + @def_method_mock(lambda self: self.scoreboard_get_dirty) + def scoreboard_get_dirty_process(self, arg): + return {"dirty": 0} + + @def_method_mock(lambda self: self.scoreboard_set_dirty) + def scoreboard_set_dirty_process(self, arg): + pass + + def wishbone_process(self): + yield Passive() + current_elen = 0 + while True: + yield from self.wishbone.slave_wait() + self.assertIsNotNone(self.current_instr) + assert self.current_instr is not None + elems_in_elen = self.v_params.elen // eew_to_bits(self.current_instr["vtype"]["sew"]) + elens_to_check = math.ceil(self.current_instr["vtype"]["vl"] / elems_in_elen) + elems_in_last_elen = self.current_instr["vtype"]["vl"] % elems_in_elen + + is_load = self.current_instr["exec_fn"]["op_type"] == OpType.V_LOAD + if is_load: + exp_data = 0 + exp_sel = 0 + else: + exp_data = self.vrfs[current_elen // self.v_params.elens_in_bank].regs[ + self.current_instr["rp_s3"]["id"] + ][current_elen % self.v_params.elens_in_bank] + if current_elen + 1 == elens_to_check and elems_in_last_elen != 0: + exp_sel = elem_mask_to_byte_mask( + self.v_params.elen, 2**elems_in_last_elen - 1, self.current_instr["vtype"]["sew"] + ) + else: + exp_sel = 2**self.v_params.bytes_in_elen - 1 + + yield from self.wishbone.slave_verify( + self.current_instr["s1_val"] // 4 + current_elen, exp_data, not is_load, exp_sel + ) + + if is_load: + resp_data = random.randrange(2**self.v_params.elen) + self.vrfs[current_elen // self.v_params.elens_in_bank].regs[self.current_instr["rp_dst"]["id"]][ + current_elen % self.v_params.elens_in_bank + ] = resp_data + else: + resp_data = 0 + yield from self.wishbone.slave_respond(resp_data) + yield Settle() + current_elen += 1 + self.elens_to_send -= 1 + if current_elen == elens_to_check: + current_elen = 0 + + def insert_process(self): + for _ in range(self.test_number): + instr = generate_instr( + self.gen_params, + self.layouts.rs_data_layout, + const_lmul=LMUL.m1, + optypes=[OpType.V_LOAD, OpType.V_STORE], + funct3=[Funct3.VMEM8, Funct3.VMEM16, Funct3.VMEM32], + max_reg_bits=3, + overwriting={"rp_s3": {"type": RegisterType.V}, "rp_dst": {"type": RegisterType.V}}, + ) + instr["s1_val"] &= ~0x3 + self.current_instr = instr + elems_in_elen = self.v_params.elen // eew_to_bits(self.current_instr["vtype"]["sew"]) + self.elens_to_send = math.ceil(self.current_instr["vtype"]["vl"] / elems_in_elen) + yield from self.circ.select.call() + yield from self.circ.insert_v.call(instr) + yield from self.circ.update_v.call(tag=instr["rp_s3"], value=0) + + while self.elens_to_send != 0: + result = yield from self.circ.get_result_v.call_try() + self.assertIsNone(result) + + result = yield from self.circ.get_result_v.call() + self.assertEqual(result["rob_id"], instr["rob_id"]) + self.assertEqual(result["rp_dst"], instr["rp_dst"]) + self.assertEqual(result["result"], 0) + self.assertEqual(result["exception"], 0) + + def precommit_process(self): + yield Passive() + while True: + if self.current_instr is None: + yield + else: + yield from self.circ.precommit.call(rob_id=self.current_instr["rob_id"]) + + def test_random(self): + with self.run_simulation( + self.m, + ) as sim: + sim.add_sync_process(self.insert_process) + sim.add_sync_process(self.wishbone_process) + sim.add_sync_process(self.precommit_process) + sim.add_sync_process(self.exception_process) + sim.add_sync_process(self.insert_x_process) + sim.add_sync_process(self.get_reserved_process) + sim.add_sync_process(self.set_reserved_process) + sim.add_sync_process(self.scoreboard_get_dirty_process) + sim.add_sync_process(self.scoreboard_set_dirty_process) + for i in range(self.v_params.register_bank_count): + sim.add_sync_process(self.vrfs[i].write_process) + sim.add_sync_process(self.vrfs[i].read_req_process) + sim.add_sync_process(self.vrfs[i].read_resp_process) diff --git a/test/regression/benchmark.py b/test/regression/benchmark.py index e43b6ee81..02252c787 100644 --- a/test/regression/benchmark.py +++ b/test/regression/benchmark.py @@ -54,13 +54,15 @@ async def run_benchmark(sim_backend: SimulationBackend, benchmark_name: str): mem_model = CoreMemoryModel(mem_segments) - success = await sim_backend.run(mem_model, timeout_cycles=5000000) + success = await sim_backend.run(mem_model, timeout_cycles=1000000) if not success: raise RuntimeError("Simulation timed out") if mmio.return_code() != 0: - raise RuntimeError("The benchmark exited with a non-zero return code: %d" % mmio.return_code()) + raise RuntimeError( + "The benchmark exited with a non-zero return code: {0:d} (hex: 0x{0:X})".format(mmio.return_code()) + ) results = {"cycle": mmio.cycle_cnt(), "instr": mmio.instr_cnt()} diff --git a/test/regression/pysim.py b/test/regression/pysim.py index aedf32f60..832700e39 100644 --- a/test/regression/pysim.py +++ b/test/regression/pysim.py @@ -9,13 +9,15 @@ from coreblocks.core import Core from coreblocks.params import GenParams -from coreblocks.params.configurations import full_core_config +from coreblocks.params.configurations import full_core_config, CoreConfiguration from coreblocks.peripherals.wishbone import WishboneBus class PySimulation(SimulationBackend): - def __init__(self, verbose: bool, traces_file: Optional[str] = None): - self.gp = GenParams(full_core_config) + def __init__( + self, verbose: bool, traces_file: Optional[str] = None, core_conf: CoreConfiguration = full_core_config + ): + self.gp = GenParams(core_conf) self.running = False self.cycle_cnt = 0 self.verbose = verbose diff --git a/test/structs_common/test_scoreboard.py b/test/structs_common/test_scoreboard.py index ac37f28ed..4abd4ad6d 100644 --- a/test/structs_common/test_scoreboard.py +++ b/test/structs_common/test_scoreboard.py @@ -19,7 +19,7 @@ def setUp(self): def create_process(self, k, forward): def f(): for _ in range(self.test_number): - id = random.randrange(self.entries_count) + id = self.generator((yield Now()), lambda: random.randrange(self.entries_count)) dirty = yield from self.circ.get_dirty_list[k].call(id=id) if forward: yield Settle() @@ -36,19 +36,7 @@ def f(): @parameterized.expand([(False,), (True,)]) def test_random(self, forward): self.circ = SimpleTestCircuit(Scoreboard(self.entries_count, self.superscalarity, data_forward=forward)) + self.generator = get_unique_generator() with self.run_simulation(self.circ) as sim: for k in range(self.superscalarity): sim.add_sync_process(self.create_process(k, forward)) - - def conflict_process(self): - yield from self.circ.set_dirty_list[0].call_init(id=0, dirty=0) - yield from self.circ.set_dirty_list[1].call_init(id=0, dirty=0) - yield Settle() - yield - self.assertEqual((yield from self.circ.set_dirty_list[0].done()), 1) - self.assertEqual((yield from self.circ.set_dirty_list[1].done()), 0) - - def test_conflict(self): - self.circ = SimpleTestCircuit(Scoreboard(self.entries_count, self.superscalarity, data_forward=False)) - with self.run_simulation(self.circ) as sim: - sim.add_sync_process(self.conflict_process) diff --git a/test/test_core.py b/test/test_core.py index d00102ff9..ecdddd85b 100644 --- a/test/test_core.py +++ b/test/test_core.py @@ -7,7 +7,7 @@ from coreblocks.core import Core from coreblocks.params import GenParams -from coreblocks.params.configurations import CoreConfiguration, basic_core_config, full_core_config +from coreblocks.params.configurations import CoreConfiguration, basic_core_config, full_core_config, vector_core_config from coreblocks.peripherals.wishbone import WishboneBus, WishboneMemorySlave from typing import Optional, cast @@ -240,9 +240,32 @@ def test_randomized(self): @parameterized_class( ("name", "source_file", "cycle_count", "expected_regvals", "configuration"), [ - ("fibonacci", "fibonacci.asm", 1200, {2: 2971215073}, basic_core_config), - ("fibonacci_mem", "fibonacci_mem.asm", 610, {3: 55}, basic_core_config), + ("fibonacci", "fibonacci.asm", 900, {2: 2971215073}, basic_core_config), + ("fibonacci_mem", "fibonacci_mem.asm", 570, {3: 55}, basic_core_config), ("csr", "csr.asm", 200, {1: 1, 2: 4}, full_core_config), + ( + "vector", + "vector.asm", + 500, # 500 + {1: 3, 2: 6, 3: 15, 4: 30, 5: 9, 6: 0, 7: 42, 8: 126, 9: 0xDEADBEEF}, + vector_core_config, + ), + ( + "vector_bytes", + "vector_bytes.asm", + 730, # 630 + { + 1: 10, + 2: 20 + (300 % 256) * 2**8 + 250 * 2**16, + 3: 50, + 4: 100, + 5: 30, + 6: 0 + (2550 % 256) * 2**8, + 7: 140, + 8: (420 % 256) + (1810 % 256) * 2**8 + 15 * 2**16 + 0x55 * 2**24, + }, + vector_core_config, + ), ], ) class TestCoreAsmSource(TestCoreBase): @@ -270,7 +293,7 @@ def test_asm_source(self): "-mabi=ilp32", # Specified manually, because toolchains from most distributions don't support new extensioins # and this test should be accessible locally. - "-march=rv32im_zicsr", + "-march=rv32im_zicsr_zve32x", "-o", asm_tmp.name, self.base_dir + self.source_file, diff --git a/test/utils/test_fifo.py b/test/utils/test_fifo.py index 59a79130f..78ee52654 100644 --- a/test/utils/test_fifo.py +++ b/test/utils/test_fifo.py @@ -93,7 +93,8 @@ def target(): yield v = yield from fifoc.read_methods[port_id].call_try() - if v is not None: + yield Settle() + if v is not None and (not clears or cycle != clears[-1]): readed.append((cycle, port_id, v["data"])) packet_counter -= 1 if packet_counter == 0: