From 30b55cd8bba332b1faf93ff9993e42bd8c3fc99b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Urba=C5=84czyk?= Date: Sun, 31 Mar 2024 11:11:06 +0100 Subject: [PATCH 1/5] Support for superscalarity in the instruction cache (#624) --- coreblocks/cache/icache.py | 57 +++++------ coreblocks/cache/iface.py | 2 +- coreblocks/cache/refiller.py | 98 ++++++++++++------- coreblocks/frontend/fetch/fetch.py | 16 +++- coreblocks/interface/layouts.py | 11 ++- coreblocks/params/configurations.py | 8 +- coreblocks/params/genparams.py | 14 ++- coreblocks/params/icache_params.py | 40 +++++--- test/cache/test_icache.py | 144 ++++++++++++++++------------ test/frontend/test_fetch.py | 4 +- test/frontend/test_rvc.py | 4 +- test/regression/memory.py | 4 +- test/test_core.py | 2 +- 13 files changed, 246 insertions(+), 158 deletions(-) diff --git a/coreblocks/cache/icache.py b/coreblocks/cache/icache.py index f94c6e07c..bcfbd37cc 100644 --- a/coreblocks/cache/icache.py +++ b/coreblocks/cache/icache.py @@ -11,6 +11,7 @@ from coreblocks.interface.layouts import ICacheLayouts from transactron.utils import assign, OneHotSwitchDynamic from transactron.lib import * +from transactron.lib import logging from coreblocks.peripherals.bus_adapter import BusMasterInterface from coreblocks.cache.iface import CacheInterface, CacheRefillerInterface @@ -21,19 +22,7 @@ "ICacheBypass", ] - -def extract_instr_from_word(m: TModule, params: ICacheParameters, word: Signal, addr: Value): - instr_out = Signal(params.instr_width) - if len(word) == 32: - m.d.comb += instr_out.eq(word) - elif len(word) == 64: - with m.If(addr[2] == 0): - m.d.comb += instr_out.eq(word[:32]) # Take lower 4 bytes - with m.Else(): - m.d.comb += instr_out.eq(word[32:]) # Take upper 4 bytes - else: - raise RuntimeError("Word size different than 32 and 64 is not supported") - return instr_out +log = logging.HardwareLogger("frontend.icache") class ICacheBypass(Elaboratable, CacheInterface): @@ -45,6 +34,9 @@ def __init__(self, layouts: ICacheLayouts, params: ICacheParameters, bus_master: self.accept_res = Method(o=layouts.accept_res) self.flush = Method() + if params.words_in_fetch_block != 1: + raise ValueError("ICacheBypass only supports fetch block size equal to the word size.") + def elaborate(self, platform): m = TModule() @@ -63,7 +55,7 @@ def _(addr: Value) -> None: def _(): res = self.bus_master.get_read_response(m) return { - "instr": extract_instr_from_word(m, self.params, res.data, req_addr), + "fetch_block": res.data, "error": res.err, } @@ -82,10 +74,10 @@ class ICache(Elaboratable, CacheInterface): Refilling a cache line is abstracted away from this module. ICache module needs two methods from the refiller `refiller_start`, which is called whenever we need to refill a cache line. - `refiller_accept` should be ready to be called whenever the refiller has another word ready - to be written to cache. `refiller_accept` should set `last` bit when either an error occurs - or the transfer is over. After issuing `last` bit, `refiller_accept` shouldn't be ready until - the next transfer is started. + `refiller_accept` should be ready to be called whenever the refiller has another fetch block + ready to be written to cache. `refiller_accept` should set `last` bit when either an error + occurs or the transfer is over. After issuing `last` bit, `refiller_accept` shouldn't be ready + until the next transfer is started. """ def __init__(self, layouts: ICacheLayouts, params: ICacheParameters, refiller: CacheRefillerInterface) -> None: @@ -194,12 +186,10 @@ def elaborate(self, platform): tag_hit = [tag_data.valid & (tag_data.tag == request_addr.tag) for tag_data in self.mem.tag_rd_data] tag_hit_any = reduce(operator.or_, tag_hit) - mem_out = Signal(self.params.word_width) + mem_out = Signal(self.params.fetch_block_bytes * 8) for i in OneHotSwitchDynamic(m, Cat(tag_hit)): m.d.comb += mem_out.eq(self.mem.data_rd_data[i]) - instr_out = extract_instr_from_word(m, self.params, mem_out, Value.cast(request_addr)) - refill_error_saved = Signal() m.d.comb += needs_refill.eq(request_valid & ~tag_hit_any & ~refill_error_saved) @@ -208,7 +198,7 @@ def elaborate(self, platform): self.perf_misses.incr(m, cond=refill_finish_last) self.perf_hits.incr(m, cond=~refill_finish_last) - self.res_fwd.write(m, instr=instr_out, error=refill_error_saved) + self.res_fwd.write(m, fetch_block=mem_out, error=refill_error_saved) m.d.sync += refill_error_saved.eq(0) @def_method(m, self.accept_res) @@ -245,6 +235,7 @@ def _(addr: Value) -> None: @def_method(m, self.flush, ready=accepting_requests) def _() -> None: + log.info(m, True, "Flushing the cache...") m.d.sync += flush_index.eq(0) m.d.comb += flush_start.eq(1) @@ -254,6 +245,7 @@ def _() -> None: with Transaction().body(m, request=fsm.ongoing("LOOKUP") & needs_refill): # Align to the beginning of the cache line aligned_addr = self.serialize_addr(request_addr) & ~((1 << self.params.offset_bits) - 1) + log.debug(m, True, "Refilling line 0x{:x}", aligned_addr) self.refiller.start_refill(m, addr=aligned_addr) m.d.sync += refill_finish_last.eq(0) @@ -265,7 +257,7 @@ def _() -> None: m.d.top_comb += [ self.mem.data_wr_addr.index.eq(deserialized["index"]), self.mem.data_wr_addr.offset.eq(deserialized["offset"]), - self.mem.data_wr_data.eq(ret.data), + self.mem.data_wr_data.eq(ret.fetch_block), ] m.d.comb += self.mem.data_wr_en.eq(1) @@ -301,7 +293,7 @@ class ICacheMemory(Elaboratable): Writes are multiplexed using one-hot `way_wr_en` signal. Read data lines from all ways are separately exposed (as an array). - The data memory is addressed using a machine word. + The data memory is addressed using fetch blocks. """ def __init__(self, params: ICacheParameters) -> None: @@ -319,11 +311,13 @@ def __init__(self, params: ICacheParameters) -> None: self.data_addr_layout = make_layout(("index", self.params.index_bits), ("offset", self.params.offset_bits)) + self.fetch_block_bits = params.fetch_block_bytes * 8 + self.data_rd_addr = Signal(self.data_addr_layout) - self.data_rd_data = Array([Signal(self.params.word_width) for _ in range(self.params.num_of_ways)]) + self.data_rd_data = Array([Signal(self.fetch_block_bits) for _ in range(self.params.num_of_ways)]) self.data_wr_addr = Signal(self.data_addr_layout) self.data_wr_en = Signal() - self.data_wr_data = Signal(self.params.word_width) + self.data_wr_data = Signal(self.fetch_block_bits) def elaborate(self, platform): m = TModule() @@ -345,17 +339,18 @@ def elaborate(self, platform): tag_mem_wp.en.eq(self.tag_wr_en & way_wr), ] - data_mem = Memory(width=self.params.word_width, depth=self.params.num_of_sets * self.params.words_in_block) + data_mem = Memory( + width=self.fetch_block_bits, depth=self.params.num_of_sets * self.params.fetch_blocks_in_line + ) data_mem_rp = data_mem.read_port() data_mem_wp = data_mem.write_port() m.submodules[f"data_mem_{i}_rp"] = data_mem_rp m.submodules[f"data_mem_{i}_wp"] = data_mem_wp - # We address the data RAM using machine words, so we have to + # We address the data RAM using fetch blocks, so we have to # discard a few least significant bits from the address. - redundant_offset_bits = exact_log2(self.params.word_width_bytes) - rd_addr = Cat(self.data_rd_addr.offset, self.data_rd_addr.index)[redundant_offset_bits:] - wr_addr = Cat(self.data_wr_addr.offset, self.data_wr_addr.index)[redundant_offset_bits:] + rd_addr = Cat(self.data_rd_addr.offset, self.data_rd_addr.index)[self.params.fetch_block_bytes_log :] + wr_addr = Cat(self.data_wr_addr.offset, self.data_wr_addr.index)[self.params.fetch_block_bytes_log :] m.d.comb += [ self.data_rd_data[i].eq(data_mem_rp.data), diff --git a/coreblocks/cache/iface.py b/coreblocks/cache/iface.py index c2c54d2ff..95bb00fd9 100644 --- a/coreblocks/cache/iface.py +++ b/coreblocks/cache/iface.py @@ -35,7 +35,7 @@ class CacheRefillerInterface(HasElaborate, Protocol): start_refill : Method A method that is used to start a refill for a given cache line. accept_refill : Method - A method that is used to accept one word from the requested cache line. + A method that is used to accept one fetch block from the requested cache line. """ start_refill: Method diff --git a/coreblocks/cache/refiller.py b/coreblocks/cache/refiller.py index 311764852..92fea2911 100644 --- a/coreblocks/cache/refiller.py +++ b/coreblocks/cache/refiller.py @@ -14,6 +14,7 @@ class SimpleCommonBusCacheRefiller(Elaboratable, CacheRefillerInterface): def __init__(self, layouts: ICacheLayouts, params: ICacheParameters, bus_master: BusMasterInterface): + self.layouts = layouts self.params = params self.bus_master = bus_master @@ -23,51 +24,84 @@ def __init__(self, layouts: ICacheLayouts, params: ICacheParameters, bus_master: def elaborate(self, platform): m = TModule() - refill_address = Signal(self.params.word_width - self.params.offset_bits) + m.submodules.resp_fwd = resp_fwd = Forwarder(self.layouts.accept_refill) + + cache_line_address = Signal(self.params.word_width - self.params.offset_bits) + refill_active = Signal() - word_counter = Signal(range(self.params.words_in_block)) + flushing = Signal() - m.submodules.address_fwd = address_fwd = Forwarder( - [("word_counter", word_counter.shape()), ("refill_address", refill_address.shape())] - ) + sending_requests = Signal() + req_word_counter = Signal(range(self.params.words_in_line)) - with Transaction().body(m): - address = address_fwd.read(m) + with Transaction().body(m, request=sending_requests): self.bus_master.request_read( m, - addr=Cat(address["word_counter"], address["refill_address"]), + addr=Cat(req_word_counter, cache_line_address), sel=C(1).replicate(self.bus_master.params.data_width // self.bus_master.params.granularity), ) - @def_method(m, self.start_refill, ready=~refill_active) - def _(addr) -> None: - address = addr[self.params.offset_bits :] - m.d.sync += refill_address.eq(address) - m.d.sync += refill_active.eq(1) - m.d.sync += word_counter.eq(0) + m.d.sync += req_word_counter.eq(req_word_counter + 1) + with m.If(req_word_counter == (self.params.words_in_line - 1)): + m.d.sync += sending_requests.eq(0) - address_fwd.write(m, word_counter=0, refill_address=address) + resp_word_counter = Signal(range(self.params.words_in_line)) + block_buffer = Signal(self.params.word_width * (self.params.words_in_fetch_block - 1)) - @def_method(m, self.accept_refill, ready=refill_active) - def _(): - fetched = self.bus_master.get_read_response(m) + # The transaction reads responses from the bus, builds the fetch block and when + # receives the last word of the fetch block, dispatches it. + with Transaction().body(m): + bus_response = self.bus_master.get_read_response(m) + + block = Signal(self.params.fetch_block_bytes * 8) + m.d.av_comb += block.eq(Cat(block_buffer, bus_response.data)) + m.d.sync += block_buffer.eq(block[self.params.word_width :]) + + words_in_fetch_block_log = exact_log2(self.params.words_in_fetch_block) + current_fetch_block = resp_word_counter[words_in_fetch_block_log:] + word_in_fetch_block = resp_word_counter[:words_in_fetch_block_log] + + with m.If(~flushing): + with m.If((word_in_fetch_block == self.params.words_in_fetch_block - 1) | bus_response.err): + fetch_block_addr = Cat( + C(0, exact_log2(self.params.word_width_bytes)), + C(0, words_in_fetch_block_log), + current_fetch_block, + cache_line_address, + ) + + resp_fwd.write( + m, + addr=fetch_block_addr, + fetch_block=block, + error=bus_response.err, + last=(resp_word_counter == self.params.words_in_line - 1) | bus_response.err, + ) + + with m.If(resp_word_counter == self.params.words_in_line - 1): + m.d.sync += refill_active.eq(0) + with m.Elif(bus_response.err): + m.d.sync += sending_requests.eq(0) + m.d.sync += flushing.eq(1) + + m.d.sync += resp_word_counter.eq(resp_word_counter + 1) + + with m.If(flushing & (resp_word_counter == req_word_counter)): + m.d.sync += refill_active.eq(0) + m.d.sync += flushing.eq(0) - last = (word_counter == (self.params.words_in_block - 1)) | fetched.err + @def_method(m, self.start_refill, ready=~refill_active) + def _(addr) -> None: + m.d.sync += cache_line_address.eq(addr[self.params.offset_bits :]) + m.d.sync += req_word_counter.eq(0) + m.d.sync += sending_requests.eq(1) - next_word_counter = Signal.like(word_counter) - m.d.top_comb += next_word_counter.eq(word_counter + 1) + m.d.sync += resp_word_counter.eq(0) - m.d.sync += word_counter.eq(next_word_counter) - with m.If(last): - m.d.sync += refill_active.eq(0) - with m.Else(): - address_fwd.write(m, word_counter=next_word_counter, refill_address=refill_address) + m.d.sync += refill_active.eq(1) - return { - "addr": Cat(C(0, exact_log2(self.params.word_width_bytes)), word_counter, refill_address), - "data": fetched.data, - "error": fetched.err, - "last": last, - } + @def_method(m, self.accept_refill) + def _(): + return resp_fwd.read(m) return m diff --git a/coreblocks/frontend/fetch/fetch.py b/coreblocks/frontend/fetch/fetch.py index add09c6c1..0901dc451 100644 --- a/coreblocks/frontend/fetch/fetch.py +++ b/coreblocks/frontend/fetch/fetch.py @@ -40,6 +40,9 @@ def __init__(self, gen_params: GenParams, icache: CacheInterface, cont: Method) # ExceptionCauseRegister uses separate Transaction for it, so performace is not affected. self.stall_exception.add_conflict(self.resume, Priority.LEFT) + # For now assume that the fetch block is 4 bytes long (a machine word). + assert self.gen_params.fetch_block_bytes == 4 + def elaborate(self, platform): m = TModule() @@ -74,7 +77,7 @@ def stall(exception=False): target = self.fetch_target_queue.read(m) res = self.icache.accept_res(m) - opcode = res.instr[2:7] + opcode = res.fetch_block[2:7] # whether we have to wait for the retirement of this instruction before we make futher speculation unsafe_instr = opcode == Opcode.SYSTEM @@ -90,7 +93,7 @@ def stall(exception=False): with m.If(unsafe_instr): stall() - m.d.comb += instr.eq(res.instr) + m.d.comb += instr.eq(res.fetch_block) self.cont(m, instr=instr, pc=target.addr, access_fault=fetch_error, rvc=0) @@ -136,6 +139,9 @@ def __init__(self, gen_params: GenParams, icache: CacheInterface, cont: Method) self.perf_rvc = HwCounter("frontend.ifu.rvc", "Number of decompressed RVC instructions") + # For now assume that the fetch block is 4 bytes long (a machine word). + assert self.gen_params.fetch_block_bytes == 4 + def elaborate(self, platform) -> TModule: m = TModule() @@ -175,8 +181,8 @@ def elaborate(self, platform) -> TModule: req_limiter.release(m) is_unaligned = current_pc[1] - resp_upper_half = cache_resp.instr[16:] - resp_lower_half = cache_resp.instr[:16] + resp_upper_half = cache_resp.fetch_block[16:] + resp_lower_half = cache_resp.fetch_block[:16] resp_first_half = Mux(is_unaligned, resp_upper_half, resp_lower_half) resp_valid = ~flushing & (cache_resp.error == 0) is_resp_upper_rvc = Signal() @@ -188,7 +194,7 @@ def elaborate(self, platform) -> TModule: is_rvc = is_instr_compressed(instr_lo_half) - full_instr = Mux(half_instr_buff_v, Cat(half_instr_buff, resp_lower_half), cache_resp.instr) + full_instr = Mux(half_instr_buff_v, Cat(half_instr_buff, resp_lower_half), cache_resp.fetch_block) instr = Signal(32) m.d.top_comb += instr.eq(Mux(is_rvc, decompress.instr_out, full_instr)) diff --git a/coreblocks/interface/layouts.py b/coreblocks/interface/layouts.py index 5db15302e..0e831f033 100644 --- a/coreblocks/interface/layouts.py +++ b/coreblocks/interface/layouts.py @@ -392,13 +392,16 @@ class ICacheLayouts: def __init__(self, gen_params: GenParams): fields = gen_params.get(CommonLayoutFields) - self.error: LayoutListField = ("last", 1) + self.last: LayoutListField = ("last", 1) """This is the last cache refill result.""" + self.fetch_block: LayoutListField = ("fetch_block", gen_params.fetch_block_bytes * 8) + """The block of data the fetch unit operates on.""" + self.issue_req = make_layout(fields.addr) self.accept_res = make_layout( - fields.instr, + self.fetch_block, fields.error, ) @@ -408,9 +411,9 @@ def __init__(self, gen_params: GenParams): self.accept_refill = make_layout( fields.addr, - fields.data, + self.fetch_block, fields.error, - self.error, + self.last, ) diff --git a/coreblocks/params/configurations.py b/coreblocks/params/configurations.py index a9dee4931..c2d51a1ca 100644 --- a/coreblocks/params/configurations.py +++ b/coreblocks/params/configurations.py @@ -62,8 +62,10 @@ class CoreConfiguration: Associativity of the instruction cache. icache_sets_bits: int Log of the number of sets of the instruction cache. - icache_block_size_bits: int + icache_line_bytes_log: int Log of the cache line size (in bytes). + fetch_block_bytes_log: int + Log of the size of the fetch block (in bytes). allow_partial_extensions: bool Allow partial support of extensions. _implied_extensions: Extenstion @@ -87,7 +89,9 @@ class CoreConfiguration: icache_enable: bool = True icache_ways: int = 2 icache_sets_bits: int = 7 - icache_block_size_bits: int = 5 + icache_line_bytes_log: int = 5 + + fetch_block_bytes_log: int = 2 allow_partial_extensions: bool = False diff --git a/coreblocks/params/genparams.py b/coreblocks/params/genparams.py index 5b6fe0ce2..33dd5346c 100644 --- a/coreblocks/params/genparams.py +++ b/coreblocks/params/genparams.py @@ -35,16 +35,17 @@ def __init__(self, cfg: CoreConfiguration): self.pma = cfg.pma bytes_in_word = self.isa.xlen // 8 - self.wb_params = WishboneParameters( - data_width=self.isa.xlen, addr_width=self.isa.xlen - exact_log2(bytes_in_word) - ) + bytes_in_word_log = exact_log2(bytes_in_word) + self.wb_params = WishboneParameters(data_width=self.isa.xlen, addr_width=self.isa.xlen - bytes_in_word_log) self.icache_params = ICacheParameters( addr_width=self.isa.xlen, word_width=self.isa.xlen, + fetch_block_bytes_log=cfg.fetch_block_bytes_log, num_of_ways=cfg.icache_ways, num_of_sets_bits=cfg.icache_sets_bits, - block_size_bits=cfg.icache_block_size_bits, + line_bytes_log=cfg.icache_line_bytes_log, + enable=cfg.icache_enable, ) self.debug_signals_enabled = cfg.debug_signals @@ -65,4 +66,9 @@ def __init__(self, cfg: CoreConfiguration): self.max_rs_entries_bits = (self.max_rs_entries - 1).bit_length() self.start_pc = cfg.start_pc + self.fetch_block_bytes_log = cfg.fetch_block_bytes_log + if self.fetch_block_bytes_log < bytes_in_word_log: + raise ValueError("Fetch block must be not smaller than the machine word.") + self.fetch_block_bytes = 2**self.fetch_block_bytes_log + self._toolchain_isa_str = gen_isa_string(extensions, cfg.xlen, skip_internal=True) diff --git a/coreblocks/params/icache_params.py b/coreblocks/params/icache_params.py index 2506d7b37..e71a07bf9 100644 --- a/coreblocks/params/icache_params.py +++ b/coreblocks/params/icache_params.py @@ -11,35 +11,49 @@ class ICacheParameters: Associativity of the cache. num_of_sets_bits : int Log of the number of cache sets. - block_size_bits : int - Log of the size of a single cache block in bytes. + line_bytes_log : int + Log of the size of a single cache line in bytes. enable : bool Enable the instruction cache. If disabled, requestes are bypassed to the bus. """ - def __init__(self, *, addr_width, word_width, num_of_ways, num_of_sets_bits, block_size_bits, enable=True): + def __init__( + self, + *, + addr_width, + word_width, + fetch_block_bytes_log, + num_of_ways, + num_of_sets_bits, + line_bytes_log, + enable=True + ): self.addr_width = addr_width self.word_width = word_width + self.fetch_block_bytes_log = fetch_block_bytes_log self.num_of_ways = num_of_ways self.num_of_sets_bits = num_of_sets_bits - self.block_size_bits = block_size_bits + self.line_bytes_log = line_bytes_log self.enable = enable + self.fetch_block_bytes = 2**fetch_block_bytes_log self.num_of_sets = 2**num_of_sets_bits - self.block_size_bytes = 2**block_size_bits - - # We are sanely assuming that the instruction width is 4 bytes. - self.instr_width = 32 + self.line_size_bytes = 2**line_bytes_log self.word_width_bytes = word_width // 8 - if self.block_size_bytes % self.word_width_bytes != 0: - raise ValueError("block_size_bytes must be divisble by the machine word size") - - self.offset_bits = block_size_bits + self.offset_bits = line_bytes_log self.index_bits = num_of_sets_bits self.tag_bits = self.addr_width - self.offset_bits - self.index_bits self.index_start_bit = self.offset_bits self.index_end_bit = self.offset_bits + self.index_bits - 1 - self.words_in_block = self.block_size_bytes // self.word_width_bytes + self.words_in_line = self.line_size_bytes // self.word_width_bytes + self.words_in_fetch_block = self.fetch_block_bytes // self.word_width_bytes + self.fetch_blocks_in_line = self.line_size_bytes // self.fetch_block_bytes + + if not enable: + return + + if line_bytes_log < self.fetch_block_bytes_log: + raise ValueError("The instruction cache line size must be not smaller than the fetch block size.") diff --git a/test/cache/test_icache.py b/test/cache/test_icache.py index 3bd198c43..43f800a5e 100644 --- a/test/cache/test_icache.py +++ b/test/cache/test_icache.py @@ -53,21 +53,25 @@ def elaborate(self, platform): @parameterized_class( - ("name", "isa_xlen", "block_size"), + ("name", "isa_xlen", "line_size", "fetch_block"), [ - ("blk_size16B_rv32i", 32, 4), - ("blk_size32B_rv32i", 32, 5), - ("blk_size32B_rv64i", 64, 5), - ("blk_size64B_rv32i", 32, 6), + ("line16B_block4B_rv32i", 32, 4, 2), + ("line32B_block8B_rv32i", 32, 5, 3), + ("line32B_block8B_rv64i", 64, 5, 3), + ("line64B_block16B_rv32i", 32, 6, 4), + ("line16B_block16B_rv32i", 32, 4, 4), ], ) class TestSimpleCommonBusCacheRefiller(TestCaseWithSimulator): isa_xlen: int - block_size: int + line_size: int + fetch_block: int def setUp(self) -> None: self.gen_params = GenParams( - test_core_config.replace(xlen=self.isa_xlen, icache_block_size_bits=self.block_size) + test_core_config.replace( + xlen=self.isa_xlen, icache_line_bytes_log=self.line_size, fetch_block_bytes_log=self.fetch_block + ) ) self.cp = self.gen_params.icache_params self.test_module = SimpleCommonBusCacheRefillerTestCircuit(self.gen_params) @@ -75,22 +79,24 @@ def setUp(self) -> None: random.seed(42) self.bad_addresses = set() + self.bad_fetch_blocks = set() self.mem = dict() self.requests = deque() for _ in range(100): # Make the address aligned to the beginning of a cache line - addr = random.randrange(2**self.gen_params.isa.xlen) & ~(self.cp.block_size_bytes - 1) + addr = random.randrange(2**self.gen_params.isa.xlen) & ~(self.cp.line_size_bytes - 1) self.requests.append(addr) if random.random() < 0.21: # Choose an address in this cache line to be erroneous - bad_addr = addr + random.randrange(self.cp.block_size_bytes) + bad_addr = addr + random.randrange(self.cp.line_size_bytes) # Make the address aligned to the machine word size bad_addr = bad_addr & ~(self.cp.word_width_bytes - 1) self.bad_addresses.add(bad_addr) + self.bad_fetch_blocks.add(bad_addr & ~(self.cp.fetch_block_bytes - 1)) def wishbone_slave(self): yield Passive() @@ -119,22 +125,26 @@ def refiller_process(self): req_addr = self.requests.pop() yield from self.test_module.start_refill.call(addr=req_addr) - for i in range(self.cp.words_in_block): + for i in range(self.cp.fetch_blocks_in_line): ret = yield from self.test_module.accept_refill.call() - cur_addr = req_addr + i * self.cp.word_width_bytes + cur_addr = req_addr + i * self.cp.fetch_block_bytes self.assertEqual(ret["addr"], cur_addr) - if cur_addr in self.bad_addresses: + if cur_addr in self.bad_fetch_blocks: self.assertEqual(ret["error"], 1) self.assertEqual(ret["last"], 1) break - self.assertEqual(ret["data"], self.mem[ret["addr"]]) + fetch_block = ret["fetch_block"] + for j in range(self.cp.words_in_fetch_block): + word = (fetch_block >> (j * self.cp.word_width)) & (2**self.cp.word_width - 1) + self.assertEqual(word, self.mem[cur_addr + j * self.cp.word_width_bytes]) + self.assertEqual(ret["error"], 0) - last = 1 if i == self.cp.words_in_block - 1 else 0 + last = 1 if i == self.cp.fetch_blocks_in_line - 1 else 0 self.assertEqual(ret["last"], last) def test(self): @@ -170,17 +180,20 @@ def elaborate(self, platform): @parameterized_class( - ("name", "isa_xlen"), + ("name", "isa_xlen", "fetch_block"), [ - ("rv32i", 32), - ("rv64i", 64), + ("rv32i", 32, 2), + ("rv64i", 64, 3), ], ) class TestICacheBypass(TestCaseWithSimulator): isa_xlen: str + fetch_block: int def setUp(self) -> None: - self.gen_params = GenParams(test_core_config.replace(xlen=self.isa_xlen)) + self.gen_params = GenParams( + test_core_config.replace(xlen=self.isa_xlen, fetch_block_bytes_log=self.fetch_block, icache_enable=False) + ) self.cp = self.gen_params.icache_params self.m = ICacheBypassTestCircuit(self.gen_params) @@ -231,7 +244,7 @@ def wishbone_slave(self): def user_process(self): while self.requests: - req_addr = self.requests.popleft() + req_addr = self.requests.popleft() & ~(self.cp.fetch_block_bytes - 1) yield from self.m.issue_req.call(addr=req_addr) while random.random() < 0.5: @@ -243,7 +256,11 @@ def user_process(self): self.assertTrue(ret["error"]) else: self.assertFalse(ret["error"]) - self.assertEqual(ret["instr"], self.mem[req_addr]) + + data = self.mem[req_addr] + if self.gen_params.isa.xlen == 64: + data |= self.mem[req_addr + 4] << 32 + self.assertEqual(ret["fetch_block"], data) while random.random() < 0.5: yield @@ -291,16 +308,18 @@ def elaborate(self, platform): @parameterized_class( - ("name", "isa_xlen", "block_size"), + ("name", "isa_xlen", "line_size", "fetch_block"), [ - ("blk_size16B_rv32i", 32, 4), - ("blk_size64B_rv32i", 32, 6), - ("blk_size32B_rv64i", 64, 5), + ("line16B_block8B_rv32i", 32, 4, 2), + ("line64B_block16B_rv32i", 32, 6, 4), + ("line32B_block16B_rv64i", 64, 5, 4), + ("line32B_block32B_rv64i", 64, 5, 5), ], ) class TestICache(TestCaseWithSimulator): isa_xlen: int - block_size: int + line_size: int + fetch_block: int def setUp(self) -> None: random.seed(42) @@ -321,7 +340,8 @@ def init_module(self, ways, sets) -> None: xlen=self.isa_xlen, icache_ways=ways, icache_sets_bits=exact_log2(sets), - icache_block_size_bits=self.block_size, + icache_line_bytes_log=self.line_size, + fetch_block_bytes_log=self.fetch_block, ) ) self.cp = self.gen_params.icache_params @@ -330,32 +350,32 @@ def init_module(self, ways, sets) -> None: @def_method_mock(lambda self: self.m.refiller.start_refill_mock) def start_refill_mock(self, addr): self.refill_requests.append(addr) - self.refill_word_cnt = 0 + self.refill_block_cnt = 0 self.refill_in_fly = True self.refill_addr = addr @def_method_mock(lambda self: self.m.refiller.accept_refill_mock, enable=lambda self: self.refill_in_fly) def accept_refill_mock(self): - addr = self.refill_addr + self.refill_word_cnt * self.cp.word_width_bytes - data = self.load_or_gen_mem(addr) - if self.gen_params.isa.xlen == 64: - data = self.load_or_gen_mem(addr + 4) << 32 | data + addr = self.refill_addr + self.refill_block_cnt * self.cp.fetch_block_bytes - self.refill_word_cnt += 1 + fetch_block = 0 + bad_addr = False + for i in range(0, self.cp.fetch_block_bytes, 4): + fetch_block |= self.load_or_gen_mem(addr + i) << (8 * i) + if addr + i in self.bad_addrs: + bad_addr = True - err = addr in self.bad_addrs - if self.gen_params.isa.xlen == 64: - err = err or (addr + 4) in self.bad_addrs + self.refill_block_cnt += 1 - last = self.refill_word_cnt == self.cp.words_in_block or err + last = self.refill_block_cnt == self.cp.fetch_blocks_in_line or bad_addr if last: self.refill_in_fly = False return { "addr": addr, - "data": data, - "error": err, + "fetch_block": fetch_block, + "error": bad_addr, "last": last, } @@ -380,13 +400,17 @@ def expect_resp(self, wait=False): self.assert_resp((yield from self.m.accept_res.get_outputs())) def assert_resp(self, resp: RecordIntDictRet): - addr = self.issued_requests.popleft() + addr = self.issued_requests.popleft() & ~(self.cp.fetch_block_bytes - 1) if (addr & ~((1 << self.cp.offset_bits) - 1)) in self.bad_cache_lines: self.assertTrue(resp["error"]) else: self.assertFalse(resp["error"]) - self.assertEqual(resp["instr"], self.mem[addr]) + fetch_block = 0 + for i in range(0, self.cp.fetch_block_bytes, 4): + fetch_block |= self.mem[addr + i] << (8 * i) + + self.assertEqual(resp["fetch_block"], fetch_block) def expect_refill(self, addr: int): self.assertEqual(self.refill_requests.popleft(), addr) @@ -407,13 +431,13 @@ def cache_user_process(): self.expect_refill(0x00010000) # Accesses to the same cache line shouldn't cause a cache miss - for i in range(self.cp.words_in_block): - yield from self.call_cache(0x00010000 + i * 4) + for i in range(self.cp.fetch_blocks_in_line): + yield from self.call_cache(0x00010000 + i * self.cp.fetch_block_bytes) self.assertEqual(len(self.refill_requests), 0) # Now go beyond the first cache line - yield from self.call_cache(0x00010000 + self.cp.block_size_bytes) - self.expect_refill(0x00010000 + self.cp.block_size_bytes) + yield from self.call_cache(0x00010000 + self.cp.line_size_bytes) + self.expect_refill(0x00010000 + self.cp.line_size_bytes) # Trigger cache aliasing yield from self.call_cache(0x00020000) @@ -422,14 +446,14 @@ def cache_user_process(): self.expect_refill(0x00010000) # Fill the whole cache - for i in range(0, self.cp.block_size_bytes * self.cp.num_of_sets, 4): + for i in range(0, self.cp.line_size_bytes * self.cp.num_of_sets, 4): yield from self.call_cache(i) for i in range(self.cp.num_of_sets): - self.expect_refill(i * self.cp.block_size_bytes) + self.expect_refill(i * self.cp.line_size_bytes) # Now do some accesses within the cached memory for i in range(50): - yield from self.call_cache(random.randrange(0, self.cp.block_size_bytes * self.cp.num_of_sets, 4)) + yield from self.call_cache(random.randrange(0, self.cp.line_size_bytes * self.cp.num_of_sets, 4)) self.assertEqual(len(self.refill_requests), 0) with self.run_simulation(self.m) as sim: @@ -460,7 +484,7 @@ def test_pipeline(self): def cache_process(): # Fill the cache for i in range(self.cp.num_of_sets): - addr = 0x00010000 + i * self.cp.block_size_bytes + addr = 0x00010000 + i * self.cp.line_size_bytes yield from self.call_cache(addr) self.expect_refill(addr) @@ -468,7 +492,7 @@ def cache_process(): # Create a stream of requests to ensure the pipeline is working yield from self.m.accept_res.enable() - for i in range(0, self.cp.num_of_sets * self.cp.block_size_bytes, 4): + for i in range(0, self.cp.num_of_sets * self.cp.line_size_bytes, 4): addr = 0x00010000 + i self.issued_requests.append(addr) @@ -488,7 +512,7 @@ def cache_process(): yield from self.tick(5) # Check how the cache handles queuing the requests - yield from self.send_req(addr=0x00010000 + 3 * self.cp.block_size_bytes) + yield from self.send_req(addr=0x00010000 + 3 * self.cp.line_size_bytes) yield from self.send_req(addr=0x00010004) # Wait a few cycles. There are two requests queued @@ -508,7 +532,7 @@ def cache_process(): # Schedule two requests, the first one causing a cache miss yield from self.send_req(addr=0x00020000) - yield from self.send_req(addr=0x00010000 + self.cp.block_size_bytes) + yield from self.send_req(addr=0x00010000 + self.cp.line_size_bytes) yield from self.m.accept_res.enable() @@ -522,7 +546,7 @@ def cache_process(): # Schedule two requests, the second one causing a cache miss yield from self.send_req(addr=0x00020004) - yield from self.send_req(addr=0x00030000 + self.cp.block_size_bytes) + yield from self.send_req(addr=0x00030000 + self.cp.line_size_bytes) yield from self.m.accept_res.enable() @@ -536,7 +560,7 @@ def cache_process(): # Schedule two requests, both causing a cache miss yield from self.send_req(addr=0x00040000) - yield from self.send_req(addr=0x00050000 + self.cp.block_size_bytes) + yield from self.send_req(addr=0x00050000 + self.cp.line_size_bytes) yield from self.m.accept_res.enable() @@ -556,14 +580,14 @@ def cache_process(): # Fill the whole cache for s in range(self.cp.num_of_sets): for w in range(self.cp.num_of_ways): - addr = w * 0x00010000 + s * self.cp.block_size_bytes + addr = w * 0x00010000 + s * self.cp.line_size_bytes yield from self.call_cache(addr) self.expect_refill(addr) # Everything should be in the cache for s in range(self.cp.num_of_sets): for w in range(self.cp.num_of_ways): - addr = w * 0x00010000 + s * self.cp.block_size_bytes + addr = w * 0x00010000 + s * self.cp.line_size_bytes yield from self.call_cache(addr) self.assertEqual(len(self.refill_requests), 0) @@ -573,7 +597,7 @@ def cache_process(): # The cache should be empty for s in range(self.cp.num_of_sets): for w in range(self.cp.num_of_ways): - addr = w * 0x00010000 + s * self.cp.block_size_bytes + addr = w * 0x00010000 + s * self.cp.line_size_bytes yield from self.call_cache(addr) self.expect_refill(addr) @@ -605,7 +629,7 @@ def cache_process(): yield # Schedule two requests and then flush - yield from self.send_req(0x00000000 + self.cp.block_size_bytes) + yield from self.send_req(0x00000000 + self.cp.line_size_bytes) yield from self.send_req(0x00010000) yield from self.m.flush_cache.call() self.mem[0x00010000] = random.randrange(2**self.gen_params.isa.ilen) @@ -613,7 +637,7 @@ def cache_process(): # And accept the results self.assert_resp((yield from self.m.accept_res.call())) self.assert_resp((yield from self.m.accept_res.call())) - self.expect_refill(0x00000000 + self.cp.block_size_bytes) + self.expect_refill(0x00000000 + self.cp.line_size_bytes) # Just make sure that the line is truly flushed yield from self.call_cache(0x00010000) @@ -629,7 +653,7 @@ def cache_process(): self.add_bad_addr(0x00010000) # Bad addr at the beggining of the line self.add_bad_addr(0x00020008) # Bad addr in the middle of the line self.add_bad_addr( - 0x00030000 + self.cp.block_size_bytes - self.cp.word_width_bytes + 0x00030000 + self.cp.line_size_bytes - self.cp.word_width_bytes ) # Bad addr at the end of the line yield from self.call_cache(0x00010008) @@ -698,7 +722,7 @@ def cache_process(): def test_random(self): self.init_module(4, 8) - max_addr = 16 * self.cp.block_size_bytes * self.cp.num_of_sets + max_addr = 16 * self.cp.line_size_bytes * self.cp.num_of_sets iterations = 1000 for i in range(0, max_addr, 4): diff --git a/test/frontend/test_fetch.py b/test/frontend/test_fetch.py index b9ff1388c..3684f7cad 100644 --- a/test/frontend/test_fetch.py +++ b/test/frontend/test_fetch.py @@ -84,7 +84,7 @@ def cache_process(self): data |= 0b1100000 data &= ~0b0010000 # but not system - self.output_q.append({"instr": data, "error": 0}) + self.output_q.append({"fetch_block": data, "error": 0}) # Speculative fetch. Skip, because this instruction shouldn't be executed. if addr != next_pc: @@ -229,7 +229,7 @@ def get_mem_or_random(addr): data = (get_mem_or_random(req_addr + 2) << 16) | get_mem_or_random(req_addr) err = (req_addr in self.memerr) or (req_addr + 2 in self.memerr) - self.output_q.append({"instr": data, "error": err}) + self.output_q.append({"fetch_block": data, "error": err}) @def_method_mock(lambda self: self.icache.issue_req_io, enable=lambda self: len(self.input_q) < 2, sched_prio=1) def issue_req_mock(self, addr): diff --git a/test/frontend/test_rvc.py b/test/frontend/test_rvc.py index 0b099f751..0a92be4e5 100644 --- a/test/frontend/test_rvc.py +++ b/test/frontend/test_rvc.py @@ -280,7 +280,9 @@ class TestInstrDecompress(TestCaseWithSimulator): test_cases: list[tuple[int, ValueLike]] def test(self): - self.gen_params = GenParams(test_core_config.replace(compressed=True, xlen=self.isa_xlen)) + self.gen_params = GenParams( + test_core_config.replace(compressed=True, xlen=self.isa_xlen, fetch_block_bytes_log=3) + ) self.m = InstrDecompress(self.gen_params) def process(): diff --git a/test/regression/memory.py b/test/regression/memory.py index 70b8a9496..a34ef764d 100644 --- a/test/regression/memory.py +++ b/test/regression/memory.py @@ -164,9 +164,9 @@ def load_segment(segment: Segment, *, disable_write_protection: bool = False) -> config = CoreConfiguration() if flags_raw & P_FLAGS.PF_X: # align instruction section to full icache lines - align_bits = config.icache_block_size_bits + align_bits = config.icache_line_bytes_log # workaround for fetching/stalling issue - extend_end = 2**config.icache_block_size_bits + extend_end = 2**config.icache_line_bytes_log else: align_bits = 0 extend_end = 0 diff --git a/test/test_core.py b/test/test_core.py index dbb8692f8..44c68fe4c 100644 --- a/test/test_core.py +++ b/test/test_core.py @@ -36,7 +36,7 @@ def elaborate(self, platform): wb_data_bus = WishboneSignature(self.gen_params.wb_params).create() # Align the size of the memory to the length of a cache line. - instr_mem_depth = align_to_power_of_two(len(self.instr_mem), self.gen_params.icache_params.block_size_bits) + instr_mem_depth = align_to_power_of_two(len(self.instr_mem), self.gen_params.icache_params.line_bytes_log) self.wb_mem_slave = WishboneMemorySlave( wb_params=self.gen_params.wb_params, width=32, depth=instr_mem_depth, init=self.instr_mem ) From 8ec353dabd3137d4e01acd345e0a8b3d0c68a129 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Urba=C5=84czyk?= Date: Mon, 1 Apr 2024 10:59:30 +0100 Subject: [PATCH 2/5] Refactor RISC-V instruction models (#631) --- coreblocks/frontend/decoder/isa.py | 1 + coreblocks/frontend/decoder/rvc.py | 2 +- coreblocks/params/instr.py | 333 ++++++++++++++++++----------- test/frontend/test_rvc.py | 46 ++-- test/params/test_instr.py | 63 ++++++ test/test_core.py | 6 +- 6 files changed, 295 insertions(+), 156 deletions(-) create mode 100644 test/params/test_instr.py diff --git a/coreblocks/frontend/decoder/isa.py b/coreblocks/frontend/decoder/isa.py index 229d65c9b..10bb72854 100644 --- a/coreblocks/frontend/decoder/isa.py +++ b/coreblocks/frontend/decoder/isa.py @@ -40,6 +40,7 @@ class Opcode(IntEnum, shape=5): JALR = 0b11001 JAL = 0b11011 SYSTEM = 0b11100 + RESERVED = 0b11111 class Funct3(IntEnum, shape=3): diff --git a/coreblocks/frontend/decoder/rvc.py b/coreblocks/frontend/decoder/rvc.py index 4ff48c07d..2fe9d42ee 100644 --- a/coreblocks/frontend/decoder/rvc.py +++ b/coreblocks/frontend/decoder/rvc.py @@ -209,7 +209,7 @@ def _quadrant_2(self) -> list[DecodedInstr]: shamt = Cat(self.instr_in[2:7], self.instr_in[12]) ldsp_imm = Cat(C(0, 3), self.instr_in[5:7], self.instr_in[12], self.instr_in[2:5], C(0, 3)) lwsp_imm = Cat(C(0, 2), self.instr_in[4:7], self.instr_in[12], self.instr_in[2:4], C(0, 4)) - sdsp_imm = Cat(C(0, 3), self.instr_in[10:13], self.instr_in[7:10], C(0, 2)) + sdsp_imm = Cat(C(0, 3), self.instr_in[10:13], self.instr_in[7:10], C(0, 3)) swsp_imm = Cat(C(0, 2), self.instr_in[9:13], self.instr_in[7:9], C(0, 4)) slli = ( diff --git a/coreblocks/params/instr.py b/coreblocks/params/instr.py index 370d25b84..f3755b25d 100644 --- a/coreblocks/params/instr.py +++ b/coreblocks/params/instr.py @@ -1,14 +1,24 @@ -from abc import abstractmethod, ABC +""" + +Based on riscv-python-model by Stefan Wallentowitz +https://github.com/wallento/riscv-python-model +""" + +from dataclasses import dataclass +from abc import ABC +from enum import Enum +from typing import Optional from amaranth.hdl import ValueCastable from amaranth import * -from transactron.utils import ValueLike, int_to_signed +from transactron.utils import ValueLike from coreblocks.params.isa_params import * from coreblocks.frontend.decoder.isa import * __all__ = [ + "RISCVInstr", "RTypeInstr", "ITypeInstr", "STypeInstr", @@ -20,154 +30,219 @@ ] +@dataclass(kw_only=True) +class Field: + """Information about a field in a RISC-V instruction. + + Attributes + ---------- + base: int | list[int] + A bit position (or a list of positions) where this field (or parts of the field) + would map in the instruction. + size: int | list[int] + Size (or sizes of the parts) of the field + signed: bool + Whether this field encodes a signed value. + offset: int + How many bits of this field should be skipped when encoding the instruction. + For example, the immediate of the jump instruction always skips the least + significant bit. This only affects encoding procedures, so externally (for example + when creating an instance of a instruction) full-size values should be always used. + static_value: Optional[Value] + Whether the field should have a static value for a given type of an instruction. + """ + + base: int | list[int] + size: int | list[int] + + signed: bool = False + offset: int = 0 + static_value: Optional[Value] = None + + _name: str = "" + + def bases(self) -> list[int]: + return [self.base] if isinstance(self.base, int) else self.base + + def sizes(self) -> list[int]: + return [self.size] if isinstance(self.size, int) else self.size + + def shape(self) -> Shape: + return Shape(width=sum(self.sizes()) + self.offset, signed=self.signed) + + def __set_name__(self, owner, name): + self._name = name + + def __get__(self, obj, objtype=None) -> Value: + if self.static_value is not None: + return self.static_value + + return obj.__dict__.get(self._name, C(0, self.shape())) + + def __set__(self, obj, value) -> None: + if self.static_value is not None: + raise AttributeError("Can't overwrite the static value of a field.") + + expected_shape = self.shape() + + field_val: Value = C(0) + if isinstance(value, Enum): + field_val = Const(value.value, expected_shape) + elif isinstance(value, int): + field_val = Const(value, expected_shape) + else: + field_val = Value.cast(value) + + if field_val.shape().width != expected_shape.width: + raise AttributeError( + f"Expected width of the value: {expected_shape.width}, given: {field_val.shape().width}" + ) + if field_val.shape().signed and not expected_shape.signed: + raise AttributeError( + f"Expected signedness of the value: {expected_shape.signed}, given: {field_val.shape().signed}" + ) + + obj.__dict__[self._name] = field_val + + def get_parts(self, value: Value) -> list[Value]: + base = self.bases() + size = self.sizes() + offset = self.offset + + ret: list[Value] = [] + for i in range(len(base)): + ret.append(value[offset : offset + size[i]]) + offset += size[i] + + return ret + + +def _get_fields(cls: type) -> list[Field]: + fields = [cls.__dict__[member] for member in vars(cls) if isinstance(cls.__dict__[member], Field)] + field_ids = set([id(field) for field in fields]) + for base in cls.__bases__: + for field in _get_fields(base): + if id(field) in field_ids: + continue + fields.append(field) + field_ids.add(id(field)) + + return fields + + class RISCVInstr(ABC, ValueCastable): - @abstractmethod - def pack(self) -> Value: - pass + opcode = Field(base=0, size=7) + + def __init__(self, opcode: Opcode): + self.opcode = Cat(C(0b11, 2), opcode) + + def encode(self) -> int: + const = Const.cast(self.as_value()) + return const.value # type: ignore @ValueCastable.lowermethod - def as_value(self): - return self.pack() + def as_value(self) -> Value: + parts: list[tuple[int, Value]] = [] + + for field in _get_fields(type(self)): + value = field.__get__(self, type(self)) + parts += zip(field.bases(), field.get_parts(value)) + + parts.sort() + return Cat([part[1] for part in parts]) - def shape(self): + def shape(self) -> Shape: return self.as_value().shape() -class RTypeInstr(RISCVInstr): +class InstructionFunct3Type(RISCVInstr): + funct3 = Field(base=12, size=3) + + +class InstructionFunct7Type(RISCVInstr): + funct7 = Field(base=25, size=7) + + +class RTypeInstr(InstructionFunct3Type, InstructionFunct7Type): + rd = Field(base=7, size=5) + rs1 = Field(base=15, size=5) + rs2 = Field(base=20, size=5) + def __init__( - self, - opcode: ValueLike, - rd: ValueLike, - funct3: ValueLike, - rs1: ValueLike, - rs2: ValueLike, - funct7: ValueLike, + self, opcode: Opcode, funct3: ValueLike, funct7: ValueLike, rd: ValueLike, rs1: ValueLike, rs2: ValueLike ): - self.opcode = Value.cast(opcode) - self.rd = Value.cast(rd) - self.funct3 = Value.cast(funct3) - self.rs1 = Value.cast(rs1) - self.rs2 = Value.cast(rs2) - self.funct7 = Value.cast(funct7) - - def pack(self) -> Value: - return Cat(C(0b11, 2), self.opcode, self.rd, self.funct3, self.rs1, self.rs2, self.funct7) - - @staticmethod - def encode(opcode: int, rd: int, funct3: int, rs1: int, rs2: int, funct7: int): - return int(f"{funct7:07b}{rs2:05b}{rs1:05b}{funct3:03b}{rd:05b}{opcode:05b}11", 2) - - -class ITypeInstr(RISCVInstr): - def __init__(self, opcode: ValueLike, rd: ValueLike, funct3: ValueLike, rs1: ValueLike, imm: ValueLike): - self.opcode = Value.cast(opcode) - self.rd = Value.cast(rd) - self.funct3 = Value.cast(funct3) - self.rs1 = Value.cast(rs1) - self.imm = Value.cast(imm) - - def pack(self) -> Value: - return Cat(C(0b11, 2), self.opcode, self.rd, self.funct3, self.rs1, self.imm) - - @staticmethod - def encode(opcode: int, rd: int, funct3: int, rs1: int, imm: int): - imm = int_to_signed(imm, 12) - return int(f"{imm:012b}{rs1:05b}{funct3:03b}{rd:05b}{opcode:05b}11", 2) - - -class STypeInstr(RISCVInstr): - def __init__(self, opcode: ValueLike, imm: ValueLike, funct3: ValueLike, rs1: ValueLike, rs2: ValueLike): - self.opcode = Value.cast(opcode) - self.imm = Value.cast(imm) - self.funct3 = Value.cast(funct3) - self.rs1 = Value.cast(rs1) - self.rs2 = Value.cast(rs2) - - def pack(self) -> Value: - return Cat(C(0b11, 2), self.opcode, self.imm[0:5], self.funct3, self.rs1, self.rs2, self.imm[5:12]) - - @staticmethod - def encode(opcode: int, imm: int, funct3: int, rs1: int, rs2: int): - imm = int_to_signed(imm, 12) - imm_str = f"{imm:012b}" - return int(f"{imm_str[5:12]:07b}{rs2:05b}{rs1:05b}{funct3:03b}{imm_str[0:5]:05b}{opcode:05b}11", 2) - - -class BTypeInstr(RISCVInstr): - def __init__(self, opcode: ValueLike, imm: ValueLike, funct3: ValueLike, rs1: ValueLike, rs2: ValueLike): - self.opcode = Value.cast(opcode) - self.imm = Value.cast(imm) - self.funct3 = Value.cast(funct3) - self.rs1 = Value.cast(rs1) - self.rs2 = Value.cast(rs2) - - def pack(self) -> Value: - return Cat( - C(0b11, 2), - self.opcode, - self.imm[11], - self.imm[1:5], - self.funct3, - self.rs1, - self.rs2, - self.imm[5:11], - self.imm[12], - ) + super().__init__(opcode) + self.funct3 = funct3 + self.funct7 = funct7 + self.rd = rd + self.rs1 = rs1 + self.rs2 = rs2 - @staticmethod - def encode(opcode: int, imm: int, funct3: int, rs1: int, rs2: int): - imm = int_to_signed(imm, 13) - imm_str = f"{imm:013b}" - return int( - f"{imm_str[12]:01b}{imm_str[5:11]:06b}{rs2:05b}{rs1:05b}{funct3:03b}{imm_str[1:5]:04b}" - + f"{imm_str[11]:01b}{opcode:05b}11", - 2, - ) +class ITypeInstr(InstructionFunct3Type): + rd = Field(base=7, size=5) + rs1 = Field(base=15, size=5) + imm = Field(base=20, size=12, signed=True) + + def __init__(self, opcode: Opcode, funct3: ValueLike, rd: ValueLike, rs1: ValueLike, imm: ValueLike): + super().__init__(opcode) + self.funct3 = funct3 + self.rd = rd + self.rs1 = rs1 + self.imm = imm -class UTypeInstr(RISCVInstr): - def __init__(self, opcode: ValueLike, rd: ValueLike, imm: ValueLike): - self.opcode = Value.cast(opcode) - self.rd = Value.cast(rd) - self.imm = Value.cast(imm) - def pack(self) -> Value: - return Cat(C(0b11, 2), self.opcode, self.rd, self.imm[12:]) +class STypeInstr(InstructionFunct3Type): + rs1 = Field(base=15, size=5) + rs2 = Field(base=20, size=5) + imm = Field(base=[7, 25], size=[5, 7], signed=True) - @staticmethod - def encode(opcode: int, rd: int, imm: int): - imm = int_to_signed(imm, 20) - return int(f"{imm:020b}{rd:05b}{opcode:05b}11", 2) + def __init__(self, opcode: Opcode, funct3: ValueLike, rs1: ValueLike, rs2: ValueLike, imm: ValueLike): + super().__init__(opcode) + self.funct3 = funct3 + self.rs1 = rs1 + self.rs2 = rs2 + self.imm = imm + + +class BTypeInstr(InstructionFunct3Type): + rs1 = Field(base=15, size=5) + rs2 = Field(base=20, size=5) + imm = Field(base=[8, 25, 7, 31], size=[4, 6, 1, 1], offset=1, signed=True) + + def __init__(self, opcode: Opcode, funct3: ValueLike, rs1: ValueLike, rs2: ValueLike, imm: ValueLike): + super().__init__(opcode) + self.funct3 = funct3 + self.rs1 = rs1 + self.rs2 = rs2 + self.imm = imm + + +class UTypeInstr(RISCVInstr): + rd = Field(base=7, size=5) + imm = Field(base=12, size=20, offset=12, signed=True) + + def __init__(self, opcode: Opcode, rd: ValueLike, imm: ValueLike): + super().__init__(opcode) + self.rd = rd + self.imm = imm class JTypeInstr(RISCVInstr): - def __init__(self, opcode: ValueLike, rd: ValueLike, imm: ValueLike): - self.opcode = Value.cast(opcode) - self.rd = Value.cast(rd) - self.imm = Value.cast(imm) - - def pack(self) -> Value: - return Cat(C(0b11, 2), self.opcode, self.rd, self.imm[12:20], self.imm[11], self.imm[1:11], self.imm[20]) - - @staticmethod - def encode(opcode: int, rd: int, imm: int): - imm = int_to_signed(imm, 21) - imm_str = f"{imm:021b}" - return int( - f"{imm_str[20]:01b}{imm_str[1:11]:010b}{imm_str[11]:01b}{imm_str[12:20]:08b}{rd:05b}{opcode:05b}11", 2 - ) + rd = Field(base=7, size=5) + imm = Field(base=[21, 20, 12, 31], size=[10, 1, 8, 1], offset=1, signed=True) + def __init__(self, opcode: Opcode, rd: ValueLike, imm: ValueLike): + super().__init__(opcode) + self.rd = rd + self.imm = imm -class IllegalInstr(RISCVInstr): - def __init__(self): - pass - def pack(self) -> Value: - return C(1).replicate(32) # Instructions with all bits set to 1 are reserved to be illegal. +class IllegalInstr(RISCVInstr): + illegal = Field(base=7, size=25, static_value=Cat(1).replicate(25)) - @staticmethod - def encode(opcode: int, rd: int, imm: int): - return int("1" * 32, 2) + def __init__(self): + super().__init__(opcode=Opcode.RESERVED) class EBreakInstr(ITypeInstr): diff --git a/test/frontend/test_rvc.py b/test/frontend/test_rvc.py index 0a92be4e5..8d8fba5a5 100644 --- a/test/frontend/test_rvc.py +++ b/test/frontend/test_rvc.py @@ -25,17 +25,17 @@ # c.addi x2, -28 ( 0x1111, - ITypeInstr(opcode=Opcode.OP_IMM, rd=Registers.X2, funct3=Funct3.ADD, rs1=Registers.X2, imm=C(-28, 12)), + ITypeInstr(opcode=Opcode.OP_IMM, rd=Registers.X2, funct3=Funct3.ADD, rs1=Registers.X2, imm=-28), ), # c.li x31, -7 ( 0x5FE5, - ITypeInstr(opcode=Opcode.OP_IMM, rd=Registers.X31, funct3=Funct3.ADD, rs1=Registers.ZERO, imm=C(-7, 12)), + ITypeInstr(opcode=Opcode.OP_IMM, rd=Registers.X31, funct3=Funct3.ADD, rs1=Registers.ZERO, imm=-7), ), # c.addi16sp 496 (0x617D, ITypeInstr(opcode=Opcode.OP_IMM, rd=Registers.SP, funct3=Funct3.ADD, rs1=Registers.SP, imm=496)), # c.lui x7, -3 - (0x73F5, UTypeInstr(opcode=Opcode.LUI, rd=Registers.X7, imm=C(-3, 20) << 12)), + (0x73F5, UTypeInstr(opcode=Opcode.LUI, rd=Registers.X7, imm=Cat(C(0, 12), C(-3, 20)))), # c.srli x10, 3 ( 0x810D, @@ -44,7 +44,7 @@ rd=Registers.X10, funct3=Funct3.SR, rs1=Registers.X10, - rs2=C(3, 5), + rs2=Registers.X3, funct7=Funct7.SL, ), ), @@ -56,7 +56,7 @@ rd=Registers.X12, funct3=Funct3.SR, rs1=Registers.X12, - rs2=C(8, 5), + rs2=Registers.X8, funct7=Funct7.SA, ), ), @@ -111,16 +111,16 @@ ), ), # c.j 2012 - (0xAFF1, JTypeInstr(opcode=Opcode.JAL, rd=Registers.ZERO, imm=C(2012, 21))), + (0xAFF1, JTypeInstr(opcode=Opcode.JAL, rd=Registers.ZERO, imm=2012)), # c.beqz x8, -6 ( 0xDC6D, - BTypeInstr(opcode=Opcode.BRANCH, imm=C(-6, 13), funct3=Funct3.BEQ, rs1=Registers.X8, rs2=Registers.ZERO), + BTypeInstr(opcode=Opcode.BRANCH, imm=-6, funct3=Funct3.BEQ, rs1=Registers.X8, rs2=Registers.ZERO), ), # c.bnez x15, 20 ( 0xEB91, - BTypeInstr(opcode=Opcode.BRANCH, imm=C(20, 13), funct3=Funct3.BNE, rs1=Registers.X15, rs2=Registers.ZERO), + BTypeInstr(opcode=Opcode.BRANCH, imm=20, funct3=Funct3.BNE, rs1=Registers.X15, rs2=Registers.ZERO), ), # c.slli x13, 31 ( @@ -130,18 +130,16 @@ rd=Registers.X13, funct3=Funct3.SLL, rs1=Registers.X13, - rs2=C(31, 5), + rs2=Registers.X31, funct7=Funct7.SL, ), ), # c.lwsp x2, 4 - (0x4112, ITypeInstr(opcode=Opcode.LOAD, rd=Registers.X2, funct3=Funct3.W, rs1=Registers.SP, imm=C(4, 12))), + (0x4112, ITypeInstr(opcode=Opcode.LOAD, rd=Registers.X2, funct3=Funct3.W, rs1=Registers.SP, imm=4)), # c.jr x30 ( 0x8F02, - ITypeInstr( - opcode=Opcode.JALR, rd=Registers.ZERO, funct3=Funct3.JALR, rs1=Registers.X30, imm=C(0).replicate(12) - ), + ITypeInstr(opcode=Opcode.JALR, rd=Registers.ZERO, funct3=Funct3.JALR, rs1=Registers.X30, imm=0), ), # c.mv x2, x26 ( @@ -170,7 +168,7 @@ ), ), # c.swsp x31, 20 - (0xCA7E, STypeInstr(opcode=Opcode.STORE, imm=C(20, 12), funct3=Funct3.W, rs1=Registers.SP, rs2=Registers.X31)), + (0xCA7E, STypeInstr(opcode=Opcode.STORE, imm=20, funct3=Funct3.W, rs1=Registers.SP, rs2=Registers.X31)), ] RV32_TESTS = [ @@ -179,9 +177,9 @@ # c.sd x14, 0(x13) (0xE298, IllegalInstr()), # c.jal 40 - (0x2025, JTypeInstr(opcode=Opcode.JAL, rd=Registers.RA, imm=C(40, 21))), + (0x2025, JTypeInstr(opcode=Opcode.JAL, rd=Registers.RA, imm=40)), # c.jal -412 - (0x3595, JTypeInstr(opcode=Opcode.JAL, rd=Registers.RA, imm=C(-412, 21))), + (0x3595, JTypeInstr(opcode=Opcode.JAL, rd=Registers.RA, imm=-412)), # c.srli x10, 32 (0x9101, IllegalInstr()), # c.srai x12, 40 @@ -196,13 +194,13 @@ RV64_TESTS = [ # c.ld x8, 8(x9) - (0x6480, ITypeInstr(opcode=Opcode.LOAD, rd=Registers.X8, funct3=Funct3.D, rs1=Registers.X9, imm=C(8, 12))), + (0x6480, ITypeInstr(opcode=Opcode.LOAD, rd=Registers.X8, funct3=Funct3.D, rs1=Registers.X9, imm=8)), # c.sd x14, 0(x13) - (0xE298, STypeInstr(opcode=Opcode.STORE, imm=C(0, 12), funct3=Funct3.D, rs1=Registers.X13, rs2=Registers.X14)), + (0xE298, STypeInstr(opcode=Opcode.STORE, imm=0, funct3=Funct3.D, rs1=Registers.X13, rs2=Registers.X14)), # c.addiw x13, -12, ( 0x36D1, - ITypeInstr(opcode=Opcode.OP_IMM_32, rd=Registers.X13, funct3=Funct3.ADD, rs1=Registers.X13, imm=C(-12, 12)), + ITypeInstr(opcode=Opcode.OP_IMM_32, rd=Registers.X13, funct3=Funct3.ADD, rs1=Registers.X13, imm=-12), ), # c.srli x10, 32 ( @@ -212,7 +210,7 @@ rd=Registers.X10, funct3=Funct3.SR, rs1=Registers.X10, - rs2=C(0, 5), + rs2=Registers.X0, funct7=Funct7.SL | 1, ), ), @@ -224,7 +222,7 @@ rd=Registers.X12, funct3=Funct3.SR, rs1=Registers.X12, - rs2=C(8, 5), + rs2=Registers.X8, funct7=Funct7.SA | 1, ), ), @@ -260,14 +258,14 @@ rd=Registers.X13, funct3=Funct3.SLL, rs1=Registers.X13, - rs2=C(31, 5), + rs2=Registers.X31, funct7=Funct7.SL | 1, ), ), # c.ldsp x29, 40 - (0x7EA2, ITypeInstr(opcode=Opcode.LOAD, rd=Registers.X29, funct3=Funct3.D, rs1=Registers.SP, imm=C(40, 12))), + (0x7EA2, ITypeInstr(opcode=Opcode.LOAD, rd=Registers.X29, funct3=Funct3.D, rs1=Registers.SP, imm=40)), # c.sdsp x4, 8 - (0xE412, STypeInstr(opcode=Opcode.STORE, imm=C(8, 12), funct3=Funct3.D, rs1=Registers.SP, rs2=Registers.X4)), + (0xE412, STypeInstr(opcode=Opcode.STORE, imm=8, funct3=Funct3.D, rs1=Registers.SP, rs2=Registers.X4)), ] diff --git a/test/params/test_instr.py b/test/params/test_instr.py new file mode 100644 index 000000000..0ed97e19c --- /dev/null +++ b/test/params/test_instr.py @@ -0,0 +1,63 @@ +import unittest +from typing import Sequence + +from amaranth import * + +from coreblocks.params.instr import * +from coreblocks.frontend.decoder.isa import * + + +class InstructionTest(unittest.TestCase): + def do_run(self, test_cases: Sequence[tuple[RISCVInstr, int]]): + for instr, raw_instr in test_cases: + self.assertEqual(instr.encode(), raw_instr) + + def test_r_type(self): + test_cases = [ + (RTypeInstr(opcode=Opcode.OP, rd=21, funct3=Funct3.AND, rs1=10, rs2=31, funct7=Funct7.AND), 0x1F57AB3), + ] + + self.do_run(test_cases) + + def test_i_type(self): + test_cases = [ + (ITypeInstr(opcode=Opcode.LOAD_FP, rd=22, funct3=Funct3.D, rs1=10, imm=2047), 0x7FF53B07), + (ITypeInstr(opcode=Opcode.LOAD_FP, rd=22, funct3=Funct3.D, rs1=10, imm=-2048), 0x80053B07), + ] + + self.do_run(test_cases) + + def test_s_type(self): + test_cases = [ + (STypeInstr(opcode=Opcode.STORE_FP, imm=2047, funct3=Funct3.D, rs1=31, rs2=0), 0x7E0FBFA7), + (STypeInstr(opcode=Opcode.STORE_FP, imm=-2048, funct3=Funct3.D, rs1=5, rs2=13), 0x80D2B027), + ] + + self.do_run(test_cases) + + def test_b_type(self): + test_cases = [ + (BTypeInstr(opcode=Opcode.BRANCH, imm=4094, funct3=Funct3.BNE, rs1=10, rs2=0), 0x7E051FE3), + (BTypeInstr(opcode=Opcode.BRANCH, imm=-4096, funct3=Funct3.BEQ, rs1=31, rs2=4), 0x804F8063), + ] + + self.do_run(test_cases) + + def test_u_type(self): + test_cases = [ + (UTypeInstr(opcode=Opcode.LUI, rd=10, imm=3102 << 12), 0xC1E537), + (UTypeInstr(opcode=Opcode.LUI, rd=31, imm=1048575 << 12), 0xFFFFFFB7), + ] + + self.do_run(test_cases) + + def test_j_type(self): + test_cases = [ + (JTypeInstr(opcode=Opcode.JAL, rd=0, imm=0), 0x6F), + (JTypeInstr(opcode=Opcode.JAL, rd=0, imm=2), 0x20006F), + (JTypeInstr(opcode=Opcode.JAL, rd=10, imm=1048572), 0x7FDFF56F), + (JTypeInstr(opcode=Opcode.JAL, rd=3, imm=-230), 0xF1BFF1EF), + (JTypeInstr(opcode=Opcode.JAL, rd=15, imm=-1048576), 0x800007EF), + ] + + self.do_run(test_cases) diff --git a/test/test_core.py b/test/test_core.py index 44c68fe4c..7bb939ac8 100644 --- a/test/test_core.py +++ b/test/test_core.py @@ -79,8 +79,10 @@ def push_register_load_imm(self, reg_id, val): if val & 0x800: lui_imm = (lui_imm + 1) & (0xFFFFF) - yield from self.push_instr(UTypeInstr.encode(Opcode.LUI, reg_id, lui_imm)) - yield from self.push_instr(ITypeInstr.encode(Opcode.OP_IMM, reg_id, Funct3.ADD, reg_id, addi_imm)) + yield from self.push_instr(UTypeInstr(opcode=Opcode.LUI, rd=reg_id, imm=lui_imm << 12).encode()) + yield from self.push_instr( + ITypeInstr(opcode=Opcode.OP_IMM, rd=reg_id, funct3=Funct3.ADD, rs1=reg_id, imm=addi_imm).encode() + ) class TestCoreAsmSourceBase(TestCoreBase): From cc02760c3955695c4710cd25ce5ee64a5a126884 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Urba=C5=84czyk?= Date: Mon, 1 Apr 2024 11:13:13 +0100 Subject: [PATCH 3/5] Fix instruction cache refilling bug (#636) --- coreblocks/cache/icache.py | 77 ++++++++++++++++++++------------------ test/cache/test_icache.py | 24 ++++++++++++ transactron/lib/fifo.py | 8 ++++ transactron/lib/reqres.py | 9 ++++- 4 files changed, 79 insertions(+), 39 deletions(-) diff --git a/coreblocks/cache/icache.py b/coreblocks/cache/icache.py index bcfbd37cc..0b60cf37c 100644 --- a/coreblocks/cache/icache.py +++ b/coreblocks/cache/icache.py @@ -142,14 +142,13 @@ def elaborate(self, platform): ] m.submodules.mem = self.mem = ICacheMemory(self.params) - m.submodules.req_fifo = self.req_fifo = FIFO(layout=self.addr_layout, depth=2) - m.submodules.res_fwd = self.res_fwd = Forwarder(layout=self.layouts.accept_res) + m.submodules.req_zipper = req_zipper = ArgumentsToResultsZipper(self.addr_layout, self.layouts.accept_res) # State machine logic needs_refill = Signal() refill_finish = Signal() - refill_finish_last = Signal() refill_error = Signal() + refill_error_saved = Signal() flush_start = Signal() flush_finish = Signal() @@ -158,6 +157,7 @@ def elaborate(self, platform): self.perf_flushes.incr(m, cond=flush_finish) with m.FSM(reset="FLUSH") as fsm: + with m.State("FLUSH"): with m.If(flush_finish): m.next = "LOOKUP" @@ -180,35 +180,44 @@ def elaborate(self, platform): m.d.sync += way_selector.eq(way_selector.rotate_left(1)) # Fast path - read requests - request_valid = self.req_fifo.read.ready - request_addr = Signal(self.addr_layout) + mem_read_addr = Signal(self.addr_layout) + prev_mem_read_addr = Signal(self.addr_layout) + m.d.comb += assign(mem_read_addr, prev_mem_read_addr) - tag_hit = [tag_data.valid & (tag_data.tag == request_addr.tag) for tag_data in self.mem.tag_rd_data] - tag_hit_any = reduce(operator.or_, tag_hit) + mem_read_output_valid = Signal() + with Transaction(name="MemRead").body( + m, request=fsm.ongoing("LOOKUP") & (mem_read_output_valid | refill_error_saved) + ): + req_addr = req_zipper.peek_arg(m) - mem_out = Signal(self.params.fetch_block_bytes * 8) - for i in OneHotSwitchDynamic(m, Cat(tag_hit)): - m.d.comb += mem_out.eq(self.mem.data_rd_data[i]) + tag_hit = [tag_data.valid & (tag_data.tag == req_addr.tag) for tag_data in self.mem.tag_rd_data] + tag_hit_any = reduce(operator.or_, tag_hit) - refill_error_saved = Signal() - m.d.comb += needs_refill.eq(request_valid & ~tag_hit_any & ~refill_error_saved) + with m.If(tag_hit_any | refill_error_saved): + self.perf_hits.incr(m, cond=tag_hit_any) + mem_out = Signal(self.params.fetch_block_bytes * 8) + for i in OneHotSwitchDynamic(m, Cat(tag_hit)): + m.d.av_comb += mem_out.eq(self.mem.data_rd_data[i]) + + req_zipper.write_results(m, fetch_block=mem_out, error=refill_error_saved) + m.d.sync += refill_error_saved.eq(0) + m.d.sync += mem_read_output_valid.eq(0) + with m.Else(): + self.perf_misses.incr(m) - with Transaction().body(m, request=request_valid & fsm.ongoing("LOOKUP") & (tag_hit_any | refill_error_saved)): - self.perf_errors.incr(m, cond=refill_error_saved) - self.perf_misses.incr(m, cond=refill_finish_last) - self.perf_hits.incr(m, cond=~refill_finish_last) + m.d.comb += needs_refill.eq(1) - self.res_fwd.write(m, fetch_block=mem_out, error=refill_error_saved) - m.d.sync += refill_error_saved.eq(0) + # Align to the beginning of the cache line + aligned_addr = self.serialize_addr(req_addr) & ~((1 << self.params.offset_bits) - 1) + log.debug(m, True, "Refilling line 0x{:x}", aligned_addr) + self.refiller.start_refill(m, addr=aligned_addr) @def_method(m, self.accept_res) def _(): - self.req_fifo.read(m) self.req_latency.stop(m) - return self.res_fwd.read(m) - mem_read_addr = Signal(self.addr_layout) - m.d.comb += assign(mem_read_addr, request_addr) + output = req_zipper.read(m) + return output.results @def_method(m, self.issue_req, ready=accepting_requests) def _(addr: Value) -> None: @@ -216,11 +225,11 @@ def _(addr: Value) -> None: self.req_latency.start(m) deserialized = self.deserialize_addr(addr) - # Forward read address only if the method is called m.d.comb += assign(mem_read_addr, deserialized) - m.d.sync += assign(request_addr, deserialized) + m.d.sync += assign(prev_mem_read_addr, deserialized) + req_zipper.write_args(m, deserialized) - self.req_fifo.write(m, deserialized) + m.d.sync += mem_read_output_valid.eq(1) m.d.comb += [ self.mem.tag_rd_index.eq(mem_read_addr.index), @@ -242,18 +251,12 @@ def _() -> None: m.d.comb += flush_finish.eq(flush_index == self.params.num_of_sets - 1) # Slow path - data refilling - with Transaction().body(m, request=fsm.ongoing("LOOKUP") & needs_refill): - # Align to the beginning of the cache line - aligned_addr = self.serialize_addr(request_addr) & ~((1 << self.params.offset_bits) - 1) - log.debug(m, True, "Refilling line 0x{:x}", aligned_addr) - self.refiller.start_refill(m, addr=aligned_addr) - - m.d.sync += refill_finish_last.eq(0) - with Transaction().body(m): ret = self.refiller.accept_refill(m) deserialized = self.deserialize_addr(ret.addr) + self.perf_errors.incr(m, cond=ret.error) + m.d.top_comb += [ self.mem.data_wr_addr.index.eq(deserialized["index"]), self.mem.data_wr_addr.offset.eq(deserialized["offset"]), @@ -262,9 +265,9 @@ def _() -> None: m.d.comb += self.mem.data_wr_en.eq(1) m.d.comb += refill_finish.eq(ret.last) - m.d.sync += refill_finish_last.eq(1) m.d.comb += refill_error.eq(ret.error) - m.d.sync += refill_error_saved.eq(ret.error) + with m.If(ret.error): + m.d.sync += refill_error_saved.eq(1) with m.If(fsm.ongoing("FLUSH")): m.d.comb += [ @@ -277,9 +280,9 @@ def _() -> None: with m.Else(): m.d.comb += [ self.mem.way_wr_en.eq(way_selector), - self.mem.tag_wr_index.eq(request_addr.index), + self.mem.tag_wr_index.eq(mem_read_addr.index), self.mem.tag_wr_data.valid.eq(~refill_error), - self.mem.tag_wr_data.tag.eq(request_addr.tag), + self.mem.tag_wr_data.tag.eq(mem_read_addr.tag), self.mem.tag_wr_en.eq(refill_finish), ] diff --git a/test/cache/test_icache.py b/test/cache/test_icache.py index 43f800a5e..f53cff894 100644 --- a/test/cache/test_icache.py +++ b/test/cache/test_icache.py @@ -715,6 +715,30 @@ def cache_process(): yield from self.expect_resp(wait=True) yield yield from self.m.accept_res.disable() + yield + + # The second request will cause an error + yield from self.send_req(addr=0x00021004) + yield from self.send_req(addr=0x00030000) + + yield from self.tick(10) + + # Accept the first response + yield from self.m.accept_res.enable() + yield from self.expect_resp(wait=True) + yield + + # Wait before accepting the second response + yield from self.m.accept_res.disable() + yield from self.tick(10) + yield from self.m.accept_res.enable() + yield from self.expect_resp(wait=True) + + yield + + # This request should not cause an error + yield from self.send_req(addr=0x00011000) + yield from self.expect_resp(wait=True) with self.run_simulation(self.m) as sim: sim.add_sync_process(cache_process) diff --git a/transactron/lib/fifo.py b/transactron/lib/fifo.py index 92ac0f7bb..24cacfadc 100644 --- a/transactron/lib/fifo.py +++ b/transactron/lib/fifo.py @@ -13,6 +13,9 @@ class BasicFifo(Elaboratable): read: Method Reads from the FIFO. Accepts an empty argument, returns a structure. Ready only if the FIFO is not empty. + peek: Method + Returns the element at the front (but not delete). Ready only if the FIFO + is not empty. The method is nonexclusive. write: Method Writes to the FIFO. Accepts a structure, returns empty result. Ready only if the FIFO is not full. @@ -40,6 +43,7 @@ def __init__(self, layout: MethodLayout, depth: int, *, src_loc: int | SrcLoc = src_loc = get_src_loc(src_loc) self.read = Method(o=self.layout, src_loc=src_loc) + self.peek = Method(o=self.layout, nonexclusive=True, src_loc=src_loc) self.write = Method(i=self.layout, src_loc=src_loc) self.clear = Method(src_loc=src_loc) self.head = Signal(from_method_layout(layout)) @@ -93,6 +97,10 @@ def _() -> ValueLike: m.d.sync += self.read_idx.eq(next_read_idx) return self.head + @def_method(m, self.peek, self.read_ready) + def _() -> ValueLike: + return self.head + @def_method(m, self.clear) def _() -> None: m.d.sync += self.read_idx.eq(0) diff --git a/transactron/lib/reqres.py b/transactron/lib/reqres.py index f9aeb6e06..a3f6e2908 100644 --- a/transactron/lib/reqres.py +++ b/transactron/lib/reqres.py @@ -1,7 +1,7 @@ from amaranth import * from ..core import * from ..utils import SrcLoc, get_src_loc, MethodLayout -from .connectors import Forwarder, FIFO +from .connectors import Forwarder from transactron.lib import BasicFifo from amaranth.utils import * @@ -39,6 +39,8 @@ class ArgumentsToResultsZipper(Elaboratable): Attributes ---------- + peek_arg: Method + A nonexclusive method to read (but not delete) the head of the arg queue. write_args: Method Method to write arguments with `args_layout` format to 2-FIFO. write_results: Method @@ -65,6 +67,7 @@ def __init__(self, args_layout: MethodLayout, results_layout: MethodLayout, src_ self.args_layout = args_layout self.output_layout = [("args", self.args_layout), ("results", results_layout)] + self.peek_arg = Method(o=self.args_layout, nonexclusive=True, src_loc=self.src_loc) self.write_args = Method(i=self.args_layout, src_loc=self.src_loc) self.write_results = Method(i=self.results_layout, src_loc=self.src_loc) self.read = Method(o=self.output_layout, src_loc=self.src_loc) @@ -72,7 +75,7 @@ def __init__(self, args_layout: MethodLayout, results_layout: MethodLayout, src_ def elaborate(self, platform): m = TModule() - fifo = FIFO(self.args_layout, depth=2, src_loc=self.src_loc) + fifo = BasicFifo(self.args_layout, depth=2, src_loc=self.src_loc) forwarder = Forwarder(self.results_layout, src_loc=self.src_loc) m.submodules.fifo = fifo @@ -92,6 +95,8 @@ def _(): results = forwarder.read(m) return {"args": args, "results": results} + self.peek_arg.proxy(m, fifo.peek) + return m From 6ef2f847cc9d34d2c051458a28d4552249e473f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Urba=C5=84czyk?= Date: Mon, 1 Apr 2024 11:38:45 +0100 Subject: [PATCH 4/5] Add TaggedCounter (#637) --- coreblocks/func_blocks/fu/alu.py | 11 ++ coreblocks/func_blocks/fu/jumpbranch.py | 18 ++-- test/transactron/test_metrics.py | 82 +++++++++++++++ transactron/lib/metrics.py | 130 +++++++++++++++++++++++- 4 files changed, 230 insertions(+), 11 deletions(-) diff --git a/coreblocks/func_blocks/fu/alu.py b/coreblocks/func_blocks/fu/alu.py index adfcc6a3f..d824cacb3 100644 --- a/coreblocks/func_blocks/fu/alu.py +++ b/coreblocks/func_blocks/fu/alu.py @@ -3,6 +3,7 @@ from transactron import * from transactron.lib import FIFO +from transactron.lib.metrics import * from coreblocks.frontend.decoder.isa import Funct3, Funct7 from coreblocks.frontend.decoder.optypes import OpType @@ -219,9 +220,17 @@ def __init__(self, gen_params: GenParams, alu_fn=AluFn()): self.issue = Method(i=layouts.issue) self.accept = Method(o=layouts.accept) + self.perf_instr = TaggedCounter( + "backend.fu.alu.instr", + "Counts of instructions executed by the jumpbranch unit", + tags=AluFn.Fn, + ) + def elaborate(self, platform): m = TModule() + m.submodules += [self.perf_instr] + m.submodules.alu = alu = Alu(self.gen_params, alu_fn=self.alu_fn) m.submodules.fifo = fifo = FIFO(self.gen_params.get(FuncUnitLayouts).accept, 2) m.submodules.decoder = decoder = self.alu_fn.get_decoder(self.gen_params) @@ -238,6 +247,8 @@ def _(arg): m.d.comb += alu.in1.eq(arg.s1_val) m.d.comb += alu.in2.eq(Mux(arg.imm, arg.imm, arg.s2_val)) + self.perf_instr.incr(m, decoder.decode_fn) + fifo.write(m, rob_id=arg.rob_id, result=alu.out, rp_dst=arg.rp_dst, exception=0) return m diff --git a/coreblocks/func_blocks/fu/jumpbranch.py b/coreblocks/func_blocks/fu/jumpbranch.py index aeb6fed22..9730650ee 100644 --- a/coreblocks/func_blocks/fu/jumpbranch.py +++ b/coreblocks/func_blocks/fu/jumpbranch.py @@ -136,8 +136,11 @@ def __init__(self, gen_params: GenParams, jb_fn=JumpBranchFn()): self.dm = gen_params.get(DependencyManager) self.dm.add_dependency(BranchVerifyKey(), self.fifo_branch_resolved.read) - self.perf_jumps = HwCounter("backend.fu.jumpbranch.jumps", "Number of jump instructions issued") - self.perf_branches = HwCounter("backend.fu.jumpbranch.branches", "Number of branch instructions issued") + self.perf_instr = TaggedCounter( + "backend.fu.jumpbranch.instr", + "Counts of instructions executed by the jumpbranch unit", + tags=JumpBranchFn.Fn, + ) self.perf_misaligned = HwCounter( "backend.fu.jumpbranch.misaligned", "Number of instructions with misaligned target address" ) @@ -145,7 +148,10 @@ def __init__(self, gen_params: GenParams, jb_fn=JumpBranchFn()): def elaborate(self, platform): m = TModule() - m.submodules += [self.perf_jumps, self.perf_branches, self.perf_misaligned] + m.submodules += [ + self.perf_instr, + self.perf_misaligned, + ] m.submodules.jb = jb = JumpBranch(self.gen_params, fn=self.jb_fn) m.submodules.fifo_res = fifo_res = FIFO(self.gen_params.get(FuncUnitLayouts).accept, 2) @@ -169,12 +175,10 @@ def _(arg): m.d.top_comb += jb.in_rvc.eq(arg.exec_fn.funct7) is_auipc = decoder.decode_fn == JumpBranchFn.Fn.AUIPC - is_jump = (decoder.decode_fn == JumpBranchFn.Fn.JAL) | (decoder.decode_fn == JumpBranchFn.Fn.JALR) jump_result = Mux(jb.taken, jb.jmp_addr, jb.reg_res) - self.perf_jumps.incr(m, cond=is_jump) - self.perf_branches.incr(m, cond=(~is_jump & ~is_auipc)) + self.perf_instr.incr(m, decoder.decode_fn) exception = Signal() exception_report = self.dm.get_dependency(ExceptionReportKey()) @@ -216,7 +220,7 @@ def _(arg): log.debug( m, True, - "jumping from 0x{:08x} to 0x{:08x}; misprediction: {}", + "branch resolved from 0x{:08x} to 0x{:08x}; misprediction: {}", jb.in_pc, jump_result, misprediction, diff --git a/test/transactron/test_metrics.py b/test/transactron/test_metrics.py index 12acdfd27..7a91616dd 100644 --- a/test/transactron/test_metrics.py +++ b/test/transactron/test_metrics.py @@ -1,6 +1,9 @@ import json import random import queue +from typing import Type +from enum import IntFlag, IntEnum, auto, Enum + from parameterized import parameterized_class from amaranth import * @@ -138,6 +141,85 @@ def test_process(): sim.add_sync_process(test_process) +class OneHotEnum(IntFlag): + ADD = auto() + XOR = auto() + OR = auto() + + +class PlainIntEnum(IntEnum): + TEST_1 = auto() + TEST_2 = auto() + TEST_3 = auto() + + +class TaggedCounterCircuit(Elaboratable): + def __init__(self, tags: range | Type[Enum] | list[int]): + self.counter = TaggedCounter("counter", "", tags=tags) + + self.cond = Signal() + self.tag = Signal(self.counter.tag_width) + + def elaborate(self, platform): + m = TModule() + + m.submodules.counter = self.counter + + with Transaction().body(m): + self.counter.incr(m, self.tag, cond=self.cond) + + return m + + +class TestTaggedCounter(TestCaseWithSimulator): + def setUp(self) -> None: + random.seed(42) + + def do_test_enum(self, tags: range | Type[Enum] | list[int], tag_values: list[int]): + m = TaggedCounterCircuit(tags) + DependencyContext.get().add_dependency(HwMetricsEnabledKey(), True) + + counts: dict[int, int] = {} + for i in tag_values: + counts[i] = 0 + + def test_process(): + for _ in range(200): + for i in tag_values: + self.assertEqual(counts[i], (yield m.counter.counters[i].value)) + + tag = random.choice(list(tag_values)) + + yield m.cond.eq(1) + yield m.tag.eq(tag) + yield + yield m.cond.eq(0) + yield + + counts[tag] += 1 + + with self.run_simulation(m) as sim: + sim.add_sync_process(test_process) + + def test_one_hot_enum(self): + self.do_test_enum(OneHotEnum, [e.value for e in OneHotEnum]) + + def test_plain_int_enum(self): + self.do_test_enum(PlainIntEnum, [e.value for e in PlainIntEnum]) + + def test_negative_range(self): + r = range(-10, 15, 3) + self.do_test_enum(r, list(r)) + + def test_positive_range(self): + r = range(0, 30, 2) + self.do_test_enum(r, list(r)) + + def test_value_list(self): + values = [-2137, 2, 4, 8, 42] + self.do_test_enum(values, values) + + class ExpHistogramCircuit(Elaboratable): def __init__(self, bucket_cnt: int, sample_width: int): self.sample_width = sample_width diff --git a/transactron/lib/metrics.py b/transactron/lib/metrics.py index 2e706e0a3..f3d5b9e0d 100644 --- a/transactron/lib/metrics.py +++ b/transactron/lib/metrics.py @@ -1,14 +1,14 @@ from dataclasses import dataclass, field from dataclasses_json import dataclass_json -from typing import Optional +from typing import Optional, Type from abc import ABC +from enum import Enum from amaranth import * -from amaranth.utils import bits_for +from amaranth.utils import bits_for, ceil_log2, exact_log2 -from transactron.utils import ValueLike +from transactron.utils import ValueLike, OneHotSwitchDynamic, SignalBundle from transactron import Method, def_method, TModule -from transactron.utils import SignalBundle from transactron.lib import FIFO from transactron.utils.dependencies import ListKey, DependencyContext, SimpleKey @@ -17,6 +17,7 @@ "MetricModel", "HwMetric", "HwCounter", + "TaggedCounter", "HwExpHistogram", "LatencyMeasurer", "HardwareMetricsManager", @@ -230,6 +231,127 @@ def incr(self, m: TModule, *, cond: ValueLike = C(1)): self._incr(m) +class TaggedCounter(Elaboratable, HwMetric): + """Hardware Tagged Counter + + Like HwCounter, but contains multiple counters, each with its own tag. + At a time a single counter can be increased and the value of the tag + can be provided dynamically. The type of the tag can be either an int + enum, a range or a list of integers (negative numbers are ok). + + Internally, it detects if tag values can be one-hot encoded and if so, + it generates more optimized circuit. + + Attributes + ---------- + tag_width: int + The length of the signal holding a tag value. + one_hot: bool + Whether tag values can be one-hot encoded. + counters: dict[int, HwMetricRegisters] + Mapping from a tag value to a register holding a counter for that tag. + """ + + def __init__( + self, + fully_qualified_name: str, + description: str = "", + *, + tags: range | Type[Enum] | list[int], + registers_width: int = 32, + ): + """ + Parameters + ---------- + fully_qualified_name: str + The fully qualified name of the metric. + description: str + A human-readable description of the metric's functionality. + tags: range | Type[Enum] | list[int] + Tag values. + registers_width: int + Width of the underlying registers. Defaults to 32 bits. + """ + + super().__init__(fully_qualified_name, description) + + if isinstance(tags, range) or isinstance(tags, list): + counters_meta = [(i, f"{i}") for i in tags] + else: + counters_meta = [(i.value, i.name) for i in tags] + + values = [value for value, _ in counters_meta] + self.tag_width = max(bits_for(max(values)), bits_for(min(values))) + + self.one_hot = True + negative_values = False + for value in values: + if value < 0: + self.one_hot = False + negative_values = True + break + + log = ceil_log2(value) + if 2**log != value: + self.one_hot = False + + self._incr = Method(i=[("tag", Shape(self.tag_width, signed=negative_values))]) + + self.counters: dict[int, HwMetricRegister] = {} + for tag_value, name in counters_meta: + value_str = ("1<<" + str(exact_log2(tag_value))) if self.one_hot else str(tag_value) + description = f"the counter for tag {name} (value={value_str})" + + self.counters[tag_value] = HwMetricRegister( + name, + registers_width, + description, + ) + + self.add_registers(list(self.counters.values())) + + def elaborate(self, platform): + if not self.metrics_enabled(): + return TModule() + + m = TModule() + + @def_method(m, self._incr) + def _(tag): + if self.one_hot: + sorted_tags = sorted(list(self.counters.keys())) + for i in OneHotSwitchDynamic(m, tag): + counter = self.counters[sorted_tags[i]] + m.d.sync += counter.value.eq(counter.value + 1) + else: + for tag_value, counter in self.counters.items(): + with m.If(tag == tag_value): + m.d.sync += counter.value.eq(counter.value + 1) + + return m + + def incr(self, m: TModule, tag: ValueLike, *, cond: ValueLike = C(1)): + """ + Increases the counter of a given tag by 1. + + Should be called in the body of either a transaction or a method. + + Parameters + ---------- + m: TModule + Transactron module + tag: ValueLike + The tag of the counter. + cond: ValueLike + When set to high, the counter will be increased. By default set to high. + """ + if not self.metrics_enabled(): + return + + with m.If(cond): + self._incr(m, tag) + + class HwExpHistogram(Elaboratable, HwMetric): """Hardware Exponential Histogram From f8add3c53d5eaf7dd1107792e63f79396e9e3578 Mon Sep 17 00:00:00 2001 From: Marek Materzok Date: Mon, 1 Apr 2024 16:49:33 +0200 Subject: [PATCH 5/5] More metrics for RF, RS and ROB (#632) --- coreblocks/cache/icache.py | 2 +- coreblocks/core_structs/rf.py | 29 +++- coreblocks/core_structs/rob.py | 18 ++- coreblocks/func_blocks/csr/csr.py | 1 + coreblocks/func_blocks/fu/common/rs.py | 32 +++- .../func_blocks/fu/common/rs_func_block.py | 13 +- coreblocks/func_blocks/lsu/dummyLsu.py | 2 + coreblocks/params/configurations.py | 6 + coreblocks/params/fu_params.py | 2 + test/regression/cocotb/benchmark.Makefile | 2 +- test/regression/cocotb/signature.Makefile | 2 +- test/regression/cocotb/test.Makefile | 2 +- test/scheduler/test_scheduler.py | 2 +- test/scheduler/test_wakeup_select.py | 4 +- test/structs_common/test_rs.py | 12 +- test/transactions/test_transaction_lib.py | 45 +++++- test/transactron/test_metrics.py | 123 ++++++++++++--- transactron/lib/metrics.py | 146 +++++++++++++++++- transactron/lib/storage.py | 76 ++++++++- 19 files changed, 466 insertions(+), 53 deletions(-) diff --git a/coreblocks/cache/icache.py b/coreblocks/cache/icache.py index 0b60cf37c..08cd51784 100644 --- a/coreblocks/cache/icache.py +++ b/coreblocks/cache/icache.py @@ -115,7 +115,7 @@ def __init__(self, layouts: ICacheLayouts, params: ICacheParameters, refiller: C self.perf_misses = HwCounter("frontend.icache.misses") self.perf_errors = HwCounter("frontend.icache.fetch_errors") self.perf_flushes = HwCounter("frontend.icache.flushes") - self.req_latency = LatencyMeasurer( + self.req_latency = FIFOLatencyMeasurer( "frontend.icache.req_latency", "Latencies of cache requests", slots_number=2, max_latency=500 ) diff --git a/coreblocks/core_structs/rf.py b/coreblocks/core_structs/rf.py index f7a9b8a7f..d6d5e76e8 100644 --- a/coreblocks/core_structs/rf.py +++ b/coreblocks/core_structs/rf.py @@ -1,7 +1,9 @@ from amaranth import * -from transactron import Method, def_method, TModule +from transactron import Method, Transaction, def_method, TModule from coreblocks.interface.layouts import RFLayouts from coreblocks.params import GenParams +from transactron.lib.metrics import HwExpHistogram, TaggedLatencyMeasurer +from transactron.utils.amaranth_ext.functions import popcount from transactron.utils.transactron_helpers import make_layout __all__ = ["RegisterFile"] @@ -20,9 +22,24 @@ def __init__(self, *, gen_params: GenParams): self.write = Method(i=layouts.rf_write) self.free = Method(i=layouts.rf_free) + self.perf_rf_valid_time = TaggedLatencyMeasurer( + "struct.rf.valid_time", + description="Distribution of time registers are valid in RF", + slots_number=2**gen_params.phys_regs_bits, + max_latency=1000, + ) + self.perf_num_valid = HwExpHistogram( + "struct.rf.num_valid", + description="Number of valid registers in RF", + bucket_count=gen_params.phys_regs_bits + 1, + sample_width=gen_params.phys_regs_bits + 1, + ) + def elaborate(self, platform): m = TModule() + m.submodules += [self.perf_rf_valid_time, self.perf_num_valid] + being_written = Signal(self.gen_params.phys_regs_bits) written_value = Signal(self.gen_params.isa.xlen) @@ -56,10 +73,20 @@ def _(reg_id: Value, reg_val: Value): with m.If(~(zero_reg)): m.d.sync += self.entries[reg_id].reg_val.eq(reg_val) m.d.sync += self.entries[reg_id].valid.eq(1) + self.perf_rf_valid_time.start(m, slot=reg_id) @def_method(m, self.free) def _(reg_id: Value): with m.If(reg_id != 0): m.d.sync += self.entries[reg_id].valid.eq(0) + self.perf_rf_valid_time.stop(m, slot=reg_id) + + if self.perf_num_valid.metrics_enabled(): + num_valid = Signal(self.gen_params.phys_regs_bits + 1) + m.d.comb += num_valid.eq( + popcount(Cat(self.entries[reg_id].valid for reg_id in range(2**self.gen_params.phys_regs_bits))) + ) + with Transaction(name="perf").body(m): + self.perf_num_valid.add(m, num_valid) return m diff --git a/coreblocks/core_structs/rob.py b/coreblocks/core_structs/rob.py index 1f3806d46..25b14bab3 100644 --- a/coreblocks/core_structs/rob.py +++ b/coreblocks/core_structs/rob.py @@ -1,5 +1,5 @@ from amaranth import * -from transactron import Method, def_method, TModule +from transactron import Method, Transaction, def_method, TModule from transactron.lib.metrics import * from coreblocks.interface.layouts import ROBLayouts from coreblocks.params import GenParams @@ -18,17 +18,23 @@ def __init__(self, gen_params: GenParams) -> None: self.data = Array(Signal(layouts.internal_layout) for _ in range(2**gen_params.rob_entries_bits)) self.get_indices = Method(o=layouts.get_indices, nonexclusive=True) - self.perf_rob_wait_time = LatencyMeasurer( + self.perf_rob_wait_time = FIFOLatencyMeasurer( "backend.rob.wait_time", description="Distribution of time instructions spend in ROB", slots_number=(2**gen_params.rob_entries_bits + 1), max_latency=1000, ) + self.perf_rob_size = HwExpHistogram( + "backend.rob.size", + description="Number of instructions in ROB", + bucket_count=gen_params.rob_entries_bits + 1, + sample_width=gen_params.rob_entries_bits, + ) def elaborate(self, platform): m = TModule() - m.submodules += [self.perf_rob_wait_time] + m.submodules += [self.perf_rob_wait_time, self.perf_rob_size] start_idx = Signal(self.params.rob_entries_bits) end_idx = Signal(self.params.rob_entries_bits) @@ -70,4 +76,10 @@ def _(rob_id: Value, exception): def _(): return {"start": start_idx, "end": end_idx} + if self.perf_rob_size.metrics_enabled(): + rob_size = Signal(self.params.rob_entries_bits) + m.d.comb += rob_size.eq((end_idx - start_idx)[0 : self.params.rob_entries_bits]) + with Transaction(name="perf").body(m): + self.perf_rob_size.add(m, rob_size) + return m diff --git a/coreblocks/func_blocks/csr/csr.py b/coreblocks/func_blocks/csr/csr.py index 43ddfe957..697de5c63 100644 --- a/coreblocks/func_blocks/csr/csr.py +++ b/coreblocks/func_blocks/csr/csr.py @@ -236,6 +236,7 @@ def _(rob_id: Value, side_fx: Value): return m +@dataclass(frozen=True) class CSRBlockComponent(BlockComponentParams): def get_module(self, gen_params: GenParams) -> FuncBlock: connections = gen_params.get(DependencyManager) diff --git a/coreblocks/func_blocks/fu/common/rs.py b/coreblocks/func_blocks/fu/common/rs.py index 56287df27..1911690b4 100644 --- a/coreblocks/func_blocks/fu/common/rs.py +++ b/coreblocks/func_blocks/fu/common/rs.py @@ -2,11 +2,13 @@ from typing import Optional from amaranth import * from amaranth.lib.coding import PriorityEncoder -from transactron import Method, def_method, TModule +from transactron import Method, Transaction, def_method, TModule from coreblocks.params import GenParams from coreblocks.frontend.decoder import OpType from coreblocks.interface.layouts import RSLayouts +from transactron.lib.metrics import HwExpHistogram, TaggedLatencyMeasurer from transactron.utils import RecordDict +from transactron.utils.amaranth_ext.functions import popcount from transactron.utils.transactron_helpers import make_layout __all__ = ["RS"] @@ -14,7 +16,11 @@ class RS(Elaboratable): def __init__( - self, gen_params: GenParams, rs_entries: int, ready_for: Optional[Iterable[Iterable[OpType]]] = None + self, + gen_params: GenParams, + rs_entries: int, + rs_number: int, + ready_for: Optional[Iterable[Iterable[OpType]]] = None, ) -> None: ready_for = ready_for or ((op for op in OpType),) self.gen_params = gen_params @@ -38,10 +44,24 @@ def __init__( self.data = Array(Signal(self.internal_layout) for _ in range(self.rs_entries)) self.data_ready = Signal(self.rs_entries) + self.perf_rs_wait_time = TaggedLatencyMeasurer( + f"fu.block_{rs_number}.rs.valid_time", + description=f"Distribution of time instructions wait in RS {rs_number}", + slots_number=2**self.rs_entries_bits, + max_latency=1000, + ) + self.perf_num_full = HwExpHistogram( + f"fu.block_{rs_number}.rs.num_full", + description=f"Number of full entries in RS {rs_number}", + bucket_count=self.rs_entries_bits + 1, + sample_width=self.rs_entries_bits + 1, + ) + def elaborate(self, platform): m = TModule() m.submodules.enc_select = PriorityEncoder(width=self.rs_entries) + m.submodules += [self.perf_rs_wait_time, self.perf_num_full] for i, record in enumerate(self.data): m.d.comb += self.data_ready[i].eq( @@ -71,6 +91,7 @@ def _(rs_entry_id: Value, rs_data: Value) -> None: m.d.sync += self.data[rs_entry_id].rs_data.eq(rs_data) m.d.sync += self.data[rs_entry_id].rec_full.eq(1) m.d.sync += self.data[rs_entry_id].rec_reserved.eq(1) + self.perf_rs_wait_time.start(m, slot=rs_entry_id) @def_method(m, self.update) def _(reg_id: Value, reg_val: Value) -> None: @@ -89,6 +110,7 @@ def _(rs_entry_id: Value) -> RecordDict: record = self.data[rs_entry_id] m.d.sync += record.rec_reserved.eq(0) m.d.sync += record.rec_full.eq(0) + self.perf_rs_wait_time.stop(m, slot=rs_entry_id) return { "s1_val": record.rs_data.s1_val, "s2_val": record.rs_data.s2_val, @@ -105,4 +127,10 @@ def _(rs_entry_id: Value) -> RecordDict: def _() -> RecordDict: return {"ready_list": ready_list} + if self.perf_num_full.metrics_enabled(): + num_full = Signal(self.rs_entries_bits + 1) + m.d.comb += num_full.eq(popcount(Cat(self.data[entry_id].rec_full for entry_id in range(self.rs_entries)))) + with Transaction(name="perf").body(m): + self.perf_num_full.add(m, num_full) + return m diff --git a/coreblocks/func_blocks/fu/common/rs_func_block.py b/coreblocks/func_blocks/fu/common/rs_func_block.py index 66fed3d0e..35801dc12 100644 --- a/coreblocks/func_blocks/fu/common/rs_func_block.py +++ b/coreblocks/func_blocks/fu/common/rs_func_block.py @@ -31,7 +31,9 @@ class RSFuncBlock(FuncBlock, Elaboratable): layout described by `FuncUnitLayouts`. """ - def __init__(self, gen_params: GenParams, func_units: Iterable[tuple[FuncUnit, set[OpType]]], rs_entries: int): + def __init__( + self, gen_params: GenParams, func_units: Iterable[tuple[FuncUnit, set[OpType]]], rs_entries: int, rs_number: int + ): """ Parameters ---------- @@ -41,10 +43,13 @@ def __init__(self, gen_params: GenParams, func_units: Iterable[tuple[FuncUnit, s Functional units to be used by this module. rs_entries: int Number of entries in RS. + rs_number: int + The number of this RS block. Used for debugging. """ self.gen_params = gen_params self.rs_entries = rs_entries self.rs_entries_bits = (rs_entries - 1).bit_length() + self.rs_number = rs_number self.rs_layouts = gen_params.get(RSLayouts, rs_entries_bits=self.rs_entries_bits) self.fu_layouts = gen_params.get(FuncUnitLayouts) self.func_units = list(func_units) @@ -60,6 +65,7 @@ def elaborate(self, platform): m.submodules.rs = self.rs = RS( gen_params=self.gen_params, rs_entries=self.rs_entries, + rs_number=self.rs_number, ready_for=(optypes for _, optypes in self.func_units), ) @@ -87,10 +93,13 @@ def elaborate(self, platform): class RSBlockComponent(BlockComponentParams): func_units: Collection[FunctionalComponentParams] rs_entries: int + rs_number: int = -1 # overwritten by CoreConfiguration def get_module(self, gen_params: GenParams) -> FuncBlock: modules = list((u.get_module(gen_params), u.get_optypes()) for u in self.func_units) - rs_unit = RSFuncBlock(gen_params=gen_params, func_units=modules, rs_entries=self.rs_entries) + rs_unit = RSFuncBlock( + gen_params=gen_params, func_units=modules, rs_entries=self.rs_entries, rs_number=self.rs_number + ) return rs_unit def get_optypes(self) -> set[OpType]: diff --git a/coreblocks/func_blocks/lsu/dummyLsu.py b/coreblocks/func_blocks/lsu/dummyLsu.py index ccda62e32..08a5d8604 100644 --- a/coreblocks/func_blocks/lsu/dummyLsu.py +++ b/coreblocks/func_blocks/lsu/dummyLsu.py @@ -1,3 +1,4 @@ +from dataclasses import dataclass from amaranth import * from amaranth.lib.data import View @@ -320,6 +321,7 @@ def _(rob_id: Value, side_fx: Value): return m +@dataclass(frozen=True) class LSUBlockComponent(BlockComponentParams): def get_module(self, gen_params: GenParams) -> FuncBlock: connections = gen_params.get(DependencyManager) diff --git a/coreblocks/params/configurations.py b/coreblocks/params/configurations.py index c2d51a1ca..1d17289f5 100644 --- a/coreblocks/params/configurations.py +++ b/coreblocks/params/configurations.py @@ -74,6 +74,12 @@ class CoreConfiguration: Definitions of PMAs per contiguous segments of memory. """ + def __post_init__(self): + self.func_units_config = [ + dataclasses.replace(conf, rs_number=k) if hasattr(conf, "rs_number") else conf + for k, conf in enumerate(self.func_units_config) + ] + xlen: int = 32 func_units_config: Collection[BlockComponentParams] = basic_configuration diff --git a/coreblocks/params/fu_params.py b/coreblocks/params/fu_params.py index 297e9e9fc..4884d7c9f 100644 --- a/coreblocks/params/fu_params.py +++ b/coreblocks/params/fu_params.py @@ -1,4 +1,5 @@ from abc import abstractmethod, ABC +from dataclasses import dataclass from collections.abc import Collection, Iterable from coreblocks.func_blocks.interface.func_protocols import FuncBlock, FuncUnit @@ -20,6 +21,7 @@ ] +@dataclass(frozen=True) class BlockComponentParams(ABC): @abstractmethod def get_module(self, gen_params: "GenParams") -> FuncBlock: diff --git a/test/regression/cocotb/benchmark.Makefile b/test/regression/cocotb/benchmark.Makefile index 9962315fb..e49b55b39 100644 --- a/test/regression/cocotb/benchmark.Makefile +++ b/test/regression/cocotb/benchmark.Makefile @@ -14,7 +14,7 @@ SIM_BUILD = build/benchmark # Yosys/Amaranth borkedness workaround ifeq ($(SIM),verilator) - EXTRA_ARGS += -Wno-CASEINCOMPLETE -Wno-CASEOVERLAP -Wno-WIDTHEXPAND -Wno-WIDTHTRUNC + EXTRA_ARGS += -Wno-CASEINCOMPLETE -Wno-CASEOVERLAP -Wno-WIDTHEXPAND -Wno-WIDTHTRUNC -Wno-UNSIGNED BUILD_ARGS += -j`nproc` endif diff --git a/test/regression/cocotb/signature.Makefile b/test/regression/cocotb/signature.Makefile index b4f690635..a03d0a5f8 100644 --- a/test/regression/cocotb/signature.Makefile +++ b/test/regression/cocotb/signature.Makefile @@ -14,7 +14,7 @@ SIM_BUILD = build/signature # Yosys/Amaranth borkedness workaround ifeq ($(SIM),verilator) - EXTRA_ARGS += -Wno-CASEINCOMPLETE -Wno-CASEOVERLAP -Wno-WIDTHEXPAND -Wno-WIDTHTRUNC + EXTRA_ARGS += -Wno-CASEINCOMPLETE -Wno-CASEOVERLAP -Wno-WIDTHEXPAND -Wno-WIDTHTRUNC -Wno-UNSIGNED BUILD_ARGS += -j`nproc` endif diff --git a/test/regression/cocotb/test.Makefile b/test/regression/cocotb/test.Makefile index 210618067..5b9f7aad9 100644 --- a/test/regression/cocotb/test.Makefile +++ b/test/regression/cocotb/test.Makefile @@ -14,7 +14,7 @@ SIM_BUILD = build/test # Yosys/Amaranth borkedness workaround ifeq ($(SIM),verilator) - EXTRA_ARGS += -Wno-CASEINCOMPLETE -Wno-CASEOVERLAP -Wno-WIDTHEXPAND -Wno-WIDTHTRUNC + EXTRA_ARGS += -Wno-CASEINCOMPLETE -Wno-CASEOVERLAP -Wno-WIDTHEXPAND -Wno-WIDTHTRUNC -Wno-UNSIGNED BUILD_ARGS += -j`nproc` endif diff --git a/test/scheduler/test_scheduler.py b/test/scheduler/test_scheduler.py index 3c50efab6..2fcf54a50 100644 --- a/test/scheduler/test_scheduler.py +++ b/test/scheduler/test_scheduler.py @@ -127,7 +127,7 @@ def setUp(self): self.rs_count = len(self.optype_sets) self.gen_params = GenParams( test_core_config.replace( - func_units_config=tuple(RSBlockComponent([], rs_entries=4) for _ in range(self.rs_count)) + func_units_config=tuple(RSBlockComponent([], rs_entries=4, rs_number=k) for k in range(self.rs_count)) ) ) self.expected_rename_queue = deque() diff --git a/test/scheduler/test_wakeup_select.py b/test/scheduler/test_wakeup_select.py index 4ff298da9..3e406e1af 100644 --- a/test/scheduler/test_wakeup_select.py +++ b/test/scheduler/test_wakeup_select.py @@ -43,7 +43,9 @@ def elaborate(self, platform): class TestWakeupSelect(TestCaseWithSimulator): def setUp(self): self.gen_params = GenParams( - test_core_config.replace(func_units_config=tuple(RSBlockComponent([], rs_entries=16) for _ in range(2))) + test_core_config.replace( + func_units_config=tuple(RSBlockComponent([], rs_entries=16, rs_number=k) for k in range(2)) + ) ) self.m = WakeupTestCircuit(self.gen_params) self.cycles = 50 diff --git a/test/structs_common/test_rs.py b/test/structs_common/test_rs.py index 4e86a46de..c62852cb0 100644 --- a/test/structs_common/test_rs.py +++ b/test/structs_common/test_rs.py @@ -24,7 +24,7 @@ class TestRSMethodInsert(TestCaseWithSimulator): def test_insert(self): self.gen_params = GenParams(test_core_config) self.rs_entries_bits = self.gen_params.max_rs_entries_bits - self.m = SimpleTestCircuit(RS(self.gen_params, 2**self.rs_entries_bits, None)) + self.m = SimpleTestCircuit(RS(self.gen_params, 2**self.rs_entries_bits, 0, None)) self.insert_list = [ { "rs_entry_id": id, @@ -69,7 +69,7 @@ class TestRSMethodSelect(TestCaseWithSimulator): def test_select(self): self.gen_params = GenParams(test_core_config) self.rs_entries_bits = self.gen_params.max_rs_entries_bits - self.m = SimpleTestCircuit(RS(self.gen_params, 2**self.rs_entries_bits, None)) + self.m = SimpleTestCircuit(RS(self.gen_params, 2**self.rs_entries_bits, 0, None)) self.insert_list = [ { "rs_entry_id": id, @@ -132,7 +132,7 @@ class TestRSMethodUpdate(TestCaseWithSimulator): def test_update(self): self.gen_params = GenParams(test_core_config) self.rs_entries_bits = self.gen_params.max_rs_entries_bits - self.m = SimpleTestCircuit(RS(self.gen_params, 2**self.rs_entries_bits, None)) + self.m = SimpleTestCircuit(RS(self.gen_params, 2**self.rs_entries_bits, 0, None)) self.insert_list = [ { "rs_entry_id": id, @@ -223,7 +223,7 @@ class TestRSMethodTake(TestCaseWithSimulator): def test_take(self): self.gen_params = GenParams(test_core_config) self.rs_entries_bits = self.gen_params.max_rs_entries_bits - self.m = SimpleTestCircuit(RS(self.gen_params, 2**self.rs_entries_bits, None)) + self.m = SimpleTestCircuit(RS(self.gen_params, 2**self.rs_entries_bits, 0, None)) self.insert_list = [ { "rs_entry_id": id, @@ -322,7 +322,7 @@ class TestRSMethodGetReadyList(TestCaseWithSimulator): def test_get_ready_list(self): self.gen_params = GenParams(test_core_config) self.rs_entries_bits = self.gen_params.max_rs_entries_bits - self.m = SimpleTestCircuit(RS(self.gen_params, 2**self.rs_entries_bits, None)) + self.m = SimpleTestCircuit(RS(self.gen_params, 2**self.rs_entries_bits, 0, None)) self.insert_list = [ { "rs_entry_id": id, @@ -378,7 +378,7 @@ def test_two_get_ready_lists(self): self.rs_entries = self.gen_params.max_rs_entries self.rs_entries_bits = self.gen_params.max_rs_entries_bits self.m = SimpleTestCircuit( - RS(self.gen_params, 2**self.rs_entries_bits, [[OpType(1), OpType(2)], [OpType(3), OpType(4)]]) + RS(self.gen_params, 2**self.rs_entries_bits, 0, [[OpType(1), OpType(2)], [OpType(3), OpType(4)]]) ) self.insert_list = [ { diff --git a/test/transactions/test_transaction_lib.py b/test/transactions/test_transaction_lib.py index c8e758ce7..78119067f 100644 --- a/test/transactions/test_transaction_lib.py +++ b/test/transactions/test_transaction_lib.py @@ -142,7 +142,7 @@ def test_mem(self, max_addr, writer_rand, reader_req_rand, reader_resp_rand, see MemoryBank(data_layout=[("data", data_width)], elem_count=max_addr, safe_writes=safe_writes) ) - data_dict: dict[int, int] = dict((i, 0) for i in range(max_addr)) + data: list[int] = list(0 for _ in range(max_addr)) read_req_queue = deque() addr_queue = deque() @@ -155,7 +155,7 @@ def writer(): yield from m.write.call(data=d, addr=a) for _ in range(2): yield Settle() - data_dict[a] = d + data[a] = d yield from self.random_wait(writer_rand, min_cycle_cnt=1) def reader_req(): @@ -165,7 +165,7 @@ def reader_req(): for _ in range(1): yield Settle() if safe_writes: - d = data_dict[a] + d = data[a] read_req_queue.append(d) else: addr_queue.append((cycle, a)) @@ -188,7 +188,7 @@ def internal_reader_resp(): else: yield continue - d = data_dict[a] + d = data[a] # check when internal method has been run to capture # memory state for tests purposes if (yield m._dut._internal_read_resp_trans.grant): @@ -232,6 +232,43 @@ def process(): sim.add_sync_process(process) +class TestAsyncMemoryBank(TestCaseWithSimulator): + @parameterized.expand([(9, 3, 3, 14), (16, 1, 1, 15), (16, 1, 1, 16), (12, 3, 1, 17)]) + def test_mem(self, max_addr, writer_rand, reader_rand, seed): + test_count = 200 + + data_width = 6 + m = SimpleTestCircuit(AsyncMemoryBank(data_layout=[("data", data_width)], elem_count=max_addr)) + + data: list[int] = list(0 for i in range(max_addr)) + + random.seed(seed) + + def writer(): + for cycle in range(test_count): + d = random.randrange(2**data_width) + a = random.randrange(max_addr) + yield from m.write.call(data=d, addr=a) + for _ in range(2): + yield Settle() + data[a] = d + yield from self.random_wait(writer_rand, min_cycle_cnt=1) + + def reader(): + for cycle in range(test_count): + a = random.randrange(max_addr) + d = yield from m.read.call(addr=a) + for _ in range(1): + yield Settle() + expected_d = data[a] + self.assertEqual(d["data"], expected_d) + yield from self.random_wait(reader_rand, min_cycle_cnt=1) + + with self.run_simulation(m) as sim: + sim.add_sync_process(reader) + sim.add_sync_process(writer) + + class ManyToOneConnectTransTestCircuit(Elaboratable): def __init__(self, count: int, lay: MethodLayout): self.count = count diff --git a/test/transactron/test_metrics.py b/test/transactron/test_metrics.py index 7a91616dd..a8af19af9 100644 --- a/test/transactron/test_metrics.py +++ b/test/transactron/test_metrics.py @@ -7,11 +7,12 @@ from parameterized import parameterized_class from amaranth import * -from amaranth.sim import Passive, Settle +from amaranth.sim import Settle from transactron.lib.metrics import * from transactron import * from transactron.testing import TestCaseWithSimulator, data_layout, SimpleTestCircuit +from transactron.testing.infrastructure import Now from transactron.utils.dependencies import DependencyContext @@ -308,6 +309,21 @@ def test_process(): sim.add_sync_process(test_process) +class TestLatencyMeasurerBase(TestCaseWithSimulator): + def check_latencies(self, m: SimpleTestCircuit, latencies: list[int]): + self.assertEqual(min(latencies), (yield m._dut.histogram.min.value)) + self.assertEqual(max(latencies), (yield m._dut.histogram.max.value)) + self.assertEqual(sum(latencies), (yield m._dut.histogram.sum.value)) + self.assertEqual(len(latencies), (yield m._dut.histogram.count.value)) + + for i in range(m._dut.histogram.bucket_count): + bucket_start = 0 if i == 0 else 2 ** (i - 1) + bucket_end = 1e10 if i == m._dut.histogram.bucket_count - 1 else 2**i + + count = sum(1 for x in latencies if bucket_start <= x < bucket_end) + self.assertEqual(count, (yield m._dut.histogram.buckets[i].value)) + + @parameterized_class( ("slots_number", "expected_consumer_wait"), [ @@ -319,31 +335,20 @@ def test_process(): (5, 5), ], ) -class TestLatencyMeasurer(TestCaseWithSimulator): +class TestFIFOLatencyMeasurer(TestLatencyMeasurerBase): slots_number: int expected_consumer_wait: float def test_latency_measurer(self): random.seed(42) - m = SimpleTestCircuit(LatencyMeasurer("latency", slots_number=self.slots_number, max_latency=300)) + m = SimpleTestCircuit(FIFOLatencyMeasurer("latency", slots_number=self.slots_number, max_latency=300)) DependencyContext.get().add_dependency(HwMetricsEnabledKey(), True) latencies: list[int] = [] event_queue = queue.Queue() - time = 0 - - def ticker(): - nonlocal time - - yield Passive() - - while True: - yield - time += 1 - finish = False def producer(): @@ -354,6 +359,7 @@ def producer(): # Make sure that the time is updated first. yield Settle() + time = yield Now() event_queue.put(time) yield from self.random_wait_geom(0.8) @@ -365,26 +371,95 @@ def consumer(): # Make sure that the time is updated first. yield Settle() + time = yield Now() latencies.append(time - event_queue.get()) yield from self.random_wait_geom(1.0 / self.expected_consumer_wait) - self.assertEqual(min(latencies), (yield m._dut.histogram.min.value)) - self.assertEqual(max(latencies), (yield m._dut.histogram.max.value)) - self.assertEqual(sum(latencies), (yield m._dut.histogram.sum.value)) - self.assertEqual(len(latencies), (yield m._dut.histogram.count.value)) + self.check_latencies(m, latencies) + + with self.run_simulation(m) as sim: + sim.add_sync_process(producer) + sim.add_sync_process(consumer) + + +@parameterized_class( + ("slots_number", "expected_consumer_wait"), + [ + (2, 5), + (2, 10), + (5, 10), + (10, 1), + (10, 10), + (5, 5), + ], +) +class TestIndexedLatencyMeasurer(TestLatencyMeasurerBase): + slots_number: int + expected_consumer_wait: float + + def test_latency_measurer(self): + random.seed(42) + + m = SimpleTestCircuit(TaggedLatencyMeasurer("latency", slots_number=self.slots_number, max_latency=300)) + DependencyContext.get().add_dependency(HwMetricsEnabledKey(), True) + + latencies: list[int] = [] + + events = list(0 for _ in range(self.slots_number)) + free_slots = list(k for k in range(self.slots_number)) + used_slots: list[int] = [] + + finish = False + + def producer(): + nonlocal finish + + for _ in range(200): + while not free_slots: + yield + continue + yield Settle() + + slot_id = random.choice(free_slots) + yield from m._start.call(slot=slot_id) + + time = yield Now() + + events[slot_id] = time + free_slots.remove(slot_id) + used_slots.append(slot_id) - for i in range(m._dut.histogram.bucket_count): - bucket_start = 0 if i == 0 else 2 ** (i - 1) - bucket_end = 1e10 if i == m._dut.histogram.bucket_count - 1 else 2**i + yield from self.random_wait_geom(0.8) + + finish = True + + def consumer(): + while not finish: + while not used_slots: + yield + continue + + slot_id = random.choice(used_slots) + + yield from m._stop.call(slot=slot_id) + + time = yield Now() + + yield Settle() + yield Settle() + + latencies.append(time - events[slot_id]) + used_slots.remove(slot_id) + free_slots.append(slot_id) + + yield from self.random_wait_geom(1.0 / self.expected_consumer_wait) - count = sum(1 for x in latencies if bucket_start <= x < bucket_end) - self.assertEqual(count, (yield m._dut.histogram.buckets[i].value)) + self.check_latencies(m, latencies) with self.run_simulation(m) as sim: sim.add_sync_process(producer) sim.add_sync_process(consumer) - sim.add_sync_process(ticker) class MetricManagerTestCircuit(Elaboratable): diff --git a/transactron/lib/metrics.py b/transactron/lib/metrics.py index f3d5b9e0d..17921e619 100644 --- a/transactron/lib/metrics.py +++ b/transactron/lib/metrics.py @@ -9,7 +9,7 @@ from transactron.utils import ValueLike, OneHotSwitchDynamic, SignalBundle from transactron import Method, def_method, TModule -from transactron.lib import FIFO +from transactron.lib import FIFO, AsyncMemoryBank, logging from transactron.utils.dependencies import ListKey, DependencyContext, SimpleKey __all__ = [ @@ -19,7 +19,8 @@ "HwCounter", "TaggedCounter", "HwExpHistogram", - "LatencyMeasurer", + "FIFOLatencyMeasurer", + "TaggedLatencyMeasurer", "HardwareMetricsManager", "HwMetricsEnabledKey", ] @@ -476,7 +477,7 @@ def add(self, m: TModule, sample: Value): self._add(m, sample) -class LatencyMeasurer(Elaboratable): +class FIFOLatencyMeasurer(Elaboratable): """ Measures duration between two events, e.g. request processing latency. It can track multiple events at the same time, i.e. the second event can @@ -501,7 +502,7 @@ def __init__( The fully qualified name of the metric. description: str A human-readable description of the metric's functionality. - slots_number: str + slots_number: int A number of events that the module can track simultaneously. max_latency: int The maximum latency of an event. Used to set signal widths and @@ -595,6 +596,143 @@ def metrics_enabled(self) -> bool: return DependencyContext.get().get_dependency(HwMetricsEnabledKey()) +class TaggedLatencyMeasurer(Elaboratable): + """ + Measures duration between two events, e.g. request processing latency. + It can track multiple events at the same time, i.e. the second event can + be registered as started, before the first finishes. However, each event + needs to have an unique slot tag. + + The module exposes an exponential histogram of the measured latencies. + """ + + def __init__( + self, + fully_qualified_name: str, + description: str = "", + *, + slots_number: int, + max_latency: int, + ): + """ + Parameters + ---------- + fully_qualified_name: str + The fully qualified name of the metric. + description: str + A human-readable description of the metric's functionality. + slots_number: int + A number of events that the module can track simultaneously. + max_latency: int + The maximum latency of an event. Used to set signal widths and + number of buckets in the histogram. If a latency turns to be + bigger than the maximum, it will overflow and result in a false + measurement. + """ + self.fully_qualified_name = fully_qualified_name + self.description = description + self.slots_number = slots_number + self.max_latency = max_latency + + self._start = Method(i=[("slot", range(0, slots_number))]) + self._stop = Method(i=[("slot", range(0, slots_number))]) + + # This bucket count gives us the best possible granularity. + bucket_count = bits_for(self.max_latency) + 1 + self.histogram = HwExpHistogram( + self.fully_qualified_name, + self.description, + bucket_count=bucket_count, + sample_width=bits_for(self.max_latency), + ) + + self.log = logging.HardwareLogger(fully_qualified_name) + + def elaborate(self, platform): + if not self.metrics_enabled(): + return TModule() + + m = TModule() + + epoch_width = bits_for(self.max_latency) + + m.submodules.slots = self.slots = AsyncMemoryBank( + data_layout=[("epoch", epoch_width)], elem_count=self.slots_number + ) + m.submodules.histogram = self.histogram + + slots_taken = Signal(self.slots_number) + slots_taken_start = Signal.like(slots_taken) + slots_taken_stop = Signal.like(slots_taken) + + m.d.comb += slots_taken_start.eq(slots_taken) + m.d.comb += slots_taken_stop.eq(slots_taken_start) + m.d.sync += slots_taken.eq(slots_taken_stop) + + epoch = Signal(epoch_width) + + m.d.sync += epoch.eq(epoch + 1) + + @def_method(m, self._start) + def _(slot: Value): + m.d.comb += slots_taken_start.eq(slots_taken | (1 << slot)) + self.log.error(m, (slots_taken & (1 << slot)).any(), "taken slot {} taken again", slot) + self.slots.write(m, addr=slot, data=epoch) + + @def_method(m, self._stop) + def _(slot: Value): + m.d.comb += slots_taken_stop.eq(slots_taken_start & ~(C(1, self.slots_number) << slot)) + self.log.error(m, ~(slots_taken & (1 << slot)).any(), "free slot {} freed again", slot) + ret = self.slots.read(m, addr=slot) + # The result of substracting two unsigned n-bit is a signed (n+1)-bit value, + # so we need to cast the result and discard the most significant bit. + duration = (epoch - ret.epoch).as_unsigned()[:-1] + self.histogram.add(m, duration) + + return m + + def start(self, m: TModule, *, slot: ValueLike): + """ + Registers the start of an event for a given slot tag. + + Should be called in the body of either a transaction or a method. + + Parameters + ---------- + m: TModule + Transactron module + slot: ValueLike + The slot tag of the event. + """ + + if not self.metrics_enabled(): + return + + self._start(m, slot) + + def stop(self, m: TModule, *, slot: ValueLike): + """ + Registers the end of the event for a given slot tag. + + Should be called in the body of either a transaction or a method. + + Parameters + ---------- + m: TModule + Transactron module + slot: ValueLike + The slot tag of the event. + """ + + if not self.metrics_enabled(): + return + + self._stop(m, slot) + + def metrics_enabled(self) -> bool: + return DependencyContext.get().get_dependency(HwMetricsEnabledKey()) + + class HardwareMetricsManager: """ Collects all metrics registered in the circuit and provides an easy diff --git a/transactron/lib/storage.py b/transactron/lib/storage.py index e6d3e5cf5..3bbf07624 100644 --- a/transactron/lib/storage.py +++ b/transactron/lib/storage.py @@ -8,7 +8,7 @@ from transactron.utils import assign, AssignType, LayoutList from .reqres import ArgumentsToResultsZipper -__all__ = ["MemoryBank"] +__all__ = ["MemoryBank", "AsyncMemoryBank"] class MemoryBank(Elaboratable): @@ -136,3 +136,77 @@ def _(arg): m.d.comb += assign(write_args, arg, fields=AssignType.ALL) return m + + +class AsyncMemoryBank(Elaboratable): + """AsyncMemoryBank module. + + Provides a transactional interface to asynchronous Amaranth Memory with one + read and one write port. It supports optionally writing with given granularity. + + Attributes + ---------- + read: Method + The read method. Accepts an `addr` from which data should be read. + The read response method. Return `data_layout` View which was saved on `addr` given by last + `read_req` method call. + write: Method + The write method. Accepts `addr` where data should be saved, `data` in form of `data_layout` + and optionally `mask` if `granularity` is not None. `1` in mask means that appropriate part should be written. + """ + + def __init__( + self, *, data_layout: LayoutList, elem_count: int, granularity: Optional[int] = None, src_loc: int | SrcLoc = 0 + ): + """ + Parameters + ---------- + data_layout: method layout + The format of structures stored in the Memory. + elem_count: int + Number of elements stored in Memory. + granularity: Optional[int] + Granularity of write, forwarded to Amaranth. If `None` the whole structure is always saved at once. + If not, the width of `data_layout` is split into `granularity` parts, which can be saved independently. + src_loc: int | SrcLoc + How many stack frames deep the source location is taken from. + Alternatively, the source location to use instead of the default. + """ + self.src_loc = get_src_loc(src_loc) + self.data_layout = make_layout(*data_layout) + self.elem_count = elem_count + self.granularity = granularity + self.width = from_method_layout(self.data_layout).size + self.addr_width = bits_for(self.elem_count - 1) + + self.read_req_layout: LayoutList = [("addr", self.addr_width)] + write_layout = [("addr", self.addr_width), ("data", self.data_layout)] + if self.granularity is not None: + write_layout.append(("mask", self.width // self.granularity)) + self.write_layout = make_layout(*write_layout) + + self.read = Method(i=self.read_req_layout, o=self.data_layout, src_loc=self.src_loc) + self.write = Method(i=self.write_layout, src_loc=self.src_loc) + + def elaborate(self, platform) -> TModule: + m = TModule() + + mem = Memory(width=self.width, depth=self.elem_count) + m.submodules.read_port = read_port = mem.read_port(domain="comb") + m.submodules.write_port = write_port = mem.write_port() + + @def_method(m, self.read) + def _(addr): + m.d.comb += read_port.addr.eq(addr) + return read_port.data + + @def_method(m, self.write) + def _(arg): + m.d.comb += write_port.addr.eq(arg.addr) + m.d.comb += write_port.data.eq(arg.data) + if self.granularity is None: + m.d.comb += write_port.en.eq(1) + else: + m.d.comb += write_port.en.eq(arg.mask) + + return m