diff --git a/coreblocks/cache/icache.py b/coreblocks/cache/icache.py index 0b60cf37c..08cd51784 100644 --- a/coreblocks/cache/icache.py +++ b/coreblocks/cache/icache.py @@ -115,7 +115,7 @@ def __init__(self, layouts: ICacheLayouts, params: ICacheParameters, refiller: C self.perf_misses = HwCounter("frontend.icache.misses") self.perf_errors = HwCounter("frontend.icache.fetch_errors") self.perf_flushes = HwCounter("frontend.icache.flushes") - self.req_latency = LatencyMeasurer( + self.req_latency = FIFOLatencyMeasurer( "frontend.icache.req_latency", "Latencies of cache requests", slots_number=2, max_latency=500 ) diff --git a/coreblocks/core_structs/rf.py b/coreblocks/core_structs/rf.py index f7a9b8a7f..d6d5e76e8 100644 --- a/coreblocks/core_structs/rf.py +++ b/coreblocks/core_structs/rf.py @@ -1,7 +1,9 @@ from amaranth import * -from transactron import Method, def_method, TModule +from transactron import Method, Transaction, def_method, TModule from coreblocks.interface.layouts import RFLayouts from coreblocks.params import GenParams +from transactron.lib.metrics import HwExpHistogram, TaggedLatencyMeasurer +from transactron.utils.amaranth_ext.functions import popcount from transactron.utils.transactron_helpers import make_layout __all__ = ["RegisterFile"] @@ -20,9 +22,24 @@ def __init__(self, *, gen_params: GenParams): self.write = Method(i=layouts.rf_write) self.free = Method(i=layouts.rf_free) + self.perf_rf_valid_time = TaggedLatencyMeasurer( + "struct.rf.valid_time", + description="Distribution of time registers are valid in RF", + slots_number=2**gen_params.phys_regs_bits, + max_latency=1000, + ) + self.perf_num_valid = HwExpHistogram( + "struct.rf.num_valid", + description="Number of valid registers in RF", + bucket_count=gen_params.phys_regs_bits + 1, + sample_width=gen_params.phys_regs_bits + 1, + ) + def elaborate(self, platform): m = TModule() + m.submodules += [self.perf_rf_valid_time, self.perf_num_valid] + being_written = Signal(self.gen_params.phys_regs_bits) written_value = Signal(self.gen_params.isa.xlen) @@ -56,10 +73,20 @@ def _(reg_id: Value, reg_val: Value): with m.If(~(zero_reg)): m.d.sync += self.entries[reg_id].reg_val.eq(reg_val) m.d.sync += self.entries[reg_id].valid.eq(1) + self.perf_rf_valid_time.start(m, slot=reg_id) @def_method(m, self.free) def _(reg_id: Value): with m.If(reg_id != 0): m.d.sync += self.entries[reg_id].valid.eq(0) + self.perf_rf_valid_time.stop(m, slot=reg_id) + + if self.perf_num_valid.metrics_enabled(): + num_valid = Signal(self.gen_params.phys_regs_bits + 1) + m.d.comb += num_valid.eq( + popcount(Cat(self.entries[reg_id].valid for reg_id in range(2**self.gen_params.phys_regs_bits))) + ) + with Transaction(name="perf").body(m): + self.perf_num_valid.add(m, num_valid) return m diff --git a/coreblocks/core_structs/rob.py b/coreblocks/core_structs/rob.py index 1f3806d46..25b14bab3 100644 --- a/coreblocks/core_structs/rob.py +++ b/coreblocks/core_structs/rob.py @@ -1,5 +1,5 @@ from amaranth import * -from transactron import Method, def_method, TModule +from transactron import Method, Transaction, def_method, TModule from transactron.lib.metrics import * from coreblocks.interface.layouts import ROBLayouts from coreblocks.params import GenParams @@ -18,17 +18,23 @@ def __init__(self, gen_params: GenParams) -> None: self.data = Array(Signal(layouts.internal_layout) for _ in range(2**gen_params.rob_entries_bits)) self.get_indices = Method(o=layouts.get_indices, nonexclusive=True) - self.perf_rob_wait_time = LatencyMeasurer( + self.perf_rob_wait_time = FIFOLatencyMeasurer( "backend.rob.wait_time", description="Distribution of time instructions spend in ROB", slots_number=(2**gen_params.rob_entries_bits + 1), max_latency=1000, ) + self.perf_rob_size = HwExpHistogram( + "backend.rob.size", + description="Number of instructions in ROB", + bucket_count=gen_params.rob_entries_bits + 1, + sample_width=gen_params.rob_entries_bits, + ) def elaborate(self, platform): m = TModule() - m.submodules += [self.perf_rob_wait_time] + m.submodules += [self.perf_rob_wait_time, self.perf_rob_size] start_idx = Signal(self.params.rob_entries_bits) end_idx = Signal(self.params.rob_entries_bits) @@ -70,4 +76,10 @@ def _(rob_id: Value, exception): def _(): return {"start": start_idx, "end": end_idx} + if self.perf_rob_size.metrics_enabled(): + rob_size = Signal(self.params.rob_entries_bits) + m.d.comb += rob_size.eq((end_idx - start_idx)[0 : self.params.rob_entries_bits]) + with Transaction(name="perf").body(m): + self.perf_rob_size.add(m, rob_size) + return m diff --git a/coreblocks/func_blocks/csr/csr.py b/coreblocks/func_blocks/csr/csr.py index 43ddfe957..697de5c63 100644 --- a/coreblocks/func_blocks/csr/csr.py +++ b/coreblocks/func_blocks/csr/csr.py @@ -236,6 +236,7 @@ def _(rob_id: Value, side_fx: Value): return m +@dataclass(frozen=True) class CSRBlockComponent(BlockComponentParams): def get_module(self, gen_params: GenParams) -> FuncBlock: connections = gen_params.get(DependencyManager) diff --git a/coreblocks/func_blocks/fu/common/rs.py b/coreblocks/func_blocks/fu/common/rs.py index 56287df27..1911690b4 100644 --- a/coreblocks/func_blocks/fu/common/rs.py +++ b/coreblocks/func_blocks/fu/common/rs.py @@ -2,11 +2,13 @@ from typing import Optional from amaranth import * from amaranth.lib.coding import PriorityEncoder -from transactron import Method, def_method, TModule +from transactron import Method, Transaction, def_method, TModule from coreblocks.params import GenParams from coreblocks.frontend.decoder import OpType from coreblocks.interface.layouts import RSLayouts +from transactron.lib.metrics import HwExpHistogram, TaggedLatencyMeasurer from transactron.utils import RecordDict +from transactron.utils.amaranth_ext.functions import popcount from transactron.utils.transactron_helpers import make_layout __all__ = ["RS"] @@ -14,7 +16,11 @@ class RS(Elaboratable): def __init__( - self, gen_params: GenParams, rs_entries: int, ready_for: Optional[Iterable[Iterable[OpType]]] = None + self, + gen_params: GenParams, + rs_entries: int, + rs_number: int, + ready_for: Optional[Iterable[Iterable[OpType]]] = None, ) -> None: ready_for = ready_for or ((op for op in OpType),) self.gen_params = gen_params @@ -38,10 +44,24 @@ def __init__( self.data = Array(Signal(self.internal_layout) for _ in range(self.rs_entries)) self.data_ready = Signal(self.rs_entries) + self.perf_rs_wait_time = TaggedLatencyMeasurer( + f"fu.block_{rs_number}.rs.valid_time", + description=f"Distribution of time instructions wait in RS {rs_number}", + slots_number=2**self.rs_entries_bits, + max_latency=1000, + ) + self.perf_num_full = HwExpHistogram( + f"fu.block_{rs_number}.rs.num_full", + description=f"Number of full entries in RS {rs_number}", + bucket_count=self.rs_entries_bits + 1, + sample_width=self.rs_entries_bits + 1, + ) + def elaborate(self, platform): m = TModule() m.submodules.enc_select = PriorityEncoder(width=self.rs_entries) + m.submodules += [self.perf_rs_wait_time, self.perf_num_full] for i, record in enumerate(self.data): m.d.comb += self.data_ready[i].eq( @@ -71,6 +91,7 @@ def _(rs_entry_id: Value, rs_data: Value) -> None: m.d.sync += self.data[rs_entry_id].rs_data.eq(rs_data) m.d.sync += self.data[rs_entry_id].rec_full.eq(1) m.d.sync += self.data[rs_entry_id].rec_reserved.eq(1) + self.perf_rs_wait_time.start(m, slot=rs_entry_id) @def_method(m, self.update) def _(reg_id: Value, reg_val: Value) -> None: @@ -89,6 +110,7 @@ def _(rs_entry_id: Value) -> RecordDict: record = self.data[rs_entry_id] m.d.sync += record.rec_reserved.eq(0) m.d.sync += record.rec_full.eq(0) + self.perf_rs_wait_time.stop(m, slot=rs_entry_id) return { "s1_val": record.rs_data.s1_val, "s2_val": record.rs_data.s2_val, @@ -105,4 +127,10 @@ def _(rs_entry_id: Value) -> RecordDict: def _() -> RecordDict: return {"ready_list": ready_list} + if self.perf_num_full.metrics_enabled(): + num_full = Signal(self.rs_entries_bits + 1) + m.d.comb += num_full.eq(popcount(Cat(self.data[entry_id].rec_full for entry_id in range(self.rs_entries)))) + with Transaction(name="perf").body(m): + self.perf_num_full.add(m, num_full) + return m diff --git a/coreblocks/func_blocks/fu/common/rs_func_block.py b/coreblocks/func_blocks/fu/common/rs_func_block.py index 66fed3d0e..35801dc12 100644 --- a/coreblocks/func_blocks/fu/common/rs_func_block.py +++ b/coreblocks/func_blocks/fu/common/rs_func_block.py @@ -31,7 +31,9 @@ class RSFuncBlock(FuncBlock, Elaboratable): layout described by `FuncUnitLayouts`. """ - def __init__(self, gen_params: GenParams, func_units: Iterable[tuple[FuncUnit, set[OpType]]], rs_entries: int): + def __init__( + self, gen_params: GenParams, func_units: Iterable[tuple[FuncUnit, set[OpType]]], rs_entries: int, rs_number: int + ): """ Parameters ---------- @@ -41,10 +43,13 @@ def __init__(self, gen_params: GenParams, func_units: Iterable[tuple[FuncUnit, s Functional units to be used by this module. rs_entries: int Number of entries in RS. + rs_number: int + The number of this RS block. Used for debugging. """ self.gen_params = gen_params self.rs_entries = rs_entries self.rs_entries_bits = (rs_entries - 1).bit_length() + self.rs_number = rs_number self.rs_layouts = gen_params.get(RSLayouts, rs_entries_bits=self.rs_entries_bits) self.fu_layouts = gen_params.get(FuncUnitLayouts) self.func_units = list(func_units) @@ -60,6 +65,7 @@ def elaborate(self, platform): m.submodules.rs = self.rs = RS( gen_params=self.gen_params, rs_entries=self.rs_entries, + rs_number=self.rs_number, ready_for=(optypes for _, optypes in self.func_units), ) @@ -87,10 +93,13 @@ def elaborate(self, platform): class RSBlockComponent(BlockComponentParams): func_units: Collection[FunctionalComponentParams] rs_entries: int + rs_number: int = -1 # overwritten by CoreConfiguration def get_module(self, gen_params: GenParams) -> FuncBlock: modules = list((u.get_module(gen_params), u.get_optypes()) for u in self.func_units) - rs_unit = RSFuncBlock(gen_params=gen_params, func_units=modules, rs_entries=self.rs_entries) + rs_unit = RSFuncBlock( + gen_params=gen_params, func_units=modules, rs_entries=self.rs_entries, rs_number=self.rs_number + ) return rs_unit def get_optypes(self) -> set[OpType]: diff --git a/coreblocks/func_blocks/lsu/dummyLsu.py b/coreblocks/func_blocks/lsu/dummyLsu.py index ccda62e32..08a5d8604 100644 --- a/coreblocks/func_blocks/lsu/dummyLsu.py +++ b/coreblocks/func_blocks/lsu/dummyLsu.py @@ -1,3 +1,4 @@ +from dataclasses import dataclass from amaranth import * from amaranth.lib.data import View @@ -320,6 +321,7 @@ def _(rob_id: Value, side_fx: Value): return m +@dataclass(frozen=True) class LSUBlockComponent(BlockComponentParams): def get_module(self, gen_params: GenParams) -> FuncBlock: connections = gen_params.get(DependencyManager) diff --git a/coreblocks/params/configurations.py b/coreblocks/params/configurations.py index c2d51a1ca..1d17289f5 100644 --- a/coreblocks/params/configurations.py +++ b/coreblocks/params/configurations.py @@ -74,6 +74,12 @@ class CoreConfiguration: Definitions of PMAs per contiguous segments of memory. """ + def __post_init__(self): + self.func_units_config = [ + dataclasses.replace(conf, rs_number=k) if hasattr(conf, "rs_number") else conf + for k, conf in enumerate(self.func_units_config) + ] + xlen: int = 32 func_units_config: Collection[BlockComponentParams] = basic_configuration diff --git a/coreblocks/params/fu_params.py b/coreblocks/params/fu_params.py index 297e9e9fc..4884d7c9f 100644 --- a/coreblocks/params/fu_params.py +++ b/coreblocks/params/fu_params.py @@ -1,4 +1,5 @@ from abc import abstractmethod, ABC +from dataclasses import dataclass from collections.abc import Collection, Iterable from coreblocks.func_blocks.interface.func_protocols import FuncBlock, FuncUnit @@ -20,6 +21,7 @@ ] +@dataclass(frozen=True) class BlockComponentParams(ABC): @abstractmethod def get_module(self, gen_params: "GenParams") -> FuncBlock: diff --git a/test/regression/cocotb/benchmark.Makefile b/test/regression/cocotb/benchmark.Makefile index 9962315fb..e49b55b39 100644 --- a/test/regression/cocotb/benchmark.Makefile +++ b/test/regression/cocotb/benchmark.Makefile @@ -14,7 +14,7 @@ SIM_BUILD = build/benchmark # Yosys/Amaranth borkedness workaround ifeq ($(SIM),verilator) - EXTRA_ARGS += -Wno-CASEINCOMPLETE -Wno-CASEOVERLAP -Wno-WIDTHEXPAND -Wno-WIDTHTRUNC + EXTRA_ARGS += -Wno-CASEINCOMPLETE -Wno-CASEOVERLAP -Wno-WIDTHEXPAND -Wno-WIDTHTRUNC -Wno-UNSIGNED BUILD_ARGS += -j`nproc` endif diff --git a/test/regression/cocotb/signature.Makefile b/test/regression/cocotb/signature.Makefile index b4f690635..a03d0a5f8 100644 --- a/test/regression/cocotb/signature.Makefile +++ b/test/regression/cocotb/signature.Makefile @@ -14,7 +14,7 @@ SIM_BUILD = build/signature # Yosys/Amaranth borkedness workaround ifeq ($(SIM),verilator) - EXTRA_ARGS += -Wno-CASEINCOMPLETE -Wno-CASEOVERLAP -Wno-WIDTHEXPAND -Wno-WIDTHTRUNC + EXTRA_ARGS += -Wno-CASEINCOMPLETE -Wno-CASEOVERLAP -Wno-WIDTHEXPAND -Wno-WIDTHTRUNC -Wno-UNSIGNED BUILD_ARGS += -j`nproc` endif diff --git a/test/regression/cocotb/test.Makefile b/test/regression/cocotb/test.Makefile index 210618067..5b9f7aad9 100644 --- a/test/regression/cocotb/test.Makefile +++ b/test/regression/cocotb/test.Makefile @@ -14,7 +14,7 @@ SIM_BUILD = build/test # Yosys/Amaranth borkedness workaround ifeq ($(SIM),verilator) - EXTRA_ARGS += -Wno-CASEINCOMPLETE -Wno-CASEOVERLAP -Wno-WIDTHEXPAND -Wno-WIDTHTRUNC + EXTRA_ARGS += -Wno-CASEINCOMPLETE -Wno-CASEOVERLAP -Wno-WIDTHEXPAND -Wno-WIDTHTRUNC -Wno-UNSIGNED BUILD_ARGS += -j`nproc` endif diff --git a/test/scheduler/test_scheduler.py b/test/scheduler/test_scheduler.py index 3c50efab6..2fcf54a50 100644 --- a/test/scheduler/test_scheduler.py +++ b/test/scheduler/test_scheduler.py @@ -127,7 +127,7 @@ def setUp(self): self.rs_count = len(self.optype_sets) self.gen_params = GenParams( test_core_config.replace( - func_units_config=tuple(RSBlockComponent([], rs_entries=4) for _ in range(self.rs_count)) + func_units_config=tuple(RSBlockComponent([], rs_entries=4, rs_number=k) for k in range(self.rs_count)) ) ) self.expected_rename_queue = deque() diff --git a/test/scheduler/test_wakeup_select.py b/test/scheduler/test_wakeup_select.py index 4ff298da9..3e406e1af 100644 --- a/test/scheduler/test_wakeup_select.py +++ b/test/scheduler/test_wakeup_select.py @@ -43,7 +43,9 @@ def elaborate(self, platform): class TestWakeupSelect(TestCaseWithSimulator): def setUp(self): self.gen_params = GenParams( - test_core_config.replace(func_units_config=tuple(RSBlockComponent([], rs_entries=16) for _ in range(2))) + test_core_config.replace( + func_units_config=tuple(RSBlockComponent([], rs_entries=16, rs_number=k) for k in range(2)) + ) ) self.m = WakeupTestCircuit(self.gen_params) self.cycles = 50 diff --git a/test/structs_common/test_rs.py b/test/structs_common/test_rs.py index 4e86a46de..c62852cb0 100644 --- a/test/structs_common/test_rs.py +++ b/test/structs_common/test_rs.py @@ -24,7 +24,7 @@ class TestRSMethodInsert(TestCaseWithSimulator): def test_insert(self): self.gen_params = GenParams(test_core_config) self.rs_entries_bits = self.gen_params.max_rs_entries_bits - self.m = SimpleTestCircuit(RS(self.gen_params, 2**self.rs_entries_bits, None)) + self.m = SimpleTestCircuit(RS(self.gen_params, 2**self.rs_entries_bits, 0, None)) self.insert_list = [ { "rs_entry_id": id, @@ -69,7 +69,7 @@ class TestRSMethodSelect(TestCaseWithSimulator): def test_select(self): self.gen_params = GenParams(test_core_config) self.rs_entries_bits = self.gen_params.max_rs_entries_bits - self.m = SimpleTestCircuit(RS(self.gen_params, 2**self.rs_entries_bits, None)) + self.m = SimpleTestCircuit(RS(self.gen_params, 2**self.rs_entries_bits, 0, None)) self.insert_list = [ { "rs_entry_id": id, @@ -132,7 +132,7 @@ class TestRSMethodUpdate(TestCaseWithSimulator): def test_update(self): self.gen_params = GenParams(test_core_config) self.rs_entries_bits = self.gen_params.max_rs_entries_bits - self.m = SimpleTestCircuit(RS(self.gen_params, 2**self.rs_entries_bits, None)) + self.m = SimpleTestCircuit(RS(self.gen_params, 2**self.rs_entries_bits, 0, None)) self.insert_list = [ { "rs_entry_id": id, @@ -223,7 +223,7 @@ class TestRSMethodTake(TestCaseWithSimulator): def test_take(self): self.gen_params = GenParams(test_core_config) self.rs_entries_bits = self.gen_params.max_rs_entries_bits - self.m = SimpleTestCircuit(RS(self.gen_params, 2**self.rs_entries_bits, None)) + self.m = SimpleTestCircuit(RS(self.gen_params, 2**self.rs_entries_bits, 0, None)) self.insert_list = [ { "rs_entry_id": id, @@ -322,7 +322,7 @@ class TestRSMethodGetReadyList(TestCaseWithSimulator): def test_get_ready_list(self): self.gen_params = GenParams(test_core_config) self.rs_entries_bits = self.gen_params.max_rs_entries_bits - self.m = SimpleTestCircuit(RS(self.gen_params, 2**self.rs_entries_bits, None)) + self.m = SimpleTestCircuit(RS(self.gen_params, 2**self.rs_entries_bits, 0, None)) self.insert_list = [ { "rs_entry_id": id, @@ -378,7 +378,7 @@ def test_two_get_ready_lists(self): self.rs_entries = self.gen_params.max_rs_entries self.rs_entries_bits = self.gen_params.max_rs_entries_bits self.m = SimpleTestCircuit( - RS(self.gen_params, 2**self.rs_entries_bits, [[OpType(1), OpType(2)], [OpType(3), OpType(4)]]) + RS(self.gen_params, 2**self.rs_entries_bits, 0, [[OpType(1), OpType(2)], [OpType(3), OpType(4)]]) ) self.insert_list = [ { diff --git a/test/transactions/test_transaction_lib.py b/test/transactions/test_transaction_lib.py index c8e758ce7..78119067f 100644 --- a/test/transactions/test_transaction_lib.py +++ b/test/transactions/test_transaction_lib.py @@ -142,7 +142,7 @@ def test_mem(self, max_addr, writer_rand, reader_req_rand, reader_resp_rand, see MemoryBank(data_layout=[("data", data_width)], elem_count=max_addr, safe_writes=safe_writes) ) - data_dict: dict[int, int] = dict((i, 0) for i in range(max_addr)) + data: list[int] = list(0 for _ in range(max_addr)) read_req_queue = deque() addr_queue = deque() @@ -155,7 +155,7 @@ def writer(): yield from m.write.call(data=d, addr=a) for _ in range(2): yield Settle() - data_dict[a] = d + data[a] = d yield from self.random_wait(writer_rand, min_cycle_cnt=1) def reader_req(): @@ -165,7 +165,7 @@ def reader_req(): for _ in range(1): yield Settle() if safe_writes: - d = data_dict[a] + d = data[a] read_req_queue.append(d) else: addr_queue.append((cycle, a)) @@ -188,7 +188,7 @@ def internal_reader_resp(): else: yield continue - d = data_dict[a] + d = data[a] # check when internal method has been run to capture # memory state for tests purposes if (yield m._dut._internal_read_resp_trans.grant): @@ -232,6 +232,43 @@ def process(): sim.add_sync_process(process) +class TestAsyncMemoryBank(TestCaseWithSimulator): + @parameterized.expand([(9, 3, 3, 14), (16, 1, 1, 15), (16, 1, 1, 16), (12, 3, 1, 17)]) + def test_mem(self, max_addr, writer_rand, reader_rand, seed): + test_count = 200 + + data_width = 6 + m = SimpleTestCircuit(AsyncMemoryBank(data_layout=[("data", data_width)], elem_count=max_addr)) + + data: list[int] = list(0 for i in range(max_addr)) + + random.seed(seed) + + def writer(): + for cycle in range(test_count): + d = random.randrange(2**data_width) + a = random.randrange(max_addr) + yield from m.write.call(data=d, addr=a) + for _ in range(2): + yield Settle() + data[a] = d + yield from self.random_wait(writer_rand, min_cycle_cnt=1) + + def reader(): + for cycle in range(test_count): + a = random.randrange(max_addr) + d = yield from m.read.call(addr=a) + for _ in range(1): + yield Settle() + expected_d = data[a] + self.assertEqual(d["data"], expected_d) + yield from self.random_wait(reader_rand, min_cycle_cnt=1) + + with self.run_simulation(m) as sim: + sim.add_sync_process(reader) + sim.add_sync_process(writer) + + class ManyToOneConnectTransTestCircuit(Elaboratable): def __init__(self, count: int, lay: MethodLayout): self.count = count diff --git a/test/transactron/test_metrics.py b/test/transactron/test_metrics.py index 7a91616dd..a8af19af9 100644 --- a/test/transactron/test_metrics.py +++ b/test/transactron/test_metrics.py @@ -7,11 +7,12 @@ from parameterized import parameterized_class from amaranth import * -from amaranth.sim import Passive, Settle +from amaranth.sim import Settle from transactron.lib.metrics import * from transactron import * from transactron.testing import TestCaseWithSimulator, data_layout, SimpleTestCircuit +from transactron.testing.infrastructure import Now from transactron.utils.dependencies import DependencyContext @@ -308,6 +309,21 @@ def test_process(): sim.add_sync_process(test_process) +class TestLatencyMeasurerBase(TestCaseWithSimulator): + def check_latencies(self, m: SimpleTestCircuit, latencies: list[int]): + self.assertEqual(min(latencies), (yield m._dut.histogram.min.value)) + self.assertEqual(max(latencies), (yield m._dut.histogram.max.value)) + self.assertEqual(sum(latencies), (yield m._dut.histogram.sum.value)) + self.assertEqual(len(latencies), (yield m._dut.histogram.count.value)) + + for i in range(m._dut.histogram.bucket_count): + bucket_start = 0 if i == 0 else 2 ** (i - 1) + bucket_end = 1e10 if i == m._dut.histogram.bucket_count - 1 else 2**i + + count = sum(1 for x in latencies if bucket_start <= x < bucket_end) + self.assertEqual(count, (yield m._dut.histogram.buckets[i].value)) + + @parameterized_class( ("slots_number", "expected_consumer_wait"), [ @@ -319,31 +335,20 @@ def test_process(): (5, 5), ], ) -class TestLatencyMeasurer(TestCaseWithSimulator): +class TestFIFOLatencyMeasurer(TestLatencyMeasurerBase): slots_number: int expected_consumer_wait: float def test_latency_measurer(self): random.seed(42) - m = SimpleTestCircuit(LatencyMeasurer("latency", slots_number=self.slots_number, max_latency=300)) + m = SimpleTestCircuit(FIFOLatencyMeasurer("latency", slots_number=self.slots_number, max_latency=300)) DependencyContext.get().add_dependency(HwMetricsEnabledKey(), True) latencies: list[int] = [] event_queue = queue.Queue() - time = 0 - - def ticker(): - nonlocal time - - yield Passive() - - while True: - yield - time += 1 - finish = False def producer(): @@ -354,6 +359,7 @@ def producer(): # Make sure that the time is updated first. yield Settle() + time = yield Now() event_queue.put(time) yield from self.random_wait_geom(0.8) @@ -365,26 +371,95 @@ def consumer(): # Make sure that the time is updated first. yield Settle() + time = yield Now() latencies.append(time - event_queue.get()) yield from self.random_wait_geom(1.0 / self.expected_consumer_wait) - self.assertEqual(min(latencies), (yield m._dut.histogram.min.value)) - self.assertEqual(max(latencies), (yield m._dut.histogram.max.value)) - self.assertEqual(sum(latencies), (yield m._dut.histogram.sum.value)) - self.assertEqual(len(latencies), (yield m._dut.histogram.count.value)) + self.check_latencies(m, latencies) + + with self.run_simulation(m) as sim: + sim.add_sync_process(producer) + sim.add_sync_process(consumer) + + +@parameterized_class( + ("slots_number", "expected_consumer_wait"), + [ + (2, 5), + (2, 10), + (5, 10), + (10, 1), + (10, 10), + (5, 5), + ], +) +class TestIndexedLatencyMeasurer(TestLatencyMeasurerBase): + slots_number: int + expected_consumer_wait: float + + def test_latency_measurer(self): + random.seed(42) + + m = SimpleTestCircuit(TaggedLatencyMeasurer("latency", slots_number=self.slots_number, max_latency=300)) + DependencyContext.get().add_dependency(HwMetricsEnabledKey(), True) + + latencies: list[int] = [] + + events = list(0 for _ in range(self.slots_number)) + free_slots = list(k for k in range(self.slots_number)) + used_slots: list[int] = [] + + finish = False + + def producer(): + nonlocal finish + + for _ in range(200): + while not free_slots: + yield + continue + yield Settle() + + slot_id = random.choice(free_slots) + yield from m._start.call(slot=slot_id) + + time = yield Now() + + events[slot_id] = time + free_slots.remove(slot_id) + used_slots.append(slot_id) - for i in range(m._dut.histogram.bucket_count): - bucket_start = 0 if i == 0 else 2 ** (i - 1) - bucket_end = 1e10 if i == m._dut.histogram.bucket_count - 1 else 2**i + yield from self.random_wait_geom(0.8) + + finish = True + + def consumer(): + while not finish: + while not used_slots: + yield + continue + + slot_id = random.choice(used_slots) + + yield from m._stop.call(slot=slot_id) + + time = yield Now() + + yield Settle() + yield Settle() + + latencies.append(time - events[slot_id]) + used_slots.remove(slot_id) + free_slots.append(slot_id) + + yield from self.random_wait_geom(1.0 / self.expected_consumer_wait) - count = sum(1 for x in latencies if bucket_start <= x < bucket_end) - self.assertEqual(count, (yield m._dut.histogram.buckets[i].value)) + self.check_latencies(m, latencies) with self.run_simulation(m) as sim: sim.add_sync_process(producer) sim.add_sync_process(consumer) - sim.add_sync_process(ticker) class MetricManagerTestCircuit(Elaboratable): diff --git a/transactron/lib/metrics.py b/transactron/lib/metrics.py index f3d5b9e0d..17921e619 100644 --- a/transactron/lib/metrics.py +++ b/transactron/lib/metrics.py @@ -9,7 +9,7 @@ from transactron.utils import ValueLike, OneHotSwitchDynamic, SignalBundle from transactron import Method, def_method, TModule -from transactron.lib import FIFO +from transactron.lib import FIFO, AsyncMemoryBank, logging from transactron.utils.dependencies import ListKey, DependencyContext, SimpleKey __all__ = [ @@ -19,7 +19,8 @@ "HwCounter", "TaggedCounter", "HwExpHistogram", - "LatencyMeasurer", + "FIFOLatencyMeasurer", + "TaggedLatencyMeasurer", "HardwareMetricsManager", "HwMetricsEnabledKey", ] @@ -476,7 +477,7 @@ def add(self, m: TModule, sample: Value): self._add(m, sample) -class LatencyMeasurer(Elaboratable): +class FIFOLatencyMeasurer(Elaboratable): """ Measures duration between two events, e.g. request processing latency. It can track multiple events at the same time, i.e. the second event can @@ -501,7 +502,7 @@ def __init__( The fully qualified name of the metric. description: str A human-readable description of the metric's functionality. - slots_number: str + slots_number: int A number of events that the module can track simultaneously. max_latency: int The maximum latency of an event. Used to set signal widths and @@ -595,6 +596,143 @@ def metrics_enabled(self) -> bool: return DependencyContext.get().get_dependency(HwMetricsEnabledKey()) +class TaggedLatencyMeasurer(Elaboratable): + """ + Measures duration between two events, e.g. request processing latency. + It can track multiple events at the same time, i.e. the second event can + be registered as started, before the first finishes. However, each event + needs to have an unique slot tag. + + The module exposes an exponential histogram of the measured latencies. + """ + + def __init__( + self, + fully_qualified_name: str, + description: str = "", + *, + slots_number: int, + max_latency: int, + ): + """ + Parameters + ---------- + fully_qualified_name: str + The fully qualified name of the metric. + description: str + A human-readable description of the metric's functionality. + slots_number: int + A number of events that the module can track simultaneously. + max_latency: int + The maximum latency of an event. Used to set signal widths and + number of buckets in the histogram. If a latency turns to be + bigger than the maximum, it will overflow and result in a false + measurement. + """ + self.fully_qualified_name = fully_qualified_name + self.description = description + self.slots_number = slots_number + self.max_latency = max_latency + + self._start = Method(i=[("slot", range(0, slots_number))]) + self._stop = Method(i=[("slot", range(0, slots_number))]) + + # This bucket count gives us the best possible granularity. + bucket_count = bits_for(self.max_latency) + 1 + self.histogram = HwExpHistogram( + self.fully_qualified_name, + self.description, + bucket_count=bucket_count, + sample_width=bits_for(self.max_latency), + ) + + self.log = logging.HardwareLogger(fully_qualified_name) + + def elaborate(self, platform): + if not self.metrics_enabled(): + return TModule() + + m = TModule() + + epoch_width = bits_for(self.max_latency) + + m.submodules.slots = self.slots = AsyncMemoryBank( + data_layout=[("epoch", epoch_width)], elem_count=self.slots_number + ) + m.submodules.histogram = self.histogram + + slots_taken = Signal(self.slots_number) + slots_taken_start = Signal.like(slots_taken) + slots_taken_stop = Signal.like(slots_taken) + + m.d.comb += slots_taken_start.eq(slots_taken) + m.d.comb += slots_taken_stop.eq(slots_taken_start) + m.d.sync += slots_taken.eq(slots_taken_stop) + + epoch = Signal(epoch_width) + + m.d.sync += epoch.eq(epoch + 1) + + @def_method(m, self._start) + def _(slot: Value): + m.d.comb += slots_taken_start.eq(slots_taken | (1 << slot)) + self.log.error(m, (slots_taken & (1 << slot)).any(), "taken slot {} taken again", slot) + self.slots.write(m, addr=slot, data=epoch) + + @def_method(m, self._stop) + def _(slot: Value): + m.d.comb += slots_taken_stop.eq(slots_taken_start & ~(C(1, self.slots_number) << slot)) + self.log.error(m, ~(slots_taken & (1 << slot)).any(), "free slot {} freed again", slot) + ret = self.slots.read(m, addr=slot) + # The result of substracting two unsigned n-bit is a signed (n+1)-bit value, + # so we need to cast the result and discard the most significant bit. + duration = (epoch - ret.epoch).as_unsigned()[:-1] + self.histogram.add(m, duration) + + return m + + def start(self, m: TModule, *, slot: ValueLike): + """ + Registers the start of an event for a given slot tag. + + Should be called in the body of either a transaction or a method. + + Parameters + ---------- + m: TModule + Transactron module + slot: ValueLike + The slot tag of the event. + """ + + if not self.metrics_enabled(): + return + + self._start(m, slot) + + def stop(self, m: TModule, *, slot: ValueLike): + """ + Registers the end of the event for a given slot tag. + + Should be called in the body of either a transaction or a method. + + Parameters + ---------- + m: TModule + Transactron module + slot: ValueLike + The slot tag of the event. + """ + + if not self.metrics_enabled(): + return + + self._stop(m, slot) + + def metrics_enabled(self) -> bool: + return DependencyContext.get().get_dependency(HwMetricsEnabledKey()) + + class HardwareMetricsManager: """ Collects all metrics registered in the circuit and provides an easy diff --git a/transactron/lib/storage.py b/transactron/lib/storage.py index e6d3e5cf5..3bbf07624 100644 --- a/transactron/lib/storage.py +++ b/transactron/lib/storage.py @@ -8,7 +8,7 @@ from transactron.utils import assign, AssignType, LayoutList from .reqres import ArgumentsToResultsZipper -__all__ = ["MemoryBank"] +__all__ = ["MemoryBank", "AsyncMemoryBank"] class MemoryBank(Elaboratable): @@ -136,3 +136,77 @@ def _(arg): m.d.comb += assign(write_args, arg, fields=AssignType.ALL) return m + + +class AsyncMemoryBank(Elaboratable): + """AsyncMemoryBank module. + + Provides a transactional interface to asynchronous Amaranth Memory with one + read and one write port. It supports optionally writing with given granularity. + + Attributes + ---------- + read: Method + The read method. Accepts an `addr` from which data should be read. + The read response method. Return `data_layout` View which was saved on `addr` given by last + `read_req` method call. + write: Method + The write method. Accepts `addr` where data should be saved, `data` in form of `data_layout` + and optionally `mask` if `granularity` is not None. `1` in mask means that appropriate part should be written. + """ + + def __init__( + self, *, data_layout: LayoutList, elem_count: int, granularity: Optional[int] = None, src_loc: int | SrcLoc = 0 + ): + """ + Parameters + ---------- + data_layout: method layout + The format of structures stored in the Memory. + elem_count: int + Number of elements stored in Memory. + granularity: Optional[int] + Granularity of write, forwarded to Amaranth. If `None` the whole structure is always saved at once. + If not, the width of `data_layout` is split into `granularity` parts, which can be saved independently. + src_loc: int | SrcLoc + How many stack frames deep the source location is taken from. + Alternatively, the source location to use instead of the default. + """ + self.src_loc = get_src_loc(src_loc) + self.data_layout = make_layout(*data_layout) + self.elem_count = elem_count + self.granularity = granularity + self.width = from_method_layout(self.data_layout).size + self.addr_width = bits_for(self.elem_count - 1) + + self.read_req_layout: LayoutList = [("addr", self.addr_width)] + write_layout = [("addr", self.addr_width), ("data", self.data_layout)] + if self.granularity is not None: + write_layout.append(("mask", self.width // self.granularity)) + self.write_layout = make_layout(*write_layout) + + self.read = Method(i=self.read_req_layout, o=self.data_layout, src_loc=self.src_loc) + self.write = Method(i=self.write_layout, src_loc=self.src_loc) + + def elaborate(self, platform) -> TModule: + m = TModule() + + mem = Memory(width=self.width, depth=self.elem_count) + m.submodules.read_port = read_port = mem.read_port(domain="comb") + m.submodules.write_port = write_port = mem.write_port() + + @def_method(m, self.read) + def _(addr): + m.d.comb += read_port.addr.eq(addr) + return read_port.data + + @def_method(m, self.write) + def _(arg): + m.d.comb += write_port.addr.eq(arg.addr) + m.d.comb += write_port.data.eq(arg.data) + if self.granularity is None: + m.d.comb += write_port.en.eq(1) + else: + m.d.comb += write_port.en.eq(arg.mask) + + return m