From 2084d88d7bbf3bae3d2872bc5742ab7243ec7087 Mon Sep 17 00:00:00 2001
From: Marek Materzok <tilk@tilk.eu>
Date: Wed, 27 Mar 2024 12:05:37 +0100
Subject: [PATCH 01/14] Add metrics for structs

---
 coreblocks/core_structs/rf.py                 | 20 +++++++++++++++++-
 coreblocks/core_structs/rob.py                | 16 ++++++++++++--
 coreblocks/func_blocks/fu/common/rs.py        | 21 +++++++++++++++++--
 .../func_blocks/fu/common/rs_func_block.py    |  9 ++++++--
 coreblocks/params/configurations.py           |  7 +++++--
 5 files changed, 64 insertions(+), 9 deletions(-)

diff --git a/coreblocks/core_structs/rf.py b/coreblocks/core_structs/rf.py
index f7a9b8a7f..b255d71a6 100644
--- a/coreblocks/core_structs/rf.py
+++ b/coreblocks/core_structs/rf.py
@@ -1,7 +1,10 @@
+import operator
 from amaranth import *
-from transactron import Method, def_method, TModule
+from functools import reduce
+from transactron import Method, Transaction, def_method, TModule
 from coreblocks.interface.layouts import RFLayouts
 from coreblocks.params import GenParams
+from transactron.lib.metrics import HwExpHistogram
 from transactron.utils.transactron_helpers import make_layout
 
 __all__ = ["RegisterFile"]
@@ -20,8 +23,15 @@ def __init__(self, *, gen_params: GenParams):
         self.write = Method(i=layouts.rf_write)
         self.free = Method(i=layouts.rf_free)
 
+        self.perf_num_valid = HwExpHistogram(
+            "struct.rf.num_valid", description="Number of valid registers in RF", bucket_count=gen_params.phys_regs_bits,
+            sample_width=gen_params.phys_regs_bits + 1
+        )
+
     def elaborate(self, platform):
         m = TModule()
+        
+        m.submodules += [self.perf_num_valid]
 
         being_written = Signal(self.gen_params.phys_regs_bits)
         written_value = Signal(self.gen_params.isa.xlen)
@@ -62,4 +72,12 @@ def _(reg_id: Value):
             with m.If(reg_id != 0):
                 m.d.sync += self.entries[reg_id].valid.eq(0)
 
+        if self.perf_num_valid.metrics_enabled():
+            num_valid = Signal(self.gen_params.phys_regs_bits + 1)
+            m.d.comb += num_valid.eq(reduce(
+                operator.add, (self.entries[reg_id].valid for reg_id in range(2**self.gen_params.phys_regs_bits))
+            ))
+            with Transaction(name="perf").body(m):
+                self.perf_num_valid.add(m, num_valid)
+
         return m
diff --git a/coreblocks/core_structs/rob.py b/coreblocks/core_structs/rob.py
index 1f3806d46..20b8eaff7 100644
--- a/coreblocks/core_structs/rob.py
+++ b/coreblocks/core_structs/rob.py
@@ -1,5 +1,5 @@
 from amaranth import *
-from transactron import Method, def_method, TModule
+from transactron import Method, Transaction, def_method, TModule
 from transactron.lib.metrics import *
 from coreblocks.interface.layouts import ROBLayouts
 from coreblocks.params import GenParams
@@ -24,11 +24,17 @@ def __init__(self, gen_params: GenParams) -> None:
             slots_number=(2**gen_params.rob_entries_bits + 1),
             max_latency=1000,
         )
+        self.perf_rob_size = HwExpHistogram(
+            "backend.rob.size",
+            description="Number of instructions in ROB",
+            bucket_count=gen_params.rob_entries_bits,
+            sample_width=gen_params.rob_entries_bits + 1
+        )
 
     def elaborate(self, platform):
         m = TModule()
 
-        m.submodules += [self.perf_rob_wait_time]
+        m.submodules += [self.perf_rob_wait_time, self.perf_rob_size]
 
         start_idx = Signal(self.params.rob_entries_bits)
         end_idx = Signal(self.params.rob_entries_bits)
@@ -70,4 +76,10 @@ def _(rob_id: Value, exception):
         def _():
             return {"start": start_idx, "end": end_idx}
 
+        if self.perf_rob_size.metrics_enabled():
+            rob_size = Signal(self.params.rob_entries_bits + 1)
+            m.d.comb += rob_size.eq(end_idx - start_idx)
+            with Transaction(name="perf").body(m):
+                self.perf_rob_size.add(m, rob_size)
+
         return m
diff --git a/coreblocks/func_blocks/fu/common/rs.py b/coreblocks/func_blocks/fu/common/rs.py
index 56287df27..fcef4210b 100644
--- a/coreblocks/func_blocks/fu/common/rs.py
+++ b/coreblocks/func_blocks/fu/common/rs.py
@@ -1,11 +1,14 @@
+import operator
+from functools import reduce
 from collections.abc import Iterable
 from typing import Optional
 from amaranth import *
 from amaranth.lib.coding import PriorityEncoder
-from transactron import Method, def_method, TModule
+from transactron import Method, Transaction, def_method, TModule
 from coreblocks.params import GenParams
 from coreblocks.frontend.decoder import OpType
 from coreblocks.interface.layouts import RSLayouts
+from transactron.lib.metrics import HwExpHistogram
 from transactron.utils import RecordDict
 from transactron.utils.transactron_helpers import make_layout
 
@@ -14,7 +17,7 @@
 
 class RS(Elaboratable):
     def __init__(
-        self, gen_params: GenParams, rs_entries: int, ready_for: Optional[Iterable[Iterable[OpType]]] = None
+        self, gen_params: GenParams, rs_entries: int, rs_number: int, ready_for: Optional[Iterable[Iterable[OpType]]] = None
     ) -> None:
         ready_for = ready_for or ((op for op in OpType),)
         self.gen_params = gen_params
@@ -38,10 +41,18 @@ def __init__(
         self.data = Array(Signal(self.internal_layout) for _ in range(self.rs_entries))
         self.data_ready = Signal(self.rs_entries)
 
+        self.perf_num_full = HwExpHistogram(
+            f"fu.block_{rs_number}.rs.num_full",
+            description=f"Number of full entries in RS {rs_number}",
+            bucket_count=self.rs_entries_bits,
+            sample_width=self.rs_entries_bits + 1
+        )
+
     def elaborate(self, platform):
         m = TModule()
 
         m.submodules.enc_select = PriorityEncoder(width=self.rs_entries)
+        m.submodules += [self.perf_num_full]
 
         for i, record in enumerate(self.data):
             m.d.comb += self.data_ready[i].eq(
@@ -105,4 +116,10 @@ def _(rs_entry_id: Value) -> RecordDict:
             def _() -> RecordDict:
                 return {"ready_list": ready_list}
 
+        if self.perf_num_full.metrics_enabled():
+            num_full = Signal(self.rs_entries_bits + 1)
+            m.d.comb += num_full.eq(reduce(operator.add, (self.data[entry_id].rec_full for entry_id in range(self.rs_entries))))
+            with Transaction(name="perf").body(m):
+                self.perf_num_full.add(m, num_full)
+
         return m
diff --git a/coreblocks/func_blocks/fu/common/rs_func_block.py b/coreblocks/func_blocks/fu/common/rs_func_block.py
index 66fed3d0e..058995aea 100644
--- a/coreblocks/func_blocks/fu/common/rs_func_block.py
+++ b/coreblocks/func_blocks/fu/common/rs_func_block.py
@@ -31,7 +31,7 @@ class RSFuncBlock(FuncBlock, Elaboratable):
         layout described by `FuncUnitLayouts`.
     """
 
-    def __init__(self, gen_params: GenParams, func_units: Iterable[tuple[FuncUnit, set[OpType]]], rs_entries: int):
+    def __init__(self, gen_params: GenParams, func_units: Iterable[tuple[FuncUnit, set[OpType]]], rs_entries: int, rs_number: int):
         """
         Parameters
         ----------
@@ -41,10 +41,13 @@ def __init__(self, gen_params: GenParams, func_units: Iterable[tuple[FuncUnit, s
             Functional units to be used by this module.
         rs_entries: int
             Number of entries in RS.
+        rs_number: int
+            The number of this RS block. Used for debugging.
         """
         self.gen_params = gen_params
         self.rs_entries = rs_entries
         self.rs_entries_bits = (rs_entries - 1).bit_length()
+        self.rs_number = rs_number
         self.rs_layouts = gen_params.get(RSLayouts, rs_entries_bits=self.rs_entries_bits)
         self.fu_layouts = gen_params.get(FuncUnitLayouts)
         self.func_units = list(func_units)
@@ -60,6 +63,7 @@ def elaborate(self, platform):
         m.submodules.rs = self.rs = RS(
             gen_params=self.gen_params,
             rs_entries=self.rs_entries,
+            rs_number=self.rs_number,
             ready_for=(optypes for _, optypes in self.func_units),
         )
 
@@ -87,10 +91,11 @@ def elaborate(self, platform):
 class RSBlockComponent(BlockComponentParams):
     func_units: Collection[FunctionalComponentParams]
     rs_entries: int
+    rs_number: int
 
     def get_module(self, gen_params: GenParams) -> FuncBlock:
         modules = list((u.get_module(gen_params), u.get_optypes()) for u in self.func_units)
-        rs_unit = RSFuncBlock(gen_params=gen_params, func_units=modules, rs_entries=self.rs_entries)
+        rs_unit = RSFuncBlock(gen_params=gen_params, func_units=modules, rs_entries=self.rs_entries, rs_number=self.rs_number)
         return rs_unit
 
     def get_optypes(self) -> set[OpType]:
diff --git a/coreblocks/params/configurations.py b/coreblocks/params/configurations.py
index a9dee4931..8b9257a75 100644
--- a/coreblocks/params/configurations.py
+++ b/coreblocks/params/configurations.py
@@ -26,6 +26,7 @@
     RSBlockComponent(
         [ALUComponent(), ShiftUnitComponent(), JumpComponent(), ExceptionUnitComponent(), PrivilegedUnitComponent()],
         rs_entries=4,
+        rs_number=0
     ),
     LSUBlockComponent(),
     CSRBlockComponent(),
@@ -106,7 +107,7 @@ def replace(self, **kwargs):
 tiny_core_config = CoreConfiguration(
     embedded=True,
     func_units_config=(
-        RSBlockComponent([ALUComponent(), ShiftUnitComponent(), JumpComponent()], rs_entries=2),
+        RSBlockComponent([ALUComponent(), ShiftUnitComponent(), JumpComponent()], rs_entries=2, rs_number=0),
         LSUBlockComponent(),
     ),
     phys_regs_bits=basic_core_config.phys_regs_bits - 1,
@@ -128,6 +129,7 @@ def replace(self, **kwargs):
                 PrivilegedUnitComponent(),
             ],
             rs_entries=4,
+            rs_number=0
         ),
         RSBlockComponent(
             [
@@ -135,6 +137,7 @@ def replace(self, **kwargs):
                 DivComponent(),
             ],
             rs_entries=2,
+            rs_number=1
         ),
         LSUBlockComponent(),
         CSRBlockComponent(),
@@ -144,7 +147,7 @@ def replace(self, **kwargs):
 
 # Core configuration used in internal testbenches
 test_core_config = CoreConfiguration(
-    func_units_config=tuple(RSBlockComponent([], rs_entries=4) for _ in range(2)),
+    func_units_config=tuple(RSBlockComponent([], rs_entries=4, rs_number=k) for k in range(2)),
     rob_entries_bits=7,
     phys_regs_bits=7,
     _implied_extensions=Extension.I,

From 0afb16ded385f7c839ad533333346744a7ab765b Mon Sep 17 00:00:00 2001
From: Marek Materzok <tilk@tilk.eu>
Date: Wed, 27 Mar 2024 12:43:40 +0100
Subject: [PATCH 02/14] Various fixes

---
 coreblocks/core_structs/rf.py                    | 16 ++++++++++------
 coreblocks/core_structs/rob.py                   |  8 ++++----
 coreblocks/func_blocks/fu/common/rs.py           | 14 ++++++++++----
 .../func_blocks/fu/common/rs_func_block.py       |  8 ++++++--
 coreblocks/params/configurations.py              |  6 +++---
 test/regression/cocotb/benchmark.Makefile        |  2 +-
 test/regression/cocotb/signature.Makefile        |  2 +-
 test/regression/cocotb/test.Makefile             |  2 +-
 8 files changed, 36 insertions(+), 22 deletions(-)

diff --git a/coreblocks/core_structs/rf.py b/coreblocks/core_structs/rf.py
index b255d71a6..c6599f043 100644
--- a/coreblocks/core_structs/rf.py
+++ b/coreblocks/core_structs/rf.py
@@ -24,13 +24,15 @@ def __init__(self, *, gen_params: GenParams):
         self.free = Method(i=layouts.rf_free)
 
         self.perf_num_valid = HwExpHistogram(
-            "struct.rf.num_valid", description="Number of valid registers in RF", bucket_count=gen_params.phys_regs_bits,
-            sample_width=gen_params.phys_regs_bits + 1
+            "struct.rf.num_valid",
+            description="Number of valid registers in RF",
+            bucket_count=gen_params.phys_regs_bits + 1,
+            sample_width=gen_params.phys_regs_bits + 1,
         )
 
     def elaborate(self, platform):
         m = TModule()
-        
+
         m.submodules += [self.perf_num_valid]
 
         being_written = Signal(self.gen_params.phys_regs_bits)
@@ -74,9 +76,11 @@ def _(reg_id: Value):
 
         if self.perf_num_valid.metrics_enabled():
             num_valid = Signal(self.gen_params.phys_regs_bits + 1)
-            m.d.comb += num_valid.eq(reduce(
-                operator.add, (self.entries[reg_id].valid for reg_id in range(2**self.gen_params.phys_regs_bits))
-            ))
+            m.d.comb += num_valid.eq(
+                reduce(
+                    operator.add, (self.entries[reg_id].valid for reg_id in range(2**self.gen_params.phys_regs_bits))
+                )
+            )
             with Transaction(name="perf").body(m):
                 self.perf_num_valid.add(m, num_valid)
 
diff --git a/coreblocks/core_structs/rob.py b/coreblocks/core_structs/rob.py
index 20b8eaff7..be15cdb49 100644
--- a/coreblocks/core_structs/rob.py
+++ b/coreblocks/core_structs/rob.py
@@ -27,8 +27,8 @@ def __init__(self, gen_params: GenParams) -> None:
         self.perf_rob_size = HwExpHistogram(
             "backend.rob.size",
             description="Number of instructions in ROB",
-            bucket_count=gen_params.rob_entries_bits,
-            sample_width=gen_params.rob_entries_bits + 1
+            bucket_count=gen_params.rob_entries_bits + 1,
+            sample_width=gen_params.rob_entries_bits,
         )
 
     def elaborate(self, platform):
@@ -77,8 +77,8 @@ def _():
             return {"start": start_idx, "end": end_idx}
 
         if self.perf_rob_size.metrics_enabled():
-            rob_size = Signal(self.params.rob_entries_bits + 1)
-            m.d.comb += rob_size.eq(end_idx - start_idx)
+            rob_size = Signal(self.params.rob_entries_bits)
+            m.d.comb += rob_size.eq((end_idx - start_idx)[0 : self.params.rob_entries_bits])
             with Transaction(name="perf").body(m):
                 self.perf_rob_size.add(m, rob_size)
 
diff --git a/coreblocks/func_blocks/fu/common/rs.py b/coreblocks/func_blocks/fu/common/rs.py
index fcef4210b..dfb9da167 100644
--- a/coreblocks/func_blocks/fu/common/rs.py
+++ b/coreblocks/func_blocks/fu/common/rs.py
@@ -17,7 +17,11 @@
 
 class RS(Elaboratable):
     def __init__(
-        self, gen_params: GenParams, rs_entries: int, rs_number: int, ready_for: Optional[Iterable[Iterable[OpType]]] = None
+        self,
+        gen_params: GenParams,
+        rs_entries: int,
+        rs_number: int,
+        ready_for: Optional[Iterable[Iterable[OpType]]] = None,
     ) -> None:
         ready_for = ready_for or ((op for op in OpType),)
         self.gen_params = gen_params
@@ -44,8 +48,8 @@ def __init__(
         self.perf_num_full = HwExpHistogram(
             f"fu.block_{rs_number}.rs.num_full",
             description=f"Number of full entries in RS {rs_number}",
-            bucket_count=self.rs_entries_bits,
-            sample_width=self.rs_entries_bits + 1
+            bucket_count=self.rs_entries_bits + 1,
+            sample_width=self.rs_entries_bits + 1,
         )
 
     def elaborate(self, platform):
@@ -118,7 +122,9 @@ def _() -> RecordDict:
 
         if self.perf_num_full.metrics_enabled():
             num_full = Signal(self.rs_entries_bits + 1)
-            m.d.comb += num_full.eq(reduce(operator.add, (self.data[entry_id].rec_full for entry_id in range(self.rs_entries))))
+            m.d.comb += num_full.eq(
+                reduce(operator.add, (self.data[entry_id].rec_full for entry_id in range(self.rs_entries)))
+            )
             with Transaction(name="perf").body(m):
                 self.perf_num_full.add(m, num_full)
 
diff --git a/coreblocks/func_blocks/fu/common/rs_func_block.py b/coreblocks/func_blocks/fu/common/rs_func_block.py
index 058995aea..6345caf6b 100644
--- a/coreblocks/func_blocks/fu/common/rs_func_block.py
+++ b/coreblocks/func_blocks/fu/common/rs_func_block.py
@@ -31,7 +31,9 @@ class RSFuncBlock(FuncBlock, Elaboratable):
         layout described by `FuncUnitLayouts`.
     """
 
-    def __init__(self, gen_params: GenParams, func_units: Iterable[tuple[FuncUnit, set[OpType]]], rs_entries: int, rs_number: int):
+    def __init__(
+        self, gen_params: GenParams, func_units: Iterable[tuple[FuncUnit, set[OpType]]], rs_entries: int, rs_number: int
+    ):
         """
         Parameters
         ----------
@@ -95,7 +97,9 @@ class RSBlockComponent(BlockComponentParams):
 
     def get_module(self, gen_params: GenParams) -> FuncBlock:
         modules = list((u.get_module(gen_params), u.get_optypes()) for u in self.func_units)
-        rs_unit = RSFuncBlock(gen_params=gen_params, func_units=modules, rs_entries=self.rs_entries, rs_number=self.rs_number)
+        rs_unit = RSFuncBlock(
+            gen_params=gen_params, func_units=modules, rs_entries=self.rs_entries, rs_number=self.rs_number
+        )
         return rs_unit
 
     def get_optypes(self) -> set[OpType]:
diff --git a/coreblocks/params/configurations.py b/coreblocks/params/configurations.py
index 8b9257a75..b7fb53173 100644
--- a/coreblocks/params/configurations.py
+++ b/coreblocks/params/configurations.py
@@ -26,7 +26,7 @@
     RSBlockComponent(
         [ALUComponent(), ShiftUnitComponent(), JumpComponent(), ExceptionUnitComponent(), PrivilegedUnitComponent()],
         rs_entries=4,
-        rs_number=0
+        rs_number=0,
     ),
     LSUBlockComponent(),
     CSRBlockComponent(),
@@ -129,7 +129,7 @@ def replace(self, **kwargs):
                 PrivilegedUnitComponent(),
             ],
             rs_entries=4,
-            rs_number=0
+            rs_number=0,
         ),
         RSBlockComponent(
             [
@@ -137,7 +137,7 @@ def replace(self, **kwargs):
                 DivComponent(),
             ],
             rs_entries=2,
-            rs_number=1
+            rs_number=1,
         ),
         LSUBlockComponent(),
         CSRBlockComponent(),
diff --git a/test/regression/cocotb/benchmark.Makefile b/test/regression/cocotb/benchmark.Makefile
index 9962315fb..e49b55b39 100644
--- a/test/regression/cocotb/benchmark.Makefile
+++ b/test/regression/cocotb/benchmark.Makefile
@@ -14,7 +14,7 @@ SIM_BUILD = build/benchmark
 
 # Yosys/Amaranth borkedness workaround
 ifeq ($(SIM),verilator)
-  EXTRA_ARGS += -Wno-CASEINCOMPLETE -Wno-CASEOVERLAP -Wno-WIDTHEXPAND -Wno-WIDTHTRUNC
+  EXTRA_ARGS += -Wno-CASEINCOMPLETE -Wno-CASEOVERLAP -Wno-WIDTHEXPAND -Wno-WIDTHTRUNC -Wno-UNSIGNED
   BUILD_ARGS += -j`nproc`
 endif
 
diff --git a/test/regression/cocotb/signature.Makefile b/test/regression/cocotb/signature.Makefile
index b4f690635..a03d0a5f8 100644
--- a/test/regression/cocotb/signature.Makefile
+++ b/test/regression/cocotb/signature.Makefile
@@ -14,7 +14,7 @@ SIM_BUILD = build/signature
 
 # Yosys/Amaranth borkedness workaround
 ifeq ($(SIM),verilator)
-  EXTRA_ARGS += -Wno-CASEINCOMPLETE -Wno-CASEOVERLAP -Wno-WIDTHEXPAND -Wno-WIDTHTRUNC
+  EXTRA_ARGS += -Wno-CASEINCOMPLETE -Wno-CASEOVERLAP -Wno-WIDTHEXPAND -Wno-WIDTHTRUNC -Wno-UNSIGNED
   BUILD_ARGS += -j`nproc`
 endif
 
diff --git a/test/regression/cocotb/test.Makefile b/test/regression/cocotb/test.Makefile
index 210618067..5b9f7aad9 100644
--- a/test/regression/cocotb/test.Makefile
+++ b/test/regression/cocotb/test.Makefile
@@ -14,7 +14,7 @@ SIM_BUILD = build/test
 
 # Yosys/Amaranth borkedness workaround
 ifeq ($(SIM),verilator)
-  EXTRA_ARGS += -Wno-CASEINCOMPLETE -Wno-CASEOVERLAP -Wno-WIDTHEXPAND -Wno-WIDTHTRUNC
+  EXTRA_ARGS += -Wno-CASEINCOMPLETE -Wno-CASEOVERLAP -Wno-WIDTHEXPAND -Wno-WIDTHTRUNC -Wno-UNSIGNED
   BUILD_ARGS += -j`nproc`
 endif
 

From 9f5ecf8e8e605401e7eeef2f54e41d1800306bc2 Mon Sep 17 00:00:00 2001
From: Marek Materzok <tilk@tilk.eu>
Date: Wed, 27 Mar 2024 13:31:15 +0100
Subject: [PATCH 03/14] Towards indexed latency measurer

---
 coreblocks/cache/icache.py             |   2 +-
 coreblocks/core_structs/rf.py          |  12 ++-
 coreblocks/core_structs/rob.py         |   2 +-
 coreblocks/func_blocks/fu/common/rs.py |  12 ++-
 test/scheduler/test_scheduler.py       |   2 +-
 test/scheduler/test_wakeup_select.py   |   4 +-
 test/structs_common/test_rs.py         |  12 +--
 test/transactron/test_metrics.py       |   4 +-
 transactron/lib/metrics.py             | 130 ++++++++++++++++++++++++-
 transactron/lib/storage.py             |  77 ++++++++++++++-
 10 files changed, 236 insertions(+), 21 deletions(-)

diff --git a/coreblocks/cache/icache.py b/coreblocks/cache/icache.py
index f94c6e07c..605e22e88 100644
--- a/coreblocks/cache/icache.py
+++ b/coreblocks/cache/icache.py
@@ -123,7 +123,7 @@ def __init__(self, layouts: ICacheLayouts, params: ICacheParameters, refiller: C
         self.perf_misses = HwCounter("frontend.icache.misses")
         self.perf_errors = HwCounter("frontend.icache.fetch_errors")
         self.perf_flushes = HwCounter("frontend.icache.flushes")
-        self.req_latency = LatencyMeasurer(
+        self.req_latency = FIFOLatencyMeasurer(
             "frontend.icache.req_latency", "Latencies of cache requests", slots_number=2, max_latency=500
         )
 
diff --git a/coreblocks/core_structs/rf.py b/coreblocks/core_structs/rf.py
index c6599f043..d6bed0c9f 100644
--- a/coreblocks/core_structs/rf.py
+++ b/coreblocks/core_structs/rf.py
@@ -4,7 +4,7 @@
 from transactron import Method, Transaction, def_method, TModule
 from coreblocks.interface.layouts import RFLayouts
 from coreblocks.params import GenParams
-from transactron.lib.metrics import HwExpHistogram
+from transactron.lib.metrics import HwExpHistogram, IndexedLatencyMeasurer
 from transactron.utils.transactron_helpers import make_layout
 
 __all__ = ["RegisterFile"]
@@ -23,6 +23,12 @@ def __init__(self, *, gen_params: GenParams):
         self.write = Method(i=layouts.rf_write)
         self.free = Method(i=layouts.rf_free)
 
+        self.perf_rf_valid_time = IndexedLatencyMeasurer(
+            "struct.rf.valid_time",
+            description="Distribution of time registers are valid in RF",
+            slots_number=2**gen_params.phys_regs_bits,
+            max_latency=1000,
+        )
         self.perf_num_valid = HwExpHistogram(
             "struct.rf.num_valid",
             description="Number of valid registers in RF",
@@ -33,7 +39,7 @@ def __init__(self, *, gen_params: GenParams):
     def elaborate(self, platform):
         m = TModule()
 
-        m.submodules += [self.perf_num_valid]
+        m.submodules += [self.perf_rf_valid_time, self.perf_num_valid]
 
         being_written = Signal(self.gen_params.phys_regs_bits)
         written_value = Signal(self.gen_params.isa.xlen)
@@ -68,11 +74,13 @@ def _(reg_id: Value, reg_val: Value):
             with m.If(~(zero_reg)):
                 m.d.sync += self.entries[reg_id].reg_val.eq(reg_val)
                 m.d.sync += self.entries[reg_id].valid.eq(1)
+                self.perf_rf_valid_time.start(m, slot=reg_id)
 
         @def_method(m, self.free)
         def _(reg_id: Value):
             with m.If(reg_id != 0):
                 m.d.sync += self.entries[reg_id].valid.eq(0)
+                self.perf_rf_valid_time.stop(m, slot=reg_id)
 
         if self.perf_num_valid.metrics_enabled():
             num_valid = Signal(self.gen_params.phys_regs_bits + 1)
diff --git a/coreblocks/core_structs/rob.py b/coreblocks/core_structs/rob.py
index be15cdb49..25b14bab3 100644
--- a/coreblocks/core_structs/rob.py
+++ b/coreblocks/core_structs/rob.py
@@ -18,7 +18,7 @@ def __init__(self, gen_params: GenParams) -> None:
         self.data = Array(Signal(layouts.internal_layout) for _ in range(2**gen_params.rob_entries_bits))
         self.get_indices = Method(o=layouts.get_indices, nonexclusive=True)
 
-        self.perf_rob_wait_time = LatencyMeasurer(
+        self.perf_rob_wait_time = FIFOLatencyMeasurer(
             "backend.rob.wait_time",
             description="Distribution of time instructions spend in ROB",
             slots_number=(2**gen_params.rob_entries_bits + 1),
diff --git a/coreblocks/func_blocks/fu/common/rs.py b/coreblocks/func_blocks/fu/common/rs.py
index dfb9da167..3c05c59f6 100644
--- a/coreblocks/func_blocks/fu/common/rs.py
+++ b/coreblocks/func_blocks/fu/common/rs.py
@@ -8,7 +8,7 @@
 from coreblocks.params import GenParams
 from coreblocks.frontend.decoder import OpType
 from coreblocks.interface.layouts import RSLayouts
-from transactron.lib.metrics import HwExpHistogram
+from transactron.lib.metrics import HwExpHistogram, IndexedLatencyMeasurer
 from transactron.utils import RecordDict
 from transactron.utils.transactron_helpers import make_layout
 
@@ -45,6 +45,12 @@ def __init__(
         self.data = Array(Signal(self.internal_layout) for _ in range(self.rs_entries))
         self.data_ready = Signal(self.rs_entries)
 
+        self.perf_rs_wait_time = IndexedLatencyMeasurer(
+            f"fu.block_{rs_number}.rs.valid_time",
+            description=f"Distribution of time instructions wait in RS {rs_number}",
+            slots_number=2**self.rs_entries_bits,
+            max_latency=1000,
+        )
         self.perf_num_full = HwExpHistogram(
             f"fu.block_{rs_number}.rs.num_full",
             description=f"Number of full entries in RS {rs_number}",
@@ -56,7 +62,7 @@ def elaborate(self, platform):
         m = TModule()
 
         m.submodules.enc_select = PriorityEncoder(width=self.rs_entries)
-        m.submodules += [self.perf_num_full]
+        m.submodules += [self.perf_rs_wait_time, self.perf_num_full]
 
         for i, record in enumerate(self.data):
             m.d.comb += self.data_ready[i].eq(
@@ -86,6 +92,7 @@ def _(rs_entry_id: Value, rs_data: Value) -> None:
             m.d.sync += self.data[rs_entry_id].rs_data.eq(rs_data)
             m.d.sync += self.data[rs_entry_id].rec_full.eq(1)
             m.d.sync += self.data[rs_entry_id].rec_reserved.eq(1)
+            self.perf_rs_wait_time.start(m, slot=rs_entry_id)
 
         @def_method(m, self.update)
         def _(reg_id: Value, reg_val: Value) -> None:
@@ -104,6 +111,7 @@ def _(rs_entry_id: Value) -> RecordDict:
             record = self.data[rs_entry_id]
             m.d.sync += record.rec_reserved.eq(0)
             m.d.sync += record.rec_full.eq(0)
+            self.perf_rs_wait_time.stop(m, slot=rs_entry_id)
             return {
                 "s1_val": record.rs_data.s1_val,
                 "s2_val": record.rs_data.s2_val,
diff --git a/test/scheduler/test_scheduler.py b/test/scheduler/test_scheduler.py
index 3c50efab6..2fcf54a50 100644
--- a/test/scheduler/test_scheduler.py
+++ b/test/scheduler/test_scheduler.py
@@ -127,7 +127,7 @@ def setUp(self):
         self.rs_count = len(self.optype_sets)
         self.gen_params = GenParams(
             test_core_config.replace(
-                func_units_config=tuple(RSBlockComponent([], rs_entries=4) for _ in range(self.rs_count))
+                func_units_config=tuple(RSBlockComponent([], rs_entries=4, rs_number=k) for k in range(self.rs_count))
             )
         )
         self.expected_rename_queue = deque()
diff --git a/test/scheduler/test_wakeup_select.py b/test/scheduler/test_wakeup_select.py
index 4ff298da9..3e406e1af 100644
--- a/test/scheduler/test_wakeup_select.py
+++ b/test/scheduler/test_wakeup_select.py
@@ -43,7 +43,9 @@ def elaborate(self, platform):
 class TestWakeupSelect(TestCaseWithSimulator):
     def setUp(self):
         self.gen_params = GenParams(
-            test_core_config.replace(func_units_config=tuple(RSBlockComponent([], rs_entries=16) for _ in range(2)))
+            test_core_config.replace(
+                func_units_config=tuple(RSBlockComponent([], rs_entries=16, rs_number=k) for k in range(2))
+            )
         )
         self.m = WakeupTestCircuit(self.gen_params)
         self.cycles = 50
diff --git a/test/structs_common/test_rs.py b/test/structs_common/test_rs.py
index 4e86a46de..c62852cb0 100644
--- a/test/structs_common/test_rs.py
+++ b/test/structs_common/test_rs.py
@@ -24,7 +24,7 @@ class TestRSMethodInsert(TestCaseWithSimulator):
     def test_insert(self):
         self.gen_params = GenParams(test_core_config)
         self.rs_entries_bits = self.gen_params.max_rs_entries_bits
-        self.m = SimpleTestCircuit(RS(self.gen_params, 2**self.rs_entries_bits, None))
+        self.m = SimpleTestCircuit(RS(self.gen_params, 2**self.rs_entries_bits, 0, None))
         self.insert_list = [
             {
                 "rs_entry_id": id,
@@ -69,7 +69,7 @@ class TestRSMethodSelect(TestCaseWithSimulator):
     def test_select(self):
         self.gen_params = GenParams(test_core_config)
         self.rs_entries_bits = self.gen_params.max_rs_entries_bits
-        self.m = SimpleTestCircuit(RS(self.gen_params, 2**self.rs_entries_bits, None))
+        self.m = SimpleTestCircuit(RS(self.gen_params, 2**self.rs_entries_bits, 0, None))
         self.insert_list = [
             {
                 "rs_entry_id": id,
@@ -132,7 +132,7 @@ class TestRSMethodUpdate(TestCaseWithSimulator):
     def test_update(self):
         self.gen_params = GenParams(test_core_config)
         self.rs_entries_bits = self.gen_params.max_rs_entries_bits
-        self.m = SimpleTestCircuit(RS(self.gen_params, 2**self.rs_entries_bits, None))
+        self.m = SimpleTestCircuit(RS(self.gen_params, 2**self.rs_entries_bits, 0, None))
         self.insert_list = [
             {
                 "rs_entry_id": id,
@@ -223,7 +223,7 @@ class TestRSMethodTake(TestCaseWithSimulator):
     def test_take(self):
         self.gen_params = GenParams(test_core_config)
         self.rs_entries_bits = self.gen_params.max_rs_entries_bits
-        self.m = SimpleTestCircuit(RS(self.gen_params, 2**self.rs_entries_bits, None))
+        self.m = SimpleTestCircuit(RS(self.gen_params, 2**self.rs_entries_bits, 0, None))
         self.insert_list = [
             {
                 "rs_entry_id": id,
@@ -322,7 +322,7 @@ class TestRSMethodGetReadyList(TestCaseWithSimulator):
     def test_get_ready_list(self):
         self.gen_params = GenParams(test_core_config)
         self.rs_entries_bits = self.gen_params.max_rs_entries_bits
-        self.m = SimpleTestCircuit(RS(self.gen_params, 2**self.rs_entries_bits, None))
+        self.m = SimpleTestCircuit(RS(self.gen_params, 2**self.rs_entries_bits, 0, None))
         self.insert_list = [
             {
                 "rs_entry_id": id,
@@ -378,7 +378,7 @@ def test_two_get_ready_lists(self):
         self.rs_entries = self.gen_params.max_rs_entries
         self.rs_entries_bits = self.gen_params.max_rs_entries_bits
         self.m = SimpleTestCircuit(
-            RS(self.gen_params, 2**self.rs_entries_bits, [[OpType(1), OpType(2)], [OpType(3), OpType(4)]])
+            RS(self.gen_params, 2**self.rs_entries_bits, 0, [[OpType(1), OpType(2)], [OpType(3), OpType(4)]])
         )
         self.insert_list = [
             {
diff --git a/test/transactron/test_metrics.py b/test/transactron/test_metrics.py
index 12acdfd27..c52e9aed4 100644
--- a/test/transactron/test_metrics.py
+++ b/test/transactron/test_metrics.py
@@ -237,14 +237,14 @@ def test_process():
         (5, 5),
     ],
 )
-class TestLatencyMeasurer(TestCaseWithSimulator):
+class TestFIFOLatencyMeasurer(TestCaseWithSimulator):
     slots_number: int
     expected_consumer_wait: float
 
     def test_latency_measurer(self):
         random.seed(42)
 
-        m = SimpleTestCircuit(LatencyMeasurer("latency", slots_number=self.slots_number, max_latency=300))
+        m = SimpleTestCircuit(FIFOLatencyMeasurer("latency", slots_number=self.slots_number, max_latency=300))
         DependencyContext.get().add_dependency(HwMetricsEnabledKey(), True)
 
         latencies: list[int] = []
diff --git a/transactron/lib/metrics.py b/transactron/lib/metrics.py
index 2e706e0a3..82f6f0bfa 100644
--- a/transactron/lib/metrics.py
+++ b/transactron/lib/metrics.py
@@ -9,7 +9,7 @@
 from transactron.utils import ValueLike
 from transactron import Method, def_method, TModule
 from transactron.utils import SignalBundle
-from transactron.lib import FIFO
+from transactron.lib import FIFO, AsyncMemoryBank
 from transactron.utils.dependencies import ListKey, DependencyContext, SimpleKey
 
 __all__ = [
@@ -18,7 +18,8 @@
     "HwMetric",
     "HwCounter",
     "HwExpHistogram",
-    "LatencyMeasurer",
+    "FIFOLatencyMeasurer",
+    "IndexedLatencyMeasurer",
     "HardwareMetricsManager",
     "HwMetricsEnabledKey",
 ]
@@ -354,7 +355,7 @@ def add(self, m: TModule, sample: Value):
         self._add(m, sample)
 
 
-class LatencyMeasurer(Elaboratable):
+class FIFOLatencyMeasurer(Elaboratable):
     """
     Measures duration between two events, e.g. request processing latency.
     It can track multiple events at the same time, i.e. the second event can
@@ -379,7 +380,7 @@ def __init__(
             The fully qualified name of the metric.
         description: str
             A human-readable description of the metric's functionality.
-        slots_number: str
+        slots_number: int
             A number of events that the module can track simultaneously.
         max_latency: int
             The maximum latency of an event. Used to set signal widths and
@@ -473,6 +474,127 @@ def metrics_enabled(self) -> bool:
         return DependencyContext.get().get_dependency(HwMetricsEnabledKey())
 
 
+class IndexedLatencyMeasurer(Elaboratable):
+    """
+    Measures duration between two events, e.g. request processing latency.
+    It can track multiple events at the same time, i.e. the second event can
+    be registered as started, before the first finishes. However, each event
+    needs to have an unique slot index.
+
+    The module exposes an exponential histogram of the measured latencies.
+    """
+
+    def __init__(
+        self,
+        fully_qualified_name: str,
+        description: str = "",
+        *,
+        slots_number: int,
+        max_latency: int,
+    ):
+        """
+        Parameters
+        ----------
+        fully_qualified_name: str
+            The fully qualified name of the metric.
+        description: str
+            A human-readable description of the metric's functionality.
+        slots_number: int
+            A number of events that the module can track simultaneously.
+        max_latency: int
+            The maximum latency of an event. Used to set signal widths and
+            number of buckets in the histogram. If a latency turns to be
+            bigger than the maximum, it will overflow and result in a false
+            measurement.
+        """
+        self.fully_qualified_name = fully_qualified_name
+        self.description = description
+        self.slots_number = slots_number
+        self.max_latency = max_latency
+
+        self._start = Method(i=[("slot", range(0, slots_number))])
+        self._stop = Method(i=[("slot", range(0, slots_number))])
+
+        # This bucket count gives us the best possible granularity.
+        bucket_count = bits_for(self.max_latency) + 1
+        self.histogram = HwExpHistogram(
+            self.fully_qualified_name,
+            self.description,
+            bucket_count=bucket_count,
+            sample_width=bits_for(self.max_latency),
+        )
+
+    def elaborate(self, platform):
+        if not self.metrics_enabled():
+            return TModule()
+
+        m = TModule()
+
+        epoch_width = bits_for(self.max_latency)
+
+        m.submodules.slots = self.slots = AsyncMemoryBank(
+            data_layout=[("epoch", epoch_width)], elem_count=self.slots_number
+        )
+        m.submodules.histogram = self.histogram
+
+        epoch = Signal(epoch_width)
+
+        m.d.sync += epoch.eq(epoch + 1)
+
+        @def_method(m, self._start)
+        def _(slot):
+            self.slots.write(m, slot, epoch)
+
+        @def_method(m, self._stop)
+        def _(slot):
+            ret = self.slots.read(m, slot)
+            # The result of substracting two unsigned n-bit is a signed (n+1)-bit value,
+            # so we need to cast the result and discard the most significant bit.
+            duration = (epoch - ret.epoch).as_unsigned()[:-1]
+            self.histogram.add(m, duration)
+
+        return m
+
+    def start(self, m: TModule, slot: ValueLike):
+        """
+        Registers the start of an event. Can be called before the previous events
+        finish. If there are no slots available, the method will be blocked.
+
+        Should be called in the body of either a transaction or a method.
+
+        Parameters
+        ----------
+        m: TModule
+            Transactron module
+        """
+
+        if not self.metrics_enabled():
+            return
+
+        self._start(m, slot)
+
+    def stop(self, m: TModule, slot: ValueLike):
+        """
+        Registers the end of the oldest event (the FIFO order). If there are no
+        started events in the queue, the method will block.
+
+        Should be called in the body of either a transaction or a method.
+
+        Parameters
+        ----------
+        m: TModule
+            Transactron module
+        """
+
+        if not self.metrics_enabled():
+            return
+
+        self._stop(m, slot)
+
+    def metrics_enabled(self) -> bool:
+        return DependencyContext.get().get_dependency(HwMetricsEnabledKey())
+
+
 class HardwareMetricsManager:
     """
     Collects all metrics registered in the circuit and provides an easy
diff --git a/transactron/lib/storage.py b/transactron/lib/storage.py
index e6d3e5cf5..a9be66020 100644
--- a/transactron/lib/storage.py
+++ b/transactron/lib/storage.py
@@ -8,7 +8,7 @@
 from transactron.utils import assign, AssignType, LayoutList
 from .reqres import ArgumentsToResultsZipper
 
-__all__ = ["MemoryBank"]
+__all__ = ["MemoryBank", "AsyncMemoryBank"]
 
 
 class MemoryBank(Elaboratable):
@@ -136,3 +136,78 @@ def _(arg):
             m.d.comb += assign(write_args, arg, fields=AssignType.ALL)
 
         return m
+
+
+class AsyncMemoryBank(Elaboratable):
+    """AsyncMemoryBank module.
+
+    Provides a transactional interface to asynchronous Amaranth Memory with one
+    read and one write port. It supports optionally writing with given granularity.
+
+    Attributes
+    ----------
+    read: Method
+        The read method. Accepts an `addr` from which data should be read.
+        The read response method. Return `data_layout` View which was saved on `addr` given by last
+        `read_req` method call.
+    write: Method
+        The write method. Accepts `addr` where data should be saved, `data` in form of `data_layout`
+        and optionally `mask` if `granularity` is not None. `1` in mask means that appropriate part should be written.
+    """
+
+    def __init__(
+        self, *, data_layout: LayoutList, elem_count: int, granularity: Optional[int] = None, src_loc: int | SrcLoc = 0
+    ):
+        """
+        Parameters
+        ----------
+        data_layout: method layout
+            The format of structures stored in the Memory.
+        elem_count: int
+            Number of elements stored in Memory.
+        granularity: Optional[int]
+            Granularity of write, forwarded to Amaranth. If `None` the whole structure is always saved at once.
+            If not, the width of `data_layout` is split into `granularity` parts, which can be saved independently.
+        src_loc: int | SrcLoc
+            How many stack frames deep the source location is taken from.
+            Alternatively, the source location to use instead of the default.
+        """
+        self.src_loc = get_src_loc(src_loc)
+        self.data_layout = make_layout(*data_layout)
+        self.elem_count = elem_count
+        self.granularity = granularity
+        self.width = from_method_layout(self.data_layout).size
+        self.addr_width = bits_for(self.elem_count - 1)
+
+        self.read_req_layout: LayoutList = [("addr", self.addr_width)]
+        write_layout = [("addr", self.addr_width), ("data", self.data_layout)]
+        if self.granularity is not None:
+            write_layout.append(("mask", self.width // self.granularity))
+        self.write_layout = make_layout(*write_layout)
+
+        self.read = Method(i=self.read_req_layout, o=self.data_layout, src_loc=self.src_loc)
+        self.write = Method(i=self.write_layout, src_loc=self.src_loc)
+
+    def elaborate(self, platform) -> TModule:
+        m = TModule()
+
+        mem = Memory(width=self.width, depth=self.elem_count)
+        m.submodules.read_port = read_port = mem.read_port(domain="comb")
+        m.submodules.write_port = write_port = mem.write_port()
+
+        @def_method(m, self.read)
+        def _(addr):
+            m.d.comb += read_port.addr.eq(addr)
+            m.d.comb += read_port.en.eq(1)
+            return read_port.data
+
+        @def_method(m, self.write)
+        def _(arg):
+            m.d.comb += write_port.addr.eq(arg.addr)
+            m.d.comb += write_port.data.eq(arg.data)
+            if self.granularity is None:
+                m.d.comb += write_port.en.eq(1)
+            else:
+                m.d.comb += write_port.en.eq(arg.mask)
+
+        return m

From bbfa39cb37b9856039a6e9b09435ebc22b62879b Mon Sep 17 00:00:00 2001
From: Marek Materzok <tilk@tilk.eu>
Date: Wed, 27 Mar 2024 16:42:40 +0100
Subject: [PATCH 04/14] Fix errors

---
 transactron/lib/metrics.py | 4 ++--
 transactron/lib/storage.py | 1 -
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/transactron/lib/metrics.py b/transactron/lib/metrics.py
index 82f6f0bfa..d52377ba6 100644
--- a/transactron/lib/metrics.py
+++ b/transactron/lib/metrics.py
@@ -543,11 +543,11 @@ def elaborate(self, platform):
 
         @def_method(m, self._start)
         def _(slot):
-            self.slots.write(m, slot, epoch)
+            self.slots.write(m, addr=slot, data=epoch)
 
         @def_method(m, self._stop)
         def _(slot):
-            ret = self.slots.read(m, slot)
+            ret = self.slots.read(m, addr=slot)
             # The result of substracting two unsigned n-bit is a signed (n+1)-bit value,
             # so we need to cast the result and discard the most significant bit.
             duration = (epoch - ret.epoch).as_unsigned()[:-1]
diff --git a/transactron/lib/storage.py b/transactron/lib/storage.py
index a9be66020..3bbf07624 100644
--- a/transactron/lib/storage.py
+++ b/transactron/lib/storage.py
@@ -198,7 +198,6 @@ def elaborate(self, platform) -> TModule:
         @def_method(m, self.read)
         def _(addr):
             m.d.comb += read_port.addr.eq(addr)
-            m.d.comb += read_port.en.eq(1)
             return read_port.data
 
         @def_method(m, self.write)

From 74084719f9ca08ec2afc8e5587537a640a9a018a Mon Sep 17 00:00:00 2001
From: Marek Materzok <tilk@tilk.eu>
Date: Thu, 28 Mar 2024 11:11:53 +0100
Subject: [PATCH 05/14] Documentation

---
 transactron/lib/metrics.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/transactron/lib/metrics.py b/transactron/lib/metrics.py
index d52377ba6..816225765 100644
--- a/transactron/lib/metrics.py
+++ b/transactron/lib/metrics.py
@@ -555,10 +555,9 @@ def _(slot):
 
         return m
 
-    def start(self, m: TModule, slot: ValueLike):
+    def start(self, m: TModule, *, slot: ValueLike):
         """
-        Registers the start of an event. Can be called before the previous events
-        finish. If there are no slots available, the method will be blocked.
+        Registers the start of an event for a given slot index.
 
         Should be called in the body of either a transaction or a method.
 
@@ -566,6 +565,8 @@ def start(self, m: TModule, slot: ValueLike):
         ----------
         m: TModule
             Transactron module
+        slot: ValueLike
+            The slot index of the event.
         """
 
         if not self.metrics_enabled():
@@ -573,10 +574,9 @@ def start(self, m: TModule, slot: ValueLike):
 
         self._start(m, slot)
 
-    def stop(self, m: TModule, slot: ValueLike):
+    def stop(self, m: TModule, *, slot: ValueLike):
         """
-        Registers the end of the oldest event (the FIFO order). If there are no
-        started events in the queue, the method will block.
+        Registers the end of the event for a given slot index.
 
         Should be called in the body of either a transaction or a method.
 
@@ -584,6 +584,8 @@ def stop(self, m: TModule, slot: ValueLike):
         ----------
         m: TModule
             Transactron module
+        slot: ValueLike
+            The slot index of the event.
         """
 
         if not self.metrics_enabled():

From 8363f21f5f6469367d1797e88c075d0bd3e2b241 Mon Sep 17 00:00:00 2001
From: Marek Materzok <tilk@tilk.eu>
Date: Thu, 28 Mar 2024 11:34:23 +0100
Subject: [PATCH 06/14] Test for IndexedLatencyMeasurer

---
 test/transactron/test_metrics.py | 99 ++++++++++++++++++++++++++++++++
 1 file changed, 99 insertions(+)

diff --git a/test/transactron/test_metrics.py b/test/transactron/test_metrics.py
index c52e9aed4..5eaac6066 100644
--- a/test/transactron/test_metrics.py
+++ b/test/transactron/test_metrics.py
@@ -305,6 +305,105 @@ def consumer():
             sim.add_sync_process(ticker)
 
 
+@parameterized_class(
+    ("slots_number", "expected_consumer_wait"),
+    [
+        (2, 5),
+        (2, 10),
+        (5, 10),
+        (10, 1),
+        (10, 10),
+        (5, 5),
+    ],
+)
+class TestIndexedLatencyMeasurer(TestCaseWithSimulator):
+    slots_number: int
+    expected_consumer_wait: float
+
+    def test_latency_measurer(self):
+        random.seed(42)
+
+        m = SimpleTestCircuit(IndexedLatencyMeasurer("latency", slots_number=self.slots_number, max_latency=300))
+        DependencyContext.get().add_dependency(HwMetricsEnabledKey(), True)
+
+        latencies: list[int] = []
+
+        events = list(0 for _ in range(self.slots_number))
+        free_slots = list(k for k in range(self.slots_number))
+        used_slots: list[int] = []
+
+        time = 0
+
+        def ticker():
+            nonlocal time
+
+            yield Passive()
+
+            while True:
+                yield
+                time += 1
+
+        finish = False
+
+        def producer():
+            nonlocal finish
+
+            for _ in range(200):
+                if not free_slots:
+                    yield
+                    continue
+
+                slot_id = random.choice(free_slots)
+                yield from m._start.call(slot=slot_id)
+
+                # Make sure that the time is updated first.
+                yield Settle()
+
+                events[slot_id] = time
+                free_slots.remove(slot_id)
+                used_slots.append(slot_id)
+
+                yield from self.random_wait_geom(0.8)
+
+            finish = True
+
+        def consumer():
+            while not finish:
+                if not used_slots:
+                    yield
+                    continue
+
+                slot_id = random.choice(used_slots)
+
+                yield from m._stop.call(slot=slot_id)
+
+                # Make sure that the time is updated first.
+                yield Settle()
+
+                latencies.append(time - events[slot_id])
+                used_slots.remove(slot_id)
+                free_slots.append(slot_id)
+
+                yield from self.random_wait_geom(1.0 / self.expected_consumer_wait)
+
+            self.assertEqual(min(latencies), (yield m._dut.histogram.min.value))
+            self.assertEqual(max(latencies), (yield m._dut.histogram.max.value))
+            self.assertEqual(sum(latencies), (yield m._dut.histogram.sum.value))
+            self.assertEqual(len(latencies), (yield m._dut.histogram.count.value))
+
+            for i in range(m._dut.histogram.bucket_count):
+                bucket_start = 0 if i == 0 else 2 ** (i - 1)
+                bucket_end = 1e10 if i == m._dut.histogram.bucket_count - 1 else 2**i
+
+                count = sum(1 for x in latencies if bucket_start <= x < bucket_end)
+                self.assertEqual(count, (yield m._dut.histogram.buckets[i].value))
+
+        with self.run_simulation(m) as sim:
+            sim.add_sync_process(producer)
+            sim.add_sync_process(consumer)
+            sim.add_sync_process(ticker)
+
+
 class MetricManagerTestCircuit(Elaboratable):
     def __init__(self):
         self.incr_counters = Method(i=[("counter1", 1), ("counter2", 1), ("counter3", 1)])

From 18119abda9fda8c16f0df557c7c4645d8a9490c8 Mon Sep 17 00:00:00 2001
From: Marek Materzok <tilk@tilk.eu>
Date: Thu, 28 Mar 2024 12:01:46 +0100
Subject: [PATCH 07/14] Test for AsyncMemoryBank

---
 test/transactions/test_transaction_lib.py | 45 +++++++++++++++++++++--
 1 file changed, 41 insertions(+), 4 deletions(-)

diff --git a/test/transactions/test_transaction_lib.py b/test/transactions/test_transaction_lib.py
index c8e758ce7..78119067f 100644
--- a/test/transactions/test_transaction_lib.py
+++ b/test/transactions/test_transaction_lib.py
@@ -142,7 +142,7 @@ def test_mem(self, max_addr, writer_rand, reader_req_rand, reader_resp_rand, see
             MemoryBank(data_layout=[("data", data_width)], elem_count=max_addr, safe_writes=safe_writes)
         )
 
-        data_dict: dict[int, int] = dict((i, 0) for i in range(max_addr))
+        data: list[int] = list(0 for _ in range(max_addr))
         read_req_queue = deque()
         addr_queue = deque()
 
@@ -155,7 +155,7 @@ def writer():
                 yield from m.write.call(data=d, addr=a)
                 for _ in range(2):
                     yield Settle()
-                data_dict[a] = d
+                data[a] = d
                 yield from self.random_wait(writer_rand, min_cycle_cnt=1)
 
         def reader_req():
@@ -165,7 +165,7 @@ def reader_req():
                 for _ in range(1):
                     yield Settle()
                 if safe_writes:
-                    d = data_dict[a]
+                    d = data[a]
                     read_req_queue.append(d)
                 else:
                     addr_queue.append((cycle, a))
@@ -188,7 +188,7 @@ def internal_reader_resp():
                 else:
                     yield
                     continue
-                d = data_dict[a]
+                d = data[a]
                 # check when internal method has been run to capture
                 # memory state for tests purposes
                 if (yield m._dut._internal_read_resp_trans.grant):
@@ -232,6 +232,43 @@ def process():
             sim.add_sync_process(process)
 
 
+class TestAsyncMemoryBank(TestCaseWithSimulator):
+    @parameterized.expand([(9, 3, 3, 14), (16, 1, 1, 15), (16, 1, 1, 16), (12, 3, 1, 17)])
+    def test_mem(self, max_addr, writer_rand, reader_rand, seed):
+        test_count = 200
+
+        data_width = 6
+        m = SimpleTestCircuit(AsyncMemoryBank(data_layout=[("data", data_width)], elem_count=max_addr))
+
+        data: list[int] = list(0 for i in range(max_addr))
+
+        random.seed(seed)
+
+        def writer():
+            for cycle in range(test_count):
+                d = random.randrange(2**data_width)
+                a = random.randrange(max_addr)
+                yield from m.write.call(data=d, addr=a)
+                for _ in range(2):
+                    yield Settle()
+                data[a] = d
+                yield from self.random_wait(writer_rand, min_cycle_cnt=1)
+
+        def reader():
+            for cycle in range(test_count):
+                a = random.randrange(max_addr)
+                d = yield from m.read.call(addr=a)
+                for _ in range(1):
+                    yield Settle()
+                expected_d = data[a]
+                self.assertEqual(d["data"], expected_d)
+                yield from self.random_wait(reader_rand, min_cycle_cnt=1)
+
+        with self.run_simulation(m) as sim:
+            sim.add_sync_process(reader)
+            sim.add_sync_process(writer)
+
+
 class ManyToOneConnectTransTestCircuit(Elaboratable):
     def __init__(self, count: int, lay: MethodLayout):
         self.count = count

From 472adfbb45c3a7f45e08ee0d888d8e6185c6e7e1 Mon Sep 17 00:00:00 2001
From: Marek Materzok <tilk@tilk.eu>
Date: Thu, 28 Mar 2024 17:58:03 +0100
Subject: [PATCH 08/14] Address review comments

---
 coreblocks/core_structs/rf.py          | 7 ++-----
 coreblocks/func_blocks/fu/common/rs.py | 7 ++-----
 2 files changed, 4 insertions(+), 10 deletions(-)

diff --git a/coreblocks/core_structs/rf.py b/coreblocks/core_structs/rf.py
index d6bed0c9f..e8c4e4ef3 100644
--- a/coreblocks/core_structs/rf.py
+++ b/coreblocks/core_structs/rf.py
@@ -1,10 +1,9 @@
-import operator
 from amaranth import *
-from functools import reduce
 from transactron import Method, Transaction, def_method, TModule
 from coreblocks.interface.layouts import RFLayouts
 from coreblocks.params import GenParams
 from transactron.lib.metrics import HwExpHistogram, IndexedLatencyMeasurer
+from transactron.utils.amaranth_ext.functions import popcount
 from transactron.utils.transactron_helpers import make_layout
 
 __all__ = ["RegisterFile"]
@@ -85,9 +84,7 @@ def _(reg_id: Value):
         if self.perf_num_valid.metrics_enabled():
             num_valid = Signal(self.gen_params.phys_regs_bits + 1)
             m.d.comb += num_valid.eq(
-                reduce(
-                    operator.add, (self.entries[reg_id].valid for reg_id in range(2**self.gen_params.phys_regs_bits))
-                )
+                popcount(Cat(self.entries[reg_id].valid for reg_id in range(2**self.gen_params.phys_regs_bits)))
             )
             with Transaction(name="perf").body(m):
                 self.perf_num_valid.add(m, num_valid)
diff --git a/coreblocks/func_blocks/fu/common/rs.py b/coreblocks/func_blocks/fu/common/rs.py
index 3c05c59f6..6c418f226 100644
--- a/coreblocks/func_blocks/fu/common/rs.py
+++ b/coreblocks/func_blocks/fu/common/rs.py
@@ -1,5 +1,3 @@
-import operator
-from functools import reduce
 from collections.abc import Iterable
 from typing import Optional
 from amaranth import *
@@ -10,6 +8,7 @@
 from coreblocks.interface.layouts import RSLayouts
 from transactron.lib.metrics import HwExpHistogram, IndexedLatencyMeasurer
 from transactron.utils import RecordDict
+from transactron.utils.amaranth_ext.functions import popcount
 from transactron.utils.transactron_helpers import make_layout
 
 __all__ = ["RS"]
@@ -130,9 +129,7 @@ def _() -> RecordDict:
 
         if self.perf_num_full.metrics_enabled():
             num_full = Signal(self.rs_entries_bits + 1)
-            m.d.comb += num_full.eq(
-                reduce(operator.add, (self.data[entry_id].rec_full for entry_id in range(self.rs_entries)))
-            )
+            m.d.comb += num_full.eq(popcount(Cat(self.data[entry_id].rec_full for entry_id in range(self.rs_entries))))
             with Transaction(name="perf").body(m):
                 self.perf_num_full.add(m, num_full)
 

From 15cf0fefb9a9a2c0a4eff265f7ff136cfe77c84c Mon Sep 17 00:00:00 2001
From: Marek Materzok <tilk@tilk.eu>
Date: Sun, 31 Mar 2024 16:00:20 +0200
Subject: [PATCH 09/14] Automatic generation of RS numbers

---
 coreblocks/func_blocks/csr/csr.py                 |  1 +
 coreblocks/func_blocks/fu/common/rs_func_block.py |  2 +-
 coreblocks/func_blocks/lsu/dummyLsu.py            |  2 ++
 coreblocks/params/configurations.py               | 13 ++++++++-----
 coreblocks/params/fu_params.py                    |  2 ++
 5 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/coreblocks/func_blocks/csr/csr.py b/coreblocks/func_blocks/csr/csr.py
index 43ddfe957..697de5c63 100644
--- a/coreblocks/func_blocks/csr/csr.py
+++ b/coreblocks/func_blocks/csr/csr.py
@@ -236,6 +236,7 @@ def _(rob_id: Value, side_fx: Value):
         return m
 
 
+@dataclass(frozen=True)
 class CSRBlockComponent(BlockComponentParams):
     def get_module(self, gen_params: GenParams) -> FuncBlock:
         connections = gen_params.get(DependencyManager)
diff --git a/coreblocks/func_blocks/fu/common/rs_func_block.py b/coreblocks/func_blocks/fu/common/rs_func_block.py
index 6345caf6b..35801dc12 100644
--- a/coreblocks/func_blocks/fu/common/rs_func_block.py
+++ b/coreblocks/func_blocks/fu/common/rs_func_block.py
@@ -93,7 +93,7 @@ def elaborate(self, platform):
 class RSBlockComponent(BlockComponentParams):
     func_units: Collection[FunctionalComponentParams]
     rs_entries: int
-    rs_number: int
+    rs_number: int = -1  # overwritten by CoreConfiguration
 
     def get_module(self, gen_params: GenParams) -> FuncBlock:
         modules = list((u.get_module(gen_params), u.get_optypes()) for u in self.func_units)
diff --git a/coreblocks/func_blocks/lsu/dummyLsu.py b/coreblocks/func_blocks/lsu/dummyLsu.py
index ccda62e32..08a5d8604 100644
--- a/coreblocks/func_blocks/lsu/dummyLsu.py
+++ b/coreblocks/func_blocks/lsu/dummyLsu.py
@@ -1,3 +1,4 @@
+from dataclasses import dataclass
 from amaranth import *
 from amaranth.lib.data import View
 
@@ -320,6 +321,7 @@ def _(rob_id: Value, side_fx: Value):
         return m
 
 
+@dataclass(frozen=True)
 class LSUBlockComponent(BlockComponentParams):
     def get_module(self, gen_params: GenParams) -> FuncBlock:
         connections = gen_params.get(DependencyManager)
diff --git a/coreblocks/params/configurations.py b/coreblocks/params/configurations.py
index b7fb53173..c8dd6810c 100644
--- a/coreblocks/params/configurations.py
+++ b/coreblocks/params/configurations.py
@@ -26,7 +26,6 @@
     RSBlockComponent(
         [ALUComponent(), ShiftUnitComponent(), JumpComponent(), ExceptionUnitComponent(), PrivilegedUnitComponent()],
         rs_entries=4,
-        rs_number=0,
     ),
     LSUBlockComponent(),
     CSRBlockComponent(),
@@ -73,6 +72,12 @@ class CoreConfiguration:
         Definitions of PMAs per contiguous segments of memory.
     """
 
+    def __post_init__(self):
+        self.func_units_config = [
+            dataclasses.replace(conf, rs_number=k) if hasattr(conf, "rs_number") else conf
+            for k, conf in enumerate(self.func_units_config)
+        ]
+
     xlen: int = 32
     func_units_config: Collection[BlockComponentParams] = basic_configuration
 
@@ -107,7 +112,7 @@ def replace(self, **kwargs):
 tiny_core_config = CoreConfiguration(
     embedded=True,
     func_units_config=(
-        RSBlockComponent([ALUComponent(), ShiftUnitComponent(), JumpComponent()], rs_entries=2, rs_number=0),
+        RSBlockComponent([ALUComponent(), ShiftUnitComponent(), JumpComponent()], rs_entries=2),
         LSUBlockComponent(),
     ),
     phys_regs_bits=basic_core_config.phys_regs_bits - 1,
@@ -129,7 +134,6 @@ def replace(self, **kwargs):
                 PrivilegedUnitComponent(),
             ],
             rs_entries=4,
-            rs_number=0,
         ),
         RSBlockComponent(
             [
@@ -137,7 +141,6 @@ def replace(self, **kwargs):
                 DivComponent(),
             ],
             rs_entries=2,
-            rs_number=1,
         ),
         LSUBlockComponent(),
         CSRBlockComponent(),
@@ -147,7 +150,7 @@ def replace(self, **kwargs):
 
 # Core configuration used in internal testbenches
 test_core_config = CoreConfiguration(
-    func_units_config=tuple(RSBlockComponent([], rs_entries=4, rs_number=k) for k in range(2)),
+    func_units_config=tuple(RSBlockComponent([], rs_entries=4) for _ in range(2)),
     rob_entries_bits=7,
     phys_regs_bits=7,
     _implied_extensions=Extension.I,
diff --git a/coreblocks/params/fu_params.py b/coreblocks/params/fu_params.py
index 297e9e9fc..4884d7c9f 100644
--- a/coreblocks/params/fu_params.py
+++ b/coreblocks/params/fu_params.py
@@ -1,4 +1,5 @@
 from abc import abstractmethod, ABC
+from dataclasses import dataclass
 from collections.abc import Collection, Iterable
 
 from coreblocks.func_blocks.interface.func_protocols import FuncBlock, FuncUnit
@@ -20,6 +21,7 @@
 ]
 
 
+@dataclass(frozen=True)
 class BlockComponentParams(ABC):
     @abstractmethod
     def get_module(self, gen_params: "GenParams") -> FuncBlock:

From f9313515db9bd68b7334611fd869f7b203adfa45 Mon Sep 17 00:00:00 2001
From: Marek Materzok <tilk@tilk.eu>
Date: Sun, 31 Mar 2024 16:17:26 +0200
Subject: [PATCH 10/14] Use Now(), increase number of tests, fixes

---
 test/transactron/test_metrics.py | 25 ++++++++-----------------
 1 file changed, 8 insertions(+), 17 deletions(-)

diff --git a/test/transactron/test_metrics.py b/test/transactron/test_metrics.py
index 5eaac6066..c197c5605 100644
--- a/test/transactron/test_metrics.py
+++ b/test/transactron/test_metrics.py
@@ -9,6 +9,7 @@
 from transactron.lib.metrics import *
 from transactron import *
 from transactron.testing import TestCaseWithSimulator, data_layout, SimpleTestCircuit
+from transactron.testing.infrastructure import Now
 from transactron.utils.dependencies import DependencyContext
 
 
@@ -332,32 +333,21 @@ def test_latency_measurer(self):
         free_slots = list(k for k in range(self.slots_number))
         used_slots: list[int] = []
 
-        time = 0
-
-        def ticker():
-            nonlocal time
-
-            yield Passive()
-
-            while True:
-                yield
-                time += 1
-
         finish = False
 
         def producer():
             nonlocal finish
 
             for _ in range(200):
-                if not free_slots:
+                while not free_slots:
                     yield
                     continue
+                yield Settle()
 
                 slot_id = random.choice(free_slots)
                 yield from m._start.call(slot=slot_id)
 
-                # Make sure that the time is updated first.
-                yield Settle()
+                time = (yield Now())
 
                 events[slot_id] = time
                 free_slots.remove(slot_id)
@@ -369,7 +359,7 @@ def producer():
 
         def consumer():
             while not finish:
-                if not used_slots:
+                while not used_slots:
                     yield
                     continue
 
@@ -377,7 +367,9 @@ def consumer():
 
                 yield from m._stop.call(slot=slot_id)
 
-                # Make sure that the time is updated first.
+                time = (yield Now())
+
+                yield Settle()
                 yield Settle()
 
                 latencies.append(time - events[slot_id])
@@ -401,7 +393,6 @@ def consumer():
         with self.run_simulation(m) as sim:
             sim.add_sync_process(producer)
             sim.add_sync_process(consumer)
-            sim.add_sync_process(ticker)
 
 
 class MetricManagerTestCircuit(Elaboratable):

From 2971e307ae2764e7530d16dd4a0a4cca4bb04579 Mon Sep 17 00:00:00 2001
From: Marek Materzok <tilk@tilk.eu>
Date: Sun, 31 Mar 2024 16:24:43 +0200
Subject: [PATCH 11/14] Use Now() in another test

---
 test/transactron/test_metrics.py | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/test/transactron/test_metrics.py b/test/transactron/test_metrics.py
index c197c5605..16d1cdf1e 100644
--- a/test/transactron/test_metrics.py
+++ b/test/transactron/test_metrics.py
@@ -252,17 +252,6 @@ def test_latency_measurer(self):
 
         event_queue = queue.Queue()
 
-        time = 0
-
-        def ticker():
-            nonlocal time
-
-            yield Passive()
-
-            while True:
-                yield
-                time += 1
-
         finish = False
 
         def producer():
@@ -273,6 +262,7 @@ def producer():
 
                 # Make sure that the time is updated first.
                 yield Settle()
+                time = (yield Now())
                 event_queue.put(time)
                 yield from self.random_wait_geom(0.8)
 
@@ -284,6 +274,7 @@ def consumer():
 
                 # Make sure that the time is updated first.
                 yield Settle()
+                time = (yield Now())
                 latencies.append(time - event_queue.get())
 
                 yield from self.random_wait_geom(1.0 / self.expected_consumer_wait)
@@ -303,7 +294,6 @@ def consumer():
         with self.run_simulation(m) as sim:
             sim.add_sync_process(producer)
             sim.add_sync_process(consumer)
-            sim.add_sync_process(ticker)
 
 
 @parameterized_class(

From 463f29f94c301758a3f5d56a5167af3316462ac6 Mon Sep 17 00:00:00 2001
From: Marek Materzok <tilk@tilk.eu>
Date: Sun, 31 Mar 2024 17:12:23 +0200
Subject: [PATCH 12/14] LatencyMeasurer test refactor

---
 test/transactron/test_metrics.py | 53 +++++++++++++++-----------------
 1 file changed, 24 insertions(+), 29 deletions(-)

diff --git a/test/transactron/test_metrics.py b/test/transactron/test_metrics.py
index 16d1cdf1e..7005bc41a 100644
--- a/test/transactron/test_metrics.py
+++ b/test/transactron/test_metrics.py
@@ -4,7 +4,7 @@
 from parameterized import parameterized_class
 
 from amaranth import *
-from amaranth.sim import Passive, Settle
+from amaranth.sim import Settle
 
 from transactron.lib.metrics import *
 from transactron import *
@@ -227,6 +227,21 @@ def test_process():
             sim.add_sync_process(test_process)
 
 
+class TestLatencyMeasurerBase(TestCaseWithSimulator):
+    def check_latencies(self, m: SimpleTestCircuit, latencies: list[int]):
+        self.assertEqual(min(latencies), (yield m._dut.histogram.min.value))
+        self.assertEqual(max(latencies), (yield m._dut.histogram.max.value))
+        self.assertEqual(sum(latencies), (yield m._dut.histogram.sum.value))
+        self.assertEqual(len(latencies), (yield m._dut.histogram.count.value))
+
+        for i in range(m._dut.histogram.bucket_count):
+            bucket_start = 0 if i == 0 else 2 ** (i - 1)
+            bucket_end = 1e10 if i == m._dut.histogram.bucket_count - 1 else 2**i
+
+            count = sum(1 for x in latencies if bucket_start <= x < bucket_end)
+            self.assertEqual(count, (yield m._dut.histogram.buckets[i].value))
+
+
 @parameterized_class(
     ("slots_number", "expected_consumer_wait"),
     [
@@ -238,7 +253,7 @@ def test_process():
         (5, 5),
     ],
 )
-class TestFIFOLatencyMeasurer(TestCaseWithSimulator):
+class TestFIFOLatencyMeasurer(TestLatencyMeasurerBase):
     slots_number: int
     expected_consumer_wait: float
 
@@ -262,7 +277,7 @@ def producer():
 
                 # Make sure that the time is updated first.
                 yield Settle()
-                time = (yield Now())
+                time = yield Now()
                 event_queue.put(time)
                 yield from self.random_wait_geom(0.8)
 
@@ -274,22 +289,12 @@ def consumer():
 
                 # Make sure that the time is updated first.
                 yield Settle()
-                time = (yield Now())
+                time = yield Now()
                 latencies.append(time - event_queue.get())
 
                 yield from self.random_wait_geom(1.0 / self.expected_consumer_wait)
 
-            self.assertEqual(min(latencies), (yield m._dut.histogram.min.value))
-            self.assertEqual(max(latencies), (yield m._dut.histogram.max.value))
-            self.assertEqual(sum(latencies), (yield m._dut.histogram.sum.value))
-            self.assertEqual(len(latencies), (yield m._dut.histogram.count.value))
-
-            for i in range(m._dut.histogram.bucket_count):
-                bucket_start = 0 if i == 0 else 2 ** (i - 1)
-                bucket_end = 1e10 if i == m._dut.histogram.bucket_count - 1 else 2**i
-
-                count = sum(1 for x in latencies if bucket_start <= x < bucket_end)
-                self.assertEqual(count, (yield m._dut.histogram.buckets[i].value))
+            self.check_latencies(m, latencies)
 
         with self.run_simulation(m) as sim:
             sim.add_sync_process(producer)
@@ -307,7 +312,7 @@ def consumer():
         (5, 5),
     ],
 )
-class TestIndexedLatencyMeasurer(TestCaseWithSimulator):
+class TestIndexedLatencyMeasurer(TestLatencyMeasurerBase):
     slots_number: int
     expected_consumer_wait: float
 
@@ -337,7 +342,7 @@ def producer():
                 slot_id = random.choice(free_slots)
                 yield from m._start.call(slot=slot_id)
 
-                time = (yield Now())
+                time = yield Now()
 
                 events[slot_id] = time
                 free_slots.remove(slot_id)
@@ -357,7 +362,7 @@ def consumer():
 
                 yield from m._stop.call(slot=slot_id)
 
-                time = (yield Now())
+                time = yield Now()
 
                 yield Settle()
                 yield Settle()
@@ -368,17 +373,7 @@ def consumer():
 
                 yield from self.random_wait_geom(1.0 / self.expected_consumer_wait)
 
-            self.assertEqual(min(latencies), (yield m._dut.histogram.min.value))
-            self.assertEqual(max(latencies), (yield m._dut.histogram.max.value))
-            self.assertEqual(sum(latencies), (yield m._dut.histogram.sum.value))
-            self.assertEqual(len(latencies), (yield m._dut.histogram.count.value))
-
-            for i in range(m._dut.histogram.bucket_count):
-                bucket_start = 0 if i == 0 else 2 ** (i - 1)
-                bucket_end = 1e10 if i == m._dut.histogram.bucket_count - 1 else 2**i
-
-                count = sum(1 for x in latencies if bucket_start <= x < bucket_end)
-                self.assertEqual(count, (yield m._dut.histogram.buckets[i].value))
+            self.check_latencies(m, latencies)
 
         with self.run_simulation(m) as sim:
             sim.add_sync_process(producer)

From 110d596f35e29d6d804ab78dfa9870c303492494 Mon Sep 17 00:00:00 2001
From: Marek Materzok <tilk@tilk.eu>
Date: Mon, 1 Apr 2024 14:30:09 +0200
Subject: [PATCH 13/14] Add assertions to IndexedLatencyMeasurer

---
 transactron/lib/metrics.py | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/transactron/lib/metrics.py b/transactron/lib/metrics.py
index 816225765..68f7de00e 100644
--- a/transactron/lib/metrics.py
+++ b/transactron/lib/metrics.py
@@ -9,7 +9,7 @@
 from transactron.utils import ValueLike
 from transactron import Method, def_method, TModule
 from transactron.utils import SignalBundle
-from transactron.lib import FIFO, AsyncMemoryBank
+from transactron.lib import FIFO, AsyncMemoryBank, logging
 from transactron.utils.dependencies import ListKey, DependencyContext, SimpleKey
 
 __all__ = [
@@ -524,6 +524,8 @@ def __init__(
             sample_width=bits_for(self.max_latency),
         )
 
+        self.log = logging.HardwareLogger(fully_qualified_name)
+
     def elaborate(self, platform):
         if not self.metrics_enabled():
             return TModule()
@@ -537,16 +539,28 @@ def elaborate(self, platform):
         )
         m.submodules.histogram = self.histogram
 
+        slots_taken = Signal(self.slots_number)
+        slots_taken_start = Signal.like(slots_taken)
+        slots_taken_stop = Signal.like(slots_taken)
+
+        m.d.comb += slots_taken_start.eq(slots_taken)
+        m.d.comb += slots_taken_stop.eq(slots_taken_start)
+        m.d.sync += slots_taken.eq(slots_taken_stop)
+
         epoch = Signal(epoch_width)
 
         m.d.sync += epoch.eq(epoch + 1)
 
         @def_method(m, self._start)
-        def _(slot):
+        def _(slot: Value):
+            m.d.comb += slots_taken_start.eq(slots_taken | (1 << slot))
+            self.log.error(m, (slots_taken & (1 << slot)).any(), "taken slot {} taken again", slot)
             self.slots.write(m, addr=slot, data=epoch)
 
         @def_method(m, self._stop)
-        def _(slot):
+        def _(slot: Value):
+            m.d.comb += slots_taken_stop.eq(slots_taken_start & ~(C(1, self.slots_number) << slot))
+            self.log.error(m, ~(slots_taken & (1 << slot)).any(), "free slot {} freed again", slot)
             ret = self.slots.read(m, addr=slot)
             # The result of substracting two unsigned n-bit is a signed (n+1)-bit value,
             # so we need to cast the result and discard the most significant bit.

From a3189030eff5f8b66ab36ebc88c76c8e1fbab1f9 Mon Sep 17 00:00:00 2001
From: Marek Materzok <tilk@tilk.eu>
Date: Mon, 1 Apr 2024 14:31:31 +0200
Subject: [PATCH 14/14] Change Indexed to Tagged

---
 coreblocks/core_structs/rf.py          |  4 ++--
 coreblocks/func_blocks/fu/common/rs.py |  4 ++--
 test/transactron/test_metrics.py       |  2 +-
 transactron/lib/metrics.py             | 14 +++++++-------
 4 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/coreblocks/core_structs/rf.py b/coreblocks/core_structs/rf.py
index e8c4e4ef3..d6d5e76e8 100644
--- a/coreblocks/core_structs/rf.py
+++ b/coreblocks/core_structs/rf.py
@@ -2,7 +2,7 @@
 from transactron import Method, Transaction, def_method, TModule
 from coreblocks.interface.layouts import RFLayouts
 from coreblocks.params import GenParams
-from transactron.lib.metrics import HwExpHistogram, IndexedLatencyMeasurer
+from transactron.lib.metrics import HwExpHistogram, TaggedLatencyMeasurer
 from transactron.utils.amaranth_ext.functions import popcount
 from transactron.utils.transactron_helpers import make_layout
 
@@ -22,7 +22,7 @@ def __init__(self, *, gen_params: GenParams):
         self.write = Method(i=layouts.rf_write)
         self.free = Method(i=layouts.rf_free)
 
-        self.perf_rf_valid_time = IndexedLatencyMeasurer(
+        self.perf_rf_valid_time = TaggedLatencyMeasurer(
             "struct.rf.valid_time",
             description="Distribution of time registers are valid in RF",
             slots_number=2**gen_params.phys_regs_bits,
diff --git a/coreblocks/func_blocks/fu/common/rs.py b/coreblocks/func_blocks/fu/common/rs.py
index 6c418f226..1911690b4 100644
--- a/coreblocks/func_blocks/fu/common/rs.py
+++ b/coreblocks/func_blocks/fu/common/rs.py
@@ -6,7 +6,7 @@
 from coreblocks.params import GenParams
 from coreblocks.frontend.decoder import OpType
 from coreblocks.interface.layouts import RSLayouts
-from transactron.lib.metrics import HwExpHistogram, IndexedLatencyMeasurer
+from transactron.lib.metrics import HwExpHistogram, TaggedLatencyMeasurer
 from transactron.utils import RecordDict
 from transactron.utils.amaranth_ext.functions import popcount
 from transactron.utils.transactron_helpers import make_layout
@@ -44,7 +44,7 @@ def __init__(
         self.data = Array(Signal(self.internal_layout) for _ in range(self.rs_entries))
         self.data_ready = Signal(self.rs_entries)
 
-        self.perf_rs_wait_time = IndexedLatencyMeasurer(
+        self.perf_rs_wait_time = TaggedLatencyMeasurer(
             f"fu.block_{rs_number}.rs.valid_time",
             description=f"Distribution of time instructions wait in RS {rs_number}",
             slots_number=2**self.rs_entries_bits,
diff --git a/test/transactron/test_metrics.py b/test/transactron/test_metrics.py
index 7005bc41a..6b0e4f738 100644
--- a/test/transactron/test_metrics.py
+++ b/test/transactron/test_metrics.py
@@ -319,7 +319,7 @@ class TestIndexedLatencyMeasurer(TestLatencyMeasurerBase):
     def test_latency_measurer(self):
         random.seed(42)
 
-        m = SimpleTestCircuit(IndexedLatencyMeasurer("latency", slots_number=self.slots_number, max_latency=300))
+        m = SimpleTestCircuit(TaggedLatencyMeasurer("latency", slots_number=self.slots_number, max_latency=300))
         DependencyContext.get().add_dependency(HwMetricsEnabledKey(), True)
 
         latencies: list[int] = []
diff --git a/transactron/lib/metrics.py b/transactron/lib/metrics.py
index 68f7de00e..b7e36a86c 100644
--- a/transactron/lib/metrics.py
+++ b/transactron/lib/metrics.py
@@ -19,7 +19,7 @@
     "HwCounter",
     "HwExpHistogram",
     "FIFOLatencyMeasurer",
-    "IndexedLatencyMeasurer",
+    "TaggedLatencyMeasurer",
     "HardwareMetricsManager",
     "HwMetricsEnabledKey",
 ]
@@ -474,12 +474,12 @@ def metrics_enabled(self) -> bool:
         return DependencyContext.get().get_dependency(HwMetricsEnabledKey())
 
 
-class IndexedLatencyMeasurer(Elaboratable):
+class TaggedLatencyMeasurer(Elaboratable):
     """
     Measures duration between two events, e.g. request processing latency.
     It can track multiple events at the same time, i.e. the second event can
     be registered as started, before the first finishes. However, each event
-    needs to have an unique slot index.
+    needs to have an unique slot tag.
 
     The module exposes an exponential histogram of the measured latencies.
     """
@@ -571,7 +571,7 @@ def _(slot: Value):
 
     def start(self, m: TModule, *, slot: ValueLike):
         """
-        Registers the start of an event for a given slot index.
+        Registers the start of an event for a given slot tag.
 
         Should be called in the body of either a transaction or a method.
 
@@ -580,7 +580,7 @@ def start(self, m: TModule, *, slot: ValueLike):
         m: TModule
             Transactron module
         slot: ValueLike
-            The slot index of the event.
+            The slot tag of the event.
         """
 
         if not self.metrics_enabled():
@@ -590,7 +590,7 @@ def start(self, m: TModule, *, slot: ValueLike):
 
     def stop(self, m: TModule, *, slot: ValueLike):
         """
-        Registers the end of the event for a given slot index.
+        Registers the end of the event for a given slot tag.
 
         Should be called in the body of either a transaction or a method.
 
@@ -599,7 +599,7 @@ def stop(self, m: TModule, *, slot: ValueLike):
         m: TModule
             Transactron module
         slot: ValueLike
-            The slot index of the event.
+            The slot tag of the event.
         """
 
         if not self.metrics_enabled():