diff --git a/coreblocks/cache/icache.py b/coreblocks/cache/icache.py
index 0b60cf37c..08cd51784 100644
--- a/coreblocks/cache/icache.py
+++ b/coreblocks/cache/icache.py
@@ -115,7 +115,7 @@ def __init__(self, layouts: ICacheLayouts, params: ICacheParameters, refiller: C
         self.perf_misses = HwCounter("frontend.icache.misses")
         self.perf_errors = HwCounter("frontend.icache.fetch_errors")
         self.perf_flushes = HwCounter("frontend.icache.flushes")
-        self.req_latency = LatencyMeasurer(
+        self.req_latency = FIFOLatencyMeasurer(
             "frontend.icache.req_latency", "Latencies of cache requests", slots_number=2, max_latency=500
         )
 
diff --git a/coreblocks/core_structs/rf.py b/coreblocks/core_structs/rf.py
index f7a9b8a7f..d6d5e76e8 100644
--- a/coreblocks/core_structs/rf.py
+++ b/coreblocks/core_structs/rf.py
@@ -1,7 +1,9 @@
 from amaranth import *
-from transactron import Method, def_method, TModule
+from transactron import Method, Transaction, def_method, TModule
 from coreblocks.interface.layouts import RFLayouts
 from coreblocks.params import GenParams
+from transactron.lib.metrics import HwExpHistogram, TaggedLatencyMeasurer
+from transactron.utils.amaranth_ext.functions import popcount
 from transactron.utils.transactron_helpers import make_layout
 
 __all__ = ["RegisterFile"]
@@ -20,9 +22,24 @@ def __init__(self, *, gen_params: GenParams):
         self.write = Method(i=layouts.rf_write)
         self.free = Method(i=layouts.rf_free)
 
+        self.perf_rf_valid_time = TaggedLatencyMeasurer(
+            "struct.rf.valid_time",
+            description="Distribution of time registers are valid in RF",
+            slots_number=2**gen_params.phys_regs_bits,
+            max_latency=1000,
+        )
+        self.perf_num_valid = HwExpHistogram(
+            "struct.rf.num_valid",
+            description="Number of valid registers in RF",
+            bucket_count=gen_params.phys_regs_bits + 1,
+            sample_width=gen_params.phys_regs_bits + 1,
+        )
+
     def elaborate(self, platform):
         m = TModule()
 
+        m.submodules += [self.perf_rf_valid_time, self.perf_num_valid]
+
         being_written = Signal(self.gen_params.phys_regs_bits)
         written_value = Signal(self.gen_params.isa.xlen)
 
@@ -56,10 +73,20 @@ def _(reg_id: Value, reg_val: Value):
             with m.If(~(zero_reg)):
                 m.d.sync += self.entries[reg_id].reg_val.eq(reg_val)
                 m.d.sync += self.entries[reg_id].valid.eq(1)
+                self.perf_rf_valid_time.start(m, slot=reg_id)
 
         @def_method(m, self.free)
         def _(reg_id: Value):
             with m.If(reg_id != 0):
                 m.d.sync += self.entries[reg_id].valid.eq(0)
+                self.perf_rf_valid_time.stop(m, slot=reg_id)
+
+        if self.perf_num_valid.metrics_enabled():
+            num_valid = Signal(self.gen_params.phys_regs_bits + 1)
+            m.d.comb += num_valid.eq(
+                popcount(Cat(self.entries[reg_id].valid for reg_id in range(2**self.gen_params.phys_regs_bits)))
+            )
+            with Transaction(name="perf").body(m):
+                self.perf_num_valid.add(m, num_valid)
 
         return m
diff --git a/coreblocks/core_structs/rob.py b/coreblocks/core_structs/rob.py
index 1f3806d46..25b14bab3 100644
--- a/coreblocks/core_structs/rob.py
+++ b/coreblocks/core_structs/rob.py
@@ -1,5 +1,5 @@
 from amaranth import *
-from transactron import Method, def_method, TModule
+from transactron import Method, Transaction, def_method, TModule
 from transactron.lib.metrics import *
 from coreblocks.interface.layouts import ROBLayouts
 from coreblocks.params import GenParams
@@ -18,17 +18,23 @@ def __init__(self, gen_params: GenParams) -> None:
         self.data = Array(Signal(layouts.internal_layout) for _ in range(2**gen_params.rob_entries_bits))
         self.get_indices = Method(o=layouts.get_indices, nonexclusive=True)
 
-        self.perf_rob_wait_time = LatencyMeasurer(
+        self.perf_rob_wait_time = FIFOLatencyMeasurer(
             "backend.rob.wait_time",
             description="Distribution of time instructions spend in ROB",
             slots_number=(2**gen_params.rob_entries_bits + 1),
             max_latency=1000,
         )
+        self.perf_rob_size = HwExpHistogram(
+            "backend.rob.size",
+            description="Number of instructions in ROB",
+            bucket_count=gen_params.rob_entries_bits + 1,
+            sample_width=gen_params.rob_entries_bits,
+        )
 
     def elaborate(self, platform):
         m = TModule()
 
-        m.submodules += [self.perf_rob_wait_time]
+        m.submodules += [self.perf_rob_wait_time, self.perf_rob_size]
 
         start_idx = Signal(self.params.rob_entries_bits)
         end_idx = Signal(self.params.rob_entries_bits)
@@ -70,4 +76,10 @@ def _(rob_id: Value, exception):
         def _():
             return {"start": start_idx, "end": end_idx}
 
+        if self.perf_rob_size.metrics_enabled():
+            rob_size = Signal(self.params.rob_entries_bits)
+            m.d.comb += rob_size.eq((end_idx - start_idx)[0 : self.params.rob_entries_bits])
+            with Transaction(name="perf").body(m):
+                self.perf_rob_size.add(m, rob_size)
+
         return m
diff --git a/coreblocks/func_blocks/csr/csr.py b/coreblocks/func_blocks/csr/csr.py
index 43ddfe957..697de5c63 100644
--- a/coreblocks/func_blocks/csr/csr.py
+++ b/coreblocks/func_blocks/csr/csr.py
@@ -236,6 +236,7 @@ def _(rob_id: Value, side_fx: Value):
         return m
 
 
+@dataclass(frozen=True)
 class CSRBlockComponent(BlockComponentParams):
     def get_module(self, gen_params: GenParams) -> FuncBlock:
         connections = gen_params.get(DependencyManager)
diff --git a/coreblocks/func_blocks/fu/common/rs.py b/coreblocks/func_blocks/fu/common/rs.py
index 56287df27..1911690b4 100644
--- a/coreblocks/func_blocks/fu/common/rs.py
+++ b/coreblocks/func_blocks/fu/common/rs.py
@@ -2,11 +2,13 @@
 from typing import Optional
 from amaranth import *
 from amaranth.lib.coding import PriorityEncoder
-from transactron import Method, def_method, TModule
+from transactron import Method, Transaction, def_method, TModule
 from coreblocks.params import GenParams
 from coreblocks.frontend.decoder import OpType
 from coreblocks.interface.layouts import RSLayouts
+from transactron.lib.metrics import HwExpHistogram, TaggedLatencyMeasurer
 from transactron.utils import RecordDict
+from transactron.utils.amaranth_ext.functions import popcount
 from transactron.utils.transactron_helpers import make_layout
 
 __all__ = ["RS"]
@@ -14,7 +16,11 @@
 
 class RS(Elaboratable):
     def __init__(
-        self, gen_params: GenParams, rs_entries: int, ready_for: Optional[Iterable[Iterable[OpType]]] = None
+        self,
+        gen_params: GenParams,
+        rs_entries: int,
+        rs_number: int,
+        ready_for: Optional[Iterable[Iterable[OpType]]] = None,
     ) -> None:
         ready_for = ready_for or ((op for op in OpType),)
         self.gen_params = gen_params
@@ -38,10 +44,24 @@ def __init__(
         self.data = Array(Signal(self.internal_layout) for _ in range(self.rs_entries))
         self.data_ready = Signal(self.rs_entries)
 
+        self.perf_rs_wait_time = TaggedLatencyMeasurer(
+            f"fu.block_{rs_number}.rs.valid_time",
+            description=f"Distribution of time instructions wait in RS {rs_number}",
+            slots_number=2**self.rs_entries_bits,
+            max_latency=1000,
+        )
+        self.perf_num_full = HwExpHistogram(
+            f"fu.block_{rs_number}.rs.num_full",
+            description=f"Number of full entries in RS {rs_number}",
+            bucket_count=self.rs_entries_bits + 1,
+            sample_width=self.rs_entries_bits + 1,
+        )
+
     def elaborate(self, platform):
         m = TModule()
 
         m.submodules.enc_select = PriorityEncoder(width=self.rs_entries)
+        m.submodules += [self.perf_rs_wait_time, self.perf_num_full]
 
         for i, record in enumerate(self.data):
             m.d.comb += self.data_ready[i].eq(
@@ -71,6 +91,7 @@ def _(rs_entry_id: Value, rs_data: Value) -> None:
             m.d.sync += self.data[rs_entry_id].rs_data.eq(rs_data)
             m.d.sync += self.data[rs_entry_id].rec_full.eq(1)
             m.d.sync += self.data[rs_entry_id].rec_reserved.eq(1)
+            self.perf_rs_wait_time.start(m, slot=rs_entry_id)
 
         @def_method(m, self.update)
         def _(reg_id: Value, reg_val: Value) -> None:
@@ -89,6 +110,7 @@ def _(rs_entry_id: Value) -> RecordDict:
             record = self.data[rs_entry_id]
             m.d.sync += record.rec_reserved.eq(0)
             m.d.sync += record.rec_full.eq(0)
+            self.perf_rs_wait_time.stop(m, slot=rs_entry_id)
             return {
                 "s1_val": record.rs_data.s1_val,
                 "s2_val": record.rs_data.s2_val,
@@ -105,4 +127,10 @@ def _(rs_entry_id: Value) -> RecordDict:
             def _() -> RecordDict:
                 return {"ready_list": ready_list}
 
+        if self.perf_num_full.metrics_enabled():
+            num_full = Signal(self.rs_entries_bits + 1)
+            m.d.comb += num_full.eq(popcount(Cat(self.data[entry_id].rec_full for entry_id in range(self.rs_entries))))
+            with Transaction(name="perf").body(m):
+                self.perf_num_full.add(m, num_full)
+
         return m
diff --git a/coreblocks/func_blocks/fu/common/rs_func_block.py b/coreblocks/func_blocks/fu/common/rs_func_block.py
index 66fed3d0e..35801dc12 100644
--- a/coreblocks/func_blocks/fu/common/rs_func_block.py
+++ b/coreblocks/func_blocks/fu/common/rs_func_block.py
@@ -31,7 +31,9 @@ class RSFuncBlock(FuncBlock, Elaboratable):
         layout described by `FuncUnitLayouts`.
     """
 
-    def __init__(self, gen_params: GenParams, func_units: Iterable[tuple[FuncUnit, set[OpType]]], rs_entries: int):
+    def __init__(
+        self, gen_params: GenParams, func_units: Iterable[tuple[FuncUnit, set[OpType]]], rs_entries: int, rs_number: int
+    ):
         """
         Parameters
         ----------
@@ -41,10 +43,13 @@ def __init__(self, gen_params: GenParams, func_units: Iterable[tuple[FuncUnit, s
             Functional units to be used by this module.
         rs_entries: int
             Number of entries in RS.
+        rs_number: int
+            The number of this RS block. Used for debugging.
         """
         self.gen_params = gen_params
         self.rs_entries = rs_entries
         self.rs_entries_bits = (rs_entries - 1).bit_length()
+        self.rs_number = rs_number
         self.rs_layouts = gen_params.get(RSLayouts, rs_entries_bits=self.rs_entries_bits)
         self.fu_layouts = gen_params.get(FuncUnitLayouts)
         self.func_units = list(func_units)
@@ -60,6 +65,7 @@ def elaborate(self, platform):
         m.submodules.rs = self.rs = RS(
             gen_params=self.gen_params,
             rs_entries=self.rs_entries,
+            rs_number=self.rs_number,
             ready_for=(optypes for _, optypes in self.func_units),
         )
 
@@ -87,10 +93,13 @@ def elaborate(self, platform):
 class RSBlockComponent(BlockComponentParams):
     func_units: Collection[FunctionalComponentParams]
     rs_entries: int
+    rs_number: int = -1  # overwritten by CoreConfiguration
 
     def get_module(self, gen_params: GenParams) -> FuncBlock:
         modules = list((u.get_module(gen_params), u.get_optypes()) for u in self.func_units)
-        rs_unit = RSFuncBlock(gen_params=gen_params, func_units=modules, rs_entries=self.rs_entries)
+        rs_unit = RSFuncBlock(
+            gen_params=gen_params, func_units=modules, rs_entries=self.rs_entries, rs_number=self.rs_number
+        )
         return rs_unit
 
     def get_optypes(self) -> set[OpType]:
diff --git a/coreblocks/func_blocks/lsu/dummyLsu.py b/coreblocks/func_blocks/lsu/dummyLsu.py
index ccda62e32..08a5d8604 100644
--- a/coreblocks/func_blocks/lsu/dummyLsu.py
+++ b/coreblocks/func_blocks/lsu/dummyLsu.py
@@ -1,3 +1,4 @@
+from dataclasses import dataclass
 from amaranth import *
 from amaranth.lib.data import View
 
@@ -320,6 +321,7 @@ def _(rob_id: Value, side_fx: Value):
         return m
 
 
+@dataclass(frozen=True)
 class LSUBlockComponent(BlockComponentParams):
     def get_module(self, gen_params: GenParams) -> FuncBlock:
         connections = gen_params.get(DependencyManager)
diff --git a/coreblocks/params/configurations.py b/coreblocks/params/configurations.py
index c2d51a1ca..1d17289f5 100644
--- a/coreblocks/params/configurations.py
+++ b/coreblocks/params/configurations.py
@@ -74,6 +74,12 @@ class CoreConfiguration:
         Definitions of PMAs per contiguous segments of memory.
     """
 
+    def __post_init__(self):
+        self.func_units_config = [
+            dataclasses.replace(conf, rs_number=k) if hasattr(conf, "rs_number") else conf
+            for k, conf in enumerate(self.func_units_config)
+        ]
+
     xlen: int = 32
     func_units_config: Collection[BlockComponentParams] = basic_configuration
 
diff --git a/coreblocks/params/fu_params.py b/coreblocks/params/fu_params.py
index 297e9e9fc..4884d7c9f 100644
--- a/coreblocks/params/fu_params.py
+++ b/coreblocks/params/fu_params.py
@@ -1,4 +1,5 @@
 from abc import abstractmethod, ABC
+from dataclasses import dataclass
 from collections.abc import Collection, Iterable
 
 from coreblocks.func_blocks.interface.func_protocols import FuncBlock, FuncUnit
@@ -20,6 +21,7 @@
 ]
 
 
+@dataclass(frozen=True)
 class BlockComponentParams(ABC):
     @abstractmethod
     def get_module(self, gen_params: "GenParams") -> FuncBlock:
diff --git a/test/regression/cocotb/benchmark.Makefile b/test/regression/cocotb/benchmark.Makefile
index 9962315fb..e49b55b39 100644
--- a/test/regression/cocotb/benchmark.Makefile
+++ b/test/regression/cocotb/benchmark.Makefile
@@ -14,7 +14,7 @@ SIM_BUILD = build/benchmark
 
 # Yosys/Amaranth borkedness workaround
 ifeq ($(SIM),verilator)
-  EXTRA_ARGS += -Wno-CASEINCOMPLETE -Wno-CASEOVERLAP -Wno-WIDTHEXPAND -Wno-WIDTHTRUNC
+  EXTRA_ARGS += -Wno-CASEINCOMPLETE -Wno-CASEOVERLAP -Wno-WIDTHEXPAND -Wno-WIDTHTRUNC -Wno-UNSIGNED
   BUILD_ARGS += -j`nproc`
 endif
 
diff --git a/test/regression/cocotb/signature.Makefile b/test/regression/cocotb/signature.Makefile
index b4f690635..a03d0a5f8 100644
--- a/test/regression/cocotb/signature.Makefile
+++ b/test/regression/cocotb/signature.Makefile
@@ -14,7 +14,7 @@ SIM_BUILD = build/signature
 
 # Yosys/Amaranth borkedness workaround
 ifeq ($(SIM),verilator)
-  EXTRA_ARGS += -Wno-CASEINCOMPLETE -Wno-CASEOVERLAP -Wno-WIDTHEXPAND -Wno-WIDTHTRUNC
+  EXTRA_ARGS += -Wno-CASEINCOMPLETE -Wno-CASEOVERLAP -Wno-WIDTHEXPAND -Wno-WIDTHTRUNC -Wno-UNSIGNED
   BUILD_ARGS += -j`nproc`
 endif
 
diff --git a/test/regression/cocotb/test.Makefile b/test/regression/cocotb/test.Makefile
index 210618067..5b9f7aad9 100644
--- a/test/regression/cocotb/test.Makefile
+++ b/test/regression/cocotb/test.Makefile
@@ -14,7 +14,7 @@ SIM_BUILD = build/test
 
 # Yosys/Amaranth borkedness workaround
 ifeq ($(SIM),verilator)
-  EXTRA_ARGS += -Wno-CASEINCOMPLETE -Wno-CASEOVERLAP -Wno-WIDTHEXPAND -Wno-WIDTHTRUNC
+  EXTRA_ARGS += -Wno-CASEINCOMPLETE -Wno-CASEOVERLAP -Wno-WIDTHEXPAND -Wno-WIDTHTRUNC -Wno-UNSIGNED
   BUILD_ARGS += -j`nproc`
 endif
 
diff --git a/test/scheduler/test_scheduler.py b/test/scheduler/test_scheduler.py
index 3c50efab6..2fcf54a50 100644
--- a/test/scheduler/test_scheduler.py
+++ b/test/scheduler/test_scheduler.py
@@ -127,7 +127,7 @@ def setUp(self):
         self.rs_count = len(self.optype_sets)
         self.gen_params = GenParams(
             test_core_config.replace(
-                func_units_config=tuple(RSBlockComponent([], rs_entries=4) for _ in range(self.rs_count))
+                func_units_config=tuple(RSBlockComponent([], rs_entries=4, rs_number=k) for k in range(self.rs_count))
             )
         )
         self.expected_rename_queue = deque()
diff --git a/test/scheduler/test_wakeup_select.py b/test/scheduler/test_wakeup_select.py
index 4ff298da9..3e406e1af 100644
--- a/test/scheduler/test_wakeup_select.py
+++ b/test/scheduler/test_wakeup_select.py
@@ -43,7 +43,9 @@ def elaborate(self, platform):
 class TestWakeupSelect(TestCaseWithSimulator):
     def setUp(self):
         self.gen_params = GenParams(
-            test_core_config.replace(func_units_config=tuple(RSBlockComponent([], rs_entries=16) for _ in range(2)))
+            test_core_config.replace(
+                func_units_config=tuple(RSBlockComponent([], rs_entries=16, rs_number=k) for k in range(2))
+            )
         )
         self.m = WakeupTestCircuit(self.gen_params)
         self.cycles = 50
diff --git a/test/structs_common/test_rs.py b/test/structs_common/test_rs.py
index 4e86a46de..c62852cb0 100644
--- a/test/structs_common/test_rs.py
+++ b/test/structs_common/test_rs.py
@@ -24,7 +24,7 @@ class TestRSMethodInsert(TestCaseWithSimulator):
     def test_insert(self):
         self.gen_params = GenParams(test_core_config)
         self.rs_entries_bits = self.gen_params.max_rs_entries_bits
-        self.m = SimpleTestCircuit(RS(self.gen_params, 2**self.rs_entries_bits, None))
+        self.m = SimpleTestCircuit(RS(self.gen_params, 2**self.rs_entries_bits, 0, None))
         self.insert_list = [
             {
                 "rs_entry_id": id,
@@ -69,7 +69,7 @@ class TestRSMethodSelect(TestCaseWithSimulator):
     def test_select(self):
         self.gen_params = GenParams(test_core_config)
         self.rs_entries_bits = self.gen_params.max_rs_entries_bits
-        self.m = SimpleTestCircuit(RS(self.gen_params, 2**self.rs_entries_bits, None))
+        self.m = SimpleTestCircuit(RS(self.gen_params, 2**self.rs_entries_bits, 0, None))
         self.insert_list = [
             {
                 "rs_entry_id": id,
@@ -132,7 +132,7 @@ class TestRSMethodUpdate(TestCaseWithSimulator):
     def test_update(self):
         self.gen_params = GenParams(test_core_config)
         self.rs_entries_bits = self.gen_params.max_rs_entries_bits
-        self.m = SimpleTestCircuit(RS(self.gen_params, 2**self.rs_entries_bits, None))
+        self.m = SimpleTestCircuit(RS(self.gen_params, 2**self.rs_entries_bits, 0, None))
         self.insert_list = [
             {
                 "rs_entry_id": id,
@@ -223,7 +223,7 @@ class TestRSMethodTake(TestCaseWithSimulator):
     def test_take(self):
         self.gen_params = GenParams(test_core_config)
         self.rs_entries_bits = self.gen_params.max_rs_entries_bits
-        self.m = SimpleTestCircuit(RS(self.gen_params, 2**self.rs_entries_bits, None))
+        self.m = SimpleTestCircuit(RS(self.gen_params, 2**self.rs_entries_bits, 0, None))
         self.insert_list = [
             {
                 "rs_entry_id": id,
@@ -322,7 +322,7 @@ class TestRSMethodGetReadyList(TestCaseWithSimulator):
     def test_get_ready_list(self):
         self.gen_params = GenParams(test_core_config)
         self.rs_entries_bits = self.gen_params.max_rs_entries_bits
-        self.m = SimpleTestCircuit(RS(self.gen_params, 2**self.rs_entries_bits, None))
+        self.m = SimpleTestCircuit(RS(self.gen_params, 2**self.rs_entries_bits, 0, None))
         self.insert_list = [
             {
                 "rs_entry_id": id,
@@ -378,7 +378,7 @@ def test_two_get_ready_lists(self):
         self.rs_entries = self.gen_params.max_rs_entries
         self.rs_entries_bits = self.gen_params.max_rs_entries_bits
         self.m = SimpleTestCircuit(
-            RS(self.gen_params, 2**self.rs_entries_bits, [[OpType(1), OpType(2)], [OpType(3), OpType(4)]])
+            RS(self.gen_params, 2**self.rs_entries_bits, 0, [[OpType(1), OpType(2)], [OpType(3), OpType(4)]])
         )
         self.insert_list = [
             {
diff --git a/test/transactions/test_transaction_lib.py b/test/transactions/test_transaction_lib.py
index c8e758ce7..78119067f 100644
--- a/test/transactions/test_transaction_lib.py
+++ b/test/transactions/test_transaction_lib.py
@@ -142,7 +142,7 @@ def test_mem(self, max_addr, writer_rand, reader_req_rand, reader_resp_rand, see
             MemoryBank(data_layout=[("data", data_width)], elem_count=max_addr, safe_writes=safe_writes)
         )
 
-        data_dict: dict[int, int] = dict((i, 0) for i in range(max_addr))
+        data: list[int] = list(0 for _ in range(max_addr))
         read_req_queue = deque()
         addr_queue = deque()
 
@@ -155,7 +155,7 @@ def writer():
                 yield from m.write.call(data=d, addr=a)
                 for _ in range(2):
                     yield Settle()
-                data_dict[a] = d
+                data[a] = d
                 yield from self.random_wait(writer_rand, min_cycle_cnt=1)
 
         def reader_req():
@@ -165,7 +165,7 @@ def reader_req():
                 for _ in range(1):
                     yield Settle()
                 if safe_writes:
-                    d = data_dict[a]
+                    d = data[a]
                     read_req_queue.append(d)
                 else:
                     addr_queue.append((cycle, a))
@@ -188,7 +188,7 @@ def internal_reader_resp():
                 else:
                     yield
                     continue
-                d = data_dict[a]
+                d = data[a]
                 # check when internal method has been run to capture
                 # memory state for tests purposes
                 if (yield m._dut._internal_read_resp_trans.grant):
@@ -232,6 +232,43 @@ def process():
             sim.add_sync_process(process)
 
 
+class TestAsyncMemoryBank(TestCaseWithSimulator):
+    @parameterized.expand([(9, 3, 3, 14), (16, 1, 1, 15), (16, 1, 1, 16), (12, 3, 1, 17)])
+    def test_mem(self, max_addr, writer_rand, reader_rand, seed):
+        test_count = 200
+
+        data_width = 6
+        m = SimpleTestCircuit(AsyncMemoryBank(data_layout=[("data", data_width)], elem_count=max_addr))
+
+        data: list[int] = list(0 for i in range(max_addr))
+
+        random.seed(seed)
+
+        def writer():
+            for cycle in range(test_count):
+                d = random.randrange(2**data_width)
+                a = random.randrange(max_addr)
+                yield from m.write.call(data=d, addr=a)
+                for _ in range(2):
+                    yield Settle()
+                data[a] = d
+                yield from self.random_wait(writer_rand, min_cycle_cnt=1)
+
+        def reader():
+            for cycle in range(test_count):
+                a = random.randrange(max_addr)
+                d = yield from m.read.call(addr=a)
+                for _ in range(1):
+                    yield Settle()
+                expected_d = data[a]
+                self.assertEqual(d["data"], expected_d)
+                yield from self.random_wait(reader_rand, min_cycle_cnt=1)
+
+        with self.run_simulation(m) as sim:
+            sim.add_sync_process(reader)
+            sim.add_sync_process(writer)
+
+
 class ManyToOneConnectTransTestCircuit(Elaboratable):
     def __init__(self, count: int, lay: MethodLayout):
         self.count = count
diff --git a/test/transactron/test_metrics.py b/test/transactron/test_metrics.py
index 7a91616dd..a8af19af9 100644
--- a/test/transactron/test_metrics.py
+++ b/test/transactron/test_metrics.py
@@ -7,11 +7,12 @@
 from parameterized import parameterized_class
 
 from amaranth import *
-from amaranth.sim import Passive, Settle
+from amaranth.sim import Settle
 
 from transactron.lib.metrics import *
 from transactron import *
 from transactron.testing import TestCaseWithSimulator, data_layout, SimpleTestCircuit
+from transactron.testing.infrastructure import Now
 from transactron.utils.dependencies import DependencyContext
 
 
@@ -308,6 +309,21 @@ def test_process():
             sim.add_sync_process(test_process)
 
 
+class TestLatencyMeasurerBase(TestCaseWithSimulator):
+    def check_latencies(self, m: SimpleTestCircuit, latencies: list[int]):
+        self.assertEqual(min(latencies), (yield m._dut.histogram.min.value))
+        self.assertEqual(max(latencies), (yield m._dut.histogram.max.value))
+        self.assertEqual(sum(latencies), (yield m._dut.histogram.sum.value))
+        self.assertEqual(len(latencies), (yield m._dut.histogram.count.value))
+
+        for i in range(m._dut.histogram.bucket_count):
+            bucket_start = 0 if i == 0 else 2 ** (i - 1)
+            bucket_end = 1e10 if i == m._dut.histogram.bucket_count - 1 else 2**i
+
+            count = sum(1 for x in latencies if bucket_start <= x < bucket_end)
+            self.assertEqual(count, (yield m._dut.histogram.buckets[i].value))
+
+
 @parameterized_class(
     ("slots_number", "expected_consumer_wait"),
     [
@@ -319,31 +335,20 @@ def test_process():
         (5, 5),
     ],
 )
-class TestLatencyMeasurer(TestCaseWithSimulator):
+class TestFIFOLatencyMeasurer(TestLatencyMeasurerBase):
     slots_number: int
     expected_consumer_wait: float
 
     def test_latency_measurer(self):
         random.seed(42)
 
-        m = SimpleTestCircuit(LatencyMeasurer("latency", slots_number=self.slots_number, max_latency=300))
+        m = SimpleTestCircuit(FIFOLatencyMeasurer("latency", slots_number=self.slots_number, max_latency=300))
         DependencyContext.get().add_dependency(HwMetricsEnabledKey(), True)
 
         latencies: list[int] = []
 
         event_queue = queue.Queue()
 
-        time = 0
-
-        def ticker():
-            nonlocal time
-
-            yield Passive()
-
-            while True:
-                yield
-                time += 1
-
         finish = False
 
         def producer():
@@ -354,6 +359,7 @@ def producer():
 
                 # Make sure that the time is updated first.
                 yield Settle()
+                time = yield Now()
                 event_queue.put(time)
                 yield from self.random_wait_geom(0.8)
 
@@ -365,26 +371,95 @@ def consumer():
 
                 # Make sure that the time is updated first.
                 yield Settle()
+                time = yield Now()
                 latencies.append(time - event_queue.get())
 
                 yield from self.random_wait_geom(1.0 / self.expected_consumer_wait)
 
-            self.assertEqual(min(latencies), (yield m._dut.histogram.min.value))
-            self.assertEqual(max(latencies), (yield m._dut.histogram.max.value))
-            self.assertEqual(sum(latencies), (yield m._dut.histogram.sum.value))
-            self.assertEqual(len(latencies), (yield m._dut.histogram.count.value))
+            self.check_latencies(m, latencies)
+
+        with self.run_simulation(m) as sim:
+            sim.add_sync_process(producer)
+            sim.add_sync_process(consumer)
+
+
+@parameterized_class(
+    ("slots_number", "expected_consumer_wait"),
+    [
+        (2, 5),
+        (2, 10),
+        (5, 10),
+        (10, 1),
+        (10, 10),
+        (5, 5),
+    ],
+)
+class TestIndexedLatencyMeasurer(TestLatencyMeasurerBase):
+    slots_number: int
+    expected_consumer_wait: float
+
+    def test_latency_measurer(self):
+        random.seed(42)
+
+        m = SimpleTestCircuit(TaggedLatencyMeasurer("latency", slots_number=self.slots_number, max_latency=300))
+        DependencyContext.get().add_dependency(HwMetricsEnabledKey(), True)
+
+        latencies: list[int] = []
+
+        events = list(0 for _ in range(self.slots_number))
+        free_slots = list(k for k in range(self.slots_number))
+        used_slots: list[int] = []
+
+        finish = False
+
+        def producer():
+            nonlocal finish
+
+            for _ in range(200):
+                while not free_slots:
+                    yield
+                    continue
+                yield Settle()
+
+                slot_id = random.choice(free_slots)
+                yield from m._start.call(slot=slot_id)
+
+                time = yield Now()
+
+                events[slot_id] = time
+                free_slots.remove(slot_id)
+                used_slots.append(slot_id)
 
-            for i in range(m._dut.histogram.bucket_count):
-                bucket_start = 0 if i == 0 else 2 ** (i - 1)
-                bucket_end = 1e10 if i == m._dut.histogram.bucket_count - 1 else 2**i
+                yield from self.random_wait_geom(0.8)
+
+            finish = True
+
+        def consumer():
+            while not finish:
+                while not used_slots:
+                    yield
+                    continue
+
+                slot_id = random.choice(used_slots)
+
+                yield from m._stop.call(slot=slot_id)
+
+                time = yield Now()
+
+                yield Settle()
+                yield Settle()
+
+                latencies.append(time - events[slot_id])
+                used_slots.remove(slot_id)
+                free_slots.append(slot_id)
+
+                yield from self.random_wait_geom(1.0 / self.expected_consumer_wait)
 
-                count = sum(1 for x in latencies if bucket_start <= x < bucket_end)
-                self.assertEqual(count, (yield m._dut.histogram.buckets[i].value))
+            self.check_latencies(m, latencies)
 
         with self.run_simulation(m) as sim:
             sim.add_sync_process(producer)
             sim.add_sync_process(consumer)
-            sim.add_sync_process(ticker)
 
 
 class MetricManagerTestCircuit(Elaboratable):
diff --git a/transactron/lib/metrics.py b/transactron/lib/metrics.py
index f3d5b9e0d..17921e619 100644
--- a/transactron/lib/metrics.py
+++ b/transactron/lib/metrics.py
@@ -9,7 +9,7 @@
 
 from transactron.utils import ValueLike, OneHotSwitchDynamic, SignalBundle
 from transactron import Method, def_method, TModule
-from transactron.lib import FIFO
+from transactron.lib import FIFO, AsyncMemoryBank, logging
 from transactron.utils.dependencies import ListKey, DependencyContext, SimpleKey
 
 __all__ = [
@@ -19,7 +19,8 @@
     "HwCounter",
     "TaggedCounter",
     "HwExpHistogram",
-    "LatencyMeasurer",
+    "FIFOLatencyMeasurer",
+    "TaggedLatencyMeasurer",
     "HardwareMetricsManager",
     "HwMetricsEnabledKey",
 ]
@@ -476,7 +477,7 @@ def add(self, m: TModule, sample: Value):
         self._add(m, sample)
 
 
-class LatencyMeasurer(Elaboratable):
+class FIFOLatencyMeasurer(Elaboratable):
     """
     Measures duration between two events, e.g. request processing latency.
     It can track multiple events at the same time, i.e. the second event can
@@ -501,7 +502,7 @@ def __init__(
             The fully qualified name of the metric.
         description: str
             A human-readable description of the metric's functionality.
-        slots_number: str
+        slots_number: int
             A number of events that the module can track simultaneously.
         max_latency: int
             The maximum latency of an event. Used to set signal widths and
@@ -595,6 +596,143 @@ def metrics_enabled(self) -> bool:
         return DependencyContext.get().get_dependency(HwMetricsEnabledKey())
 
 
+class TaggedLatencyMeasurer(Elaboratable):
+    """
+    Measures duration between two events, e.g. request processing latency.
+    It can track multiple events at the same time, i.e. the second event can
+    be registered as started, before the first finishes. However, each event
+    needs to have an unique slot tag.
+
+    The module exposes an exponential histogram of the measured latencies.
+    """
+
+    def __init__(
+        self,
+        fully_qualified_name: str,
+        description: str = "",
+        *,
+        slots_number: int,
+        max_latency: int,
+    ):
+        """
+        Parameters
+        ----------
+        fully_qualified_name: str
+            The fully qualified name of the metric.
+        description: str
+            A human-readable description of the metric's functionality.
+        slots_number: int
+            A number of events that the module can track simultaneously.
+        max_latency: int
+            The maximum latency of an event. Used to set signal widths and
+            number of buckets in the histogram. If a latency turns to be
+            bigger than the maximum, it will overflow and result in a false
+            measurement.
+        """
+        self.fully_qualified_name = fully_qualified_name
+        self.description = description
+        self.slots_number = slots_number
+        self.max_latency = max_latency
+
+        self._start = Method(i=[("slot", range(0, slots_number))])
+        self._stop = Method(i=[("slot", range(0, slots_number))])
+
+        # This bucket count gives us the best possible granularity.
+        bucket_count = bits_for(self.max_latency) + 1
+        self.histogram = HwExpHistogram(
+            self.fully_qualified_name,
+            self.description,
+            bucket_count=bucket_count,
+            sample_width=bits_for(self.max_latency),
+        )
+
+        self.log = logging.HardwareLogger(fully_qualified_name)
+
+    def elaborate(self, platform):
+        if not self.metrics_enabled():
+            return TModule()
+
+        m = TModule()
+
+        epoch_width = bits_for(self.max_latency)
+
+        m.submodules.slots = self.slots = AsyncMemoryBank(
+            data_layout=[("epoch", epoch_width)], elem_count=self.slots_number
+        )
+        m.submodules.histogram = self.histogram
+
+        slots_taken = Signal(self.slots_number)
+        slots_taken_start = Signal.like(slots_taken)
+        slots_taken_stop = Signal.like(slots_taken)
+
+        m.d.comb += slots_taken_start.eq(slots_taken)
+        m.d.comb += slots_taken_stop.eq(slots_taken_start)
+        m.d.sync += slots_taken.eq(slots_taken_stop)
+
+        epoch = Signal(epoch_width)
+
+        m.d.sync += epoch.eq(epoch + 1)
+
+        @def_method(m, self._start)
+        def _(slot: Value):
+            m.d.comb += slots_taken_start.eq(slots_taken | (1 << slot))
+            self.log.error(m, (slots_taken & (1 << slot)).any(), "taken slot {} taken again", slot)
+            self.slots.write(m, addr=slot, data=epoch)
+
+        @def_method(m, self._stop)
+        def _(slot: Value):
+            m.d.comb += slots_taken_stop.eq(slots_taken_start & ~(C(1, self.slots_number) << slot))
+            self.log.error(m, ~(slots_taken & (1 << slot)).any(), "free slot {} freed again", slot)
+            ret = self.slots.read(m, addr=slot)
+            # The result of substracting two unsigned n-bit is a signed (n+1)-bit value,
+            # so we need to cast the result and discard the most significant bit.
+            duration = (epoch - ret.epoch).as_unsigned()[:-1]
+            self.histogram.add(m, duration)
+
+        return m
+
+    def start(self, m: TModule, *, slot: ValueLike):
+        """
+        Registers the start of an event for a given slot tag.
+
+        Should be called in the body of either a transaction or a method.
+
+        Parameters
+        ----------
+        m: TModule
+            Transactron module
+        slot: ValueLike
+            The slot tag of the event.
+        """
+
+        if not self.metrics_enabled():
+            return
+
+        self._start(m, slot)
+
+    def stop(self, m: TModule, *, slot: ValueLike):
+        """
+        Registers the end of the event for a given slot tag.
+
+        Should be called in the body of either a transaction or a method.
+
+        Parameters
+        ----------
+        m: TModule
+            Transactron module
+        slot: ValueLike
+            The slot tag of the event.
+        """
+
+        if not self.metrics_enabled():
+            return
+
+        self._stop(m, slot)
+
+    def metrics_enabled(self) -> bool:
+        return DependencyContext.get().get_dependency(HwMetricsEnabledKey())
+
+
 class HardwareMetricsManager:
     """
     Collects all metrics registered in the circuit and provides an easy
diff --git a/transactron/lib/storage.py b/transactron/lib/storage.py
index e6d3e5cf5..3bbf07624 100644
--- a/transactron/lib/storage.py
+++ b/transactron/lib/storage.py
@@ -8,7 +8,7 @@
 from transactron.utils import assign, AssignType, LayoutList
 from .reqres import ArgumentsToResultsZipper
 
-__all__ = ["MemoryBank"]
+__all__ = ["MemoryBank", "AsyncMemoryBank"]
 
 
 class MemoryBank(Elaboratable):
@@ -136,3 +136,77 @@ def _(arg):
             m.d.comb += assign(write_args, arg, fields=AssignType.ALL)
 
         return m
+
+
+class AsyncMemoryBank(Elaboratable):
+    """AsyncMemoryBank module.
+
+    Provides a transactional interface to asynchronous Amaranth Memory with one
+    read and one write port. It supports optionally writing with given granularity.
+
+    Attributes
+    ----------
+    read: Method
+        The read method. Accepts an `addr` from which data should be read.
+        The read response method. Return `data_layout` View which was saved on `addr` given by last
+        `read_req` method call.
+    write: Method
+        The write method. Accepts `addr` where data should be saved, `data` in form of `data_layout`
+        and optionally `mask` if `granularity` is not None. `1` in mask means that appropriate part should be written.
+    """
+
+    def __init__(
+        self, *, data_layout: LayoutList, elem_count: int, granularity: Optional[int] = None, src_loc: int | SrcLoc = 0
+    ):
+        """
+        Parameters
+        ----------
+        data_layout: method layout
+            The format of structures stored in the Memory.
+        elem_count: int
+            Number of elements stored in Memory.
+        granularity: Optional[int]
+            Granularity of write, forwarded to Amaranth. If `None` the whole structure is always saved at once.
+            If not, the width of `data_layout` is split into `granularity` parts, which can be saved independently.
+        src_loc: int | SrcLoc
+            How many stack frames deep the source location is taken from.
+            Alternatively, the source location to use instead of the default.
+        """
+        self.src_loc = get_src_loc(src_loc)
+        self.data_layout = make_layout(*data_layout)
+        self.elem_count = elem_count
+        self.granularity = granularity
+        self.width = from_method_layout(self.data_layout).size
+        self.addr_width = bits_for(self.elem_count - 1)
+
+        self.read_req_layout: LayoutList = [("addr", self.addr_width)]
+        write_layout = [("addr", self.addr_width), ("data", self.data_layout)]
+        if self.granularity is not None:
+            write_layout.append(("mask", self.width // self.granularity))
+        self.write_layout = make_layout(*write_layout)
+
+        self.read = Method(i=self.read_req_layout, o=self.data_layout, src_loc=self.src_loc)
+        self.write = Method(i=self.write_layout, src_loc=self.src_loc)
+
+    def elaborate(self, platform) -> TModule:
+        m = TModule()
+
+        mem = Memory(width=self.width, depth=self.elem_count)
+        m.submodules.read_port = read_port = mem.read_port(domain="comb")
+        m.submodules.write_port = write_port = mem.write_port()
+
+        @def_method(m, self.read)
+        def _(addr):
+            m.d.comb += read_port.addr.eq(addr)
+            return read_port.data
+
+        @def_method(m, self.write)
+        def _(arg):
+            m.d.comb += write_port.addr.eq(arg.addr)
+            m.d.comb += write_port.data.eq(arg.data)
+            if self.granularity is None:
+                m.d.comb += write_port.en.eq(1)
+            else:
+                m.d.comb += write_port.en.eq(arg.mask)
+
+        return m