Merge pull request #157 from OpenXiangShan/ruby-pf

Ruby pf
OpenXiangShan · Aug 15, 2024 · 8f7a0f5 · 8f7a0f5
2 parents 48c5acc + 7e50c7a
commit 8f7a0f5
Show file tree

Hide file tree

Showing 59 changed files with 891 additions and 338 deletions.
diff --git a/configs/common/CacheConfig.py b/configs/common/CacheConfig.py
@@ -170,17 +170,14 @@ def config_cache(options, system):
             system.tol3bus = L2XBar(clk_domain=system.cpu_clk_domain, width=256)
             system.l3.cpu_side = system.tol3bus.mem_side_ports
             system.l3.mem_side = system.membus.cpu_side_ports
-            system.l3.max_cache_level = 3
 
         for i in range(options.num_cpus):
             if options.l3cache:
                 # l2 -> tol3bus -> l3
                 system.l2_caches[i].mem_side = system.tol3bus.cpu_side_ports
                 # l3 -> membus
-                system.l2_caches[i].max_cache_level = 3
             else:
                 system.l2_caches[i].mem_side = system.membus.cpu_side_ports
-                system.l2_caches[i].max_cache_level = 2
 
     if options.memchecker:
         system.memchecker = MemChecker()
@@ -189,14 +186,7 @@ def config_cache(options, system):
         if options.caches:
             icache = icache_class(**_get_cache_opts('l1i', options))
             dcache = dcache_class(**_get_cache_opts('l1d', options))
-            if options.l2cache:
-                icache.max_cache_level = 2
-                dcache.max_cache_level = 2
-            if options.l3cache:
-                icache.max_cache_level = 3
-                dcache.max_cache_level = 3
             if dcache.prefetcher != NULL:
-                print("Add dtb for L1D prefetcher")
                 dcache.prefetcher.registerTLB(system.cpu[i].mmu.dtb)
                 if options.l1d_hwp_type == 'XSCompositePrefetcher':
                     if options.l1d_enable_spp:
@@ -233,15 +223,13 @@ def config_cache(options, system):
                 dcache.prefetcher.add_pf_downstream(system.l2_caches[i].prefetcher)
                 system.l2_caches[i].prefetcher.queue_size = 64
                 system.l2_caches[i].prefetcher.max_prefetch_requests_with_pending_translation = 128
-                print("Add L2 prefetcher {} as downstream of L1D prefetcher {}".format(i, i))
 
             if options.l3cache and options.l2_to_l3_pf_hint:
                 assert system.l2_caches[i].prefetcher != NULL and \
                     system.l3.prefetcher != NULL
                 system.l2_caches[i].prefetcher.add_pf_downstream(system.l3.prefetcher)
                 system.l3.prefetcher.queue_size = 64
                 system.l3.prefetcher.max_prefetch_requests_with_pending_translation = 128
-                print("Add L3 prefetcher as downstream of L2 prefetcher {}".format(i))
 
             # If we have a walker cache specified, instantiate two
             # instances here

diff --git a/configs/common/Caches.py b/configs/common/Caches.py
@@ -89,8 +89,6 @@ class L2Cache(Cache):
     mshrs = 64
     tgts_per_mshr = 20
     clusivity='mostly_incl'
-    prefetch_on_access = True
-    #prefetch_on_access = False
     # always writeback clean when lower level is exclusive
     writeback_clean = True
 
@@ -110,7 +108,6 @@ class L3Cache(Cache):
     tgts_per_mshr = 20
     clusivity='mostly_excl'
     writeback_clean = False
-    prefetch_on_access = True
 
     # aligned latency:
     tag_latency = 2

diff --git a/configs/common/PrefetcherConfig.py b/configs/common/PrefetcherConfig.py
@@ -0,0 +1,52 @@
+import m5
+from m5.objects import *
+from common.Caches import *
+from common import ObjectList
+
+
+def _get_hwp(hwp_option):
+    if hwp_option == None:
+        return NULL
+
+    hwpClass = ObjectList.hwp_list.get(hwp_option)
+    return hwpClass()
+
+def create_prefetcher(cpu, cache_level, options):
+    prefetcher_attr = '{}_hwp_type'.format(cache_level)
+    prefetcher_name = ''
+    prefetcher = NULL
+    if hasattr(options, prefetcher_attr):
+        prefetcher_name = getattr(options, prefetcher_attr)
+        prefetcher = _get_hwp(prefetcher_name)
+        print(f"create_prefetcher at {cache_level}: {prefetcher_name}")
+
+    if prefetcher == NULL:
+        return NULL
+
+    if cpu != NULL:
+        prefetcher.registerTLB(cpu.mmu.dtb)
+
+    prefetcher.queue_size = 64
+
+    if prefetcher_name == 'XSCompositePrefetcher':
+        if options.l1d_enable_spp:
+            prefetcher.enable_spp = True
+        if options.l1d_enable_cplx:
+            prefetcher.enable_cplx = True
+        prefetcher.pht_pf_level = options.pht_pf_level
+        prefetcher.short_stride_thres = options.short_stride_thres
+        prefetcher.fuzzy_stride_matching = False
+        prefetcher.stream_pf_ahead = True
+        prefetcher.bop_large.delay_queue_enable = True
+        prefetcher.bop_large.bad_score = 10
+        prefetcher.bop_small.delay_queue_enable = True
+        prefetcher.bop_small.bad_score = 5
+        prefetcher.queue_size = 128
+        prefetcher.max_prefetch_requests_with_pending_translation = 128
+        prefetcher.region_size = 64*16  # 64B * blocks per region
+
+        prefetcher.berti.use_byte_addr = True
+        prefetcher.berti.aggressive_pf = False
+        prefetcher.berti.trigger_pht = True
+
+    return prefetcher
diff --git a/configs/common/cores/arm/O3_ARM_v7a.py b/configs/common/cores/arm/O3_ARM_v7a.py
@@ -179,7 +179,6 @@ class O3_ARM_v7aL2(Cache):
     size = '1MB'
     assoc = 16
     write_buffers = 8
-    prefetch_on_access = True
     clusivity = 'mostly_excl'
     # Simple stride prefetcher
     prefetcher = StridePrefetcher(degree=8, latency = 1)

diff --git a/configs/common/cores/arm/ex5_LITTLE.py b/configs/common/cores/arm/ex5_LITTLE.py
@@ -122,7 +122,6 @@ class L2(Cache):
     size = '512kB'
     assoc = 8
     write_buffers = 16
-    prefetch_on_access = True
     clusivity = 'mostly_excl'
     # Simple stride prefetcher
     prefetcher = StridePrefetcher(degree=1, latency = 1)

diff --git a/configs/common/cores/arm/ex5_big.py b/configs/common/cores/arm/ex5_big.py
@@ -174,7 +174,6 @@ class L2(Cache):
     size = '2MB'
     assoc = 16
     write_buffers = 8
-    prefetch_on_access = True
     clusivity = 'mostly_excl'
     # Simple stride prefetcher
     prefetcher = StridePrefetcher(degree=8, latency = 1)

diff --git a/configs/ruby/CHI.py b/configs/ruby/CHI.py
@@ -111,25 +111,29 @@ def create_system(
     # dataAccessLatency may be set to 0 if one wants to consider parallel
     # data and tag lookups
     class L1ICache(RubyCache):
+        level = 1
         dataAccessLatency = 1
         tagAccessLatency = 1
         size = options.l1i_size
         assoc = options.l1i_assoc
 
     class L1DCache(RubyCache):
+        level = 1
         dataAccessLatency = 0
         tagAccessLatency = 1
         size = options.l1d_size
         assoc = options.l1d_assoc
 
     class L2Cache(RubyCache):
-        dataAccessLatency = 6
+        level = 2
+        dataAccessLatency = 13
         tagAccessLatency = 2
         size = options.l2_size
         assoc = options.l2_assoc
 
     class HNFCache(RubyCache):
-        dataAccessLatency = 10
+        level = 3
+        dataAccessLatency = 17
         tagAccessLatency = 2
         size = options.l3_size
         assoc = options.l3_assoc
@@ -154,11 +158,13 @@ class HNFCache(RubyCache):
             L1ICache,
             L1DCache,
             system.cache_line_size.value,
+            options
         )
         for cpu in cpus
     ]
+
     for rnf in ruby_system.rnf:
-        rnf.addPrivL2Cache(L2Cache)
+        rnf.addPrivL2Cache(L2Cache, options)
         cpu_sequencers.extend(rnf.getSequencers())
         all_cntrls.extend(rnf.getAllControllers())
         network_nodes.append(rnf)
@@ -191,10 +197,17 @@ class HNFCache(RubyCache):
     hnf_list = [i for i in range(options.num_l3caches)]
     CHI_HNF.createAddrRanges(sysranges, system.cache_line_size.value, hnf_list)
     ruby_system.hnf = [
-        CHI_HNF(i, ruby_system, HNFCache, None)
+        CHI_HNF(i, ruby_system, HNFCache, options, None)
         for i in range(options.num_l3caches)
     ]
 
+    if options.l2_to_l3_pf_hint:
+        if len(ruby_system.hnf) > 1:
+            Warning("L2 to L3 prefetch hint is not supported with multiple HNFs")
+        else:
+            for rnf in ruby_system.rnf:
+                rnf.addLLCPrefetcherDownstream(ruby_system.hnf[0].getPrefetcher())
+
     for hnf in ruby_system.hnf:
         network_nodes.append(hnf)
         network_cntrls.extend(hnf.getNetworkSideControllers())