diff --git a/WIP.md b/WIP.md
index abeeb975..c8567d1c 100644
--- a/WIP.md
+++ b/WIP.md
@@ -2,22 +2,38 @@
 
 - today:
 
-  - test flush caches using config options
-  - less important:
+  - todos
 
-    - perf: investigate if the many small allocations of msg for move in / move warp etc are problematic
-    - perf: investigate the performance overhead for finding the allocation ids
+    - use gpu_mem_alloc for the allocations but still allow smart comparision with play whose traces does not include allocations
+
+  - refactor
+
+    - join core and inner core
+    - flatten ported submodule
+    - lint
+    - factor into multiple files
+    - some minor todos
+    - remove dead code
+    - instantiate the entire GPU in one file to find a good API
+    - factor out traits
+
+  - generate plots and correlation stuff etc
+
+  - less important:
 
-    - fix: investigate lockstep performance and see if we can reduce allocations?
     - fix: remove global statics to allow running tests in parallel
     - parse accelsim config files
+
       - with defaults for compatibility
 
+    - test flush caches using config options
+    - perf: investigate if the many small allocations of msg for move in / move warp etc are problematic
+    - perf: investigate the performance overhead for finding the allocation ids
+    - perf: investigate lockstep performance and see if we can reduce allocations?
+
   - allow basic configurations for the playground bridge
 
-  -
   - FIX: add l2 set index back in
-  - generate plots and correlation stuff etc
 
   - DONE: multiple memories
   - DONE: lockstep with multiple cores and clusters
diff --git a/accelsim/src/stats.rs b/accelsim/src/stats.rs
index 426ca526..1ce0f0a1 100644
--- a/accelsim/src/stats.rs
+++ b/accelsim/src/stats.rs
@@ -170,6 +170,7 @@ impl TryFrom<Stats> for stats::Stats {
             l1c_stats: stats::PerCache::default(),
             l1d_stats: stats::PerCache::default(),
             l2d_stats,
+            stall_dram_full: 0, // todo
         })
     }
 }
diff --git a/accelsim/src/tracegen/reader.rs b/accelsim/src/tracegen/reader.rs
index 4bbe84ed..168b889b 100644
--- a/accelsim/src/tracegen/reader.rs
+++ b/accelsim/src/tracegen/reader.rs
@@ -332,8 +332,6 @@ pub fn parse_trace_instruction(
 
     // parse addresses
     if mem_width > 0 {
-        // let width = super::get_data_width_from_opcode(&opcode)?;
-
         let address_format: usize = parse_decimal(values.pop_front(), "mem address format")?;
         let address_format = AddressFormat::from_repr(address_format)
             .ok_or_else(|| eyre::eyre!("unknown mem address format: {:?}", address_format))?;
diff --git a/benches/vectoradd.rs b/benches/vectoradd.rs
index 2db45bcb..3680c965 100644
--- a/benches/vectoradd.rs
+++ b/benches/vectoradd.rs
@@ -1,3 +1,5 @@
+#![allow(clippy::missing_errors_doc, clippy::missing_panics_doc)]
+
 use color_eyre::eyre;
 use criterion::{black_box, Criterion};
 use validate::materialize::{BenchmarkConfig, Benchmarks};
@@ -21,8 +23,8 @@ fn get_bench_config(benchmark_name: &str, input_idx: usize) -> eyre::Result<Benc
     Ok(bench_config.clone())
 }
 
-pub fn run_box(bench_config: BenchmarkConfig) -> eyre::Result<()> {
-    let _stats = validate::simulate::simulate_bench_config(&bench_config)?;
+pub fn run_box(bench_config: &BenchmarkConfig) -> eyre::Result<()> {
+    let _stats = validate::simulate::simulate_bench_config(bench_config)?;
     Ok(())
 }
 
@@ -31,8 +33,8 @@ pub async fn run_accelsim(bench_config: BenchmarkConfig) -> eyre::Result<()> {
     Ok(())
 }
 
-pub fn run_playground(bench_config: BenchmarkConfig) -> eyre::Result<()> {
-    let _stats = validate::playground::simulate_bench_config(&bench_config);
+pub fn run_playground(bench_config: &BenchmarkConfig) -> eyre::Result<()> {
+    let _stats = validate::playground::simulate_bench_config(bench_config);
     Ok(())
 }
 
@@ -48,7 +50,7 @@ pub fn accelsim_benchmark(c: &mut Criterion) {
 
     group.bench_function("vectoradd/10000", |b| {
         b.to_async(&runtime)
-            .iter(|| run_accelsim(black_box(get_bench_config("vectorAdd", 2).unwrap())))
+            .iter(|| run_accelsim(black_box(get_bench_config("vectorAdd", 2).unwrap())));
     });
     // group.bench_function("transpose/256/naive", |b| {
     //     b.iter(|| run_accelsim(black_box(get_bench_config("transpose", 0).unwrap())))
@@ -61,7 +63,7 @@ pub fn play_benchmark(c: &mut Criterion) {
     group.sampling_mode(criterion::SamplingMode::Flat);
 
     group.bench_function("vectoradd/10000", |b| {
-        b.iter(|| run_playground(black_box(get_bench_config("vectorAdd", 2).unwrap())))
+        b.iter(|| run_playground(&black_box(get_bench_config("vectorAdd", 2).unwrap())));
     });
     // group.bench_function("transpose/256/naive", |b| {
     //     b.iter(|| run_playground(black_box(get_bench_config("transpose", 0).unwrap())))
@@ -74,7 +76,7 @@ pub fn box_benchmark(c: &mut Criterion) {
     group.sampling_mode(criterion::SamplingMode::Flat);
 
     group.bench_function("vectoradd/10000", |b| {
-        b.iter(|| run_box(black_box(get_bench_config("vectorAdd", 2).unwrap())))
+        b.iter(|| run_box(&black_box(get_bench_config("vectorAdd", 2).unwrap())));
     });
     // group.bench_function("transpose/256/naive", |b| {
     //     b.iter(|| run_box(black_box(get_bench_config("transpose", 0).unwrap())))
@@ -82,30 +84,29 @@ pub fn box_benchmark(c: &mut Criterion) {
 }
 
 criterion::criterion_group!(benches, box_benchmark, play_benchmark, accelsim_benchmark);
-criterion::criterion_main!(benches);
+// criterion::criterion_main!(benches);
 
 #[allow(dead_code)]
-fn custom() -> eyre::Result<()> {
+fn main() -> eyre::Result<()> {
     use std::time::Instant;
 
     let runtime = tokio::runtime::Builder::new_multi_thread()
         .enable_all()
-        .build()
-        .expect("build tokio runtime");
+        .build()?;
 
     let mut start = Instant::now();
-    let _ = run_box(black_box(get_bench_config("transpose", 0)?));
+    let _ = run_box(&black_box(get_bench_config("transpose", 0)?));
     println!("box took:\t\t{:?}", start.elapsed());
 
     start = Instant::now();
-    let _ = run_playground(black_box(get_bench_config("transpose", 0)?));
+    let _ = run_playground(&black_box(get_bench_config("transpose", 0)?));
     println!("play took:\t\t{:?}", start.elapsed());
 
     start = Instant::now();
-    let _ = runtime.block_on(async {
-        let _ = run_accelsim(black_box(get_bench_config("transpose", 0)?)).await?;
+    runtime.block_on(async {
+        run_accelsim(black_box(get_bench_config("transpose", 0)?)).await?;
         Ok::<(), eyre::Report>(())
-    });
+    })?;
     println!("accel took:\t\t{:?}", start.elapsed());
 
     Ok(())
diff --git a/examples/pycachesim.rs b/examples/pycachesim.rs
index d2b70829..0a489ef8 100644
--- a/examples/pycachesim.rs
+++ b/examples/pycachesim.rs
@@ -1,57 +1,58 @@
-#![allow(warnings)]
-
-use casimu::{cache::LRU, Cache, CacheConfig, MainMemory, Simulation};
-use std::sync::Arc;
-
-const CACHELINE_SIZE: usize = 64;
-
-fn main() {
-    let mut mem = MainMemory::new();
-    let l3 = Arc::new(Cache::new(CacheConfig {
-        name: "L3".to_string(),
-        sets: 20480,
-        ways: 16,
-        line_size: CACHELINE_SIZE,
-        replacement_policy: LRU {},
-        write_back: true,
-        write_allocate: true,
-        store_to: None,
-        load_from: None,
-        victims_to: None,
-        swap_on_load: false,
-    }));
-    mem.set_load_to(l3.clone());
-    mem.set_store_from(l3.clone());
-
-    let l2 = Arc::new(Cache::new(CacheConfig {
-        name: "L2".to_string(),
-        sets: 512,
-        ways: 8,
-        line_size: CACHELINE_SIZE,
-        replacement_policy: LRU {},
-        write_back: true,
-        write_allocate: true,
-        store_to: Some(l3.clone()),
-        load_from: Some(l3),
-        victims_to: None,
-        swap_on_load: false,
-    }));
-    let l1 = Arc::new(Cache::new(CacheConfig {
-        name: "L1".to_string(),
-        sets: 64,
-        ways: 8,
-        line_size: CACHELINE_SIZE,
-        replacement_policy: LRU {},
-        write_back: true,
-        write_allocate: true,
-        store_to: Some(l2.clone()),
-        load_from: Some(l2),
-        victims_to: None,
-        swap_on_load: false, // incl/excl does not matter in first level
-    }));
-
-    // let mut sim = Simulation::new(l1.clone(), mem);
-    // sim.load(23)
-    // cv = CacheVisualizer(cs, [10, 16])
-    // sim.dump_state()
-}
+// #![allow(warnings)]
+//
+// use casimu::{cache::LRU, Cache, CacheConfig, MainMemory, Simulation};
+// use std::sync::Arc;
+//
+// const CACHELINE_SIZE: usize = 64;
+//
+// fn main() {
+//     let mut mem = MainMemory::new();
+//     let l3 = Arc::new(Cache::new(CacheConfig {
+//         name: "L3".to_string(),
+//         sets: 20480,
+//         ways: 16,
+//         line_size: CACHELINE_SIZE,
+//         replacement_policy: LRU {},
+//         write_back: true,
+//         write_allocate: true,
+//         store_to: None,
+//         load_from: None,
+//         victims_to: None,
+//         swap_on_load: false,
+//     }));
+//     mem.set_load_to(l3.clone());
+//     mem.set_store_from(l3.clone());
+//
+//     let l2 = Arc::new(Cache::new(CacheConfig {
+//         name: "L2".to_string(),
+//         sets: 512,
+//         ways: 8,
+//         line_size: CACHELINE_SIZE,
+//         replacement_policy: LRU {},
+//         write_back: true,
+//         write_allocate: true,
+//         store_to: Some(l3.clone()),
+//         load_from: Some(l3),
+//         victims_to: None,
+//         swap_on_load: false,
+//     }));
+//     let l1 = Arc::new(Cache::new(CacheConfig {
+//         name: "L1".to_string(),
+//         sets: 64,
+//         ways: 8,
+//         line_size: CACHELINE_SIZE,
+//         replacement_policy: LRU {},
+//         write_back: true,
+//         write_allocate: true,
+//         store_to: Some(l2.clone()),
+//         load_from: Some(l2),
+//         victims_to: None,
+//         swap_on_load: false, // incl/excl does not matter in first level
+//     }));
+//
+//     // let mut sim = Simulation::new(l1.clone(), mem);
+//     // sim.load(23)
+//     // cv = CacheVisualizer(cs, [10, 16])
+//     // sim.dump_state()
+// }
+fn main() {}
diff --git a/examples/vectoradd.rs b/examples/vectoradd.rs
index f27007a7..bf3f916c 100644
--- a/examples/vectoradd.rs
+++ b/examples/vectoradd.rs
@@ -3,92 +3,92 @@
 #![allow(clippy::cast_sign_loss)]
 
 use color_eyre::eyre;
-use num_traits::{Float, NumCast, Zero};
-
-#[derive(Debug)]
-struct VecAdd<'s, 'a, T> {
-    d_a: &'a mut casimu::DevicePtr<'s, 'a, Vec<T>>,
-    d_b: &'a mut casimu::DevicePtr<'s, 'a, Vec<T>>,
-    d_c: &'a mut casimu::DevicePtr<'s, 'a, Vec<T>>,
-    n: usize,
-}
-
-impl<'s, 'a, T> casimu::Kernel for VecAdd<'s, 'a, T>
-where
-    T: Float + std::fmt::Debug,
-{
-    type Error = std::convert::Infallible;
-
-    fn run(&mut self, idx: &casimu::ThreadIndex) -> Result<(), Self::Error> {
-        // Get our global thread ID
-        // int id = blockIdx.x * blockDim.x + threadIdx.x;
-        let id: usize = (idx.block_idx.x * idx.block_dim.x + idx.thread_idx.x) as usize;
-
-        // Make sure we do not go out of bounds
-        // if (id < n) c[id] = a[id] + b[id];
-        // let test2: &(dyn std::ops::IndexMut<usize, Output = T>) = self.d_a;
-        if id < self.n {
-            self.d_c[id] = self.d_a[id] + self.d_b[id];
-        }
-        Ok(())
-    }
-}
-
-// Number of threads in each thread block
-const BLOCK_SIZE: u32 = 1024;
-
-fn vectoradd<T>(n: usize) -> eyre::Result<()>
-where
-    T: Float + Zero + NumCast + std::iter::Sum + std::fmt::Display + std::fmt::Debug,
-{
-    // create host vectors
-    let mut a: Vec<T> = vec![T::zero(); n];
-    let mut b: Vec<T> = vec![T::zero(); n];
-    let mut c: Vec<T> = vec![T::zero(); n];
-
-    // initialize vectors
-    for i in 0..n {
-        let angle = T::from(i).unwrap();
-        a[i] = angle.sin() * angle.sin();
-        b[i] = angle.cos() * angle.cos();
-        c[i] = T::zero();
-    }
-
-    let sim = casimu::Simulation::new();
-
-    // allocate memory for each vector on simulated GPU device
-    let a_size = a.len() * std::mem::size_of::<T>();
-    let b_size = b.len() * std::mem::size_of::<T>();
-    let c_size = c.len() * std::mem::size_of::<T>();
-    let mut d_a = sim.allocate(&mut a, a_size as u64);
-    let mut d_b = sim.allocate(&mut b, b_size as u64);
-    let mut d_c = sim.allocate(&mut c, c_size as u64);
-
-    // number of thread blocks in grid
-    let grid_size = (n as f64 / <f64 as From<_>>::from(BLOCK_SIZE)).ceil() as u32;
-
-    let kernel: VecAdd<T> = VecAdd {
-        d_a: &mut d_a,
-        d_b: &mut d_b,
-        d_c: &mut d_c,
-        n,
-    };
-    sim.launch_kernel(grid_size, BLOCK_SIZE, kernel)?;
-
-    // sum up vector c and print result divided by n.
-    // this should equal 1 within
-    let total_sum: T = c.into_iter().sum();
-    println!(
-        "Final sum = {total_sum}; sum/n = {} (should be ~1)\n",
-        total_sum / T::from(n).unwrap()
-    );
-
-    dbg!(&sim.stats.lock().unwrap());
-    Ok(())
-}
+// use num_traits::{Float, NumCast, Zero};
+
+// #[derive(Debug)]
+// struct VecAdd<'s, 'a, T> {
+//     d_a: &'a mut casimu::DevicePtr<'s, 'a, Vec<T>>,
+//     d_b: &'a mut casimu::DevicePtr<'s, 'a, Vec<T>>,
+//     d_c: &'a mut casimu::DevicePtr<'s, 'a, Vec<T>>,
+//     n: usize,
+// }
+//
+// impl<'s, 'a, T> casimu::Kernel for VecAdd<'s, 'a, T>
+// where
+//     T: Float + std::fmt::Debug,
+// {
+//     type Error = std::convert::Infallible;
+//
+//     fn run(&mut self, idx: &casimu::ThreadIndex) -> Result<(), Self::Error> {
+//         // Get our global thread ID
+//         // int id = blockIdx.x * blockDim.x + threadIdx.x;
+//         let id: usize = (idx.block_idx.x * idx.block_dim.x + idx.thread_idx.x) as usize;
+//
+//         // Make sure we do not go out of bounds
+//         // if (id < n) c[id] = a[id] + b[id];
+//         // let test2: &(dyn std::ops::IndexMut<usize, Output = T>) = self.d_a;
+//         if id < self.n {
+//             self.d_c[id] = self.d_a[id] + self.d_b[id];
+//         }
+//         Ok(())
+//     }
+// }
+//
+// // Number of threads in each thread block
+// const BLOCK_SIZE: u32 = 1024;
+//
+// fn vectoradd<T>(n: usize) -> eyre::Result<()>
+// where
+//     T: Float + Zero + NumCast + std::iter::Sum + std::fmt::Display + std::fmt::Debug,
+// {
+//     // create host vectors
+//     let mut a: Vec<T> = vec![T::zero(); n];
+//     let mut b: Vec<T> = vec![T::zero(); n];
+//     let mut c: Vec<T> = vec![T::zero(); n];
+//
+//     // initialize vectors
+//     for i in 0..n {
+//         let angle = T::from(i).unwrap();
+//         a[i] = angle.sin() * angle.sin();
+//         b[i] = angle.cos() * angle.cos();
+//         c[i] = T::zero();
+//     }
+//
+//     let sim = casimu::Simulation::new();
+//
+//     // allocate memory for each vector on simulated GPU device
+//     let a_size = a.len() * std::mem::size_of::<T>();
+//     let b_size = b.len() * std::mem::size_of::<T>();
+//     let c_size = c.len() * std::mem::size_of::<T>();
+//     let mut d_a = sim.allocate(&mut a, a_size as u64);
+//     let mut d_b = sim.allocate(&mut b, b_size as u64);
+//     let mut d_c = sim.allocate(&mut c, c_size as u64);
+//
+//     // number of thread blocks in grid
+//     let grid_size = (n as f64 / <f64 as From<_>>::from(BLOCK_SIZE)).ceil() as u32;
+//
+//     let kernel: VecAdd<T> = VecAdd {
+//         d_a: &mut d_a,
+//         d_b: &mut d_b,
+//         d_c: &mut d_c,
+//         n,
+//     };
+//     sim.launch_kernel(grid_size, BLOCK_SIZE, kernel)?;
+//
+//     // sum up vector c and print result divided by n.
+//     // this should equal 1 within
+//     let total_sum: T = c.into_iter().sum();
+//     println!(
+//         "Final sum = {total_sum}; sum/n = {} (should be ~1)\n",
+//         total_sum / T::from(n).unwrap()
+//     );
+//
+//     dbg!(&sim.stats.lock().unwrap());
+//     Ok(())
+// }
 
 fn main() -> eyre::Result<()> {
-    vectoradd::<f32>(100)?;
+    // vectoradd::<f32>(100)?;
     Ok(())
 }
 
diff --git a/playground/sys/build.rs b/playground/sys/build.rs
index 3a550b2e..513d7e77 100644
--- a/playground/sys/build.rs
+++ b/playground/sys/build.rs
@@ -41,23 +41,6 @@ fn configure_debug_mode(build: &mut cc::Build) {
     }
 }
 
-#[allow(dead_code)]
-#[deprecated = "redundant when compiling the bridge"]
-fn build(sources: &[PathBuf]) -> eyre::Result<()> {
-    let mut build = cc::Build::new();
-    build
-        .cpp(true)
-        .static_flag(true)
-        .files(sources)
-        .flag("-std=c++14")
-        .warnings(false);
-
-    configure_debug_mode(&mut build);
-    enable_diagnostics_color(&mut build);
-    build.try_compile("playground")?;
-    Ok(())
-}
-
 #[derive(Debug)]
 struct ParseCallbacks {}
 
@@ -329,6 +312,7 @@ fn generate_bridge(
     build
         .cpp(true)
         .static_flag(true)
+        .pic(true)
         .warnings(false)
         .include(include_dir)
         .include(parser_include_dir)
diff --git a/playground/sys/src/ref/box_interconnect.cc b/playground/sys/src/ref/box_interconnect.cc
index 3ccfab49..87eed118 100644
--- a/playground/sys/src/ref/box_interconnect.cc
+++ b/playground/sys/src/ref/box_interconnect.cc
@@ -5,16 +5,17 @@
 #include "mem_fetch.hpp"
 
 bool BoxInterconnect::HasBuffer(unsigned deviceID, unsigned int size) const {
-  unsigned icntID = _node_map.find(deviceID)->second;
-  assert(icntID == deviceID);
-
-  // request is subnet 0 and reply is subnet 1
-  bool is_memory_node = ((_subnets > 1) && deviceID >= _n_shader);
-  unsigned subnet = is_memory_node ? 1 : 0;
-  bool has_buffer =
-      simple_input_queue[subnet][icntID][0].size() <= _input_buffer_capacity;
-
-  return has_buffer;
+  return true;
+  // unsigned icntID = _node_map.find(deviceID)->second;
+  // assert(icntID == deviceID);
+  //
+  // // request is subnet 0 and reply is subnet 1
+  // bool is_memory_node = ((_subnets > 1) && deviceID >= _n_shader);
+  // unsigned subnet = is_memory_node ? 1 : 0;
+  // bool has_buffer =
+  //     simple_input_queue[subnet][icntID][0].size() <= _input_buffer_capacity;
+  //
+  // return has_buffer;
 }
 
 void BoxInterconnect::Advance() {
@@ -93,7 +94,6 @@ void BoxInterconnect::Push(unsigned input_deviceID, unsigned output_deviceID,
         mem_fetch_ptr(mf), size, input_icntID, output_icntID, subnet);
   }
 
-  // simple_input_queue[subnet][input_icntID][0].push_back(data);
   simple_output_queue[subnet][output_icntID][0].push_back(data);
 }
 
@@ -103,13 +103,13 @@ void BoxInterconnect::Init() {
   unsigned nodes = _net[0]->NumNodes();
   unsigned classes = _icnt_config->GetInt("classes");
 
-  simple_input_queue.resize(_subnets);
+  // simple_input_queue.resize(_subnets);
   simple_output_queue.resize(_subnets);
   for (int subnet = 0; subnet < _subnets; ++subnet) {
-    simple_input_queue[subnet].resize(nodes);
+    // simple_input_queue[subnet].resize(nodes);
     simple_output_queue[subnet].resize(nodes);
     for (unsigned node = 0; node < nodes; ++node) {
-      simple_input_queue[subnet][node].resize(classes);
+      // simple_input_queue[subnet][node].resize(classes);
       simple_output_queue[subnet][node].resize(classes);
     }
   }
diff --git a/playground/sys/src/ref/box_interconnect.hpp b/playground/sys/src/ref/box_interconnect.hpp
index 765034a3..631c7f25 100644
--- a/playground/sys/src/ref/box_interconnect.hpp
+++ b/playground/sys/src/ref/box_interconnect.hpp
@@ -26,7 +26,8 @@ class BoxInterconnect : public InterconnectInterface {
   std::shared_ptr<spdlog::logger> logger;
 
  protected:
-  std::vector<std::vector<std::vector<std::list<void *>>>> simple_input_queue;
+  // std::vector<std::vector<std::vector<std::list<void *>>>>
+  // simple_input_queue;
   std::vector<std::vector<std::vector<std::list<void *>>>> simple_output_queue;
 };
 
diff --git a/src/cache.rs b/src/cache.rs
index ae7a5abc..25914fa6 100644
--- a/src/cache.rs
+++ b/src/cache.rs
@@ -1,4 +1,3 @@
-
 use std::sync::Arc;
 
 pub trait ReplacementPolicy {}
diff --git a/src/config/accelsim.rs b/src/config/accelsim.rs
index cae629d8..997539a4 100644
--- a/src/config/accelsim.rs
+++ b/src/config/accelsim.rs
@@ -619,7 +619,7 @@ static ARGUMENT_REGEX: Lazy<Regex> = Lazy::new(|| {
         .unwrap()
 });
 
-pub fn extract_arguments<'a>(config: &'a str) -> impl Iterator<Item = (&'a str, &'a str)> + '_ {
+pub fn extract_arguments(config: &str) -> impl Iterator<Item = (&str, &str)> {
     ARGUMENT_REGEX.captures_iter(config).filter_map(|cap| {
         let key = cap.get(1)?.as_str().trim();
         let value = cap.get(2)?.as_str().trim();
@@ -630,7 +630,7 @@ pub fn extract_arguments<'a>(config: &'a str) -> impl Iterator<Item = (&'a str,
 impl Config {
     pub fn from_config_str(config: impl AsRef<str>) -> eyre::Result<Self> {
         let args = extract_arguments(config.as_ref())
-            .flat_map(|(key, value)| [format!("--{}", key), value.to_string()]);
+            .flat_map(|(key, value)| [format!("--{key}"), value.to_string()]);
         let args: Vec<String> = ["test".to_string()].into_iter().chain(args).collect();
         dbg!(&args);
         let config = Self::try_parse_from(&args)?;
@@ -655,8 +655,8 @@ mod tests {
 # --gpgpu_shader_core_pipeline 2048:32
 # --gpgpu_simd_model 1
         ";
-        let args = super::extract_arguments(&config)
-            .flat_map(|(key, value)| [format!("--{}", key), value.to_string()]);
+        let args = super::extract_arguments(config)
+            .flat_map(|(key, value)| [format!("--{key}"), value.to_string()]);
         let mut args: std::collections::VecDeque<String> = args.collect();
         args.push_front("test".to_string());
 
diff --git a/src/config/mod.rs b/src/config/mod.rs
index f2435adf..b1825b26 100644
--- a/src/config/mod.rs
+++ b/src/config/mod.rs
@@ -1,7 +1,8 @@
 pub mod accelsim;
 
 use super::ported::{
-    addrdec, address, core::PipelineStage, mem_sub_partition, mshr, opcodes, KernelInfo,
+    addrdec, address, core::PipelineStage, kernel::Kernel, mem_sub_partition, mshr, opcodes,
+    set_index,
 };
 use color_eyre::eyre;
 use std::collections::HashMap;
@@ -29,13 +30,14 @@ pub enum CacheReplacementPolicy {
     FIFO, // F
 }
 
-#[derive(Debug, PartialEq, Eq, Hash)]
+#[derive(Debug)]
 pub struct L2DCacheConfig {
     pub inner: Arc<CacheConfig>,
 }
 
 impl L2DCacheConfig {
     #[inline]
+    #[must_use]
     pub fn set_index(&self, addr: address) -> u64 {
         let partition_addr = addr;
 
@@ -48,12 +50,13 @@ impl L2DCacheConfig {
     }
 }
 
-#[derive(Debug, PartialEq, Eq, Hash)]
+#[derive(Debug)]
 pub struct L1DCacheConfig {
     /// L1 Hit Latency
     pub l1_latency: usize, // 1
     /// l1 banks hashing function
-    pub l1_banks_hashing_function: CacheSetIndexFunc, // 0
+    pub l1_banks_hashing_function: Box<dyn set_index::SetIndexFunction>, // 0
+    // pub l1_banks_hashing_function: CacheSetIndexFunc, // 0
     /// l1 banks byte interleaving granularity
     pub l1_banks_byte_interleaving: usize, // 32
     /// The number of L1 cache banks
@@ -64,16 +67,19 @@ pub struct L1DCacheConfig {
 
 impl L1DCacheConfig {
     #[inline]
+    #[must_use]
     pub fn l1_banks_log2(&self) -> u32 {
         addrdec::logb2(self.l1_banks as u32)
     }
 
     #[inline]
+    #[must_use]
     pub fn l1_banks_byte_interleaving_log2(&self) -> u32 {
         addrdec::logb2(self.l1_banks_byte_interleaving as u32)
     }
 
     #[inline]
+    #[must_use]
     pub fn compute_set_bank(&self, addr: address) -> u64 {
         log::trace!(
             "computing set bank for address {} ({} l1 banks) using hashing function {:?}",
@@ -85,20 +91,26 @@ impl L1DCacheConfig {
         // For sector cache, we select one sector per bank (sector interleaving)
         // This is what was found in Volta (one sector per bank, sector
         // interleaving) otherwise, line interleaving
-        hash_function(
+
+        self.l1_banks_hashing_function.compute_set_index(
             addr,
             self.l1_banks,
             self.l1_banks_byte_interleaving_log2(),
             self.l1_banks_log2(),
-            self.l1_banks_hashing_function,
         )
+
+        // hash_function(
+        //     addr,
+        //     self.l1_banks,
+        //     self.l1_banks_byte_interleaving_log2(),
+        //     self.l1_banks_log2(),
+        //     self.l1_banks_hashing_function,
+        // )
     }
 }
 
 /// CacheConfig
-///
-/// <sector?>:<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
-#[derive(Debug, PartialEq, Eq, Hash)]
+#[derive(Debug)]
 pub struct CacheConfig {
     pub kind: CacheKind,
     pub num_sets: usize,
@@ -109,7 +121,8 @@ pub struct CacheConfig {
     pub write_policy: CacheWritePolicy,
     pub allocate_policy: CacheAllocatePolicy,
     pub write_allocate_policy: CacheWriteAllocatePolicy,
-    pub set_index_function: CacheSetIndexFunc,
+    // pub set_index_function: CacheSetIndexFunc,
+    pub set_index_function: Box<dyn set_index::SetIndexFunction>,
 
     pub mshr_kind: mshr::Kind,
     pub mshr_entries: usize,
@@ -126,6 +139,17 @@ pub struct CacheConfig {
     // pub disabled: bool,
 }
 
+impl std::fmt::Display for CacheConfig {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        let size = human_bytes::human_bytes(self.total_bytes() as f64);
+        write!(
+            f,
+            "{size} ({} set, {}-way, {} byte line)",
+            self.num_sets, self.associativity, self.line_size
+        )
+    }
+}
+
 pub static MAX_DEFAULT_CACHE_SIZE_MULTIPLIER: u8 = 4;
 
 /// TODO: use a builder here so we can fill in the remaining values
@@ -135,6 +159,7 @@ impl CacheConfig {
     ///
     /// todo: this can be replaced with the builder?
     #[inline]
+    #[must_use]
     pub fn data_port_width(&self) -> usize {
         // default granularity is line size
         let width = self.data_port_width.unwrap_or(self.line_size as usize);
@@ -144,49 +169,58 @@ impl CacheConfig {
 
     /// The total size of the cache in bytes.
     #[inline]
+    #[must_use]
     pub fn total_bytes(&self) -> usize {
         self.line_size as usize * self.num_sets * self.associativity
     }
 
     /// Number of lines in total.
     #[inline]
+    #[must_use]
     pub fn total_lines(&self) -> usize {
         self.num_sets * self.associativity
     }
 
     /// Maximum number of lines.
     #[inline]
+    #[must_use]
     pub fn max_num_lines(&self) -> usize {
         self.max_cache_multiplier() as usize * self.num_sets * self.associativity
     }
 
     /// this is virtual (possibly different)
     #[inline]
+    #[must_use]
     pub fn max_cache_multiplier(&self) -> u8 {
         MAX_DEFAULT_CACHE_SIZE_MULTIPLIER
     }
 
     #[inline]
+    #[must_use]
     pub fn line_size_log2(&self) -> u32 {
-        addrdec::logb2(self.line_size as u32)
+        addrdec::logb2(self.line_size)
     }
 
     #[inline]
+    #[must_use]
     pub fn num_sets_log2(&self) -> u32 {
         addrdec::logb2(self.num_sets as u32)
     }
 
     #[inline]
+    #[must_use]
     pub fn sector_size(&self) -> u32 {
         mem_sub_partition::SECTOR_SIZE
     }
 
     #[inline]
+    #[must_use]
     pub fn sector_size_log2(&self) -> u32 {
         addrdec::logb2(self.sector_size())
     }
 
     #[inline]
+    #[must_use]
     pub fn atom_size(&self) -> u32 {
         if self.kind == CacheKind::Sector {
             mem_sub_partition::SECTOR_SIZE
@@ -197,17 +231,25 @@ impl CacheConfig {
 
     // do not use enabled but options
     #[inline]
+    #[must_use]
     pub fn set_index(&self, addr: address) -> u64 {
-        hash_function(
+        self.set_index_function.compute_set_index(
             addr,
             self.num_sets,
             self.line_size_log2(),
             self.num_sets_log2(),
-            self.set_index_function,
         )
+        // hash_function(
+        //     addr,
+        //     self.num_sets,
+        //     self.line_size_log2(),
+        //     self.num_sets_log2(),
+        //     self.set_index_function,
+        // )
     }
 
     #[inline]
+    #[must_use]
     pub fn tag(&self, addr: address) -> address {
         // For generality, the tag includes both index and tag.
         // This allows for more complex set index calculations that
@@ -217,19 +259,21 @@ impl CacheConfig {
 
         // return addr >> (m_line_sz_log2+m_nset_log2);
         // return addr & ~(new_addr_type)(m_line_sz - 1);
-        addr & !((self.line_size - 1) as u64)
+        addr & !u64::from(self.line_size - 1)
     }
 
     /// Block address
     #[inline]
+    #[must_use]
     pub fn block_addr(&self, addr: address) -> address {
-        addr & !((self.line_size - 1) as u64)
+        addr & !u64::from(self.line_size - 1)
     }
 
     /// Mshr address
     #[inline]
+    #[must_use]
     pub fn mshr_addr(&self, addr: address) -> address {
-        addr & !((self.line_size - 1) as u64)
+        addr & !u64::from(self.line_size - 1)
     }
 
     // // detect invalid configuration
@@ -268,93 +312,8 @@ impl CacheConfig {
     // assert(m_line_sz % m_data_port_width == 0);
 }
 
-fn hash_function(
-    addr: address,
-    num_sets: usize,
-    line_size_log2: u32,
-    num_sets_log2: u32,
-    set_index_function: CacheSetIndexFunc,
-) -> u64 {
-    use super::ported::set_index_function as indexing;
-
-    let set_idx: u64 = match set_index_function {
-        CacheSetIndexFunc::LINEAR_SET_FUNCTION => {
-            // log::trace!(
-            //     "set_index({}): LINEAR hash func: log2(line)={}, num sets={}",
-            //     addr,
-            //     line_size_log2,
-            //     num_sets,
-            // );
-            let set_index = (addr >> line_size_log2) & (num_sets as u64 - 1);
-            set_index
-        }
-        CacheSetIndexFunc::FERMI_HASH_SET_FUNCTION => {
-            // Set Indexing function from
-            // "A Detailed GPU Cache Model Based on Reuse
-            // Distance Theory" Cedric Nugteren et al. HPCA 2014
-
-            // check for incorrect number of sets
-            assert!(
-                    matches!(num_sets, 32 | 64),
-                    "bad cache config: num sets should be 32 or 64 for hashing set index function (got {})", num_sets,
-                );
-
-            let mut lower_xor = 0;
-            let mut upper_xor = 0;
-
-            // lower xor value is bits 7-11
-            lower_xor = (addr >> line_size_log2) & 0x1F;
-
-            // upper xor value is bits 13, 14, 15, 17, and 19
-            upper_xor = (addr & 0xE000) >> 13; // Bits 13, 14, 15
-            upper_xor |= (addr & 0x20000) >> 14; // Bit 17
-            upper_xor |= (addr & 0x80000) >> 15; // Bit 19
-
-            let mut set_index = lower_xor ^ upper_xor;
-
-            // 48KB cache prepends the set_index with bit 12
-            if num_sets == 64 {
-                set_index |= (addr & 0x1000) >> 7;
-            }
-            set_index
-        }
-        CacheSetIndexFunc::HASH_IPOLY_FUNCTION => {
-            let bits = line_size_log2 + num_sets_log2;
-            let higher_bits = addr >> bits;
-            let mut index = (addr >> line_size_log2) as usize;
-            index &= num_sets - 1;
-            indexing::ipoly_hash_function(higher_bits, index, num_sets)
-        }
-
-        CacheSetIndexFunc::BITWISE_XORING_FUNCTION => {
-            let bits = line_size_log2 + num_sets_log2;
-            let higher_bits = addr >> bits;
-            let mut index = (addr >> line_size_log2) as usize;
-            index &= num_sets - 1;
-            indexing::bitwise_hash_function(higher_bits, index, num_sets)
-        }
-    };
-
-    assert!(
-            set_idx < num_sets as u64,
-             "Error: Set index out of bounds. This is caused by an incorrect or unimplemented set index function."
-        );
-    set_idx
-}
-
-impl std::fmt::Display for CacheConfig {
-    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
-        let size = human_bytes::human_bytes(self.total_bytes() as f64);
-        write!(
-            f,
-            "{size} ({} set, {}-way, {} byte line)",
-            self.num_sets, self.associativity, self.line_size
-        )
-    }
-}
-
 /// todo: remove the copy stuff, very expensive otherwise
-#[derive(Debug, PartialEq, Eq)]
+#[derive(Debug)]
 pub struct GPUConfig {
     pub linear_to_raw_adress_translation:
         std::sync::OnceLock<addrdec::LinearToRawAddressTranslation>,
@@ -376,17 +335,7 @@ pub struct GPUConfig {
     /// unified banked L2 data cache config
     pub data_cache_l2: Option<Arc<L2DCacheConfig>>,
 
-    /// L1D write ratio
-    // pub l1_cache_write_ratio: usize,
-    /// The number of L1 cache banks
-    // pub l1_banks: usize,
-    // /// L1 banks byte interleaving granularity
-    // pub l1_banks_byte_interleaving: usize,
-    // // L1 banks hashing function
-    // pub l1_banks_hashing_function: usize,
-    // /// L1 Hit Latency
-    // pub l1_latency: usize,
-    /// smem Latency
+    /// Shared memory latency
     pub shared_memory_latency: usize,
     /// SP unit max latency
     pub max_sp_latency: usize,
@@ -626,6 +575,7 @@ pub struct GPUConfig {
 
 pub static WORD_SIZE: address = 4;
 
+#[must_use]
 pub fn pad_to_multiple(n: usize, k: usize) -> usize {
     let rem = n % k;
     if rem != 0 {
@@ -664,9 +614,9 @@ impl GPUConfig {
         mem_id + self.num_simt_clusters
     }
 
-    pub fn threads_per_block_padded(&self, kernel: &KernelInfo) -> usize {
+    pub fn threads_per_block_padded(&self, kernel: &Kernel) -> usize {
         let threads_per_block = kernel.threads_per_block();
-        pad_to_multiple(threads_per_block as usize, self.warp_size)
+        pad_to_multiple(threads_per_block, self.warp_size)
     }
 
     /// Number of bytes transferred per read or write command.
@@ -679,11 +629,11 @@ impl GPUConfig {
     ///
     /// Depends on the following constraints:
     /// -
-    pub fn max_blocks(&self, kernel: &KernelInfo) -> eyre::Result<usize> {
+    pub fn max_blocks(&self, kernel: &Kernel) -> eyre::Result<usize> {
         let threads_per_block = kernel.threads_per_block();
-        let threads_per_block = pad_to_multiple(threads_per_block as usize, self.warp_size);
+        let threads_per_block = pad_to_multiple(threads_per_block, self.warp_size);
         // limit by n_threads/shader
-        let by_thread_limit = self.max_threads_per_core / threads_per_block as usize;
+        let by_thread_limit = self.max_threads_per_core / threads_per_block;
 
         // limit by shmem/shader
         let by_shared_mem_limit = if kernel.config.shared_mem_bytes > 0 {
@@ -712,7 +662,7 @@ impl GPUConfig {
             by_register_limit,
         ]
         .into_iter()
-        .filter_map(|limit| limit)
+        .flatten()
         .min()
         .unwrap_or(usize::MAX);
         // result = gs_min2(result, result_shmem);
@@ -733,18 +683,58 @@ impl GPUConfig {
             ));
         }
 
-        if self.adaptive_cache_config && !kernel.cache_config_set {
+        if self.adaptive_cache_config {
             // more info about adaptive cache, see
             // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x
             let total_shared_mem = kernel.config.shared_mem_bytes as usize * limit;
-            assert!(
-                total_shared_mem >= 0
-                    && self
-                        .shared_memory_sizes
-                        .last()
-                        .map(|size| total_shared_mem <= (*size as usize))
-                        .unwrap_or(true)
-            );
+            if let Some(size) = self.shared_memory_sizes.last() {
+                assert!(total_shared_mem <= (*size as usize));
+            }
+
+            // Unified cache config is in KB. Converting to B
+            // unsigned total_unified = m_L1D_config.m_unified_cache_size * 1024;
+            //
+            // bool l1d_configured = false;
+            // unsigned max_assoc = m_L1D_config.get_max_assoc();
+            //
+            // for (std::vector<unsigned>::const_iterator it = shmem_opt_list.begin();
+            //      it < shmem_opt_list.end(); it++) {
+            //   if (total_shmem <= *it) {
+            //     float l1_ratio = 1 - ((float)*(it) / total_unified);
+            //     // make sure the ratio is between 0 and 1
+            //     assert(0 <= l1_ratio && l1_ratio <= 1);
+            //     // round to nearest instead of round down
+            //     m_L1D_config.set_assoc(max_assoc * l1_ratio + 0.5f);
+            //     l1d_configured = true;
+            //     break;
+            //   }
+            // }
+            //
+            // assert(l1d_configured && "no shared memory option found");
+
+            // if (m_L1D_config.is_streaming()) {
+            //       // for streaming cache, if the whole memory is allocated
+            //       // to the L1 cache, then make the allocation to be on_MISS
+            //       // otherwise, make it ON_FILL to eliminate line allocation fails
+            //       // i.e. MSHR throughput is the same, independent on the L1 cache
+            //       // size/associativity
+            //       if (total_shmem == 0) {
+            //         m_L1D_config.set_allocation_policy(ON_MISS);
+            //
+            //         if (gpgpu_ctx->accelsim_compat_mode) {
+            //           printf("GPGPU-Sim: Reconfigure L1 allocation to ON_MISS\n");
+            //         }
+            //       } else {
+            //         m_L1D_config.set_allocation_policy(ON_FILL);
+            //         if (gpgpu_ctx->accelsim_compat_mode) {
+            //           printf("GPGPU-Sim: Reconfigure L1 allocation to ON_FILL\n");
+            //         }
+            //       }
+            //     }
+            //     if (gpgpu_ctx->accelsim_compat_mode) {
+            //       printf("GPGPU-Sim: Reconfigure L1 cache to %uKB\n",
+            //              m_L1D_config.get_total_size_inKB());
+            //     }
         }
 
         Ok(limit)
@@ -764,31 +754,21 @@ impl GPUConfig {
             | ArchOp::RET_OPS => {
                 // integer units
                 (latency, initiation_interval) = self.trace_opcode_latency_initiation_int;
-                // latency = int_latency;
-                // initiation_interval = int_init;
             }
             ArchOp::SP_OP => {
                 // single precision units
                 (latency, initiation_interval) = self.trace_opcode_latency_initiation_sp;
-                // latency = fp_latency;
-                // initiation_interval = fp_init;
             }
             ArchOp::DP_OP => {
                 // double precision units
                 (latency, initiation_interval) = self.trace_opcode_latency_initiation_dp;
-                // latency = dp_latency;
-                // initiation_interval = dp_init;
             }
             ArchOp::SFU_OP => {
                 // special function units
                 (latency, initiation_interval) = self.trace_opcode_latency_initiation_sfu;
-                // latency = sfu_latency;
-                // initiation_interval = sfu_init;
             }
             ArchOp::TENSOR_CORE_OP => {
                 (latency, initiation_interval) = self.trace_opcode_latency_initiation_tensor;
-                // latency = tensor_latency;
-                // initiation_interval = tensor_init;
             }
             _ => {}
         }
@@ -805,63 +785,6 @@ impl GPUConfig {
     }
 }
 
-// void trace_config::reg_options(option_parser_t opp) {
-//   option_parser_register(opp, "-trace", OPT_CSTR, &g_traces_filename,
-//                          "traces kernel file"
-//                          "traces kernel file directory",
-//                          "./traces/kernelslist.g");
-//
-//   option_parser_register(opp, "-trace_opcode_latency_initiation_int", OPT_CSTR,
-//                          &trace_opcode_latency_initiation_int,
-//                          "Opcode latencies and initiation for integers in "
-//                          "trace driven mode <latency,initiation>",
-//                          "4,1");
-//   option_parser_register(opp, "-trace_opcode_latency_initiation_sp", OPT_CSTR,
-//                          &trace_opcode_latency_initiation_sp,
-//                          "Opcode latencies and initiation for sp in trace "
-//                          "driven mode <latency,initiation>",
-//                          "4,1");
-//   option_parser_register(opp, "-trace_opcode_latency_initiation_dp", OPT_CSTR,
-//                          &trace_opcode_latency_initiation_dp,
-//                          "Opcode latencies and initiation for dp in trace "
-//                          "driven mode <latency,initiation>",
-//                          "4,1");
-//   option_parser_register(opp, "-trace_opcode_latency_initiation_sfu", OPT_CSTR,
-//                          &trace_opcode_latency_initiation_sfu,
-//                          "Opcode latencies and initiation for sfu in trace "
-//                          "driven mode <latency,initiation>",
-//                          "4,1");
-//   option_parser_register(opp, "-trace_opcode_latency_initiation_tensor",
-//                          OPT_CSTR, &trace_opcode_latency_initiation_tensor,
-//                          "Opcode latencies and initiation for tensor in trace "
-//                          "driven mode <latency,initiation>",
-//                          "4,1");
-//
-//   for (unsigned j = 0; j < SPECIALIZED_UNIT_NUM; ++j) {
-//     std::stringstream ss;
-//     ss << "-trace_opcode_latency_initiation_spec_op_" << j + 1;
-//     option_parser_register(opp, ss.str().c_str(), OPT_CSTR,
-//                            &trace_opcode_latency_initiation_specialized_op[j],
-//                            "specialized unit config"
-//                            " <latency,initiation>",
-//                            "4,4");
-//   }
-// }
-//
-// void trace_config::parse_config() {
-//   sscanf(trace_opcode_latency_initiation_int, "%u,%u", &int_latency, &int_init);
-//   sscanf(trace_opcode_latency_initiation_sp, "%u,%u", &fp_latency, &fp_init);
-//   sscanf(trace_opcode_latency_initiation_dp, "%u,%u", &dp_latency, &dp_init);
-//   sscanf(trace_opcode_latency_initiation_sfu, "%u,%u", &sfu_latency, &sfu_init);
-//   sscanf(trace_opcode_latency_initiation_tensor, "%u,%u", &tensor_latency,
-//          &tensor_init);
-//
-//   for (unsigned j = 0; j < SPECIALIZED_UNIT_NUM; ++j) {
-//     sscanf(trace_opcode_latency_initiation_specialized_op[j], "%u,%u",
-//            &specialized_unit_latency[j], &specialized_unit_initiation[j]);
-//   }
-// }
-
 /// Cache set indexing function kind.
 #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
 pub enum CacheSetIndexFunc {
@@ -875,9 +798,9 @@ pub enum CacheSetIndexFunc {
 ///
 /// Cache write-allocate policy.
 ///
-/// For more details about difference between FETCH_ON_WRITE and WRITE
+/// For more details about difference between `FETCH_ON_WRITE` and WRITE
 /// VALIDAE policies Read: Jouppi, Norman P. "Cache write policies and
-/// performance". ISCA 93. WRITE_ALLOCATE is the old write policy in
+/// performance". ISCA 93. `WRITE_ALLOCATE` is the old write policy in
 /// GPGPU-sim 3.x, that send WRITE and READ for every write request
 #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
 pub enum CacheWriteAllocatePolicy {
@@ -955,8 +878,8 @@ pub enum DRAMSchedulerKind {
 
 /// Core Scheduler policy.
 ///
-/// If two_level_active:
-/// <num_active_warps>:<inner_prioritization>:<outer_prioritization>
+/// If `two_level_active`:
+/// <`num_active_warps>:<inner_prioritization>:<outer_prioritization`>
 ///
 /// For complete list of prioritization values see shader.h.
 #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
@@ -986,11 +909,11 @@ impl GPUConfig {
         let shared_memory_sizes_string = "0";
         let _shared_memory_sizes: Vec<u32> = if adaptive_cache_config {
             let sizes: Result<Vec<u32>, _> = shared_memory_sizes_string
-                .split(",")
+                .split(',')
                 .map(str::parse)
                 .collect();
             let mut sizes: Vec<_> = sizes?.into_iter().map(|size| size * 1024).collect();
-            sizes.sort();
+            sizes.sort_unstable();
             sizes
         } else {
             vec![]
@@ -1004,16 +927,10 @@ impl GPUConfig {
 
     pub fn address_mapping(&self) -> &addrdec::LinearToRawAddressTranslation {
         self.linear_to_raw_adress_translation
-            .get_or_init(|| addrdec::LinearToRawAddressTranslation::new(&self).unwrap())
+            .get_or_init(|| addrdec::LinearToRawAddressTranslation::new(self).unwrap())
     }
 }
 
-// opp, "-gpgpu_pipeline_widths", OPT_CSTR, &pipeline_widths_string,
-//       "Pipeline widths "
-//       "ID_OC_SP,ID_OC_DP,ID_OC_INT,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_DP,OC_EX_"
-//       "INT,OC_EX_SFU,OC_EX_MEM,EX_WB,ID_OC_TENSOR_CORE,OC_EX_TENSOR_CORE",
-//       "1,1,1,1,1,1,1,1,1,1,1,1,1")
-
 impl Default for GPUConfig {
     fn default() -> Self {
         Self {
@@ -1032,7 +949,8 @@ impl Default for GPUConfig {
                 write_policy: CacheWritePolicy::READ_ONLY,
                 allocate_policy: CacheAllocatePolicy::ON_MISS,
                 write_allocate_policy: CacheWriteAllocatePolicy::NO_WRITE_ALLOCATE,
-                set_index_function: CacheSetIndexFunc::LINEAR_SET_FUNCTION,
+                // set_index_function: CacheSetIndexFunc::LINEAR_SET_FUNCTION,
+                set_index_function: Box::new(set_index::linear::SetIndex::default()),
                 mshr_kind: mshr::Kind::TEX_FIFO,
                 mshr_entries: 128,
                 mshr_max_merge: 4,
@@ -1052,7 +970,8 @@ impl Default for GPUConfig {
                 write_policy: CacheWritePolicy::READ_ONLY,
                 allocate_policy: CacheAllocatePolicy::ON_FILL,
                 write_allocate_policy: CacheWriteAllocatePolicy::NO_WRITE_ALLOCATE,
-                set_index_function: CacheSetIndexFunc::LINEAR_SET_FUNCTION,
+                // set_index_function: CacheSetIndexFunc::LINEAR_SET_FUNCTION,
+                set_index_function: Box::new(set_index::linear::SetIndex::default()),
                 mshr_kind: mshr::Kind::ASSOC,
                 mshr_entries: 2,
                 mshr_max_merge: 64,
@@ -1072,7 +991,8 @@ impl Default for GPUConfig {
                 write_policy: CacheWritePolicy::READ_ONLY,
                 allocate_policy: CacheAllocatePolicy::ON_FILL,
                 write_allocate_policy: CacheWriteAllocatePolicy::NO_WRITE_ALLOCATE,
-                set_index_function: CacheSetIndexFunc::LINEAR_SET_FUNCTION,
+                // set_index_function: CacheSetIndexFunc::LINEAR_SET_FUNCTION,
+                set_index_function: Box::new(set_index::linear::SetIndex::default()),
                 mshr_kind: mshr::Kind::ASSOC,
                 mshr_entries: 2,
                 mshr_max_merge: 48,
@@ -1085,8 +1005,8 @@ impl Default for GPUConfig {
             // {<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>,<mshr>:<N>:<merge>,<mq> | none}
             data_cache_l1: Some(Arc::new(L1DCacheConfig {
                 l1_latency: 1,
-                l1_banks_hashing_function: CacheSetIndexFunc::LINEAR_SET_FUNCTION,
-                // l1_banks_hashing_function: CacheSetIndexFunc::FERMI_HASH_SET_FUNCTION,
+                // l1_banks_hashing_function: CacheSetIndexFunc::LINEAR_SET_FUNCTION,
+                l1_banks_hashing_function: Box::new(set_index::linear::SetIndex::default()),
                 l1_banks_byte_interleaving: 32,
                 l1_banks: 1,
                 inner: Arc::new(CacheConfig {
@@ -1098,7 +1018,8 @@ impl Default for GPUConfig {
                     write_policy: CacheWritePolicy::LOCAL_WB_GLOBAL_WT,
                     allocate_policy: CacheAllocatePolicy::ON_MISS,
                     write_allocate_policy: CacheWriteAllocatePolicy::NO_WRITE_ALLOCATE,
-                    set_index_function: CacheSetIndexFunc::FERMI_HASH_SET_FUNCTION,
+                    // set_index_function: CacheSetIndexFunc::FERMI_HASH_SET_FUNCTION,
+                    set_index_function: Box::new(set_index::fermi::SetIndex::default()),
                     mshr_kind: mshr::Kind::ASSOC,
                     mshr_entries: 128,
                     mshr_max_merge: 8,
@@ -1120,7 +1041,8 @@ impl Default for GPUConfig {
                     write_policy: CacheWritePolicy::WRITE_BACK,
                     allocate_policy: CacheAllocatePolicy::ON_MISS,
                     write_allocate_policy: CacheWriteAllocatePolicy::WRITE_ALLOCATE,
-                    set_index_function: CacheSetIndexFunc::LINEAR_SET_FUNCTION,
+                    // set_index_function: CacheSetIndexFunc::LINEAR_SET_FUNCTION,
+                    set_index_function: Box::new(set_index::linear::SetIndex::default()),
                     mshr_kind: mshr::Kind::ASSOC,
                     mshr_entries: 1024,
                     mshr_max_merge: 1024,
@@ -1277,8 +1199,8 @@ mod tests {
 
     fn parse_cache_config(config: &str) -> bindings::CacheConfig {
         use bindings::parse_cache_config as parse;
-        let cache_config = unsafe { parse(config.as_ptr().cast()) };
-        cache_config
+
+        unsafe { parse(config.as_ptr().cast()) }
     }
 
     #[test]
@@ -1405,29 +1327,32 @@ mod tests {
     fn test_l1i_block_addr() {
         let config = super::GPUConfig::default();
         let l1i_cache_config = config.inst_cache_l1.unwrap();
-        assert_eq!(l1i_cache_config.block_addr(4026531848), 4026531840);
+        assert_eq!(l1i_cache_config.block_addr(4_026_531_848), 4_026_531_840);
     }
 
     #[test]
     fn test_l2d_block_addr() {
         let config = super::GPUConfig::default();
         let l2d_cache_config = config.data_cache_l2.unwrap();
-        assert_eq!(l2d_cache_config.inner.block_addr(34887082112), 34887082112);
+        assert_eq!(
+            l2d_cache_config.inner.block_addr(34_887_082_112),
+            34_887_082_112
+        );
     }
 
     #[test]
     fn test_l1i_mshr_addr() {
         let config = super::GPUConfig::default();
         let l1i_cache_config = config.inst_cache_l1.unwrap();
-        assert_eq!(l1i_cache_config.mshr_addr(4026531848), 4026531840);
-        assert_eq!(l1i_cache_config.mshr_addr(4026531992), 4026531968);
+        assert_eq!(l1i_cache_config.mshr_addr(4_026_531_848), 4_026_531_840);
+        assert_eq!(l1i_cache_config.mshr_addr(4_026_531_992), 4_026_531_968);
     }
 
     #[test]
     fn test_l2d_set_index() {
         let config = super::GPUConfig::default();
         let l2d_config = config.data_cache_l2.unwrap();
-        let block_addr = 34887082112;
+        let block_addr = 34_887_082_112;
         assert_eq!(l2d_config.inner.set_index(block_addr), 1);
     }
 }
diff --git a/src/lib.rs b/src/lib.rs
index 5287c801..15ca43f8 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -2,14 +2,14 @@
 // #![allow(warnings)]
 
 // pub mod gpgpusim;
-pub mod cache;
+// pub mod cache;
 pub mod config;
-pub mod dram;
+// pub mod dram;
 pub mod ported;
-#[cfg(feature = "python")]
-pub mod python;
-pub mod sim;
+// #[cfg(feature = "python")]
+// pub mod python;
+// pub mod sim;
 
-pub use cache::{Cache, Config as CacheConfig};
-pub use dram::MainMemory;
-pub use sim::{DevicePtr, Kernel, Simulation, ThreadIndex};
+// pub use cache::{Cache, Config as CacheConfig};
+// pub use dram::MainMemory;
+// pub use sim::{DevicePtr, Kernel, Simulation, ThreadIndex};
diff --git a/src/ported/addrdec.rs b/src/ported/addrdec.rs
index 7af95c49..f26e0463 100644
--- a/src/ported/addrdec.rs
+++ b/src/ported/addrdec.rs
@@ -6,21 +6,21 @@ use regex::Regex;
 /// Base 2 logarithm of n.
 ///
 /// Effectively the minium number of bits required to store n.
-pub fn logb2(n: u32) -> u32 {
+#[must_use] pub fn logb2(n: u32) -> u32 {
     n.max(1).ilog2()
 }
 
 /// Compute power of two greater than or equal to n
 ///
 /// see: https://www.techiedelight.com/round-next-highest-power-2/
-pub fn next_power2(mut n: u32) -> u32 {
+#[must_use] pub fn next_power2(mut n: u32) -> u32 {
     // avoid subtract with overflow
     if n == 0 {
         return 0;
     }
 
     // decrement n (handle the case when n itself is a power of 2)
-    n = n - 1;
+    n -= 1;
 
     // unset rightmost bit until only one bit is left
     while n > 0 && (n & (n - 1)) > 0 {
@@ -32,7 +32,7 @@ pub fn next_power2(mut n: u32) -> u32 {
     n << 1
 }
 
-pub fn mask_limit(mask: address) -> (u8, u8) {
+#[must_use] pub fn mask_limit(mask: address) -> (u8, u8) {
     let mut high = 64;
     let mut low = 0;
     let mut low_found = false;
@@ -129,7 +129,7 @@ pub struct AddressDecodingConfig {
     pub burst: Mask,
 }
 
-const ACCELSIM_ADDRESS_DECODE_CONFIG_REGEX: Lazy<Regex> =
+static ACCELSIM_ADDRESS_DECODE_CONFIG_REGEX: Lazy<Regex> =
     Lazy::new(|| Regex::new(r"(dramid@(?P<dramid>\d+))?;?(?P<rest>.*)").unwrap());
 
 impl AddressDecodingConfig {
@@ -210,7 +210,7 @@ impl AddressDecodingConfig {
 }
 
 impl LinearToRawAddressTranslation {
-    pub fn partition_address(&self, addr: address) -> address {
+    #[must_use] pub fn partition_address(&self, addr: address) -> address {
         if !self.has_gap {
             let mut mask = self.decode_config.chip.mask;
             mask |= self.sub_partition_id_mask;
@@ -227,7 +227,7 @@ impl LinearToRawAddressTranslation {
         }
     }
 
-    pub fn tlx(&self, addr: address) -> DecodedAddress {
+    #[must_use] pub fn tlx(&self, addr: address) -> DecodedAddress {
         let mut tlx = DecodedAddress::default();
         let num_channels = self.num_channels as u64;
 
@@ -284,20 +284,20 @@ impl LinearToRawAddressTranslation {
         let num_sub_partitions_per_channel_log2 = logb2(num_sub_partitions_per_channel as u32);
 
         let mut num_chip_bits = num_channels_log2;
-        let gap = num_channels as i64 - 2u32.pow(num_chip_bits) as i64;
+        let gap = num_channels as i64 - i64::from(2u32.pow(num_chip_bits));
         if gap > 0 {
             num_chip_bits += 1;
         }
         let mut decode_config = if let Some(ref mapping_config) = config.memory_addr_mapping {
-            AddressDecodingConfig::parse_accelsim_config(&mapping_config)?
+            AddressDecodingConfig::parse_accelsim_config(mapping_config)?
         } else {
             AddressDecodingConfig {
                 addr_chip_start: Some(10),
-                chip: 0x0000000000001C00.into(),
-                bank: 0x0000000000000300.into(),
-                row: 0x000000000FFF0000.into(),
-                col: 0x000000000000E0FF.into(),
-                burst: 0x000000000000000F.into(),
+                chip: 0x0000_0000_0000_1C00.into(),
+                bank: 0x0000_0000_0000_0300.into(),
+                row: 0x0000_0000_0FFF_0000.into(),
+                col: 0x0000_0000_0000_E0FF.into(),
+                burst: 0x0000_0000_0000_000F.into(),
             }
         };
 
@@ -366,7 +366,7 @@ impl LinearToRawAddressTranslation {
         })
     }
 
-    pub fn num_sub_partition_total(&self) -> usize {
+    #[must_use] pub fn num_sub_partition_total(&self) -> usize {
         self.num_channels * self.num_sub_partitions_per_channel
     }
 }
@@ -386,7 +386,7 @@ fn packbits(mask: super::address, val: super::address, low: u8, high: u8) -> sup
             pos += 1;
         }
     }
-    return res;
+    res
 }
 
 #[derive(Default, Debug, Clone, Copy, Eq, PartialEq)]
@@ -417,18 +417,18 @@ mod tests {
 
     #[inline]
     fn bit_str(n: u64) -> String {
-        format!("{:064b}", n)
+        format!("{n:064b}")
     }
 
     impl From<playground::addrdec::AddrDec> for super::DecodedAddress {
         fn from(addr: playground::addrdec::AddrDec) -> Self {
             Self {
-                chip: addr.chip as u64,
-                bk: addr.bk as u64,
-                row: addr.row as u64,
-                col: addr.col as u64,
-                burst: addr.burst as u64,
-                sub_partition: addr.sub_partition as u64,
+                chip: u64::from(addr.chip),
+                bk: u64::from(addr.bk),
+                row: u64::from(addr.row),
+                col: u64::from(addr.col),
+                burst: u64::from(addr.burst),
+                sub_partition: u64::from(addr.sub_partition),
             }
         }
     }
@@ -457,23 +457,23 @@ mod tests {
         dbg!(&dec_config);
         assert_eq!(
             bit_str(dec_config.chip.mask),
-            bit_str(0b00000000_00000000_00000000_00000000)
+            bit_str(0b0000_0000_0000_0000_0000_0000_0000_0000)
         );
         assert_eq!(
             bit_str(dec_config.bank.mask),
-            bit_str(0b00000000_00000000_01110000_10000000)
+            bit_str(0b0000_0000_0000_0000_0111_0000_1000_0000)
         );
         assert_eq!(
             bit_str(dec_config.row.mask),
-            bit_str(0b00001111_11111111_10000000_00000000)
+            bit_str(0b0000_1111_1111_1111_1000_0000_0000_0000)
         );
         assert_eq!(
             bit_str(dec_config.col.mask),
-            bit_str(0b00000000_00000000_00001111_01111111)
+            bit_str(0b0000_0000_0000_0000_0000_1111_0111_1111)
         );
         assert_eq!(
             bit_str(dec_config.burst.mask),
-            bit_str(0b00000000_00000000_00000000_00011111)
+            bit_str(0b0000_0000_0000_0000_0000_0000_0001_1111)
         );
 
         let mut config = GPUConfig::default();
@@ -483,11 +483,11 @@ mod tests {
 
         let mapping = super::LinearToRawAddressTranslation::new(&config)?;
         let dec_config = mapping.decode_config;
-        assert_eq!(bit_str(dec_config.chip.mask), bit_str(0x0000000000000700));
-        assert_eq!(bit_str(dec_config.bank.mask), bit_str(0x0000000000038080));
-        assert_eq!(bit_str(dec_config.row.mask), bit_str(0x000000007ffc0000));
-        assert_eq!(bit_str(dec_config.col.mask), bit_str(0x000000000000787f));
-        assert_eq!(bit_str(dec_config.burst.mask), bit_str(0x000000000000001f));
+        assert_eq!(bit_str(dec_config.chip.mask), bit_str(0x0000_0000_0000_0700));
+        assert_eq!(bit_str(dec_config.bank.mask), bit_str(0x0000_0000_0003_8080));
+        assert_eq!(bit_str(dec_config.row.mask), bit_str(0x0000_0000_7ffc_0000));
+        assert_eq!(bit_str(dec_config.col.mask), bit_str(0x0000_0000_0000_787f));
+        assert_eq!(bit_str(dec_config.burst.mask), bit_str(0x0000_0000_0000_001f));
 
         assert_eq!((dec_config.chip.low, dec_config.chip.high), (8, 11));
         assert_eq!((dec_config.bank.low, dec_config.bank.high), (7, 18));
@@ -503,20 +503,20 @@ mod tests {
         use playground::addrdec::packbits as ref_packbits;
         assert_eq!(packbits(0, 0, 0, 64), ref_packbits(0, 0, 0, 64));
         assert_eq!(
-            packbits(0, 0xFFFFFFFFFFFFFFFF, 0, 64),
-            ref_packbits(0, 0xFFFFFFFFFFFFFFFF, 0, 64),
+            packbits(0, 0xFFFF_FFFF_FFFF_FFFF, 0, 64),
+            ref_packbits(0, 0xFFFF_FFFF_FFFF_FFFF, 0, 64),
         );
         assert_eq!(
-            packbits(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0, 64),
-            ref_packbits(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0, 64),
+            packbits(0xFFFF_FFFF_FFFF_FFFF, 0xFFFF_FFFF_FFFF_FFFF, 0, 64),
+            ref_packbits(0xFFFF_FFFF_FFFF_FFFF, 0xFFFF_FFFF_FFFF_FFFF, 0, 64),
         );
         assert_eq!(
-            packbits(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 64, 255),
-            ref_packbits(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 64, 64),
+            packbits(0xFFFF_FFFF_FFFF_FFFF, 0xFFFF_FFFF_FFFF_FFFF, 64, 255),
+            ref_packbits(0xFFFF_FFFF_FFFF_FFFF, 0xFFFF_FFFF_FFFF_FFFF, 64, 64),
         );
         assert_eq!(
-            packbits(0xFFFFFFFFFFFFFFFF, 15, 0, 4),
-            ref_packbits(0xFFFFFFFFFFFFFFFF, 15, 0, 4),
+            packbits(0xFFFF_FFFF_FFFF_FFFF, 15, 0, 4),
+            ref_packbits(0xFFFF_FFFF_FFFF_FFFF, 15, 0, 4),
         );
     }
 
@@ -526,42 +526,42 @@ mod tests {
         config.num_memory_controllers = 8;
         config.num_sub_partition_per_memory_channel = 2;
 
-        let (tlx_addr, ref_tlx_addr) = compute_tlx(&config, 140159034064896);
+        let (tlx_addr, ref_tlx_addr) = compute_tlx(&config, 140_159_034_064_896);
         dbg!(&tlx_addr, &ref_tlx_addr);
         assert_eq!(ref_tlx_addr.sub_partition, 0);
         assert_eq!(tlx_addr.sub_partition, 0);
 
-        let (tlx_addr, ref_tlx_addr) = compute_tlx(&config, 140159034065024);
+        let (tlx_addr, ref_tlx_addr) = compute_tlx(&config, 140_159_034_065_024);
         dbg!(&tlx_addr, &ref_tlx_addr);
         assert_eq!(ref_tlx_addr.sub_partition, 1);
         assert_eq!(tlx_addr.sub_partition, 1);
 
-        let (tlx_addr, ref_tlx_addr) = compute_tlx(&config, 140159034065120);
+        let (tlx_addr, ref_tlx_addr) = compute_tlx(&config, 140_159_034_065_120);
         dbg!(&tlx_addr, &ref_tlx_addr);
         assert_eq!(ref_tlx_addr.sub_partition, 1);
         assert_eq!(tlx_addr.sub_partition, 1);
 
-        let (tlx_addr, ref_tlx_addr) = compute_tlx(&config, 140159034065152);
+        let (tlx_addr, ref_tlx_addr) = compute_tlx(&config, 140_159_034_065_152);
         dbg!(&tlx_addr, &ref_tlx_addr);
         assert_eq!(ref_tlx_addr.sub_partition, 2);
         assert_eq!(tlx_addr.sub_partition, 2);
 
-        let (tlx_addr, ref_tlx_addr) = compute_tlx(&config, 140159034065472);
+        let (tlx_addr, ref_tlx_addr) = compute_tlx(&config, 140_159_034_065_472);
         dbg!(&tlx_addr, &ref_tlx_addr);
         assert_eq!(ref_tlx_addr.sub_partition, 4);
         assert_eq!(tlx_addr.sub_partition, 4);
 
-        let (tlx_addr, ref_tlx_addr) = compute_tlx(&config, 140159034066048);
+        let (tlx_addr, ref_tlx_addr) = compute_tlx(&config, 140_159_034_066_048);
         dbg!(&tlx_addr, &ref_tlx_addr);
         assert_eq!(ref_tlx_addr.sub_partition, 9);
         assert_eq!(tlx_addr.sub_partition, 9);
 
-        let (tlx_addr, ref_tlx_addr) = compute_tlx(&config, 140159034066432);
+        let (tlx_addr, ref_tlx_addr) = compute_tlx(&config, 140_159_034_066_432);
         dbg!(&tlx_addr, &ref_tlx_addr);
         assert_eq!(ref_tlx_addr.sub_partition, 12);
         assert_eq!(tlx_addr.sub_partition, 12);
 
-        let (tlx_addr, ref_tlx_addr) = compute_tlx(&config, 140159034066944);
+        let (tlx_addr, ref_tlx_addr) = compute_tlx(&config, 140_159_034_066_944);
         dbg!(&tlx_addr, &ref_tlx_addr);
         assert_eq!(ref_tlx_addr.sub_partition, 0);
         assert_eq!(tlx_addr.sub_partition, 0);
@@ -570,7 +570,7 @@ mod tests {
     #[test]
     fn test_tlx() {
         let config = GPUConfig::default();
-        let (tlx_addr, ref_tlx_addr) = compute_tlx(&config, 139823420539008);
+        let (tlx_addr, ref_tlx_addr) = compute_tlx(&config, 139_823_420_539_008);
         let expected = super::DecodedAddress {
             chip: 0,
             bk: 1,
@@ -587,23 +587,23 @@ mod tests {
     fn test_mask_limit() {
         use playground::addrdec::mask_limit as ref_mask_limit;
 
-        let mask = 0b0000000000000000000000000000000000000000000000000000000000000000;
+        let mask = 0b0000_0000_0000_0000_0000_0000_0000_0000_0000_0000_0000_0000_0000_0000_0000_0000;
         diff::assert_eq!(super::mask_limit(mask), (0, 64));
         diff::assert_eq!(ref_mask_limit(mask), (0, 64));
 
-        let mask = 0b0000000000000000000000000000000000000000000000000111000010000000;
+        let mask = 0b0000_0000_0000_0000_0000_0000_0000_0000_0000_0000_0000_0000_0111_0000_1000_0000;
         diff::assert_eq!(super::mask_limit(mask), (7, 15));
         diff::assert_eq!(ref_mask_limit(mask), (7, 15));
 
-        let mask = 0b0000000000000000000000000000000000001111111111111000000000000000;
+        let mask = 0b0000_0000_0000_0000_0000_0000_0000_0000_0000_1111_1111_1111_1000_0000_0000_0000;
         diff::assert_eq!(super::mask_limit(mask), (15, 28));
         diff::assert_eq!(ref_mask_limit(mask), (15, 28));
 
-        let mask = 0b0000000000000000000000000000000000000000000000000000111101111111;
+        let mask = 0b0000_0000_0000_0000_0000_0000_0000_0000_0000_0000_0000_0000_0000_1111_0111_1111;
         diff::assert_eq!(super::mask_limit(mask), (0, 12));
         diff::assert_eq!(ref_mask_limit(mask), (0, 12));
 
-        let mask = 0b0000000000000000000000000000000000000000000000000000000000011111;
+        let mask = 0b0000_0000_0000_0000_0000_0000_0000_0000_0000_0000_0000_0000_0000_0000_0001_1111;
         diff::assert_eq!(super::mask_limit(mask), (0, 5));
         diff::assert_eq!(ref_mask_limit(mask), (0, 5));
     }
diff --git a/src/ported/arbitration.rs b/src/ported/arbitration.rs
index 78f95e21..310321fd 100644
--- a/src/ported/arbitration.rs
+++ b/src/ported/arbitration.rs
@@ -42,7 +42,7 @@ impl ArbitrationMetadata {
     }
 
     /// check if a subpartition still has credit
-    pub fn has_credits(&self, inner_sub_partition_id: usize) -> bool {
+    #[must_use] pub fn has_credits(&self, inner_sub_partition_id: usize) -> bool {
         if self.private_credit[inner_sub_partition_id] < self.private_credit_limit {
             return true;
         }
@@ -89,7 +89,7 @@ impl ArbitrationMetadata {
     }
 
     /// return the last subpartition that borrowed credit
-    pub fn last_borrower(&self) -> usize {
+    #[must_use] pub fn last_borrower(&self) -> usize {
         self.last_borrower
     }
 }
diff --git a/src/ported/barrier.rs b/src/ported/barrier.rs
index 1f04265b..458c81c6 100644
--- a/src/ported/barrier.rs
+++ b/src/ported/barrier.rs
@@ -12,7 +12,7 @@ pub struct BarrierSet {
 }
 
 impl BarrierSet {
-    pub fn new(
+    #[must_use] pub fn new(
         _max_warps_per_core: usize,
         _max_blocks_per_core: usize,
         _max_barriers_per_block: usize,
diff --git a/src/ported/cache.rs b/src/ported/cache.rs
index 9df952d4..7c5fd604 100644
--- a/src/ported/cache.rs
+++ b/src/ported/cache.rs
@@ -83,7 +83,7 @@ pub struct Event {
 }
 
 impl Event {
-    pub fn new(kind: EventKind) -> Self {
+    #[must_use] pub fn new(kind: EventKind) -> Self {
         Self {
             kind,
             evicted_block: None,
@@ -92,9 +92,7 @@ impl Event {
 }
 
 pub trait Component {
-    fn cycle(&mut self) {
-        todo!("component: cycle");
-    }
+    fn cycle(&mut self);
 }
 
 pub trait Cache: Component + CacheBandwidth {
@@ -152,15 +150,8 @@ pub trait Cache: Component + CacheBandwidth {
     }
 }
 
-// not clear if we ever need this
 pub trait CacheBandwidth {
-    fn has_free_data_port(&self) -> bool {
-        todo!("cache: has_free_data_port");
-        false
-    }
+    fn has_free_data_port(&self) -> bool;
 
-    fn has_free_fill_port(&self) -> bool {
-        todo!("cache: has_free_fill_port");
-        false
-    }
+    fn has_free_fill_port(&self) -> bool;
 }
diff --git a/src/ported/cache_block.rs b/src/ported/cache_block.rs
index cda68453..0e0522c6 100644
--- a/src/ported/cache_block.rs
+++ b/src/ported/cache_block.rs
@@ -100,7 +100,7 @@ impl Default for LineCacheBlock {
 }
 
 impl LineCacheBlock {
-    pub fn new() -> Self {
+    #[must_use] pub fn new() -> Self {
         Self::default()
     }
 
@@ -143,7 +143,7 @@ impl LineCacheBlock {
             self.is_readable = true;
         }
         if self.set_byte_mask_on_fill {
-            self.set_byte_mask(&byte_mask)
+            self.set_byte_mask(byte_mask)
         }
 
         self.fill_time = time;
@@ -171,32 +171,32 @@ impl LineCacheBlock {
     }
 
     #[inline]
-    pub fn status(&self, _mask: &mem_fetch::MemAccessSectorMask) -> Status {
+    #[must_use] pub fn status(&self, _mask: &mem_fetch::MemAccessSectorMask) -> Status {
         self.status
     }
 
     #[inline]
-    pub fn is_valid(&self) -> bool {
+    #[must_use] pub fn is_valid(&self) -> bool {
         self.status == Status::VALID
     }
 
     #[inline]
-    pub fn is_modified(&self) -> bool {
+    #[must_use] pub fn is_modified(&self) -> bool {
         self.status == Status::MODIFIED
     }
 
     #[inline]
-    pub fn is_invalid(&self) -> bool {
+    #[must_use] pub fn is_invalid(&self) -> bool {
         self.status == Status::INVALID
     }
 
     #[inline]
-    pub fn is_reserved(&self) -> bool {
+    #[must_use] pub fn is_reserved(&self) -> bool {
         self.status == Status::RESERVED
     }
 
     #[inline]
-    pub fn is_readable(&self, _mask: &mem_fetch::MemAccessSectorMask) -> bool {
+    #[must_use] pub fn is_readable(&self, _mask: &mem_fetch::MemAccessSectorMask) -> bool {
         self.is_readable
     }
 
@@ -206,28 +206,28 @@ impl LineCacheBlock {
     }
 
     #[inline]
-    pub fn alloc_time(&self) -> u64 {
+    #[must_use] pub fn alloc_time(&self) -> u64 {
         self.alloc_time
     }
 
     #[inline]
-    pub fn last_access_time(&self) -> u64 {
+    #[must_use] pub fn last_access_time(&self) -> u64 {
         self.last_access_time
     }
 
     #[inline]
-    pub fn modified_size(&self) -> u32 {
+    #[must_use] pub fn modified_size(&self) -> u32 {
         // cache line size
         mem_sub_partition::SECTOR_CHUNCK_SIZE * mem_sub_partition::SECTOR_SIZE
     }
 
     #[inline]
-    pub fn dirty_byte_mask(&self) -> mem_fetch::MemAccessByteMask {
+    #[must_use] pub fn dirty_byte_mask(&self) -> mem_fetch::MemAccessByteMask {
         self.dirty_byte_mask
     }
 
     #[inline]
-    pub fn dirty_sector_mask(&self) -> mem_fetch::MemAccessSectorMask {
+    #[must_use] pub fn dirty_sector_mask(&self) -> mem_fetch::MemAccessSectorMask {
         if self.is_modified() {
             !BitArray::ZERO
         } else {
diff --git a/src/ported/cluster.rs b/src/ported/cluster.rs
index 253eff19..16a05dab 100644
--- a/src/ported/cluster.rs
+++ b/src/ported/cluster.rs
@@ -1,6 +1,6 @@
 use super::{interconn as ic, mem_fetch, MockSimulator, Packet, SIMTCore};
 use crate::config::GPUConfig;
-use crate::ported;
+use crate::ported::{self, Kernel};
 use console::style;
 use std::cell::RefCell;
 use std::collections::VecDeque;
@@ -18,7 +18,6 @@ pub struct SIMTCoreCluster<I> {
 
     pub interconn: Arc<I>,
 
-    // pub core_sim_order: Vec<usize>,
     pub core_sim_order: VecDeque<usize>,
     pub block_issue_next_core: Mutex<usize>,
     pub response_fifo: VecDeque<mem_fetch::MemFetch>,
@@ -92,7 +91,7 @@ where
             .lock()
             .unwrap()
             .iter()
-            .map(|c| c.not_completed())
+            .map(ported::core::SIMTCore::not_completed)
             .sum()
     }
 
@@ -131,7 +130,7 @@ where
                 self.cluster_id,
                 self.response_fifo
                     .iter()
-                    .map(|fetch| fetch.to_string())
+                    .map(std::string::ToString::to_string)
                     .collect::<Vec<_>>(),
             ))
             .cyan()
@@ -209,7 +208,7 @@ where
         };
         // m_stats->m_incoming_traffic_stats->record_traffic(mf, packet_size);
         fetch.status = mem_fetch::Status::IN_CLUSTER_TO_SHADER_QUEUE;
-        self.response_fifo.push_back(fetch.clone());
+        self.response_fifo.push_back(fetch);
 
         // m_stats->n_mem_to_simt[m_cluster_id] += mf->get_num_flits(false);
     }
@@ -232,19 +231,15 @@ where
         log::debug!("cluster {} cycle {}", self.cluster_id, self.cycle.get());
         let mut cores = self.cores.lock().unwrap();
 
-        // for core in cores.iter_mut() {
-        for core_id in self.core_sim_order.iter() {
-            // core.cycle()
+        for core_id in &self.core_sim_order {
             cores[*core_id].cycle()
         }
 
-        // if (m_config->simt_core_sim_order == 1) {
-        // self.core_sim_order.rotate_left(1);
-        let first = self.core_sim_order.pop_front().unwrap();
-        self.core_sim_order.push_back(first);
-        // m_core_sim_order.splice(m_core_sim_order.end(), m_core_sim_order,
-        //                         m_core_sim_order.begin());
-        // }
+        if let ported::config::SchedulingOrder::RoundRobin = self.config.simt_core_sim_order {
+            self.core_sim_order.rotate_left(1);
+            // let first = self.core_sim_order.pop_front().unwrap();
+            // self.core_sim_order.push_back(first);
+        }
     }
 
     pub fn issue_block_to_core(&self, sim: &MockSimulator<I>) -> usize {
@@ -259,37 +254,18 @@ where
         let mut num_blocks_issued = 0;
 
         let mut block_issue_next_core = self.block_issue_next_core.lock().unwrap();
-        // dbg!(&sim.select_kernel());
 
         for core_id in 0..num_cores {
-            // debug_assert_eq!(i, core.id);
             let core_id = (core_id + *block_issue_next_core + 1) % num_cores;
             let core = &mut cores[core_id];
-            // let mut kernel = None;
-            let kernel: Option<Arc<ported::KernelInfo>> = if self.config.concurrent_kernel_sm {
-                unimplemented!("concurrent kernel sm");
+            let kernel: Option<Arc<Kernel>> = if self.config.concurrent_kernel_sm {
                 // always select latest issued kernel
                 // kernel = sim.select_kernel()
-                sim.select_kernel().map(Arc::clone)
+                // sim.select_kernel().map(Arc::clone);
+                unimplemented!("concurrent kernel sm");
             } else {
                 let mut current_kernel = core.inner.current_kernel.as_ref();
-                // .map(Arc::clone);
-                // match core.inner.current_kernel {
-                //     Some(current) if current.no_more_blocks_to_run() && core.not_completed() == 0 => {
-                //         // new kernel
-                //         sim.select_kernel()
-                //     }
-                //     None => {
-                //         // new kernel
-                //         sim.select_kernel()
-                //     }
-                //
-                // }
-                // let kernel = core.inner.current_kernel;
-                // if let Some(current_kernel) = kernel {
-                // }
-                // kernel
-                let should_select_new_kernel = if let Some(ref current) = current_kernel {
+                let should_select_new_kernel = if let Some(current) = current_kernel {
                     // if no more blocks left, get new kernel once current block completes
                     current.no_more_blocks_to_run() && core.not_completed() == 0
                 } else {
@@ -310,74 +286,24 @@ where
 
                 if should_select_new_kernel {
                     current_kernel = sim.select_kernel();
-                    if let Some(ref k) = current_kernel {
+                    if let Some(k) = current_kernel {
                         core.set_kernel(Arc::clone(k));
                     }
                 }
 
                 current_kernel.map(Arc::clone)
-
-                // Select current core kernel.
-                // If no more cta, get a new kernel once core completed warps
-                // match core.inner.current_kernel {
-                //     Some(current_kernel)
-                //         if current_kernel.no_more_blocks_to_run() && core.not_completed() == 0 =>
-                //     {
-                // if should_select_new_kernel {
-                //     kernel = sim.select_kernel();
-                //     if let Some(k) = kernel {
-                //         core.set_kernel(Arc::clone(k));
-                //     }
-                // }
-                //     }
-                //     _ => {}
-                // }
-                // Select current core kernel.
-                // If no more cta, get a new kernel once core completed warps
-                // if current_kernel.no_more_blocks_to_run() && core.not_completed() == 0 {
-                //     kernel = sim.select_kernel();
-                //     if let Some(k) = kernel {
-                //         core.set_kernel(Arc::clone(k));
-                //     }
-                // }
             };
-            // log::debug!(
-            //     "core {}-{}: {} active warps, current kernel {:?}, more blocks={:?}",
-            //     self.cluster_id,
-            //     core.inner.core_id,
-            //     core.inner.num_active_warps,
-            //     core.inner.current_kernel.as_ref().map(|k| k.name()),
-            //     core.inner
-            //         .current_kernel
-            //         .as_ref()
-            //         .map(|k| !k.no_more_blocks_to_run())
-            // );
-            // log::debug!(
-            //     "core {}-{}: selected kernel {:?}",
-            //     self.cluster_id,
-            //     core.inner.core_id,
-            //     kernel.as_ref().map(|k| k.name())
-            // );
             if let Some(kernel) = kernel {
-                // let core_id = 0;
                 log::debug!(
                     "core {}-{}: selected kernel {} more blocks={} can issue={}",
                     self.cluster_id,
                     core_id,
                     kernel,
                     !kernel.no_more_blocks_to_run(),
-                    core.can_issue_block(&*kernel),
+                    core.can_issue_block(&kernel),
                 );
 
-                // log::debug!(
-                //     "kernel: no more blocks to run={} can issue block {}",
-                //     kernel.no_more_blocks_to_run(),
-                //     core.can_issue_block(&*kernel)
-                // );
-                // log::debug!("kernel: {:#?}", &*kernel);
-
-                if !kernel.no_more_blocks_to_run() && core.can_issue_block(&*kernel) {
-                    // core.issue_block(Arc::clone(kernel));
+                if !kernel.no_more_blocks_to_run() && core.can_issue_block(&kernel) {
                     core.issue_block(kernel);
                     num_blocks_issued += 1;
                     *block_issue_next_core = core_id;
diff --git a/src/ported/core.rs b/src/ported/core.rs
index 8a9defdc..c2337145 100644
--- a/src/ported/core.rs
+++ b/src/ported/core.rs
@@ -1,8 +1,8 @@
 use super::instruction::WarpInstruction;
 use super::scheduler::SchedulerWarp;
 use super::{
-    address, barrier, cache, opcodes, operand_collector as opcoll, register_set, scoreboard,
-    simd_function_unit as fu, KernelInfo, LoadStoreUnit,
+    address, barrier, cache, kernel::Kernel, opcodes, operand_collector as opcoll, register_set,
+    scoreboard, simd_function_unit as fu, LoadStoreUnit,
 };
 use super::{interconn as ic, l1, mem_fetch, scheduler as sched};
 use crate::config::{self, GPUConfig};
@@ -41,9 +41,9 @@ pub type WarpMask = BitArr!(for WARP_PER_CTA_MAX);
 /// Start of the program memory space
 ///
 /// Note: should be distinct from other memory spaces.
-pub const PROGRAM_MEM_START: usize = 0xF0000000;
+pub const PROGRAM_MEM_START: usize = 0xF000_0000;
 
-pub const PROGRAM_MEM_ALLOC: Lazy<super::Allocation> = Lazy::new(|| super::Allocation {
+pub static PROGRAM_MEM_ALLOC: Lazy<super::Allocation> = Lazy::new(|| super::Allocation {
     name: Some("PROGRAM_MEM".to_string()),
     id: 0,
     start_addr: PROGRAM_MEM_START as super::address,
@@ -53,25 +53,19 @@ pub const PROGRAM_MEM_ALLOC: Lazy<super::Allocation> = Lazy::new(|| super::Alloc
 #[derive(Debug)]
 pub struct ThreadState {
     pub active: bool,
-    // pub block_id: usize,
-    // pub active: bool,
     pub pc: usize,
 }
 
 #[derive(Debug, Default)]
 pub struct InstrFetchBuffer {
     valid: bool,
-    pc: address,
-    num_bytes: usize,
     warp_id: usize,
 }
 
 impl InstrFetchBuffer {
-    pub fn new() -> Self {
+    #[must_use] pub fn new() -> Self {
         Self {
             valid: false,
-            pc: 0,
-            num_bytes: 0,
             warp_id: 0,
         }
     }
@@ -87,12 +81,12 @@ pub struct InnerSIMTCore<I> {
     pub warp_instruction_unique_uid: Arc<atomic::AtomicU64>,
     pub stats: Arc<Mutex<stats::Stats>>,
     pub config: Arc<GPUConfig>,
-    pub current_kernel: Option<Arc<KernelInfo>>,
+    pub current_kernel: Option<Arc<Kernel>>,
     pub last_warp_fetched: Option<usize>,
     pub interconn: Arc<I>,
     pub load_store_unit: Arc<Mutex<LoadStoreUnit<ic::CoreMemoryInterface<Packet>>>>,
     pub active_thread_mask: BitArr!(for MAX_THREAD_PER_SM),
-    pub occupied_hw_thread_ids: BitArr!(for MAX_THREAD_PER_SM),
+    occupied_hw_thread_ids: BitArr!(for MAX_THREAD_PER_SM),
     pub dynamic_warp_id: usize,
     pub num_active_blocks: usize,
     pub num_active_warps: usize,
@@ -107,9 +101,8 @@ pub struct InnerSIMTCore<I> {
     pub allocations: Rc<RefCell<super::Allocations>>,
     pub instr_l1_cache: Box<dyn cache::Cache>,
     pub instr_fetch_buffer: InstrFetchBuffer,
-    pub warps: Vec<sched::CoreWarp>,
+    pub warps: Vec<sched::WarpRef>,
     pub thread_state: Vec<Option<ThreadState>>,
-    // pub thread_info: Vec<Option<ThreadInfo>>,
     pub scoreboard: Arc<RwLock<scoreboard::Scoreboard>>,
     pub operand_collector: Rc<RefCell<opcoll::OperandCollectorRegisterFileUnit>>,
     pub pipeline_reg: Vec<Rc<RefCell<register_set::RegisterSet>>>,
@@ -134,7 +127,7 @@ pub enum Packet {
 impl std::fmt::Display for Packet {
     fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
         match self {
-            Packet::Fetch(fetch) => write!(f, "{}", fetch),
+            Packet::Fetch(fetch) => write!(f, "{fetch}"),
         }
     }
 }
@@ -146,7 +139,7 @@ where
     // Returns numbers of addresses in translated_addrs.
     //
     // Each addr points to a 4B (32-bit) word
-    pub fn translate_local_memaddr(
+    #[must_use] pub fn translate_local_memaddr(
         &self,
         local_addr: address,
         thread_id: usize,
@@ -206,7 +199,7 @@ where
             // will overflow into next thread's space
             debug_assert_eq!(local_addr % 4, 0);
             for i in 0..num_accesses {
-                let local_word = local_addr / 4 + (i as u64);
+                let local_word = local_addr / 4 + u64::from(i);
                 let linear_address: address = local_word * max_concurrent_threads as u64 * 4
                     + thread_base as u64
                     + super::instruction::LOCAL_GENERIC_START;
@@ -218,14 +211,14 @@ where
             let local_word = local_addr / 4;
             let local_word_offset = local_addr % 4;
             // Make sure access doesn't overflow into next 4B chunk
-            debug_assert_eq!((local_addr + data_size as address - 1) / 4, local_word);
+            debug_assert_eq!((local_addr + u64::from(data_size) - 1) / 4, local_word);
             let linear_address: address = local_word * max_concurrent_threads as u64 * 4
                 + local_word_offset
                 + thread_base as u64
                 + super::instruction::LOCAL_GENERIC_START;
             translated_addresses.push(linear_address);
         }
-        return translated_addresses;
+        translated_addresses
     }
 }
 
@@ -390,17 +383,6 @@ where
 
         if warp.done() && warp.functional_done() {
             warp.ibuffer_flush();
-            // note: not modeling barriers for now
-            // self.barriers.warp_exit(pipe_reg_ref.warp_id);
-        }
-
-        // let mut warp = self.warps.get_mut(warp_id).unwrap().lock().unwrap();
-        if pipe_reg_ref.opcode.category == opcodes::ArchOp::BARRIER_OP {
-            // m_warp[warp_id]->store_info_of_last_inst_at_barrier(*pipe_reg);
-            // self.barriers.warp_reaches_barrier(warp.block_id, warp_id, next_inst);
-        } else if pipe_reg_ref.opcode.category == opcodes::ArchOp::MEMORY_BARRIER_OP {
-            // m_warp[warp_id]->set_membar();
-            // warp.set_membar();
         }
 
         log::debug!(
@@ -550,22 +532,6 @@ where
             })
             .collect();
 
-        // SKIPPING SPECIALIZED UNITS
-        // for (int j = 0; j < m_config->m_specialized_unit.size(); j++) {
-        //   m_pipeline_reg.push_back(
-        //       register_set(m_config->m_specialized_unit[j].id_oc_spec_reg_width,
-        //                    m_config->m_specialized_unit[j].name));
-        //   m_config->m_specialized_unit[j].ID_OC_SPEC_ID = m_pipeline_reg.size() - 1;
-        //   m_specilized_dispatch_reg.push_back(
-        //       &m_pipeline_reg[m_pipeline_reg.size() - 1]);
-        // }
-        // for (int j = 0; j < m_config->m_specialized_unit.size(); j++) {
-        //   m_pipeline_reg.push_back(
-        //       register_set(m_config->m_specialized_unit[j].oc_ex_spec_reg_width,
-        //                    m_config->m_specialized_unit[j].name));
-        //   m_config->m_specialized_unit[j].OC_EX_SPEC_ID = m_pipeline_reg.size() - 1;
-        // }
-
         if config.sub_core_model {
             // in subcore model, each scheduler should has its own
             // issue register, so ensure num scheduler = reg width
@@ -581,20 +547,6 @@ where
                 config.num_schedulers_per_core,
                 pipeline_reg[PipelineStage::ID_OC_MEM as usize].size()
             );
-            // if (m_config->gpgpu_tensor_core_avail)
-            //   assert(m_config->gpgpu_num_sched_per_core ==
-            //          m_pipeline_reg[ID_OC_TENSOR_CORE].get_size());
-            // if (m_config->gpgpu_num_dp_units > 0)
-            //   assert(m_config->gpgpu_num_sched_per_core ==
-            //          m_pipeline_reg[ID_OC_DP].get_size());
-            // if (m_config->gpgpu_num_int_units > 0)
-            //   assert(m_config->gpgpu_num_sched_per_core ==
-            //          m_pipeline_reg[ID_OC_INT].get_size());
-            // for (int j = 0; j < m_config->m_specialized_unit.size(); j++) {
-            //   if (m_config->m_specialized_unit[j].num_units > 0)
-            //     assert(m_config->gpgpu_num_sched_per_core ==
-            //            m_config->m_specialized_unit[j].id_oc_spec_reg_width);
-            // }
         }
 
         let fetch_interconn = Arc::new(ic::CoreMemoryInterface {
@@ -609,7 +561,7 @@ where
             core_id,
             cluster_id,
             warps.clone(),
-            fetch_interconn.clone(),
+            fetch_interconn,
             operand_collector.clone(),
             scoreboard.clone(),
             config.clone(),
@@ -634,7 +586,7 @@ where
             warp_instruction_unique_uid,
             stats,
             allocations,
-            config: config.clone(),
+            config,
             current_kernel: None,
             last_warp_fetched: None,
             active_thread_mask: BitArray::ZERO,
@@ -652,10 +604,10 @@ where
             instr_fetch_buffer: InstrFetchBuffer::default(),
             interconn,
             load_store_unit,
-            warps: warps.clone(),
+            warps,
             pipeline_reg,
             result_busses,
-            scoreboard: scoreboard.clone(),
+            scoreboard,
             operand_collector,
             barriers,
             thread_state,
@@ -798,150 +750,52 @@ where
     }
 
     fn init_schedulers(&mut self) {
-        // let scheduler_kind = config::SchedulerKind::LRR;
         let scheduler_kind = config::SchedulerKind::GTO;
 
         self.schedulers = (0..self.inner.config.num_schedulers_per_core)
-            .map(|sched_id| match scheduler_kind {
-                // config::SchedulerKind::LRR => {
-                //     let mem_out = &self.inner.pipeline_reg[PipelineStage::ID_OC_MEM as usize];
-                //     Box::new(sched::LrrScheduler::new(
-                //         // &self.inner.warps,
-                //         sched_id,
-                //         self.inner.cluster_id,
-                //         self.inner.core_id,
-                //         self.inner.warps.clone(),
-                //         // mem_out,
-                //         // &self.inner,
-                //         self.inner.scoreboard.clone(),
-                //         self.inner.stats.clone(),
-                //         self.inner.config.clone(),
-                //         // m_stats, this, m_scoreboard, m_simt_stack, &m_warp,
-                //         // &m_pipeline_reg[ID_OC_SP], &m_pipeline_reg[ID_OC_DP],
-                //         // &m_pipeline_reg[ID_OC_SFU], &m_pipeline_reg[ID_OC_INT],
-                //         // &m_pipeline_reg[ID_OC_TENSOR_CORE], m_specilized_dispatch_reg,
-                //         // &m_pipeline_reg[ID_OC_MEM], i
-                //     )) as Box<dyn sched::SchedulerUnit>
-                //     // self.schedulers.push_back(Box::new(lrr));
-                // }
-                config::SchedulerKind::GTO => {
-                    Box::new(sched::GTOScheduler::new(
-                        // &self.inner.warps,
+            .map(|sched_id| {
+                let scheduler_stats = Arc::new(Mutex::new(stats::scheduler::Scheduler::default()));
+                match scheduler_kind {
+                    config::SchedulerKind::GTO => Box::new(sched::gto::Scheduler::new(
                         sched_id,
                         self.inner.cluster_id,
                         self.inner.core_id,
                         self.inner.warps.clone(),
-                        // mem_out,
-                        // &self.inner,
                         self.inner.scoreboard.clone(),
-                        self.inner.stats.clone(),
+                        scheduler_stats,
                         self.inner.config.clone(),
-                        // &self.inner.pipeline_reg[PipelineStage::ID_OC_MEM as usize],
-                        // m_stats, this, m_scoreboard, m_simt_stack, &m_warp,
-                        // &m_pipeline_reg[ID_OC_SP], &m_pipeline_reg[ID_OC_DP],
-                        // &m_pipeline_reg[ID_OC_SFU], &m_pipeline_reg[ID_OC_INT],
-                        // &m_pipeline_reg[ID_OC_TENSOR_CORE], m_specilized_dispatch_reg,
-                        // &m_pipeline_reg[ID_OC_MEM], i
-
-                        // ORIGINAL PARAMS
-                        // m_stats,
-                        // this,
-                        // m_scoreboard,
-                        // m_simt_stack,
-                        // &m_warp,
-                        // &m_pipeline_reg[ID_OC_SP],
-                        // &m_pipeline_reg[ID_OC_DP],
-                        // &m_pipeline_reg[ID_OC_SFU],
-                        // &m_pipeline_reg[ID_OC_INT],
-                        // &m_pipeline_reg[ID_OC_TENSOR_CORE],
-                        // m_specilized_dispatch_reg,
-                        // &m_pipeline_reg[ID_OC_MEM],
-                        // i,
-                    )) as Box<dyn sched::SchedulerUnit>
-                    // schedulers.push_back(gto);
+                    ))
+                        as Box<dyn sched::SchedulerUnit>,
+                    scheduler_kind => unimplemented!("scheduler: {:?}", &scheduler_kind),
                 }
-                //     SchedulerKind::TwoLevelActive => {
-                // Box::new(sched::TwoLevelActiveScheduler::new(
-                //                   m_stats, this, m_scoreboard, m_simt_stack, &m_warp,
-                //                   &m_pipeline_reg[ID_OC_SP], &m_pipeline_reg[ID_OC_DP],
-                //                   &m_pipeline_reg[ID_OC_SFU], &m_pipeline_reg[ID_OC_INT],
-                //                   &m_pipeline_reg[ID_OC_TENSOR_CORE], m_specilized_dispatch_reg,
-                //                   &m_pipeline_reg[ID_OC_MEM], i, m_config->gpgpu_scheduler_string);
-                //               schedulers.push_back(tla);
-                //         },
-                other => todo!("scheduler: {:?}", &other),
-                //         SchedulerKind::RRR => {
-                //                     let rrr = RrrScheduler::new(
-                //                   m_stats, this, m_scoreboard, m_simt_stack, &m_warp,
-                //                   &m_pipeline_reg[ID_OC_SP], &m_pipeline_reg[ID_OC_DP],
-                //                   &m_pipeline_reg[ID_OC_SFU], &m_pipeline_reg[ID_OC_INT],
-                //                   &m_pipeline_reg[ID_OC_TENSOR_CORE], m_specilized_dispatch_reg,
-                //                   &m_pipeline_reg[ID_OC_MEM], i);
-                //               schedulers.push_back(rrr);
-                //         },
-                //             SchedulerKind::OldestFirst => {
-                //                     let oldest = OldestScheduler::new(
-                //                   m_stats, this, m_scoreboard, m_simt_stack, &m_warp,
-                //                   &m_pipeline_reg[ID_OC_SP], &m_pipeline_reg[ID_OC_DP],
-                //                   &m_pipeline_reg[ID_OC_SFU], &m_pipeline_reg[ID_OC_INT],
-                //                   &m_pipeline_reg[ID_OC_TENSOR_CORE], m_specilized_dispatch_reg,
-                //                   &m_pipeline_reg[ID_OC_MEM], i);
-                //               schedulers.push_back(oldest);
-                //         },
-                //             SchedulerKind::WarpLimiting => {
-                //                     let swl = SwlScheduler::new(
-                //                   m_stats, this, m_scoreboard, m_simt_stack, &m_warp,
-                //                   &m_pipeline_reg[ID_OC_SP], &m_pipeline_reg[ID_OC_DP],
-                //                   &m_pipeline_reg[ID_OC_SFU], &m_pipeline_reg[ID_OC_INT],
-                //                   &m_pipeline_reg[ID_OC_TENSOR_CORE], m_specilized_dispatch_reg,
-                //                   &m_pipeline_reg[ID_OC_MEM], i, m_config->gpgpu_scheduler_string);
-                //               schedulers.push_back(swl);
-                //         },
             })
             .collect();
-        // }
 
         for (i, warp) in self.inner.warps.iter().enumerate() {
-            // distribute i's evenly though schedulers;
+            // distribute warps evenly though schedulers
             let sched_idx = i % self.inner.config.num_schedulers_per_core;
             let scheduler = &mut self.schedulers[sched_idx];
-            scheduler.add_supervised_warp(Rc::clone(&warp));
+            scheduler.add_supervised_warp(Rc::clone(warp));
         }
-        // for scheduler in self.schedulers.iter_mut() {
-        //     // todo!("call done_adding_supervised_warps");
-        //     scheduler.done_adding_supervised_warps();
-        // }
-        // for (unsigned i = 0; i < m_config->gpgpu_num_sched_per_core; ++i) {
-        //   schedulers[i]->done_adding_supervised_warps();
-        // }
     }
 
-    pub fn active(&self) -> bool {
+    #[must_use] pub fn active(&self) -> bool {
         self.inner.num_active_blocks > 0
     }
 
     /// return the next pc of a thread
     pub fn next_pc(&mut self, thread_id: usize) -> Option<usize> {
-        // if (tid == -1) return -1;
-        // PC should already be updatd to next PC at this point (was
-        // set in shader_decode() last time thread ran)
-        self.inner
-            .thread_state
-            .get(thread_id)
-            .map(Option::as_ref)
-            .flatten()
-            .map(|t| t.pc)
+        self.inner.thread_state[thread_id].as_ref().map(|t| t.pc)
     }
 
     fn register_thread_in_block_exited(
         &mut self,
         block_hw_id: usize,
-        kernel: &Option<Arc<KernelInfo>>,
+        kernel: &Option<Arc<Kernel>>,
     ) {
         let current_kernel: &mut Option<_> =
-            &mut self.inner.current_kernel.as_ref().map(|k| k.as_ref());
+            &mut self.inner.current_kernel.as_ref().map(std::convert::AsRef::as_ref);
 
-        // see: m_cta_status
         debug_assert!(block_hw_id < MAX_CTA_PER_SHADER);
         debug_assert!(self.inner.block_status[block_hw_id] > 0);
         self.inner.block_status[block_hw_id] -= 1;
@@ -952,22 +806,7 @@ where
             //   m_stats->ctas_completed++;
             //   m_gpu->inc_completed_cta();
             self.inner.num_active_blocks -= 1;
-            //   m_barriers.deallocate_barrier(cta_num);
-            //   shader_CTA_count_unlog(m_sid, 1);
-            //
-            //   SHADER_DPRINTF(
-            //       LIVENESS,
-            //       "GPGPU-Sim uArch: Finished CTA #%u (%lld,%lld), %u CTAs running\n",
-            //       cta_num, m_gpu->gpu_sim_cycle, m_gpu->gpu_tot_sim_cycle,
-            //       m_n_active_cta);
-            //
             if self.inner.num_active_blocks == 0 {
-                //     SHADER_DPRINTF(
-                //         LIVENESS,
-                //         "GPGPU-Sim uArch: Empty (last released kernel %u \'%s\').\n",
-                //         kernel->get_uid(), kernel->name().c_str());
-                //     fflush(stdout);
-                //
                 // Shader can only be empty when no more cta are dispatched
                 if kernel.as_ref().map(|k| k.config.id) != current_kernel.map(|k| k.config.id) {
                     // debug_assert!(current_kernel.is_none() || kernel.no_more_blocks_to_run());
@@ -975,23 +814,14 @@ where
                 *current_kernel = None;
             }
             //
-            //   // Jin: for concurrent kernels on sm
             // self.release_shader_resource_1block(cta_num, kernel);
             //   kernel->dec_running();
             if let Some(kernel) = kernel {
-                if kernel.no_more_blocks_to_run() {
-                    if !kernel.running() {
-                        //       SHADER_DPRINTF(LIVENESS,
-                        //                      "GPGPU-Sim uArch: GPU detected kernel %u \'%s\' "
-                        //                      "finished on shader %u.\n",
-                        //                      kernel->get_uid(), kernel->name().c_str(), m_sid);
-                        //
-                        if current_kernel.map(|k| k.config.id) == Some(kernel.config.id) {
-                            *current_kernel = None;
-                        }
-
-                        // m_gpu->set_kernel_done(kernel);
-                    }
+                if kernel.no_more_blocks_to_run()
+                    && !kernel.running()
+                    && current_kernel.map(|k| k.config.id) == Some(kernel.config.id)
+                {
+                    *current_kernel = None;
                 }
             }
         }
@@ -1011,7 +841,7 @@ where
                     .cloned()
                     .unwrap_or_default()
                     .iter()
-                    .map(|access| access.to_string())
+                    .map(std::string::ToString::to_string)
                     .collect::<Vec<_>>(),
             ))
             .green()
@@ -1024,21 +854,16 @@ where
                 let mut warp = warp.try_borrow_mut().unwrap();
                 warp.has_imiss_pending = false;
 
-                let pc = warp.pc().unwrap() as u64;
                 self.inner.instr_fetch_buffer = InstrFetchBuffer {
                     valid: true,
-                    pc,
-                    num_bytes: fetch.data_size as usize,
                     warp_id: fetch.warp_id,
                 };
 
                 // verify that we got the instruction we were expecting.
-                // TODO: this does not work because the fetch.addr() is not the same anymore?
-                // it gets changed to the block addr on the way and not ever changed back..
-                // debug_assert_eq!(
-                //     warp.pc(),
-                //     Some(fetch.addr() as usize - super::PROGRAM_MEM_START)
-                // );
+                debug_assert_eq!(
+                    warp.pc(),
+                    Some(fetch.addr() as usize - super::PROGRAM_MEM_START)
+                );
 
                 self.inner.instr_fetch_buffer.valid = true;
                 // warp.set_last_fetch(m_gpu->gpu_sim_cycle);
@@ -1060,7 +885,6 @@ where
 
                     let sb = self.inner.scoreboard.read().unwrap();
                     let pending_writes = sb.pending_writes(warp_id);
-                    // .clone();
 
                     // if warp.functional_done() && warp.hardware_done() && warp.done_exit() {
                     //     continue;
@@ -1080,7 +904,6 @@ where
                     );
                 }
 
-                // log!("\n\n");
                 for i in 0..max_warps {
                     let last = self.inner.last_warp_fetched.unwrap_or(0);
                     let warp_id = (last + 1 + i) % max_warps;
@@ -1153,7 +976,6 @@ where
                             }
                         }
                         self.inner.num_active_warps -= 1;
-                        debug_assert!(self.inner.num_active_warps >= 0);
                     }
 
                     let mut warp = self.inner.warps[warp_id].try_borrow_mut().unwrap();
@@ -1197,13 +1019,14 @@ where
                         let mut num_bytes = 16;
                         let line_size = icache_config.line_size as usize;
                         let offset_in_block = pc & (line_size - 1);
-                        if offset_in_block + num_bytes > line_size as usize {
-                            num_bytes = line_size as usize - offset_in_block;
+                        if offset_in_block + num_bytes > line_size {
+                            num_bytes = line_size - offset_in_block;
                         }
+                        let inst_alloc = &*PROGRAM_MEM_ALLOC;
                         let access = mem_fetch::MemAccess::new(
                             mem_fetch::AccessKind::INST_ACC_R,
                             ppc as u64,
-                            Some(PROGRAM_MEM_ALLOC.clone()),
+                            Some(inst_alloc.clone()),
                             num_bytes as u32,
                             false,
                             // todo: is this correct?
@@ -1214,7 +1037,7 @@ where
                         let fetch = mem_fetch::MemFetch::new(
                             None,
                             access,
-                            &*self.inner.config,
+                            &self.inner.config,
                             mem_fetch::READ_PACKET_SIZE.into(),
                             warp_id,
                             self.inner.core_id,
@@ -1222,7 +1045,6 @@ where
                         );
 
                         let status = if self.inner.config.perfect_inst_const_cache {
-                            // shader_cache_access_log(m_sid, INSTRUCTION, 0);
                             cache::RequestStatus::HIT
                         } else {
                             let mut events = Vec::new();
@@ -1240,47 +1062,28 @@ where
                         self.inner.last_warp_fetched = Some(warp_id);
 
                         if status == cache::RequestStatus::MISS {
-                            // let warp = self.inner.warps.get_mut(warp_id).unwrap();
-                            // let warp = warp.lock().unwrap();
-                            // .as_mut()
-                            // .unwrap();
                             warp.has_imiss_pending = true;
                             // warp.set_last_fetch(m_gpu->gpu_sim_cycle);
                         } else if status == cache::RequestStatus::HIT {
                             self.inner.instr_fetch_buffer = InstrFetchBuffer {
                                 valid: true,
-                                pc: pc as u64,
-                                num_bytes,
+                                // pc: pc as u64,
+                                // num_bytes,
                                 warp_id,
                             };
                             // m_warp[warp_id]->set_last_fetch(m_gpu->gpu_sim_cycle);
-                            // delete mf;
                         } else {
                             debug_assert_eq!(status, cache::RequestStatus::RESERVATION_FAIL);
-                            // delete mf;
                         }
                         break;
                     }
-                    // }
                 }
             }
         }
         self.inner.instr_l1_cache.cycle();
     }
 
-    /// shader core decode pipeline stage
-    ///
-    /// NOTE: inst fetch buffer valid after 279 cycles
-    ///
-    /// investigate:
-    /// - fetch buffer becomes valid when icache has access ready
-    /// - icache has access ready whenm mshrs has next access
-    /// - mshrs has next access when mshrs::current_response queue is not empty
-    /// - mshrs::current_response is pushed into by mshr_table::mark_ready
-    /// - mshr_table::mark_ready is called by baseline_cache::fill
-    /// - only trace_shader_core_ctx::accept_fetch_response calls baseline_cache::fill
-    /// - only void simt_core_cluster::icnt_cycle() calls accept_fetch_response when there is a
-    /// response
+    /// Shader core decode
     fn decode(&mut self) {
         let InstrFetchBuffer { valid, warp_id, .. } = self.inner.instr_fetch_buffer;
 
@@ -1365,7 +1168,7 @@ where
             instr,
         );
 
-        warp.ibuffer_fill(slot, instr.clone());
+        warp.ibuffer_fill(slot, instr);
         warp.num_instr_in_pipeline += 1;
     }
 
@@ -1522,8 +1325,7 @@ where
                         .inner
                         .result_busses
                         .iter_mut()
-                        .filter(|bus| !bus[instr.latency])
-                        .next();
+                        .find(|bus| !bus[instr.latency]);
 
                     log::debug!(
                         "{} {} (partition issue={}, reg id={:?}) ready for issue to fu[{:03}]={}",
@@ -1545,12 +1347,9 @@ where
                         Some(result_bus) if schedule_wb_now => {
                             debug_assert!(instr.latency < fu::MAX_ALU_LATENCY);
                             result_bus.set(instr.latency, true);
-                            // fu.issue(&mut issue_inst);
-                            // let ready_reg = ready_reg.take();
                             fu.issue(ready_reg.take().unwrap());
                         }
                         _ if !schedule_wb_now => {
-                            // fu.issue(&mut issue_inst);
                             fu.issue(ready_reg.take().unwrap());
                         }
                         _ => {
@@ -1617,7 +1416,7 @@ where
         unit.invalidate();
     }
 
-    pub fn ldst_unit_response_buffer_full(&self) -> bool {
+    #[must_use] pub fn ldst_unit_response_buffer_full(&self) -> bool {
         self.inner
             .load_store_unit
             .lock()
@@ -1625,7 +1424,7 @@ where
             .response_buffer_full()
     }
 
-    pub fn fetch_unit_response_buffer_full(&self) -> bool {
+    #[must_use] pub fn fetch_unit_response_buffer_full(&self) -> bool {
         false
     }
 
@@ -1639,15 +1438,15 @@ where
         self.inner.load_store_unit.lock().unwrap().fill(fetch);
     }
 
-    pub fn not_completed(&self) -> usize {
+    #[must_use] pub fn not_completed(&self) -> usize {
         self.inner.num_active_threads
     }
 
-    pub fn is_active(&self) -> bool {
+    #[must_use] pub fn is_active(&self) -> bool {
         self.inner.num_active_blocks > 0
     }
 
-    pub fn set_kernel(&mut self, kernel: Arc<KernelInfo>) {
+    pub fn set_kernel(&mut self, kernel: Arc<Kernel>) {
         log::debug!("kernel {} bind to core {:?}", kernel, self.id());
         self.inner.current_kernel = Some(kernel);
     }
@@ -1659,16 +1458,19 @@ where
     ) -> Option<usize> {
         let mut step = 0;
         while step < self.inner.config.max_threads_per_core {
-            let hw_thread_id = step;
-            while hw_thread_id < step + thread_block_size {
-                if self.inner.occupied_hw_thread_ids[hw_thread_id] {
-                    break;
-                }
-            }
-            // consecutive non-active
-            if hw_thread_id == step + thread_block_size {
+            if self.inner.occupied_hw_thread_ids[step..(step + thread_block_size)].not_any() {
+                // found consecutive non-active
                 break;
             }
+            // for hw_thread_id in step..(step + thread_block_size) {
+            //     if self.inner.occupied_hw_thread_ids[hw_thread_id] {
+            //         break;
+            //     }
+            // }
+            // consecutive non-active
+            // if hw_thread_id == step + thread_block_size {
+            //     break;
+            // }
             step += thread_block_size;
         }
         if step >= self.inner.config.max_threads_per_core {
@@ -1676,138 +1478,42 @@ where
             None
         } else {
             if occupy {
-                for hw_thread_id in step..step + thread_block_size {
-                    self.inner.occupied_hw_thread_ids.set(hw_thread_id, true);
-                }
+                self.inner.occupied_hw_thread_ids[step..(step + thread_block_size)].fill(true);
+                // for hw_thread_id in step..(step + thread_block_size) {
+                //     self.inner.occupied_hw_thread_ids.set(hw_thread_id, true);
+                // }
             }
             Some(step)
         }
     }
-    //     int shader_core_ctx::find_available_hwtid(unsigned int cta_size, bool occupy) {
-    //   unsigned int step;
-    //   for (step = 0; step < m_config->n_thread_per_shader; step += cta_size) {
-    //     unsigned int hw_tid;
-    //     for (hw_tid = step; hw_tid < step + cta_size; hw_tid++) {
-    //       if (m_occupied_hwtid.test(hw_tid)) break;
-    //     }
-    //     if (hw_tid == step + cta_size)  // consecutive non-active
-    //       break;
-    //   }
-    //   if (step >= m_config->n_thread_per_shader)  // didn't find
-    //     return -1;
-    //   else {
-    //     if (occupy) {
-    //       for (unsigned hw_tid = step; hw_tid < step + cta_size; hw_tid++)
-    //         m_occupied_hwtid.set(hw_tid);
-    //     }
-    //     return step;
-    //   }
-    // }
-
-    pub fn occupy_resource_for_block(&mut self, kernel: &KernelInfo, _occupy: bool) -> bool {
-        let thread_block_size = self.inner.config.threads_per_block_padded(kernel);
-        if self.inner.num_occupied_threads + thread_block_size
-            > self.inner.config.max_threads_per_core
-        {
-            return false;
-        }
-        if self
-            .find_available_hw_thread_id(thread_block_size, false)
-            .is_none()
-        {
-            return false;
-        }
-        unimplemented!("occupy resource for block");
-        return true;
-    }
-
-    //     bool shader_core_ctx::occupy_shader_resource_1block(kernel_info_t &k,
-    //                                                     bool occupy) {
-    //   unsigned threads_per_cta = k.threads_per_cta();
-    //   const class function_info *kernel = k.entry();
-    //   unsigned int padded_cta_size = threads_per_cta;
-    //   unsigned int warp_size = m_config->warp_size;
-    //   if (padded_cta_size % warp_size)
-    //     padded_cta_size = ((padded_cta_size / warp_size) + 1) * (warp_size);
-    //
-    //   if (m_occupied_n_threads + padded_cta_size > m_config->n_thread_per_shader)
-    //     return false;
-    //
-    //   if (find_available_hwtid(padded_cta_size, false) == -1) return false;
-    //
-    //   const struct gpgpu_ptx_sim_info *kernel_info = ptx_sim_kernel_info(kernel);
-    //
-    //   if (m_occupied_shmem + kernel_info->smem > m_config->gpgpu_shmem_size)
-    //     return false;
-    //
-    //   unsigned int used_regs = padded_cta_size * ((kernel_info->regs + 3) & ~3);
-    //   if (m_occupied_regs + used_regs > m_config->gpgpu_shader_registers)
-    //     return false;
-    //
-    //   if (m_occupied_ctas + 1 > m_config->max_cta_per_core) return false;
-    //
-    //   if (occupy) {
-    //     m_occupied_n_threads += padded_cta_size;
-    //     m_occupied_shmem += kernel_info->smem;
-    //     m_occupied_regs += (padded_cta_size * ((kernel_info->regs + 3) & ~3));
-    //     m_occupied_ctas++;
-    //
-    //     SHADER_DPRINTF(LIVENESS,
-    //                    "GPGPU-Sim uArch: Occupied %u threads, %u shared mem, %u "
-    //                    "registers, %u ctas, on shader %d\n",
-    //                    m_occupied_n_threads, m_occupied_shmem, m_occupied_regs,
-    //                    m_occupied_ctas, m_sid);
-    //   }
-    //
-    //   return true;
-    // }
 
-    pub fn can_issue_block(&mut self, kernel: &KernelInfo) -> bool {
+    pub fn can_issue_block(&mut self, kernel: &Kernel) -> bool {
         let max_blocks = self.inner.config.max_blocks(kernel).unwrap();
         if self.inner.config.concurrent_kernel_sm {
-            unimplemented!("concurrent kernel sm model");
             if max_blocks < 1 {
                 return false;
             }
-            self.occupy_resource_for_block(kernel, false)
+            // self.occupy_resource_for_block(kernel, false);
+            unimplemented!("concurrent kernel sm model");
         } else {
             self.inner.num_active_blocks < max_blocks
         }
     }
 
-    /// m_not_completed
-    // pub fn active_warps(&self) -> usize {
-    //     0
-    // }
-
-    fn set_max_blocks(&mut self, kernel: &KernelInfo) -> eyre::Result<()> {
+    fn set_max_blocks(&mut self, kernel: &Kernel) -> eyre::Result<()> {
         // calculate the max cta count and cta size for local memory address mapping
         self.inner.max_blocks_per_shader = self.inner.config.max_blocks(kernel)?;
         self.inner.thread_block_size = self.inner.config.threads_per_block_padded(kernel);
         Ok(())
     }
 
-    pub fn id(&self) -> (usize, usize) {
+    #[must_use] pub fn id(&self) -> (usize, usize) {
         (self.inner.cluster_id, self.inner.core_id)
     }
 
-    // pub fn init_warps_from_traces(
-    //     &mut self,
-    //     kernel: &KernelInfo,
-    //     start_thread: usize,
-    //     end_thread: usize,
-    // ) {
-    //     let start_warp = start_thread / self.inner.config.warp_size;
-    //     let end_warp = (end_thread / self.inner.config.warp_size)
-    //         + if end_thread % self.inner.config.warp_size != 0 {
-    //             1
-    //         } else {
-    //             0
-    //         };
-
     pub fn init_warps_from_traces(
         &mut self,
-        kernel: &Arc<KernelInfo>,
+        kernel: &Arc<Kernel>,
         start_warp: usize,
         end_warp: usize,
     ) {
@@ -1835,27 +1541,13 @@ where
         end_thread: usize,
         block_id: u64,
         thread_block_size: usize,
-        kernel: Arc<KernelInfo>,
+        kernel: Arc<Kernel>,
     ) {
-        // log::debug!(
-        //     "core {:?}: init warps (threads {}..{}) for block {} (hw {})",
-        //     self.id(),
-        //     start_thread,
-        //     end_thread,
-        //     block_id,
-        //     block_hw_id
-        // );
-        log::debug!("kernel: {}", &kernel);
-
         let start_pc = self.next_pc(start_thread);
         let start_warp = start_thread / self.inner.config.warp_size;
         let _warp_per_cta = thread_block_size / self.inner.config.warp_size;
         let end_warp = end_thread / self.inner.config.warp_size
-            + if end_thread % self.inner.config.warp_size == 0 {
-                0
-            } else {
-                1
-            };
+            + usize::from(end_thread % self.inner.config.warp_size != 0);
         for warp_id in start_warp..end_warp {
             let mut num_active = 0;
 
@@ -1897,17 +1589,11 @@ where
 
     pub fn reinit(&mut self, start_thread: usize, end_thread: usize, reset_not_completed: bool) {
         if reset_not_completed {
+            self.inner.num_active_warps = 0;
             self.inner.num_active_threads = 0;
             self.inner.active_thread_mask.fill(false);
-
-            // Jin: for concurrent kernels on a SM
-            // m_occupied_n_threads = 0;
-            // m_occupied_shmem = 0;
-            // m_occupied_regs = 0;
-            // m_occupied_ctas = 0;
-            // m_occupied_hwtid.reset();
-            // m_occupied_cta_to_hwtid.clear();
-            self.inner.num_active_warps = 0;
+            self.inner.occupied_block_to_hw_thread_id.clear();
+            self.inner.occupied_hw_thread_ids.fill(false);
         }
         for t in start_thread..end_thread {
             self.inner.thread_state[t] = None;
@@ -1925,20 +1611,18 @@ where
         );
 
         for w in start_warp..end_warp {
-            // log::debug!("reset warp = {}/{}", w + 1, self.inner.warps.len());
             self.inner.warps[w].try_borrow_mut().unwrap().reset();
-            // simt_stack[i]->reset();
         }
     }
 
-    pub fn issue_block(&mut self, kernel: Arc<KernelInfo>) -> () {
+    pub fn issue_block(&mut self, kernel: Arc<Kernel>) {
         log::debug!("core {:?}: issue block", self.id());
         if self.inner.config.concurrent_kernel_sm {
-            let occupied = self.occupy_resource_for_block(&*kernel, true);
-            assert!(occupied);
+            // let occupied = self.occupy_resource_for_block(&*kernel, true);
+            // assert!(occupied);
             unimplemented!("concurrent kernel sm");
         } else {
-            self.set_max_blocks(&*kernel).unwrap();
+            self.set_max_blocks(&kernel).unwrap();
         }
 
         // kernel.inc_running();
@@ -1956,13 +1640,12 @@ where
             self.inner.block_status
         );
         let free_block_hw_id = (0..max_blocks_per_core)
-            .filter(|i| self.inner.block_status[*i] == 0)
-            .next()
+            .find(|i| self.inner.block_status[*i] == 0)
             .unwrap();
 
         // determine hardware threads and warps that will be used for this block
         let thread_block_size = kernel.threads_per_block();
-        let padded_thread_block_size = self.inner.config.threads_per_block_padded(&*kernel);
+        let padded_thread_block_size = self.inner.config.threads_per_block_padded(&kernel);
 
         // hw warp id = hw thread id mod warp size, so we need to find a range
         // of hardware thread ids corresponding to an integral number of hardware
diff --git a/src/ported/deadlock.rs b/src/ported/deadlock.rs
new file mode 100644
index 00000000..271179ea
--- /dev/null
+++ b/src/ported/deadlock.rs
@@ -0,0 +1,103 @@
+use super::{core, interconn as ic, mem_fetch, operand_collector, register_set};
+
+#[derive(Debug, PartialEq, Eq)]
+pub struct State {
+    pub interconn_to_l2_queue: Vec<Vec<mem_fetch::MemFetch>>,
+    pub l2_to_interconn_queue: Vec<Vec<mem_fetch::MemFetch>>,
+    pub l2_to_dram_queue: Vec<Vec<mem_fetch::MemFetch>>,
+    pub dram_to_l2_queue: Vec<Vec<mem_fetch::MemFetch>>,
+    pub dram_latency_queue: Vec<Vec<mem_fetch::MemFetch>>,
+    pub functional_unit_pipelines: Vec<Vec<register_set::RegisterSet>>,
+    // pub operand_collectors: Vec<Option<operand_collector::OperandCollectorRegisterFileUnit>>,
+    // pub schedulers: Vec<Vec<sched::Scheduler>>,
+    // functional_unit_pipelines
+    // schedulers
+    // operand_collectors
+}
+
+impl State {
+    #[must_use]
+    pub fn new(total_cores: usize, num_mem_partitions: usize, num_sub_partitions: usize) -> Self {
+        Self {
+            // per sub partition
+            interconn_to_l2_queue: vec![vec![]; num_sub_partitions],
+            l2_to_interconn_queue: vec![vec![]; num_sub_partitions],
+            l2_to_dram_queue: vec![vec![]; num_sub_partitions],
+            dram_to_l2_queue: vec![vec![]; num_sub_partitions],
+            // per partition
+            dram_latency_queue: vec![vec![]; num_mem_partitions],
+            // per core
+            functional_unit_pipelines: vec![vec![]; total_cores],
+            // operand_collectors: vec![None; total_cores],
+            // schedulers: vec![vec![]; total_cores],
+        }
+    }
+}
+
+impl<I> super::MockSimulator<I>
+where
+    I: ic::Interconnect<core::Packet> + 'static,
+{
+    pub fn gather_state(&self) -> State {
+        let total_cores = self.config.total_cores();
+        let num_partitions = self.mem_partition_units.len();
+        let num_sub_partitions = self.mem_sub_partitions.len();
+
+        let mut state = State::new(total_cores, num_partitions, num_sub_partitions);
+
+        for (cluster_id, cluster) in self.clusters.iter().enumerate() {
+            for (core_id, core) in cluster.cores.lock().unwrap().iter().enumerate() {
+                let global_core_id = cluster_id * self.config.num_cores_per_simt_cluster + core_id;
+                assert_eq!(core.inner.core_id, global_core_id);
+
+                // this is the one we will use (unless the assertion is ever false)
+                let core_id = core.inner.core_id;
+
+                // core: functional units
+                for (fu_id, fu) in core.functional_units.iter().enumerate() {
+                    let _fu = fu.lock().unwrap();
+                    let issue_port = core.issue_ports[fu_id];
+                    let issue_reg: register_set::RegisterSet = core.inner.pipeline_reg
+                        [issue_port as usize]
+                        .borrow()
+                        .clone();
+                    assert_eq!(issue_port, issue_reg.stage);
+
+                    state.functional_unit_pipelines[core_id].push(issue_reg);
+                }
+                // core: operand collector
+                // state.operand_collectors[core_id] =
+                //     Some(core.inner.operand_collector.borrow().clone());
+                // core: schedulers
+                // state.schedulers[core_id].extend(core.schedulers.iter().map(Into::into));
+            }
+        }
+        for (partition_id, partition) in self.mem_partition_units.iter().enumerate() {
+            state.dram_latency_queue[partition_id]
+                .extend(partition.dram_latency_queue.clone().into_iter());
+        }
+        for (sub_id, sub) in self.mem_sub_partitions.iter().enumerate() {
+            for (dest_queue, src_queue) in [
+                (
+                    &mut state.interconn_to_l2_queue[sub_id],
+                    &sub.borrow().interconn_to_l2_queue,
+                ),
+                (
+                    &mut state.l2_to_interconn_queue[sub_id],
+                    &sub.borrow().l2_to_interconn_queue,
+                ),
+                (
+                    &mut state.l2_to_dram_queue[sub_id],
+                    &sub.borrow().l2_to_dram_queue.lock().unwrap(),
+                ),
+                (
+                    &mut state.dram_to_l2_queue[sub_id],
+                    &sub.borrow().dram_to_l2_queue,
+                ),
+            ] {
+                dest_queue.extend(src_queue.clone().into_iter());
+            }
+        }
+        state
+    }
+}
diff --git a/src/ported/deprecated/core.rs b/src/ported/deprecated/core.rs
new file mode 100644
index 00000000..96d42b04
--- /dev/null
+++ b/src/ported/deprecated/core.rs
@@ -0,0 +1,15 @@
+// pub fn occupy_resource_for_block(&mut self, kernel: &KernelInfo, _occupy: bool) -> bool {
+//     let thread_block_size = self.inner.config.threads_per_block_padded(kernel);
+//     if self.inner.num_occupied_threads + thread_block_size
+//         > self.inner.config.max_threads_per_core
+//     {
+//         return false;
+//     }
+//     if self
+//         .find_available_hw_thread_id(thread_block_size, false)
+//         .is_none()
+//     {
+//         return false;
+//     }
+//     unimplemented!("occupy resource for block");
+// }
diff --git a/src/ported/deprecated/scheduler.rs b/src/ported/deprecated/scheduler.rs
new file mode 100644
index 00000000..0420fdca
--- /dev/null
+++ b/src/ported/deprecated/scheduler.rs
@@ -0,0 +1,199 @@
+#[derive(Debug)]
+pub struct LrrScheduler {
+    inner: BaseSchedulerUnit,
+}
+
+impl SchedulerUnit for LrrScheduler {
+    // impl<'a> SchedulerUnit for LrrScheduler<'a> {
+    fn order_warps(
+        &mut self,
+        // out: &mut VecDeque<SchedulerWarp>,
+        // warps: &mut Vec<SchedulerWarp>,
+        // last_issued_warps: &Vec<SchedulerWarp>,
+        // num_warps_to_add: usize,
+    ) {
+        self.inner.order_lrr();
+        // let num_warps_to_add = self.inner.supervised_warps.len();
+        // order_lrr(
+        //     &mut self.inner.next_cycle_prioritized_warps,
+        //     &mut self.inner.supervised_warps,
+        //     &mut self.inner.last_supervised_issued_idx,
+        //     // &mut self.inner.last_supervised_issued(),
+        //     num_warps_to_add,
+        // );
+    }
+
+    fn add_supervised_warp(&mut self, warp: CoreWarp) {
+        self.inner.supervised_warps.push_back(warp);
+        // self.inner.add_supervised_warp_id(warp_id);
+    }
+
+    fn prioritized_warps(&self) -> &VecDeque<CoreWarp> {
+        self.inner.prioritized_warps()
+    }
+
+    // fn add_supervised_warp_id(&mut self, warp_id: usize) {
+    //     self.inner.add_supervised_warp_id(warp_id);
+    // }
+
+    // fn done_adding_supervised_warps(&mut self) {
+    //     self.inner.last_supervised_issued_idx = self.inner.supervised_warps.len();
+    // }
+
+    // fn cycle<I>(&mut self, core: &mut super::core::InnerSIMTCore<I>) {
+    // fn cycle(&mut self, core: ()) {
+    fn cycle(&mut self, issuer: &mut dyn super::core::WarpIssuer) {
+        self.order_warps();
+        self.inner.cycle(issuer);
+    }
+}
+
+// impl<'a> LrrScheduler<'a> {
+impl LrrScheduler {
+    // fn order_warps(
+    //     &self,
+    //     out: &mut VecDeque<SchedulerWarp>,
+    //     warps: &mut Vec<SchedulerWarp>,
+    //     last_issued_warps: &Vec<SchedulerWarp>,
+    //     num_warps_to_add: usize,
+    // ) {
+    //     todo!("scheduler unit: order warps")
+    // }
+
+    // pub fn new(
+    //     id: usize,
+    //     // warps: &'a Vec<SchedulerWarp>,
+    //     warps: Vec<CoreWarp>,
+    //     // warps: &'a Vec<Option<SchedulerWarp>>,
+    //     // mem_out: &'a register_set::RegisterSet,
+    //     // core: &'a super::core::InnerSIMTCore,
+    //     scoreboard: Arc<RwLock<scoreboard::Scoreboard>>,
+    //     stats: Arc<Mutex<stats::Stats>>,
+    //     config: Arc<GPUConfig>,
+    // ) -> Self {
+    //     // todo!("lrr scheduler: new");
+    //     let inner = BaseSchedulerUnit::new(
+    //         id, // mem_out, core,
+    //         warps, scoreboard, stats, config,
+    //     );
+    //     Self { inner }
+    // }
+
+    // lrr_scheduler(shader_core_stats *stats, shader_core_ctx *shader,
+    //               Scoreboard *scoreboard, simt_stack **simt,
+    //               std::vector<shd_warp_t *> *warp, register_set *sp_out,
+    //               register_set *dp_out, register_set *sfu_out,
+    //               register_set *int_out, register_set *tensor_core_out,
+    //               std::vector<register_set *> &spec_cores_out,
+    //               register_set *mem_out, int id)
+    //     : scheduler_unit(stats, shader, scoreboard, simt, warp, sp_out, dp_out,
+    //                      sfu_out, int_out, tensor_core_out, spec_cores_out,
+    //                      mem_out, id) {}
+
+    // virtual void order_warps();
+}
+
+fn order_rrr(
+    &mut self,
+    // out: &mut VecDeque<SchedulerWarp>,
+    // warps: &mut Vec<SchedulerWarp>,
+    // std::vector<T> &result_list, const typename std::vector<T> &input_list,
+    // const typename std::vector<T>::const_iterator &last_issued_from_input,
+    // unsigned num_warps_to_add)
+) {
+    unimplemented!("order rrr is untested");
+    let num_warps_to_add = self.supervised_warps.len();
+    let out = &mut self.next_cycle_prioritized_warps;
+    // order_lrr(
+    //     &mut self.inner.next_cycle_prioritized_warps,
+    //     &mut self.inner.supervised_warps,
+    //     &mut self.inner.last_supervised_issued_idx,
+    //     // &mut self.inner.last_supervised_issued(),
+    //     num_warps_to_add,
+    // );
+
+    out.clear();
+
+    let current_turn_warp_ref = self.warps.get(self.current_turn_warp).unwrap();
+    let current_turn_warp = current_turn_warp_ref.try_borrow().unwrap();
+    // .as_ref()
+    // .unwrap();
+
+    if self.num_issued_last_cycle > 0
+        || current_turn_warp.done_exit()
+        || current_turn_warp.waiting()
+    {
+        // std::vector<shd_warp_t *>::const_iterator iter =
+        //   (last_issued_from_input == input_list.end()) ?
+        //     input_list.begin() : last_issued_from_input + 1;
+
+        let mut iter = self
+            .supervised_warps
+            .iter()
+            .skip(self.last_supervised_issued_idx + 1)
+            .chain(self.supervised_warps.iter());
+
+        for w in iter.take(num_warps_to_add) {
+            let warp = w.try_borrow().unwrap();
+            let warp_id = warp.warp_id;
+            if !warp.done_exit() && !warp.waiting() {
+                out.push_back(w.clone());
+                self.current_turn_warp = warp_id;
+                break;
+            }
+        }
+        // for (unsigned count = 0; count < num_warps_to_add; ++iter, ++count) {
+        //   if (iter == input_list.end()) {
+        //   iter = input_list.begin();
+        //   }
+        //   unsigned warp_id = (*iter)->get_warp_id();
+        //   if (!(*iter)->done_exit() && !(*iter)->waiting()) {
+        //     result_list.push_back(*iter);
+        //     m_current_turn_warp = warp_id;
+        //     break;
+        //   }
+        // }
+    } else {
+        out.push_back(current_turn_warp_ref.clone());
+    }
+}
+
+fn order_lrr(
+    &mut self,
+    // out: &mut VecDeque<SchedulerWarp>,
+    // warps: &mut Vec<SchedulerWarp>,
+    // // last_issued_warps: &Vec<SchedulerWarp>,
+    // // last_issued_warps: impl Iterator<Item=SchedulerWarp>,
+    // // last_issued_warps: &mut std::slice::Iter<'_, SchedulerWarp>,
+    // // last_issued_warps: impl Iterator<Item = &'a SchedulerWarp>,
+    // last_issued_warp_idx: &mut usize,
+    // num_warps_to_add: usize,
+) {
+    unimplemented!("order lrr is not tested");
+    let num_warps_to_add = self.supervised_warps.len();
+    let out = &mut self.next_cycle_prioritized_warps;
+
+    debug_assert!(num_warps_to_add <= self.warps.len());
+    out.clear();
+    // if last_issued_warps
+    //   typename std::vector<T>::const_iterator iter = (last_issued_from_input == input_list.end()) ? input_list.begin()
+    //                                                    : last_issued_from_input + 1;
+    //
+    let mut last_issued_iter = self.warps.iter().skip(self.last_supervised_issued_idx);
+
+    let mut iter = last_issued_iter.chain(self.warps.iter());
+    // .filter_map(|x| x.as_ref());
+    // .filter_map(|x| x.as_ref());
+
+    out.extend(iter.take(num_warps_to_add).cloned());
+    // for count in 0..num_warps_to_add {
+    //     let Some(warp) = iter.next() else {
+    //         return;
+    //     };
+    //     // if (iter == input_list.end()) {
+    //     //   iter = input_list.begin();
+    //     // }
+    //     out.push_back(warp.clone());
+    // }
+    // todo!("order lrr: order warps")
+}
diff --git a/src/ported/set_index_function.rs b/src/ported/deprecated/set_index_function.rs
similarity index 95%
rename from src/ported/set_index_function.rs
rename to src/ported/deprecated/set_index_function.rs
index 66b6e059..a4e2219e 100644
--- a/src/ported/set_index_function.rs
+++ b/src/ported/deprecated/set_index_function.rs
@@ -1,10 +1,6 @@
 use super::address;
 use color_eyre::eyre;
 
-pub fn bitwise_hash_function(higher_bits: address, index: usize, bank_set_num: usize) -> u64 {
-    index as u64 ^ (higher_bits & (bank_set_num as u64 - 1))
-}
-
 /// Set Indexing function from "Pseudo-randomly interleaved memory."
 /// Rau, B. R et al.
 /// ISCA 1991
@@ -30,6 +26,7 @@ pub fn bitwise_hash_function(higher_bits: address, index: usize, bank_set_num: u
 /// IPOLY hashing guarantees conflict-free for all 2^n strides which widely
 /// exit in GPGPU applications and also show good performance for other
 /// strides.
+#[must_use]
 pub fn ipoly_hash_function(_higher_bits: address, _index: usize, _bank_set_num: usize) -> u64 {
     todo!("ipoly_hash_function");
 }
diff --git a/src/ported/dram.rs b/src/ported/dram.rs
index bfab2c7c..b8d5d235 100644
--- a/src/ported/dram.rs
+++ b/src/ported/dram.rs
@@ -55,7 +55,7 @@ impl DRAM {
     /// DRAM access
     ///
     /// Here, we do nothing except logging statistics
-    /// see: memory_stats_t::memlatstat_dram_access()
+    /// see: `memory_stats_t::memlatstat_dram_access`()
     pub fn access(&mut self, fetch: &mem_fetch::MemFetch) {
         let dram_id = fetch.tlx_addr.chip as usize;
         let bank = fetch.tlx_addr.bk as usize;
@@ -92,7 +92,7 @@ impl DRAM {
     //     todo!("dram: return_queue_top");
     // }
     //
-    pub fn full(&self, _is_write: bool) -> bool {
+    #[must_use] pub fn full(&self, _is_write: bool) -> bool {
         false
         // let write_queue_size = self.config.dram_frfcfs_write_queue_size;
         // let sched_queue_size = self.config.dram_frfcfs_sched_queue_size;
diff --git a/src/ported/fifo.rs b/src/ported/fifo.rs
index a9f28743..a09920e6 100644
--- a/src/ported/fifo.rs
+++ b/src/ported/fifo.rs
@@ -50,7 +50,7 @@ where
                 .map(|max| max.to_string())
                 .as_deref()
                 .unwrap_or(""),
-            self.inner.iter().map(|i| i.to_string()).collect::<Vec<_>>() // .join(", ")
+            self.inner.iter().map(std::string::ToString::to_string).collect::<Vec<_>>() // .join(", ")
         )
         // f.debug_list()
         //     .entries(self.inner.iter().map(|i| i)) // i.to_string()))
@@ -59,7 +59,7 @@ where
 }
 
 impl<T> FifoQueue<T> {
-    pub fn iter(&self) -> std::collections::vec_deque::Iter<T> {
+    #[must_use] pub fn iter(&self) -> std::collections::vec_deque::Iter<T> {
         self.inner.iter()
     }
 }
diff --git a/src/ported/instruction.rs b/src/ported/instruction.rs
index 66f1eed8..23d90d3a 100644
--- a/src/ported/instruction.rs
+++ b/src/ported/instruction.rs
@@ -1,3 +1,4 @@
+use super::kernel::Kernel;
 use super::mem_fetch::{AccessKind, BitString, MemAccess};
 use super::opcodes::{ArchOp, Op, Opcode};
 use super::{address, mem_fetch, operand_collector as opcoll, scheduler as sched};
@@ -53,12 +54,6 @@ struct TransactionInfo {
     active_mask: sched::ThreadActiveMask,
 }
 
-impl TransactionInfo {
-    pub fn test_bytes(&self, start_bit: usize, end_bit: usize) -> bool {
-        self.byte_mask[start_bit..end_bit].any()
-    }
-}
-
 pub const MAX_ACCESSES_PER_INSN_PER_THREAD: usize = 8;
 
 #[derive(Debug, Default, Clone, PartialEq, Eq, Hash)]
@@ -96,7 +91,7 @@ fn line_size_based_tag_func(addr: address, line_size: u64) -> u64 {
     addr & !(line_size - 1)
 }
 
-pub const GLOBAL_HEAP_START: u64 = 0xC0000000;
+pub const GLOBAL_HEAP_START: u64 = 0xC000_0000;
 // Volta max shmem size is 96kB
 pub const SHARED_MEM_SIZE_MAX: u64 = 96 * (1 << 10);
 // Volta max local mem is 16kB
@@ -113,8 +108,6 @@ pub const TOTAL_LOCAL_MEM: u64 =
 pub const SHARED_GENERIC_START: u64 = GLOBAL_HEAP_START - TOTAL_SHARED_MEM;
 pub const LOCAL_GENERIC_START: u64 = SHARED_GENERIC_START - TOTAL_LOCAL_MEM;
 
-// const MAX_REG_OPERANDS: usize = 32;
-
 #[derive(Clone, PartialEq, Eq, Hash)]
 pub struct WarpInstruction {
     /// Globally unique id for this warp instruction.
@@ -125,7 +118,6 @@ pub struct WarpInstruction {
     /// The ID of the scheduler unit that issued this instruction.
     pub scheduler_id: Option<usize>,
     pub pc: usize,
-    // todo: keep?
     pub trace_idx: usize,
     pub opcode: Opcode,
     pub active_mask: sched::ThreadActiveMask,
@@ -157,7 +149,6 @@ impl std::fmt::Debug for WarpInstruction {
         f.debug_struct("WarpInstruction")
             .field("opcode", &self.opcode)
             .field("warp_id", &self.warp_id)
-            // .field("empty", &self.empty)
             .field("pc", &self.pc)
             .field("active_mask", &self.active_mask.to_bit_string())
             .field("memory_space", &self.memory_space)
@@ -172,55 +163,26 @@ impl std::fmt::Display for WarpInstruction {
     }
 }
 
-// impl Default for WarpInstruction {
-//     fn default() -> Self {
-//         let mut threads = [(); 32].map(|_| PerThreadInfo::default());
-//         Self {
-//             uid: 0,
-//             warp_id: 0,
-//             scheduler_id: 0,
-//             opcode: Opcode {
-//                 op: Op::NOP,
-//                 category: ArchOp::NO_OP,
-//             },
-//             pc: 0,
-//             threads,
-//             memory_space: MemorySpace::None,
-//             is_atomic: false,
-//             active_mask: BitArray::ZERO,
-//             cache_operator: CacheOperator::UNDEFINED,
-//             latency: 0,             // todo
-//             initiation_interval: 0, // todo
-//             data_size: 0,
-//             empty: true,
-//             mem_access_queue: VecDeque::new(),
-//             outputs: [0; 8],
-//             in_count: 0,
-//             inputs: [0; 24],
-//             out_count: 0,
-//         }
-//     }
-// }
-
-pub static MAX_WARP_SIZE: usize = 32;
+pub const MAX_WARP_SIZE: usize = 32;
 
 fn is_number(s: &str) -> bool {
     !s.is_empty() && s.chars().all(char::is_numeric)
 }
 
-fn get_data_width_from_opcode(opcode: &str) -> Result<u32, std::num::ParseIntError> {
-    let opcode_tokens: Vec<_> = opcode
-        .split(".")
-        .map(|t| t.trim())
+fn opcode_tokens(opcode: &str) -> impl Iterator<Item = &str> {
+    opcode
+        .split('.')
+        .map(str::trim)
         .filter(|t| !t.is_empty())
-        .collect();
+}
 
-    for token in opcode_tokens {
+fn get_data_width_from_opcode(opcode: &str) -> Result<u32, std::num::ParseIntError> {
+    for token in opcode_tokens(opcode) {
         assert!(!token.is_empty());
 
         if is_number(token) {
             return Ok(token.parse::<u32>()? / 8);
-        } else if let Some('U') = token.chars().nth(0) {
+        } else if let Some('U') = token.chars().next() {
             if is_number(&token[1..token.len()]) {
                 // handle the U* case
                 return Ok(token[1..token.len()].parse::<u32>()? / 8);
@@ -233,10 +195,7 @@ fn get_data_width_from_opcode(opcode: &str) -> Result<u32, std::num::ParseIntErr
 
 impl WarpInstruction {
     pub fn new_empty(config: &config::GPUConfig) -> Self {
-        // let mut threads = [(); config.warp_size].map(|_| PerThreadInfo::default());
-        let threads = (0..config.warp_size)
-            .map(|_| PerThreadInfo::default())
-            .collect();
+        let threads = vec![PerThreadInfo::default(); config.warp_size];
         Self {
             uid: 0,
             warp_id: 0,
@@ -252,30 +211,21 @@ impl WarpInstruction {
             is_atomic: false,
             active_mask: BitArray::ZERO,
             cache_operator: CacheOperator::UNDEFINED,
-            latency: 1,             // TODO: used to be one
-            initiation_interval: 1, // TODO: used to be one
-            issue_cycle: None,      // TODO: used to be one
+            latency: 1,
+            initiation_interval: 1,
+            issue_cycle: None,
             dispatch_delay_cycles: 0,
             data_size: 0,
             instr_width: 16,
-            // empty: true,
             mem_access_queue: VecDeque::new(),
             outputs: [None; 8],
-            // in_count: 0,
             inputs: [None; 24],
-            // out_count: 0,
-            // src_arch_reg: [(); opcoll::MAX_REG_OPERANDS].map(|_| None),
             src_arch_reg: [None; opcoll::MAX_REG_OPERANDS],
             dest_arch_reg: [None; opcoll::MAX_REG_OPERANDS],
-            // dest_arch_reg: [(); opcoll::MAX_REG_OPERANDS].map(|_| None),
-            // for (unsigned i = 0; i < MAX_REG_OPERANDS; i++) {
-            //   arch_reg.src[i] = -1;
-            //   arch_reg.dst[i] = -1;
-            // }
         }
     }
 
-    pub fn from_trace(kernel: &super::KernelInfo, trace: trace::MemAccessTraceEntry) -> Self {
+    pub fn from_trace(kernel: &Kernel, trace: trace::MemAccessTraceEntry) -> Self {
         // fill active mask
         let mut active_mask = BitArray::ZERO;
         active_mask.store(trace.active_mask);
@@ -289,12 +239,12 @@ impl WarpInstruction {
         let mut dest_arch_reg = [None; opcoll::MAX_REG_OPERANDS];
 
         // get the opcode
-        let opcode_tokens: Vec<_> = trace.instr_opcode.split(".").collect();
+        let opcode_tokens: Vec<_> = trace.instr_opcode.split('.').collect();
         debug_assert!(!opcode_tokens.is_empty());
         let opcode1 = opcode_tokens[0];
 
         let Some(&opcode) = kernel.opcodes.get(opcode1) else {
-            panic!("undefined opcode {}", opcode1);
+            panic!("undefined opcode {opcode1}");
         };
 
         // fill regs information
@@ -335,23 +285,23 @@ impl WarpInstruction {
 
         // handle special cases and fill memory space
 
-        let mut memory_op: Option<MemOp> = None;
+        // let mut memory_op: Option<MemOp> = None;
         let mut is_atomic = false;
-        let mut const_cache_operand = false;
+        // let mut const_cache_operand = false;
         let mut cache_operator = CacheOperator::UNDEFINED; // TODO: convert to none?
         let mut memory_space = None;
 
         match opcode.op {
             Op::LDC => {
-                memory_op = Some(MemOp::Load);
+                // memory_op = Some(MemOp::Load);
                 data_size = 4;
-                const_cache_operand = true;
+                // const_cache_operand = true;
                 memory_space = Some(MemorySpace::Constant);
                 cache_operator = CacheOperator::ALL;
             }
             Op::LDG | Op::LDL => {
                 assert!(data_size > 0);
-                memory_op = Some(MemOp::Load);
+                // memory_op = Some(MemOp::Load);
                 cache_operator = CacheOperator::ALL;
                 memory_space = if opcode.op == Op::LDL {
                     Some(MemorySpace::Local)
@@ -365,7 +315,7 @@ impl WarpInstruction {
             }
             Op::STG | Op::STL => {
                 assert!(data_size > 0);
-                memory_op = Some(MemOp::Store);
+                // memory_op = Some(MemOp::Store);
                 cache_operator = CacheOperator::ALL;
                 memory_space = if opcode.op == Op::STL {
                     Some(MemorySpace::Local)
@@ -375,7 +325,7 @@ impl WarpInstruction {
             }
             Op::ATOM | Op::RED | Op::ATOMG => {
                 assert!(data_size > 0);
-                memory_op = Some(MemOp::Load);
+                // memory_op = Some(MemOp::Load);
                 // op = Op::LOAD;
                 memory_space = Some(MemorySpace::Global);
                 is_atomic = true;
@@ -384,18 +334,18 @@ impl WarpInstruction {
             }
             Op::LDS => {
                 assert!(data_size > 0);
-                memory_op = Some(MemOp::Load);
+                // memory_op = Some(MemOp::Load);
                 memory_space = Some(MemorySpace::Shared);
             }
             Op::STS => {
                 assert!(data_size > 0);
-                memory_op = Some(MemOp::Store);
+                // memory_op = Some(MemOp::Store);
                 memory_space = Some(MemorySpace::Shared);
             }
             Op::ATOMS => {
                 assert!(data_size > 0);
                 is_atomic = true;
-                memory_op = Some(MemOp::Load);
+                // memory_op = Some(MemOp::Load);
                 memory_space = Some(MemorySpace::Shared);
             }
             Op::LDSM => {
@@ -405,11 +355,11 @@ impl WarpInstruction {
             Op::ST | Op::LD => {
                 assert!(data_size > 0);
                 is_atomic = true;
-                memory_op = Some(if opcode.op == Op::LD {
-                    MemOp::Load
-                } else {
-                    MemOp::Store
-                });
+                // memory_op = Some(if opcode.op == Op::LD {
+                //     MemOp::Load
+                // } else {
+                //     MemOp::Store
+                // });
                 // resolve generic loads
                 let trace::KernelLaunch {
                     shared_mem_base_addr,
@@ -443,7 +393,7 @@ impl WarpInstruction {
 
         Self {
             uid: 0,
-            warp_id: trace.warp_id_in_block as usize, // todo: block or sm?
+            warp_id: trace.warp_id_in_block as usize,
             scheduler_id: None,
             opcode,
             pc: trace.instr_offset as usize,
@@ -474,7 +424,7 @@ impl WarpInstruction {
         }
     }
 
-    pub fn has_dispatch_delay(&self) -> bool {
+    #[must_use] pub fn has_dispatch_delay(&self) -> bool {
         self.dispatch_delay_cycles > 0
     }
 
@@ -525,21 +475,21 @@ impl WarpInstruction {
 
     // m_uid = ++(m_config->gpgpu_ctx->warp_inst_sm_next_uid);
 
-    pub fn active_thread_count(&self) -> usize {
+    #[must_use] pub fn active_thread_count(&self) -> usize {
         self.active_mask.count_ones()
     }
 
-    pub fn is_load(&self) -> bool {
+    #[must_use] pub fn is_load(&self) -> bool {
         let op = self.opcode.category;
         matches!(op, ArchOp::LOAD_OP | ArchOp::TENSOR_CORE_LOAD_OP)
     }
 
-    pub fn is_store(&self) -> bool {
+    #[must_use] pub fn is_store(&self) -> bool {
         let op = self.opcode.category;
         matches!(op, ArchOp::STORE_OP | ArchOp::TENSOR_CORE_STORE_OP)
     }
 
-    pub fn is_atomic(&self) -> bool {
+    #[must_use] pub fn is_atomic(&self) -> bool {
         let op = self.opcode.op;
         matches!(
             op,
@@ -547,11 +497,11 @@ impl WarpInstruction {
         )
     }
 
-    pub fn addr(&self) -> Option<address> {
+    #[must_use] pub fn addr(&self) -> Option<address> {
         self.mem_access_queue.front().map(|access| access.addr)
     }
 
-    pub fn access_kind(&self) -> Option<AccessKind> {
+    #[must_use] pub fn access_kind(&self) -> Option<AccessKind> {
         let is_write = self.is_store();
         match self.memory_space {
             Some(MemorySpace::Constant) => Some(AccessKind::CONST_ACC_R),
@@ -586,7 +536,7 @@ impl WarpInstruction {
         let is_write = self.is_store();
 
         // Calculate memory accesses generated by this warp
-        let mut cache_block_size_bytes = 0;
+        // let mut cache_block_size_bytes = 0;
 
         // Number of portions a warp is divided into for
         // shared memory bank conflict check
@@ -597,9 +547,6 @@ impl WarpInstruction {
             Some(MemorySpace::Shared) => {
                 let subwarp_size = config.warp_size / warp_parts;
                 let mut total_accesses = 0;
-                // dbg!(&warp_parts);
-                // dbg!(&config.warp_size);
-                // dbg!(&subwarp_size);
                 let mut banks = Vec::new();
                 let mut words = Vec::new();
 
@@ -613,17 +560,14 @@ impl WarpInstruction {
                         if !self.active_mask[thread] {
                             continue;
                         }
-                        // dbg!(&thread);
-                        // dbg!(&self.threads[thread].mem_req_addr);
                         let Some(addr) = self.threads[thread].mem_req_addr.first() else {
                             continue;
                         };
                         // FIXME: deferred allocation of shared memory should not accumulate
                         // across kernel launches
-                        // assert( addr < m_config->gpgpu_shmem_size );
                         let bank = config.shared_mem_bank(*addr);
                         // line_size_based_tag_func
-                        let word = line_size_based_tag_func(*addr, config::WORD_SIZE as u64);
+                        let word = line_size_based_tag_func(*addr, config::WORD_SIZE);
 
                         let accesses = bank_accesses.entry(bank).or_default();
                         *accesses.entry(word).or_default() += 1;
@@ -634,7 +578,6 @@ impl WarpInstruction {
                     // dbg!(&bank_accesses);
 
                     if config.shared_memory_limited_broadcast {
-                        panic!("shmem limited broadcast is used");
                         // step 2: look for and select a broadcast bank/word if one occurs
                         let mut broadcast_detected = false;
                         let mut broadcast_word_addr = None;
@@ -659,7 +602,7 @@ impl WarpInstruction {
                         let mut max_bank_accesses = 0;
                         for (bank, accesses) in &bank_accesses {
                             let mut bank_accesses = 0;
-                            for (_addr, num_accesses) in accesses {
+                            for num_accesses in accesses.values() {
                                 bank_accesses += num_accesses;
                                 if broadcast_detected && broadcast_bank.is_some_and(|b| b == bank) {
                                     for (addr, num_accesses) in accesses {
@@ -676,19 +619,15 @@ impl WarpInstruction {
                             }
                         }
                         // step 4: accumulate
-                        total_accesses += max_bank_accesses;
+                        // total_accesses += max_bank_accesses;
+                        unimplemented!("shmem limited broadcast is used");
                     } else {
-                        // step 2: look for the bank with the maximum number of different
-                        // words accessed
+                        // step 2: look for the bank with the most unique words accessed
                         let max_bank_accesses = bank_accesses
                             .values()
-                            .map(|accesses| accesses.len())
+                            .map(std::collections::HashMap::len)
                             .max()
                             .unwrap_or(0);
-                        // let mut max_bank_accesses = 0;
-                        // for (bank, accesses) in &bank_accesses.values() {
-                        //     max_bank_accesses = max_bank_accesses.max(accesses.len());
-                        // }
                         // step 3: accumulate
                         total_accesses += max_bank_accesses;
                     }
@@ -700,24 +639,23 @@ impl WarpInstruction {
 
                 debug_assert!(total_accesses > 0);
                 debug_assert!(total_accesses <= config.warp_size);
-                // panic!("shared mem request");
 
                 // shared memory conflicts modeled as larger initiation interval
                 self.dispatch_delay_cycles = total_accesses;
 
-                // TODO: shared mem does not generate mem accesses?
+                // shared mem does not generate mem accesses?
                 None
             }
             Some(MemorySpace::Texture) => {
-                if let Some(l1_tex) = &config.tex_cache_l1 {
-                    cache_block_size_bytes = l1_tex.line_size;
-                }
+                // if let Some(l1_tex) = &config.tex_cache_l1 {
+                //     cache_block_size_bytes = l1_tex.line_size;
+                // }
                 None
             }
             Some(MemorySpace::Constant) => {
-                if let Some(l1_const) = &config.const_cache_l1 {
-                    cache_block_size_bytes = l1_const.line_size;
-                }
+                // if let Some(l1_const) = &config.const_cache_l1 {
+                //     cache_block_size_bytes = l1_const.line_size;
+                // }
                 None
             }
             Some(MemorySpace::Global | MemorySpace::Local) => {
@@ -728,7 +666,7 @@ impl WarpInstruction {
                         unimplemented!("atomics not supported for now");
                     } else {
                         // here, we return the memory accesses
-                        let accesses = self.memory_coalescing_arch(is_write, access_kind, &config);
+                        let accesses = self.memory_coalescing_arch(is_write, access_kind, config);
                         Some(accesses)
                     }
                 } else {
@@ -738,36 +676,10 @@ impl WarpInstruction {
                     );
                 }
             }
-            None => panic!("generate mem accesses but dont have mem space"),
-            // other => todo!("generate mem accesses[{other:?}]: not yet implemented"),
+            None => panic!("generate mem accesses for instruction without mem space"),
         }
     }
 
-    /// this just sets values
-    // pub fn issue(
-    //     &mut self,
-    //     mask: sched::ThreadActiveMask,
-    //     warp_id: usize,
-    //     cycle: u64,
-    //     dynamic_warp_id: usize,
-    //     scheduler_id: usize,
-    // ) {
-    //     // assert_eq!(self.active_mask, mask);
-    //     // assert_eq!(self.warp_id, warp_id);
-    //     // assert_eq!(self.scheduler_id, scheduler_id);
-    //
-    //     self.active_mask = mask;
-    //     self.active_mask = mask;
-    //     // self.id = ++(m_config->gpgpu_ctx->warp_inst_sm_next_uid);
-    //     self.warp_id = warp_id;
-    //     // self.dynamic_warp_id = dynamic_warp_id;
-    //     // self.issue_cycle = cycle;
-    //     // self.cycles = self.initiation_interval;
-    //     // self.cache_hit = false;
-    //     // self.empty = false;
-    //     self.scheduler_id = Some(scheduler_id;
-    // }
-
     fn memory_coalescing_arch(
         &self,
         is_write: bool,
@@ -776,20 +688,14 @@ impl WarpInstruction {
     ) -> Vec<MemAccess> {
         // see the CUDA manual where it discusses coalescing rules
         // before reading this
-        // let segment_size = 0;
         let warp_parts = config.shared_memory_warp_parts;
-        // let sector_segment_size = false;
         let coalescing_arch = config.coalescing_arch as usize;
 
-        let sector_segment_size = if coalescing_arch >= 20 && coalescing_arch < 39 {
+        let sector_segment_size = if (20..39).contains(&coalescing_arch) {
             // Fermi and Kepler, L1 is normal and L2 is sector
             config.global_mem_skip_l1_data_cache || self.cache_operator == CacheOperator::GLOBAL
-        } else if coalescing_arch >= 40 {
-            // Maxwell, Pascal and Volta, L1 and L2 are sectors
-            // all requests should be 32 bytes
-            true
         } else {
-            false
+            coalescing_arch >= 40
         };
 
         let segment_size = match self.data_size {
@@ -852,10 +758,6 @@ impl WarpInstruction {
                     // chunk does this thread access?
                     let tx = subwarp_transactions.entry(block_addr).or_default();
                     // can only write to one segment
-                    // it seems like in trace driven,
-                    // a thread can write to more than one segment
-                    //
-                    // assert(block_address == line_size_based_tag_func(addr+data_size_coales-1,segment_size));
 
                     tx.chunk_mask.set(chunk as usize, true);
                     tx.active_mask.set(thread_id, true);
@@ -870,7 +772,7 @@ impl WarpInstruction {
 
                     // it seems like in trace driven, a thread can write to more than one
                     // segment handle this special case
-                    let coalesc_end_addr = addr + data_size_coales as u64 - 1;
+                    let coalesc_end_addr = addr + u64::from(data_size_coales) - 1;
                     if block_addr != line_size_based_tag_func(coalesc_end_addr, segment_size) {
                         let block_addr = line_size_based_tag_func(coalesc_end_addr, segment_size);
                         let chunk = (coalesc_end_addr & 127) / 32;
@@ -926,9 +828,6 @@ impl WarpInstruction {
         mut addr: address,
         segment_size: u64,
     ) -> MemAccess {
-        // dbg!(&tx);
-        // dbg!(&tx.chunk_mask.to_string());
-
         debug_assert_eq!(addr & (segment_size - 1), 0);
         debug_assert!(tx.chunk_mask.count_ones() >= 1);
         // halves (used to check if 64 byte segment can be
@@ -943,23 +842,11 @@ impl WarpInstruction {
                 // only lower 64 bytes used
                 req_size_bytes = 64;
                 halves |= &tx.chunk_mask[0..2];
-                // if tx.chunk_mask[0] {
-                //     halves.set(0, true);
-                // }
-                // if tx.chunk_mask[1] {
-                //     halves.set(1, true);
-                // }
             } else if !lower_half_used && upper_half_used {
                 // only upper 64 bytes used
-                addr = addr + 64;
+                addr += 64;
                 req_size_bytes = 64;
                 halves |= &tx.chunk_mask[2..4];
-                // if tx.chunk_mask[2] {
-                //     halves.set(0, true);
-                // }
-                // if tx.chunk_mask[3] {
-                //     halves.set(1, true);
-                // }
             } else {
                 assert!(lower_half_used && upper_half_used);
             }
@@ -967,13 +854,9 @@ impl WarpInstruction {
             // need to set halves
             if addr % 128 == 0 {
                 halves |= &tx.chunk_mask[0..2];
-                // if (q[0]) h.set(0);
-                // if (q[1]) h.set(1);
             } else {
                 debug_assert_eq!(addr % 128, 64);
                 halves |= &tx.chunk_mask[2..4];
-                // if (q[2]) h.set(0);
-                // if (q[3]) h.set(1);
             }
         }
 
@@ -983,14 +866,14 @@ impl WarpInstruction {
             if lower_half_used && !upper_half_used {
                 req_size_bytes = 32;
             } else if !lower_half_used && upper_half_used {
-                addr = addr + 32;
+                addr += 32;
                 req_size_bytes = 32;
             } else {
                 assert!(lower_half_used && upper_half_used);
             }
         }
 
-        let access = MemAccess::new(
+        MemAccess::new(
             access_kind,
             addr,
             None, // we cannot know the allocation start address in this context
@@ -999,8 +882,7 @@ impl WarpInstruction {
             tx.active_mask,
             tx.byte_mask,
             tx.chunk_mask,
-        );
-        access
+        )
     }
 
     pub fn set_addr(&mut self, thread_id: usize, addr: address) {
@@ -1008,44 +890,10 @@ impl WarpInstruction {
         thread.mem_req_addr[0] = addr;
     }
 
-    // fn set_addresses(&mut self, thread_id: usize, addrs: &[address], count: usize) {
     pub fn set_addresses(&mut self, thread_id: usize, addresses: Vec<address>) {
         let thread = &mut self.threads[thread_id];
         for (i, addr) in addresses.into_iter().enumerate() {
             thread.mem_req_addr[i] = addr;
         }
-
-        // let max_count = thread.mem_req_addr.len();
-        // debug_assert!(count <= max_count);
-        // let count = count.min(max_count).min(addrs.len());
-        // for i in 0..count {
-        //     thread.mem_req_addr[i] = addrs[i];
-        // }
-    }
-
-    // pub fn is_active(&self, thread: usize) -> bool {
-    //     self.active_mask[thread]
-    // }
-}
-
-pub fn opcode_tokens(opcode: &str) -> Vec<&str> {
-    opcode
-        .split(".")
-        .map(|t| t.trim())
-        .filter(|t| !t.is_empty())
-        .collect()
-}
-
-pub fn datawidth_for_opcode(opcode: &str) -> u32 {
-    let tokens = opcode_tokens(opcode);
-    for t in tokens {
-        if let Ok(num) = t.parse::<u32>() {
-            return num / 8;
-        } else if t.chars().nth(0) == Some('U') {
-            if let Ok(num) = t[1..].parse::<u32>() {
-                return num / 8;
-            }
-        }
     }
-    4 // default is 4 bytes
 }
diff --git a/src/ported/interconn.rs b/src/ported/interconn.rs
index 30b493a4..6b566e27 100644
--- a/src/ported/interconn.rs
+++ b/src/ported/interconn.rs
@@ -29,14 +29,14 @@ pub trait Interconnect<P> {
 
 #[derive(Debug)]
 pub struct ToyInterconnect<P> {
-    pub capacity: Option<usize>,
+    // pub capacity: Option<usize>,
     pub num_cores: usize,
     pub num_mems: usize,
     pub num_subnets: usize,
     pub num_nodes: usize,
     pub num_classes: usize,
     round_robin_turn: Vec<Vec<Mutex<usize>>>,
-    input_queue: Vec<Vec<Vec<Mutex<VecDeque<P>>>>>,
+    // input_queue: Vec<Vec<Vec<Mutex<VecDeque<P>>>>>,
     output_queue: Vec<Vec<Vec<Mutex<VecDeque<P>>>>>,
     // deviceID to icntID map
     // deviceID : Starts from 0 for shaders and then continues until mem nodes
@@ -45,41 +45,42 @@ pub struct ToyInterconnect<P> {
 }
 
 impl<P> ToyInterconnect<P> {
-    pub fn new(num_cores: usize, num_mems: usize, capacity: Option<usize>) -> ToyInterconnect<P> {
+    #[must_use]
+    pub fn new(num_cores: usize, num_mems: usize) -> ToyInterconnect<P> {
         let num_subnets = 2;
         let num_nodes = num_cores + num_mems;
         let num_classes = 1;
 
-        let mut input_queue: Vec<Vec<Vec<Mutex<VecDeque<P>>>>> = Vec::new();
+        // let mut input_queue: Vec<Vec<Vec<Mutex<VecDeque<P>>>>> = Vec::new();
         let mut output_queue: Vec<Vec<Vec<Mutex<VecDeque<P>>>>> = Vec::new();
         let mut round_robin_turn: Vec<Vec<Mutex<usize>>> = Vec::new();
 
         for subnet in 0..num_subnets {
-            input_queue.push(Vec::new());
+            // input_queue.push(Vec::new());
             output_queue.push(Vec::new());
             round_robin_turn.push(Vec::new());
 
             for node in 0..num_nodes {
-                input_queue[subnet].push(Vec::new());
+                // input_queue[subnet].push(Vec::new());
                 output_queue[subnet].push(Vec::new());
                 round_robin_turn[subnet].push(Mutex::new(0));
 
                 for _class in 0..num_classes {
-                    input_queue[subnet][node].push(Mutex::new(VecDeque::new()));
+                    // input_queue[subnet][node].push(Mutex::new(VecDeque::new()));
                     output_queue[subnet][node].push(Mutex::new(VecDeque::new()));
                 }
             }
         }
         Self {
-            capacity,
+            // capacity,
             num_cores,
             num_mems,
             num_subnets,
             num_nodes,
             num_classes,
-            input_queue,
-            output_queue,
             round_robin_turn,
+            // input_queue,
+            output_queue,
         }
     }
 }
@@ -92,8 +93,8 @@ where
         // todo: this is not efficient, could keep track of this with a variable
         self.output_queue
             .iter()
-            .flat_map(|x| x)
-            .flat_map(|x| x)
+            .flatten()
+            .flatten()
             .any(|reqs: &Mutex<VecDeque<_>>| !reqs.lock().unwrap().is_empty())
     }
 
@@ -101,10 +102,10 @@ where
         assert!(self.has_buffer(src_device, size));
 
         let is_memory_node = self.num_subnets > 1 && dest_device >= self.num_cores;
-        let subnet = if is_memory_node { 1 } else { 0 };
+        let subnet = usize::from(is_memory_node);
         log::debug!(
             "{}: {size} bytes from device {src_device} to {dest_device} (subnet {subnet})",
-            style(format!("INTERCONN PUSH {}", packet)).bold(),
+            style(format!("INTERCONN PUSH {packet}")).bold(),
         );
 
         let mut queue = self.output_queue[subnet][dest_device][0].lock().unwrap();
@@ -113,7 +114,7 @@ where
 
     fn pop(&self, device: usize) -> Option<P> {
         let icnt_id = device;
-        let subnet = if device >= self.num_cores { 1 } else { 0 };
+        let subnet = usize::from(device >= self.num_cores);
 
         let mut lock = self.round_robin_turn[subnet][icnt_id].lock().unwrap();
         let mut turn = *lock;
@@ -137,15 +138,16 @@ where
         // do nothing
     }
 
-    fn has_buffer(&self, device: usize, _size: u32) -> bool {
-        let Some(capacity) = self.capacity else {
-            return true;
-        };
-
-        // TODO: using input queue makes no sense as we push into output directly
-        let subnet = if device >= self.num_cores { 1 } else { 0 };
-        let queue = self.input_queue[subnet][device][0].lock().unwrap();
-        queue.len() <= capacity
+    fn has_buffer(&self, _device: usize, _size: u32) -> bool {
+        true
+        // let Some(capacity) = self.capacity else {
+        //     return true;
+        // };
+        //
+        // // TODO: using input queue makes no sense as we push into output directly
+        // let subnet = usize::from(device >= self.num_cores);
+        // let queue = self.input_queue[subnet][device][0].lock().unwrap();
+        // queue.len() <= capacity
     }
 }
 
@@ -180,7 +182,7 @@ impl MemFetchInterface for CoreMemoryInterface<Packet> {
         let request_size = if write {
             size
         } else {
-            mem_fetch::READ_PACKET_SIZE as u32
+            u32::from(mem_fetch::READ_PACKET_SIZE)
         };
         !self.interconn.has_buffer(self.cluster_id, request_size)
     }
@@ -197,9 +199,7 @@ impl MemFetchInterface for CoreMemoryInterface<Packet> {
         }
 
         let dest_sub_partition_id = fetch.sub_partition_id();
-        let mem_dest = self
-            .config
-            .mem_id_to_device_id(dest_sub_partition_id as usize);
+        let mem_dest = self.config.mem_id_to_device_id(dest_sub_partition_id);
 
         log::debug!(
             "cluster {} icnt_inject_request_packet({}) dest sub partition id={} dest mem node={}",
@@ -269,7 +269,7 @@ mod tests {
 
         let config = IntersimConfig::from_file(&config_file)?;
 
-        assert_eq!(config.get_bool("use_map"), false);
+        assert!(!config.get_bool("use_map"));
         assert_eq!(config.get_int("num_vcs"), 1); // this means vc can only ever be zero
         assert_eq!(config.get_int("ejection_buffer_size"), 0);
         assert_eq!(config.get_string("sim_type"), "gpgpusim");
diff --git a/src/ported/kernel.rs b/src/ported/kernel.rs
new file mode 100644
index 00000000..ecf0797d
--- /dev/null
+++ b/src/ported/kernel.rs
@@ -0,0 +1,212 @@
+use super::{instruction, opcodes, scheduler as sched};
+use color_eyre::{
+    eyre::{self},
+    Help,
+};
+use std::collections::HashSet;
+use std::path::Path;
+use std::sync::{Mutex, RwLock};
+use std::time::Instant;
+use trace_model::{KernelLaunch, MemAccessTraceEntry, Point};
+
+pub fn read_trace(path: impl AsRef<Path>) -> eyre::Result<Vec<MemAccessTraceEntry>> {
+    use serde::Deserializer;
+
+    let reader = utils::fs::open_readable(path.as_ref())?;
+    let mut reader = rmp_serde::Deserializer::new(reader);
+    let mut trace = vec![];
+    let decoder = nvbit_io::Decoder::new(|access: MemAccessTraceEntry| {
+        trace.push(access);
+    });
+    reader.deserialize_seq(decoder).suggestion("maybe the traces does not match the most recent binary trace format, try re-generating the traces.")?;
+    Ok(trace)
+}
+
+/// Kernel represents a kernel.
+///
+/// This includes its launch configuration,
+/// as well as its state of execution.
+#[derive(Debug)]
+pub struct Kernel {
+    pub opcodes: &'static opcodes::OpcodeMap,
+    pub config: KernelLaunch,
+    trace: Vec<MemAccessTraceEntry>,
+    trace_pos: RwLock<usize>,
+    launched: Mutex<bool>,
+    num_cores_running: usize,
+}
+
+impl PartialEq for Kernel {
+    fn eq(&self, other: &Self) -> bool {
+        self.id() == other.id()
+    }
+}
+
+impl std::fmt::Display for Kernel {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("Kernel")
+            .field("name", &self.name())
+            .field("id", &self.id())
+            .finish()
+    }
+}
+
+impl Kernel {
+    pub fn from_trace(traces_dir: impl AsRef<Path>, config: KernelLaunch) -> Self {
+        let start = Instant::now();
+        log::info!(
+            "parsing kernel for launch {:?} from {}",
+            &config,
+            &config.trace_file
+        );
+        let trace_path = traces_dir
+            .as_ref()
+            .join(&config.trace_file)
+            .with_extension("msgpack");
+
+        let trace = read_trace(trace_path).unwrap();
+
+        // sanity check
+        assert!(trace_model::is_valid_trace(&trace));
+
+        // check if grid size is equal to the number of unique blocks in the trace
+        let all_blocks: HashSet<_> = trace.iter().map(|t| &t.block_id).collect();
+        log::info!(
+            "parsed kernel trace for {:?}: {}/{} blocks in {:?}",
+            config.name,
+            all_blocks.len(),
+            config.grid.size(),
+            start.elapsed()
+        );
+        assert_eq!(config.grid.size(), all_blocks.len() as u64);
+
+        let opcodes = opcodes::get_opcode_map(&config).unwrap();
+
+        Self {
+            config,
+            trace,
+            trace_pos: RwLock::new(0),
+            opcodes,
+            launched: Mutex::new(false),
+            num_cores_running: 0,
+        }
+    }
+
+    pub fn shared_memory_bytes_human_readable(&self) -> String {
+        human_bytes::human_bytes(f64::from(self.config.shared_mem_bytes))
+    }
+
+    pub fn set_launched(&self) {
+        *self.launched.lock().unwrap() = true;
+    }
+
+    pub fn launched(&self) -> bool {
+        *self.launched.lock().unwrap()
+    }
+
+    pub fn id(&self) -> u64 {
+        self.config.id
+    }
+
+    pub fn next_threadblock_traces(&self, warps: &mut [sched::WarpRef]) {
+        let mut trace_pos = self.trace_pos.write().unwrap();
+
+        let mut instructions = 0;
+        let trace_size = self.trace.len();
+
+        if *trace_pos + 1 >= trace_size || trace_size == 0 {
+            // no more threadblocks
+            log::info!("blocks done: no more threadblock traces");
+            return;
+        }
+        let next_block = &self.trace[*trace_pos + 1].block_id;
+
+        while *trace_pos < trace_size {
+            let entry = &self.trace[*trace_pos];
+            if entry.block_id != *next_block {
+                // get instructions until new block
+                break;
+            }
+
+            let warp_id = entry.warp_id_in_block as usize;
+            let instr = instruction::WarpInstruction::from_trace(self, entry.clone());
+            let warp = warps.get_mut(warp_id).unwrap();
+            let mut warp = warp.try_borrow_mut().unwrap();
+            warp.push_trace_instruction(instr);
+
+            instructions += 1;
+            *trace_pos += 1;
+        }
+
+        log::debug!(
+            "added {instructions} instructions ({} per warp) for block {next_block}",
+            instructions / warps.len()
+        );
+        debug_assert!(instructions > 0);
+        // debug_assert!(instructions % 32 == 0);
+        // dbg!(warps
+        //     .iter()
+        //     .map(|w| w.try_borrow().unwrap().trace_instructions.len())
+        //     .collect::<Vec<_>>());
+        // debug_assert!(
+        //     warps
+        //         .iter()
+        //         .map(|w| w.try_borrow().unwrap().trace_instructions.len())
+        //         .collect::<HashSet<_>>()
+        //         .len()
+        //         == 1,
+        //     "all warps have the same number of instructions"
+        // );
+        // dbg!(warps
+        //     .iter()
+        //     .map(|w| w.try_borrow().unwrap().trace_instructions.len())
+        //     .collect::<Vec<_>>());
+
+        debug_assert!(
+            warps
+                .iter()
+                .all(|w| !w.try_borrow().unwrap().trace_instructions.is_empty()),
+            "all warps have at least one instruction (need at least an EXIT)"
+        );
+    }
+
+    pub fn inc_running(&mut self) {
+        self.num_cores_running += 1;
+    }
+
+    pub fn name(&self) -> &str {
+        &self.config.name
+    }
+
+    pub fn was_launched(&self) -> bool {
+        *self.launched.lock().unwrap()
+    }
+
+    pub fn running(&self) -> bool {
+        self.num_cores_running > 0
+    }
+
+    pub fn current_block(&self) -> Option<Point> {
+        let traces_pos = self.trace_pos.read().unwrap();
+        let trace = self.trace.get(*traces_pos)?;
+        Some(Point::new(trace.block_id.clone(), self.config.grid.clone()))
+    }
+
+    pub fn done(&self) -> bool {
+        self.no_more_blocks_to_run() && !self.running()
+    }
+
+    pub fn num_blocks(&self) -> usize {
+        let grid = &self.config.grid;
+        grid.x as usize * grid.y as usize * grid.z as usize
+    }
+
+    pub fn threads_per_block(&self) -> usize {
+        let block = &self.config.block;
+        block.x as usize * block.y as usize * block.z as usize
+    }
+
+    pub fn no_more_blocks_to_run(&self) -> bool {
+        self.current_block().is_none()
+    }
+}
diff --git a/src/ported/l1/base.rs b/src/ported/l1/base.rs
index 02433e63..e8251491 100644
--- a/src/ported/l1/base.rs
+++ b/src/ported/l1/base.rs
@@ -38,18 +38,18 @@ impl BandwidthManager {
     }
 
     /// Use the data port based on the outcome and
-    /// events generated by the mem_fetch request
+    /// events generated by the `mem_fetch` request
     pub fn use_data_port(
         &mut self,
         data_size: u32,
         access_status: cache::RequestStatus,
-        events: &mut Vec<cache::Event>,
+        events: &mut [cache::Event],
     ) {
         let port_width = self.config.data_port_width() as u32;
         match access_status {
             cache::RequestStatus::HIT => {
                 let mut data_cycles = data_size / port_width;
-                data_cycles += if data_size % port_width > 0 { 1 } else { 0 };
+                data_cycles += u32::from(data_size % port_width > 0);
                 self.data_port_occupied_cycles += data_cycles as usize;
             }
             cache::RequestStatus::HIT_RESERVED | cache::RequestStatus::MISS => {
@@ -139,7 +139,7 @@ impl PendingRequest {}
 
 /// Base cache
 ///
-/// Implements common functions for read_only_cache and data_cache
+/// Implements common functions for `read_only_cache` and `data_cache`
 /// Each subclass implements its own 'access' function
 #[derive()]
 pub struct Base<I>
@@ -198,12 +198,8 @@ impl<I> Base<I> {
         config: Arc<config::GPUConfig>,
         cache_config: Arc<config::CacheConfig>,
     ) -> Self {
-        // for now we initialize the tag array and mshr
-
-        // m_tag_array(new tag_array(config, core_id, type_id)),
-        let tag_array = tag_array::TagArray::new(core_id, 0, cache_config.clone());
+        let tag_array = tag_array::TagArray::new(cache_config.clone());
 
-        // m_mshrs(config.m_mshr_entries, config.m_mshr_max_merge),
         debug_assert!(matches!(
             cache_config.mshr_kind,
             mshr::Kind::ASSOC | mshr::Kind::SECTOR_ASSOC
@@ -226,15 +222,12 @@ impl<I> Base<I> {
             pending: HashMap::new(),
             miss_queue: VecDeque::new(),
             miss_queue_status: mem_fetch::Status::INITIALIZED,
-            // write_alloc_type: mem_fetch::AccessKind::L1_WR_ALLOC_R,
-            // write_back_type: mem_fetch::AccessKind::L1_WRBK_ACC,
         }
     }
 
     /// Checks whether this request can be handled in this cycle.
     ///
-    /// `n` equals the number of misses to be handled on
-    /// this cycle.
+    /// `n` equals the number of misses to be handled in this cycle.
     pub fn miss_queue_can_fit(&self, n: usize) -> bool {
         self.miss_queue.len() + n < self.cache_config.miss_queue_size
     }
@@ -248,7 +241,7 @@ impl<I> Base<I> {
 
     /// Checks if fetch is waiting to be filled by lower memory level
     pub fn waiting_for_fill(&self, fetch: &mem_fetch::MemFetch) -> bool {
-        self.pending.contains_key(&fetch)
+        self.pending.contains_key(fetch)
     }
 
     /// Are any (accepted) accesses that had to wait for memory now ready?
@@ -355,7 +348,7 @@ impl<I> Base<I> {
             );
 
             // change address to mshr block address
-            fetch.data_size = self.cache_config.atom_size() as u32;
+            fetch.data_size = self.cache_config.atom_size();
             fetch.access.addr = mshr_addr;
 
             self.mshrs.add(mshr_addr, fetch.clone());
@@ -467,8 +460,8 @@ where
             other => unimplemented!("cache allocate policy {:?} is not implemented", other),
         }
 
-        let access_sector_mask = fetch.access_sector_mask().clone();
-        let access_byte_mask = fetch.access_byte_mask().clone();
+        let access_sector_mask = *fetch.access_sector_mask();
+        let access_byte_mask = *fetch.access_byte_mask();
 
         let has_atomic = self
             .mshrs
@@ -518,9 +511,8 @@ mod tests {
         let cache_stats = Arc::new(Mutex::new(stats::Cache::default()));
         let cache_config = config.data_cache_l1.clone().unwrap();
 
-        let stats = Arc::new(Mutex::new(stats::Stats::from_config(&*config)));
-        let interconn: Arc<ic::ToyInterconnect<Packet>> =
-            Arc::new(ic::ToyInterconnect::new(0, 0, None));
+        let stats = Arc::new(Mutex::new(stats::Stats::from_config(&config)));
+        let interconn: Arc<ic::ToyInterconnect<Packet>> = Arc::new(ic::ToyInterconnect::new(0, 0));
         let port = Arc::new(ic::CoreMemoryInterface {
             interconn,
             cluster_id: 0,
diff --git a/src/ported/l1/data.rs b/src/ported/l1/data.rs
index d7373043..e476f064 100644
--- a/src/ported/l1/data.rs
+++ b/src/ported/l1/data.rs
@@ -50,7 +50,7 @@ where
         }
     }
 
-    pub fn cache_config(&self) -> &Arc<config::CacheConfig> {
+    #[must_use] pub fn cache_config(&self) -> &Arc<config::CacheConfig> {
         &self.inner.cache_config
     }
 
@@ -61,7 +61,7 @@ where
         cache_index: Option<usize>,
         fetch: mem_fetch::MemFetch,
         time: u64,
-        _events: &mut Vec<cache::Event>,
+        _events: &mut [cache::Event],
         _probe_status: cache::RequestStatus,
     ) -> cache::RequestStatus {
         debug_assert_eq!(addr, fetch.addr());
@@ -119,7 +119,7 @@ where
         // cache_index: usize,
         fetch: mem_fetch::MemFetch,
         time: u64,
-        _events: &mut Vec<cache::Event>,
+        _events: &mut [cache::Event],
         _probe_status: cache::RequestStatus,
     ) -> cache::RequestStatus {
         let super::base::Base {
@@ -143,7 +143,7 @@ where
                 tag_array.num_dirty += 1;
             }
         }
-        return cache::RequestStatus::HIT;
+        cache::RequestStatus::HIT
     }
 
     /// Sends write request to lower level memory (write or writeback)
@@ -168,11 +168,9 @@ where
         &mut self,
         addr: address,
         cache_index: Option<usize>,
-        // cache_index: usize,
         fetch: mem_fetch::MemFetch,
         time: u64,
         events: &mut Vec<cache::Event>,
-        // events: &[cache::Event],
         _probe_status: cache::RequestStatus,
     ) -> cache::RequestStatus {
         if !self.inner.miss_queue_can_fit(1) {
@@ -213,25 +211,22 @@ where
             // (already modified lower level)
             if writeback && writeback_policy != config::CacheWritePolicy::WRITE_THROUGH {
                 if let Some(evicted) = evicted {
-                    let debug_fetch = fetch.to_string();
-
                     let is_write = true;
                     let writeback_access = mem_fetch::MemAccess::new(
                         self.write_back_type,
                         evicted.block_addr,
                         evicted.allocation.clone(),
-                        evicted.modified_size as u32,
+                        evicted.modified_size,
                         is_write,
                         *fetch.access_warp_mask(),
                         evicted.byte_mask,
                         evicted.sector_mask,
                     );
-                    // dbg!(&writeback_access);
 
                     let mut writeback_fetch = mem_fetch::MemFetch::new(
-                        fetch.instr,
+                        fetch.instr.clone(),
                         writeback_access,
-                        &*self.inner.config,
+                        &self.inner.config,
                         if is_write {
                             ported::WRITE_PACKET_SIZE
                         } else {
@@ -242,7 +237,6 @@ where
                         0,
                         0,
                     );
-                    // dbg!(&writeback_fetch);
 
                     // the evicted block may have wrong chip id when
                     // advanced L2 hashing is used, so set the right chip
@@ -256,7 +250,7 @@ where
 
                     log::trace!(
                         "handling READ MISS for {}: => sending writeback {}",
-                        debug_fetch,
+                        fetch,
                         writeback_fetch
                     );
 
@@ -266,7 +260,7 @@ where
             return cache::RequestStatus::MISS;
         }
 
-        return cache::RequestStatus::RESERVATION_FAIL;
+        cache::RequestStatus::RESERVATION_FAIL
     }
 
     fn write_miss_no_write_allocate(
@@ -297,7 +291,6 @@ where
         }
 
         // on miss, generate write through
-        // (no write buffering -- too many threads for that)
         let event = cache::Event {
             kind: cache::EventKind::WRITE_REQUEST_SENT,
             evicted_block: None,
@@ -331,7 +324,7 @@ where
 
         log::debug!("handling write miss for {} (block addr={}, mshr addr={}, mshr hit={} mshr avail={}, miss queue full={})", &fetch, block_addr, mshr_addr, mshr_hit, mshr_free, self.inner.miss_queue_can_fit(2));
 
-        if !self.inner.miss_queue_can_fit(2) || (!(mshr_hit && mshr_free) && !mshr_miss_but_free) {
+        if !self.inner.miss_queue_can_fit(2) || !(mshr_miss_but_free || mshr_hit && mshr_free) {
             // check what is the exact failure reason
             let failure = if !self.inner.miss_queue_can_fit(2) {
                 cache::ReservationFailure::MISS_QUEUE_FULL
@@ -492,7 +485,7 @@ where
         cache_index: Option<usize>,
         fetch: mem_fetch::MemFetch,
         time: u64,
-        events: &mut Vec<cache::Event>,
+        events: &mut [cache::Event],
         probe_status: cache::RequestStatus,
     ) -> cache::RequestStatus {
         let func = match self.inner.cache_config.write_policy {
@@ -556,24 +549,19 @@ where
                     1,
                 );
             }
+        } else if probe_status == cache::RequestStatus::HIT {
+            access_status = self.read_hit(addr, cache_index, fetch, time, events, probe_status);
+        } else if probe_status != cache::RequestStatus::RESERVATION_FAIL {
+            access_status = self.read_miss(addr, cache_index, fetch, time, events, probe_status);
         } else {
-            if probe_status == cache::RequestStatus::HIT {
-                access_status = self.read_hit(addr, cache_index, fetch, time, events, probe_status);
-            } else if probe_status != cache::RequestStatus::RESERVATION_FAIL {
-                access_status =
-                    self.read_miss(addr, cache_index, fetch, time, events, probe_status);
-            } else {
-                // the only reason for reservation fail here is LINE_ALLOC_FAIL
-                // (i.e all lines are reserved)
-                let mut stats = self.inner.stats.lock().unwrap();
-                stats.inc(
-                    *fetch.access_kind(),
-                    cache::AccessStat::ReservationFailure(
-                        cache::ReservationFailure::LINE_ALLOC_FAIL,
-                    ),
-                    1,
-                );
-            }
+            // the only reason for reservation fail here is LINE_ALLOC_FAIL
+            // (i.e all lines are reserved)
+            let mut stats = self.inner.stats.lock().unwrap();
+            stats.inc(
+                *fetch.access_kind(),
+                cache::AccessStat::ReservationFailure(cache::ReservationFailure::LINE_ALLOC_FAIL),
+                1,
+            );
         }
 
         self.inner
@@ -742,7 +730,7 @@ mod tests {
     use super::Data;
     use crate::config;
     use crate::ported::{
-        self, cache::Cache, instruction, interconn as ic, mem_fetch, parse_commands,
+        self, cache::Cache, instruction, interconn as ic, kernel::Kernel, mem_fetch, parse_commands,
     };
     use std::collections::VecDeque;
     use std::path::PathBuf;
@@ -808,7 +796,6 @@ mod tests {
     #[test]
     fn test_data_l1_full_trace() {
         let _control_size = 0;
-        // let warp_id = 0;
         let core_id = 0;
         let cluster_id = 0;
 
@@ -824,7 +811,7 @@ mod tests {
             cycle,
             interconn,
             stats.clone(),
-            config.clone(),
+            config,
             Arc::clone(&cache_config.inner),
             mem_fetch::AccessKind::L1_WR_ALLOC_R,
             mem_fetch::AccessKind::L1_WRBK_ACC,
@@ -832,34 +819,22 @@ mod tests {
 
         let trace_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
             .join("test-apps/vectoradd/traces/vectoradd-100-32-trace/");
-        // let command_traces_path =
-        //     traces_dircommands.json");
         dbg!(&trace_dir);
         let commands: Vec<Command> =
             parse_commands(&trace_dir.join("commands.json")).expect("parse trace commands");
 
         dbg!(&commands);
-        // let mut kernels: VecDeque<Arc<KernelInfo>> = VecDeque::new();
         let mut kernels: VecDeque<_> = VecDeque::new();
         for cmd in commands {
             match cmd {
                 Command::MemcpyHtoD { .. } => {}
                 Command::MemAlloc { .. } => {}
-                // Command::MemcpyHtoD {
-                //     allocation_name,
-                //     dest_device_addr,
-                //     num_bytes,
-                // } => {
-                //     // sim.memcopy_to_gpu(*dest_device_addr, *num_bytes, allocation_name);
-                // }
                 Command::KernelLaunch(launch) => {
-                    let kernel = ported::KernelInfo::from_trace(&trace_dir, launch.clone());
-                    // kernels.push_back(Arc::new(kernel));
+                    let kernel = Kernel::from_trace(&trace_dir, launch.clone());
                     kernels.push_back(kernel);
                 }
             }
         }
-        // dbg!(&kernels);
 
         // for kernel in &mut kernels {
         //     let mut block_iter = kernel.next_block_iter.lock().unwrap();
@@ -1011,11 +986,11 @@ mod tests {
             num_registers: 8,
             binary_version: 61,
             stream_id: 0,
-            shared_mem_base_addr: 140663786045440,
-            local_mem_base_addr: 140663752491008,
+            shared_mem_base_addr: 140_663_786_045_440,
+            local_mem_base_addr: 140_663_752_491_008,
             nvbit_version: "1.5.5".to_string(),
         };
-        let kernel = crate::ported::KernelInfo::from_trace(trace_dir, launch);
+        let kernel = Kernel::from_trace(trace_dir, launch);
 
         let trace_instr = trace_model::MemAccessTraceEntry {
             cuda_ctx: 0,
@@ -1048,10 +1023,10 @@ mod tests {
             num_src_regs: 0,
             addrs: concat(
                 [
-                    140663086646144,
-                    140663086646148,
-                    140663086646152,
-                    140663086646156,
+                    140_663_086_646_144,
+                    140_663_086_646_148,
+                    140_663_086_646_152,
+                    140_663_086_646_156,
                 ],
                 [0; 32 - 4],
             )
@@ -1062,7 +1037,7 @@ mod tests {
         let mut instr = instruction::WarpInstruction::from_trace(&kernel, trace_instr);
         dbg!(&instr);
         let mut accesses = instr
-            .generate_mem_accesses(&*config)
+            .generate_mem_accesses(&config)
             .expect("generated acceseses");
         assert_eq!(accesses.len(), 1);
 
diff --git a/src/ported/ldst_unit.rs b/src/ported/ldst_unit.rs
index 04c061b5..bb2739af 100644
--- a/src/ported/ldst_unit.rs
+++ b/src/ported/ldst_unit.rs
@@ -32,7 +32,7 @@ fn new_mem_fetch(
     mem_fetch::MemFetch::new(
         Some(instr),
         access,
-        &config,
+        config,
         control_size,
         warp_id,
         core_id,
@@ -46,7 +46,7 @@ pub struct LoadStoreUnit<I> {
     cluster_id: usize,
     next_writeback: Option<WarpInstruction>,
     response_fifo: VecDeque<MemFetch>,
-    warps: Vec<sched::CoreWarp>,
+    warps: Vec<sched::WarpRef>,
     pub data_l1: Option<Box<dyn cache::Cache>>,
     config: Arc<config::GPUConfig>,
     pub stats: Arc<Mutex<stats::Stats>>,
@@ -91,6 +91,7 @@ enum WritebackClient {
 }
 
 #[derive(strum::EnumCount, strum::FromRepr, Hash, PartialEq, Eq, Clone, Copy, Debug)]
+#[allow(dead_code)]
 #[repr(usize)]
 enum MemStageAccessKind {
     C_MEM,
@@ -104,6 +105,7 @@ enum MemStageAccessKind {
 }
 
 #[derive(strum::EnumCount, strum::FromRepr, Hash, PartialEq, Eq, Clone, Copy, Debug)]
+#[allow(dead_code)]
 #[repr(usize)]
 enum MemStageStallKind {
     NO_RC_FAIL = 0,
@@ -125,7 +127,7 @@ where
         id: usize,
         core_id: usize,
         cluster_id: usize,
-        warps: Vec<sched::CoreWarp>,
+        warps: Vec<sched::WarpRef>,
         fetch_interconn: Arc<I>,
         operand_collector: Rc<RefCell<OperandCollectorRegisterFileUnit>>,
         scoreboard: Arc<RwLock<Scoreboard>>,
@@ -157,7 +159,7 @@ where
                 // initialize l1 data cache
                 let cache_stats = Arc::new(Mutex::new(stats::Cache::default()));
                 Some(Box::new(l1::Data::new(
-                    format!("ldst-unit-{}-{}-L1-DATA-CACHE", cluster_id, core_id),
+                    format!("ldst-unit-{cluster_id}-{core_id}-L1-DATA-CACHE"),
                     core_id,
                     cluster_id,
                     Rc::clone(&cycle),
@@ -193,7 +195,7 @@ where
         }
     }
 
-    pub fn response_buffer_full(&self) -> bool {
+    #[must_use] pub fn response_buffer_full(&self) -> bool {
         self.response_fifo.len() >= self.config.num_ldst_response_buffer_size
     }
 
@@ -341,7 +343,7 @@ where
                             "{}",
                             style(format!(
                                 "ldst unit writeback: has global {:?} ({})",
-                                &next_global.instr.as_ref().map(|i| i.to_string()),
+                                &next_global.instr.as_ref().map(std::string::ToString::to_string),
                                 &next_global.addr()
                             ))
                             .magenta(),
@@ -423,7 +425,7 @@ where
         _rc_fail: &mut MemStageStallKind,
         _kind: &mut MemStageAccessKind,
     ) -> bool {
-        false
+        true
     }
 
     fn texture_cycle(
@@ -431,7 +433,7 @@ where
         _rc_fail: &mut MemStageStallKind,
         _kind: &mut MemStageAccessKind,
     ) -> bool {
-        false
+        true
     }
 
     fn memory_cycle(
@@ -491,7 +493,7 @@ where
             } else {
                 mem_fetch::READ_PACKET_SIZE
             };
-            let size = access.req_size_bytes + control_size as u32;
+            let size = access.req_size_bytes + u32::from(control_size);
 
             if self.fetch_interconn.full(
                 size,
@@ -669,12 +671,13 @@ where
         }
     }
 
+    #[allow(dead_code)]
     fn process_cache_access(
         &mut self,
         _cache: (),
         _addr: address,
         instr: &mut WarpInstruction,
-        events: &mut Vec<cache::Event>,
+        events: &mut [cache::Event],
         fetch: mem_fetch::MemFetch,
         status: cache::RequestStatus,
     ) -> MemStageStallKind {
@@ -774,7 +777,7 @@ where
                             );
 
                             if *still_pending > 0 {
-                                pending.remove(&out_reg);
+                                pending.remove(out_reg);
                                 log::trace!("l1 latency queue release registers");
                                 self.scoreboard
                                     .write()
@@ -834,13 +837,13 @@ where
         }
     }
 
-    fn pending_writes(&self, warp_id: usize, reg_id: u32) -> Option<usize> {
+    #[must_use] pub fn pending_writes(&self, warp_id: usize, reg_id: u32) -> Option<usize> {
         let pending = self.pending_writes.get(&warp_id)?;
         let pending = pending.get(&reg_id)?;
         Some(*pending)
     }
 
-    fn pending_writes_mut(&mut self, warp_id: usize, reg_id: u32) -> &mut usize {
+    pub fn pending_writes_mut(&mut self, warp_id: usize, reg_id: u32) -> &mut usize {
         let pending = self.pending_writes.entry(warp_id).or_default();
         pending.entry(reg_id).or_default()
     }
@@ -935,13 +938,13 @@ where
             self.pipelined_simd_unit
                 .pipeline_reg
                 .iter()
-                .map(|reg| reg.as_ref().map(|r| r.to_string()))
+                .map(|reg| reg.as_ref().map(std::string::ToString::to_string))
                 .collect::<Vec<_>>(),
             self.pipelined_simd_unit.num_active_instr_in_pipeline(),
             self.pipelined_simd_unit.pipeline_reg.len(),
             self.response_fifo
                 .iter()
-                .map(|t| t.to_string())
+                .map(std::string::ToString::to_string)
                 .collect::<Vec<_>>(),
         );
 
@@ -968,9 +971,7 @@ where
             }
         }
 
-        drop(simd_unit);
-
-        if let Some(ref fetch) = self.response_fifo.front() {
+        if let Some(fetch) = self.response_fifo.front() {
             match fetch.access_kind() {
                 mem_fetch::AccessKind::TEXTURE_ACC_R => {
                     todo!("ldst unit: tex access");
@@ -993,7 +994,7 @@ where
                     if fetch.kind == mem_fetch::Kind::WRITE_ACK
                         || (self.config.perfect_mem && fetch.is_write())
                     {
-                        self.store_ack(&fetch);
+                        self.store_ack(fetch);
                         self.response_fifo.pop_front();
                     } else {
                         // L1 cache is write evict:
@@ -1051,11 +1052,10 @@ where
         let mut access_kind = MemStageAccessKind::C_MEM;
         let mut done = true;
         done &= self.shared_cycle(&mut stall_kind, &mut access_kind);
-        // done &= self.constant_cycle(&mut stall_kind, &mut access_kind);
-        // done &= self.texture_cycle(&mut stall_kind, &mut access_kind);
+        done &= self.constant_cycle(&mut stall_kind, &mut access_kind);
+        done &= self.texture_cycle(&mut stall_kind, &mut access_kind);
         done &= self.memory_cycle(&mut stall_kind, &mut access_kind);
 
-        // let mut num_stall_scheduler_mem = 0;
         if !done {
             // log stall types and return
             debug_assert_ne!(stall_kind, MemStageStallKind::NO_RC_FAIL);
diff --git a/src/ported/mem_fetch.rs b/src/ported/mem_fetch.rs
index 8fb15a9d..546abd89 100644
--- a/src/ported/mem_fetch.rs
+++ b/src/ported/mem_fetch.rs
@@ -21,15 +21,11 @@ pub type MemAccessSectorMask = BitArr!(for mem_sub_partition::SECTOR_CHUNCK_SIZE
 pub enum Kind {
     READ_REQUEST = 0,
     WRITE_REQUEST,
-    READ_REPLY, // send to shader
+    READ_REPLY,
     WRITE_ACK,
-    // Atomic,
-    // Const,
-    // Tex,
 }
 
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
-#[allow(clippy::incorrect_ident_case)]
 pub enum Status {
     INITIALIZED,
     IN_L1I_MISS_QUEUE,
@@ -179,7 +175,7 @@ impl std::fmt::Display for MemAccess {
 
 impl MemAccess {
     /// todo: where is this initialized
-    pub fn new(
+    #[must_use] pub fn new(
         kind: AccessKind,
         addr: address,
         allocation: Option<super::Allocation>,
@@ -196,31 +192,22 @@ impl MemAccess {
         if let Some(ref alloc) = allocation {
             debug_assert!(alloc.start_addr <= addr);
         }
-        Self {
-            warp_mask,
-            byte_mask,
-            sector_mask,
-            req_size_bytes,
-            is_write,
-            kind,
-            addr,
-            allocation,
-        }
+        Self { addr, allocation, is_write, req_size_bytes, kind, warp_mask, byte_mask, sector_mask }
     }
 
     #[inline]
-    pub fn relative_addr(&self) -> Option<super::address> {
+    #[must_use] pub fn relative_addr(&self) -> Option<super::address> {
         self.allocation
             .as_ref()
             .map(|alloc| alloc.start_addr)
             .and_then(|start| self.addr.checked_sub(start))
     }
 
-    pub fn control_size(&self) -> u32 {
+    #[must_use] pub fn control_size(&self) -> u32 {
         if self.is_write {
-            WRITE_PACKET_SIZE as u32
+            u32::from(WRITE_PACKET_SIZE)
         } else {
-            READ_PACKET_SIZE as u32
+            u32::from(READ_PACKET_SIZE)
         }
     }
 
@@ -286,7 +273,7 @@ impl std::fmt::Display for MemFetch {
         if let Some(ref alloc) = self.access.allocation {
             write!(f, "@{}+{})", alloc.id, addr - alloc.start_addr)
         } else {
-            write!(f, "@{})", addr)
+            write!(f, "@{addr})")
         }
     }
 }
@@ -403,25 +390,25 @@ impl MemFetch {
             .map_or(false, WarpInstruction::is_atomic)
     }
 
-    pub fn is_texture(&self) -> bool {
+    #[must_use] pub fn is_texture(&self) -> bool {
         self.instr
             .as_ref()
             .map_or(false, |i| i.memory_space == Some(MemorySpace::Texture))
     }
 
-    pub fn is_write(&self) -> bool {
+    #[must_use] pub fn is_write(&self) -> bool {
         self.access.is_write
     }
 
-    pub fn addr(&self) -> address {
+    #[must_use] pub fn addr(&self) -> address {
         self.access.addr
     }
 
-    pub fn relative_addr(&self) -> Option<address> {
+    #[must_use] pub fn relative_addr(&self) -> Option<address> {
         self.access.relative_addr()
     }
 
-    pub fn size(&self) -> u32 {
+    #[must_use] pub fn size(&self) -> u32 {
         self.data_size + self.control_size
     }
 
@@ -429,23 +416,23 @@ impl MemFetch {
     //     self.instr.cache_op
     // }
 
-    pub fn access_byte_mask(&self) -> &MemAccessByteMask {
+    #[must_use] pub fn access_byte_mask(&self) -> &MemAccessByteMask {
         &self.access.byte_mask
     }
 
-    pub fn access_warp_mask(&self) -> &ThreadActiveMask {
+    #[must_use] pub fn access_warp_mask(&self) -> &ThreadActiveMask {
         &self.access.warp_mask
     }
 
-    pub fn access_sector_mask(&self) -> &MemAccessSectorMask {
+    #[must_use] pub fn access_sector_mask(&self) -> &MemAccessSectorMask {
         &self.access.sector_mask
     }
 
-    pub fn sub_partition_id(&self) -> usize {
+    #[must_use] pub fn sub_partition_id(&self) -> usize {
         self.tlx_addr.sub_partition as usize
     }
 
-    pub fn access_kind(&self) -> &AccessKind {
+    #[must_use] pub fn access_kind(&self) -> &AccessKind {
         &self.access.kind
     }
 
@@ -454,7 +441,7 @@ impl MemFetch {
         self.last_status_change = Some(time);
     }
 
-    pub fn is_reply(&self) -> bool {
+    #[must_use] pub fn is_reply(&self) -> bool {
         matches!(self.kind, Kind::READ_REPLY | Kind::WRITE_ACK)
     }
 
diff --git a/src/ported/mem_partition_unit.rs b/src/ported/mem_partition_unit.rs
index ac1a160b..5998bebd 100644
--- a/src/ported/mem_partition_unit.rs
+++ b/src/ported/mem_partition_unit.rs
@@ -1,9 +1,7 @@
 use super::mem_fetch::BitString;
 use crate::config::GPUConfig;
 use crate::ported::{
-    self, address,
-    cache::Cache,
-    dram,
+    self, address, dram,
     fifo::{FifoQueue, Queue},
     mem_fetch,
     mem_sub_partition::MemorySubPartition,
@@ -23,6 +21,7 @@ pub struct MemoryPartitionUnit {
     pub arbitration_metadata: super::arbitration::ArbitrationMetadata,
 
     config: Arc<GPUConfig>,
+    #[allow(dead_code)]
     stats: Arc<Mutex<stats::Stats>>,
 }
 
@@ -38,19 +37,19 @@ impl MemoryPartitionUnit {
             .map(|i| {
                 let sub_id = id * num_sub_partitions + i;
 
-                let sub = Rc::new(RefCell::new(MemorySubPartition::new(
+                
+                Rc::new(RefCell::new(MemorySubPartition::new(
                     sub_id,
                     id,
                     Rc::clone(&cycle),
                     Arc::clone(&config),
                     Arc::clone(&stats),
-                )));
-                sub
+                )))
             })
             .collect();
 
         let dram = dram::DRAM::new(config.clone(), stats.clone());
-        let arbitration_metadata = super::arbitration::ArbitrationMetadata::new(&*config);
+        let arbitration_metadata = super::arbitration::ArbitrationMetadata::new(&config);
         Self {
             id,
             config,
@@ -62,7 +61,7 @@ impl MemoryPartitionUnit {
         }
     }
 
-    pub fn busy(&self) -> bool {
+    #[must_use] pub fn busy(&self) -> bool {
         self.sub_partitions
             .iter()
             .any(|sub| sub.try_borrow().unwrap().busy())
@@ -93,7 +92,7 @@ impl MemoryPartitionUnit {
     }
 
     pub fn cache_cycle(&mut self, cycle: u64) {
-        for mem_sub in self.sub_partitions.iter_mut() {
+        for mem_sub in &mut self.sub_partitions {
             mem_sub.borrow_mut().cache_cycle(cycle);
         }
     }
@@ -216,7 +215,7 @@ impl MemoryPartitionUnit {
                 let dram_latency_queue: Vec<_> = self
                     .dram_latency_queue
                     .iter()
-                    .map(|f| f.to_string())
+                    .map(std::string::ToString::to_string)
                     .collect();
                 log::debug!(
                     "\t dram latency queue ({:3}) = {:?}",
diff --git a/src/ported/mem_sub_partition.rs b/src/ported/mem_sub_partition.rs
index 152005bf..224c6e33 100644
--- a/src/ported/mem_sub_partition.rs
+++ b/src/ported/mem_sub_partition.rs
@@ -1,7 +1,6 @@
 use crate::config::{self, GPUConfig};
 use crate::ported::{
     self, address, cache,
-    cache::Cache,
     fifo::{FifoQueue, Queue},
     interconn as ic, l2, mem_fetch,
 };
@@ -19,25 +18,25 @@ pub const SECTOR_CHUNCK_SIZE: u32 = 4;
 /// Sector size is 32 bytes width
 pub const SECTOR_SIZE: u32 = 32;
 
-pub fn was_write_sent(events: &[cache::Event]) -> bool {
+#[must_use] pub fn was_write_sent(events: &[cache::Event]) -> bool {
     events
         .iter()
         .any(|event| event.kind == cache::EventKind::WRITE_REQUEST_SENT)
 }
 
-pub fn was_writeback_sent(events: &[cache::Event]) -> Option<&cache::Event> {
+#[must_use] pub fn was_writeback_sent(events: &[cache::Event]) -> Option<&cache::Event> {
     events
         .iter()
         .find(|event| event.kind == cache::EventKind::WRITE_BACK_REQUEST_SENT)
 }
 
-pub fn was_read_sent(events: &[cache::Event]) -> bool {
+#[must_use] pub fn was_read_sent(events: &[cache::Event]) -> bool {
     events
         .iter()
         .any(|event| event.kind == cache::EventKind::READ_REQUEST_SENT)
 }
 
-pub fn was_writeallocate_sent(events: &[cache::Event]) -> bool {
+#[must_use] pub fn was_writeallocate_sent(events: &[cache::Event]) -> bool {
     events
         .iter()
         .any(|event| event.kind == cache::EventKind::WRITE_ALLOCATE_SENT)
@@ -271,7 +270,6 @@ where
     }
 
     pub fn push(&mut self, fetch: mem_fetch::MemFetch) {
-        // todo!("mem sub partition: push");
         // m_stats->memlatstat_icnt2mem_pop(m_req);
         let mut requests = Vec::new();
         let l2_config = self.config.data_cache_l2.as_ref().unwrap();
@@ -322,11 +320,7 @@ where
     }
 
     pub fn flush_l2(&mut self) -> Option<usize> {
-        if let Some(l2) = &mut self.l2_cache {
-            Some(l2.flush())
-        } else {
-            None
-        }
+        self.l2_cache.as_mut().map(|l2| l2.flush())
     }
 
     pub fn invalidate_l2(&mut self) {
@@ -341,13 +335,8 @@ where
         let fetch = self.l2_to_interconn_queue.dequeue()?;
         // self.request_tracker.remove(fetch);
         if fetch.is_atomic() {
-            // fetch.do_atomic();
             unimplemented!("atomic memory operation");
         }
-        // panic!(
-        //     "l2 to dram queue fetch: access kind = {:?}",
-        //     fetch.access_kind(),
-        // );
         match fetch.access_kind() {
             // writeback accesses not counted
             AccessKind::L2_WRBK_ACC | AccessKind::L1_WRBK_ACC => None,
@@ -357,30 +346,19 @@ where
 
     pub fn top(&mut self) -> Option<&mem_fetch::MemFetch> {
         use super::AccessKind;
-        match self
+        if let Some(AccessKind::L2_WRBK_ACC | AccessKind::L1_WRBK_ACC) = self
             .l2_to_interconn_queue
             .first()
-            .map(|fetch| fetch.access_kind())
+            .map(ported::mem_fetch::MemFetch::access_kind)
         {
-            Some(AccessKind::L2_WRBK_ACC | AccessKind::L1_WRBK_ACC) => {
-                self.l2_to_interconn_queue.dequeue();
-                // self.request_tracker.remove(fetch);
-                return None;
-            }
-            _ => {}
+            self.l2_to_interconn_queue.dequeue();
+            // self.request_tracker.remove(fetch);
+            return None;
         }
 
         self.l2_to_interconn_queue.first()
     }
 
-    // pub fn full(&self) -> bool {
-    //     self.interconn_to_l2_queue.full()
-    // }
-    //
-    // pub fn has_available_size(&self, size: usize) -> bool {
-    //     self.interconn_to_l2_queue.has_available_size(size)
-    // }
-
     pub fn set_done(&mut self, fetch: &mem_fetch::MemFetch) {
         self.request_tracker.remove(fetch);
     }
@@ -408,7 +386,7 @@ where
             log_line,
             self.rop_queue
                 .iter()
-                .map(|f| f.to_string())
+                .map(std::string::ToString::to_string)
                 .collect::<Vec<_>>(),
             self.interconn_to_l2_queue,
             self.l2_to_interconn_queue,
@@ -434,10 +412,8 @@ where
 
             // todo: move config into l2
             let l2_config = self.config.data_cache_l2.as_ref().unwrap();
-            // if !l2_config.disabled {}
             if l2_cache.has_ready_accesses() && !queue_full {
                 let mut fetch = l2_cache.next_access().unwrap();
-                // panic!("fetch from l2 cache ready");
 
                 // Don't pass write allocate read request back to upper level cache
                 if fetch.access_kind() != &AccessKind::L2_WR_ALLOC_R {
@@ -445,20 +421,16 @@ where
                     fetch.set_status(Status::IN_PARTITION_L2_TO_ICNT_QUEUE, 0);
                     // m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle);
                     self.l2_to_interconn_queue.enqueue(fetch);
-                } else {
-                    if l2_config.inner.write_allocate_policy
-                        == CacheWriteAllocatePolicy::FETCH_ON_WRITE
-                    {
-                        let mut original_write_fetch = *fetch.original_fetch.unwrap();
-                        original_write_fetch.set_reply();
-                        original_write_fetch
-                            .set_status(mem_fetch::Status::IN_PARTITION_L2_TO_ICNT_QUEUE, 0);
-                        // m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle);
-                        self.l2_to_interconn_queue.enqueue(original_write_fetch);
-                        todo!("fetch on write: l2 to icnt queue");
-                    }
-                    // self.request_tracker.remove(fetch);
-                    // delete mf;
+                } else if l2_config.inner.write_allocate_policy
+                    == CacheWriteAllocatePolicy::FETCH_ON_WRITE
+                {
+                    let mut original_write_fetch = *fetch.original_fetch.unwrap();
+                    original_write_fetch.set_reply();
+                    original_write_fetch
+                        .set_status(mem_fetch::Status::IN_PARTITION_L2_TO_ICNT_QUEUE, 0);
+                    // m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle);
+                    self.l2_to_interconn_queue.enqueue(original_write_fetch);
+                    todo!("fetch on write: l2 to icnt queue");
                 }
             }
         }
@@ -468,14 +440,12 @@ where
         // DRAM to L2 (texture) and icnt (not texture)
         if let Some(reply) = self.dram_to_l2_queue.first() {
             match self.l2_cache {
-                Some(ref mut l2_cache) if l2_cache.waiting_for_fill(&reply) => {
+                Some(ref mut l2_cache) if l2_cache.waiting_for_fill(reply) => {
                     if l2_cache.has_free_fill_port() {
                         let mut reply = self.dram_to_l2_queue.dequeue().unwrap();
                         log::debug!("filling L2 with {}", &reply);
                         reply.set_status(mem_fetch::Status::IN_PARTITION_L2_FILL_QUEUE, 0);
-                        // dbg!(cycle, self.memcpy_cycle_offset);
                         l2_cache.fill(reply, time);
-                        // l2_cache.fill(&mut reply)
                         // reply will be gone forever at this point
                         // m_dram_L2_queue->pop();
                     } else {
@@ -509,16 +479,13 @@ where
         if !self.l2_to_dram_queue.lock().unwrap().full() {
             if let Some(fetch) = self.interconn_to_l2_queue.first() {
                 if let Some(ref mut l2_cache) = self.l2_cache {
-                    if (self.config.data_cache_l2_texture_only && fetch.is_texture())
-                        || !self.config.data_cache_l2_texture_only
-                    {
+                    if !self.config.data_cache_l2_texture_only || fetch.is_texture() {
                         // L2 is enabled and access is for L2
                         let output_full = self.l2_to_interconn_queue.full();
                         let port_free = l2_cache.has_free_data_port();
 
                         if !output_full && port_free {
                             let mut events = Vec::new();
-                            // dbg!(cycle, self.memcpy_cycle_offset);
                             let status =
                                 l2_cache.access(fetch.addr(), fetch.clone(), &mut events, time);
                             let write_sent = was_write_sent(&events);
@@ -545,17 +512,13 @@ where
                                         );
                                         self.l2_to_interconn_queue.enqueue(fetch);
                                     }
-                                    // m_icnt_L2_queue->pop();
                                 } else {
                                     assert!(write_sent);
-                                    // m_icnt_L2_queue->pop();
                                 }
                             } else if status != cache::RequestStatus::RESERVATION_FAIL {
                                 // L2 cache accepted request
                                 let mut fetch = self.interconn_to_l2_queue.dequeue().unwrap();
                                 let wa_policy = l2_cache.write_allocate_policy();
-                                // let is_fetch_on_write = l2_cache.write_allocate_policy()
-                                //     == config::CacheWriteAllocatePolicy::FETCH_ON_WRITE;
                                 let should_fetch = matches!(
                                     wa_policy,
                                     config::CacheWriteAllocatePolicy::FETCH_ON_WRITE
diff --git a/src/ported/mod.rs b/src/ported/mod.rs
index b098109e..547b1100 100644
--- a/src/ported/mod.rs
+++ b/src/ported/mod.rs
@@ -1,3 +1,10 @@
+#![allow(
+    clippy::too_many_arguments,
+    clippy::missing_panics_doc,
+    clippy::missing_errors_doc,
+    clippy::too_many_lines
+)]
+
 pub mod addrdec;
 pub mod arbitration;
 pub mod barrier;
@@ -5,10 +12,12 @@ pub mod cache;
 pub mod cache_block;
 pub mod cluster;
 pub mod core;
+pub mod deadlock;
 pub mod dram;
 pub mod fifo;
 pub mod instruction;
 pub mod interconn;
+pub mod kernel;
 pub mod l1;
 pub mod l2;
 pub mod ldst_unit;
@@ -21,7 +30,7 @@ pub mod operand_collector;
 pub mod register_set;
 pub mod scheduler;
 pub mod scoreboard;
-pub mod set_index_function;
+pub mod set_index;
 pub mod simd_function_unit;
 pub mod sp_unit;
 pub mod tag_array;
@@ -29,15 +38,18 @@ pub mod tag_array;
 #[cfg(test)]
 pub mod testing;
 
-use self::cluster::*;
-use self::core::*;
-use addrdec::*;
-use color_eyre::Help;
+use self::cluster::SIMTCoreCluster;
+use self::core::{
+    warp_inst_complete, Packet, PipelineStage, SIMTCore, WarpMask, MAX_THREAD_PER_SM,
+    PROGRAM_MEM_START,
+};
+use addrdec::DecodedAddress;
 use fifo::Queue;
 use interconn as ic;
-use ldst_unit::*;
-use mem_fetch::*;
-use sp_unit::*;
+use kernel::Kernel;
+use ldst_unit::LoadStoreUnit;
+use mem_fetch::{AccessKind, BitString, READ_PACKET_SIZE, WRITE_PACKET_SIZE};
+use sp_unit::SPUnit;
 use stats::Stats;
 
 use crate::config;
@@ -45,231 +57,16 @@ use bitvec::array::BitArray;
 use color_eyre::eyre::{self};
 use console::style;
 use std::cell::RefCell;
-use std::collections::HashSet;
 use std::collections::{HashMap, VecDeque};
 use std::ops::Deref;
 use std::path::{Path, PathBuf};
 use std::rc::Rc;
-use std::sync::{atomic, Arc, Mutex, RwLock};
+use std::sync::{atomic, Arc, Mutex};
 use std::time::Instant;
-use trace_model::{Command, KernelLaunch, MemAccessTraceEntry, Point};
+use trace_model::Command;
 
 pub type address = u64;
 
-/// KernelInfo represents a kernel.
-///
-/// This includes its launch configuration,
-/// as well as its state of execution.
-#[derive()]
-pub struct KernelInfo {
-    pub opcodes: &'static opcodes::OpcodeMap,
-    pub config: KernelLaunch,
-    trace: Vec<MemAccessTraceEntry>,
-    trace_pos: RwLock<usize>,
-    launched: Mutex<bool>,
-    num_cores_running: usize,
-
-    pub cache_config_set: bool,
-}
-
-impl PartialEq for KernelInfo {
-    fn eq(&self, other: &Self) -> bool {
-        self.id() == other.id()
-    }
-}
-
-impl std::fmt::Debug for KernelInfo {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        f.debug_struct("KernelInfo")
-            .field("name", &self.name())
-            .field("id", &self.id())
-            .field("instructions", &self.trace.len())
-            .field("launched", &self.launched)
-            .field("grid", &self.config.grid)
-            .field("block", &self.config.block)
-            .field("stream", &self.config.stream_id)
-            .field(
-                "shared_mem",
-                &human_bytes::human_bytes(self.config.shared_mem_bytes as f64),
-            )
-            .field("registers", &self.config.num_registers)
-            // .field("block", &self.current_block())
-            // .field("thread", &self.next_block_iter.lock().unwrap().peek())
-            .finish()
-    }
-}
-
-impl std::fmt::Display for KernelInfo {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        f.debug_struct("KernelInfo")
-            .field("name", &self.name())
-            .field("id", &self.id())
-            .finish()
-    }
-}
-
-pub fn read_trace(path: impl AsRef<Path>) -> eyre::Result<Vec<MemAccessTraceEntry>> {
-    use serde::Deserializer;
-
-    let reader = utils::fs::open_readable(path.as_ref())?;
-    let mut reader = rmp_serde::Deserializer::new(reader);
-    let mut trace = vec![];
-    let decoder = nvbit_io::Decoder::new(|access: MemAccessTraceEntry| {
-        trace.push(access);
-    });
-    reader.deserialize_seq(decoder).suggestion("maybe the traces does not match the most recent binary trace format, try re-generating the traces.")?;
-    Ok(trace)
-}
-
-impl KernelInfo {
-    pub fn from_trace(traces_dir: impl AsRef<Path>, config: KernelLaunch) -> Self {
-        let start = Instant::now();
-        log::info!(
-            "parsing kernel for launch {:?} from {}",
-            &config,
-            &config.trace_file
-        );
-        let trace_path = traces_dir
-            .as_ref()
-            .join(&config.trace_file)
-            .with_extension("msgpack");
-
-        let trace = read_trace(&trace_path).unwrap();
-
-        // sanity check
-        assert!(trace_model::is_valid_trace(&trace));
-
-        // check if grid size is equal to the number of unique blocks in the trace
-        let all_blocks: HashSet<_> = trace.iter().map(|t| &t.block_id).collect();
-        log::info!(
-            "parsed kernel trace for {:?}: {}/{} blocks in {:?}",
-            config.name,
-            all_blocks.len(),
-            config.grid.size(),
-            start.elapsed()
-        );
-        assert_eq!(config.grid.size(), all_blocks.len() as u64);
-
-        let opcodes = opcodes::get_opcode_map(&config).unwrap();
-
-        Self {
-            config,
-            trace,
-            trace_pos: RwLock::new(0),
-            opcodes,
-            launched: Mutex::new(false),
-            num_cores_running: 0,
-            cache_config_set: false,
-        }
-    }
-
-    pub fn id(&self) -> u64 {
-        self.config.id
-    }
-
-    pub fn next_threadblock_traces(&self, warps: &mut [scheduler::CoreWarp]) {
-        let mut trace_pos = self.trace_pos.write().unwrap();
-
-        let mut instructions = 0;
-        let trace_size = self.trace.len();
-
-        if *trace_pos + 1 >= trace_size || trace_size == 0 {
-            // no more threadblocks
-            log::info!("blocks done: no more threadblock traces");
-            return;
-        }
-        let next_block = &self.trace[*trace_pos + 1].block_id;
-
-        while *trace_pos < trace_size {
-            let entry = &self.trace[*trace_pos];
-            if entry.block_id != *next_block {
-                // get instructions until new block
-                break;
-            }
-
-            let warp_id = entry.warp_id_in_block as usize;
-            let instr = instruction::WarpInstruction::from_trace(&self, entry.clone());
-            let warp = warps.get_mut(warp_id).unwrap();
-            let mut warp = warp.try_borrow_mut().unwrap();
-            warp.push_trace_instruction(instr);
-
-            instructions += 1;
-            *trace_pos += 1;
-        }
-
-        log::debug!(
-            "added {instructions} instructions ({} per warp) for block {next_block}",
-            instructions / warps.len()
-        );
-        debug_assert!(instructions > 0);
-        // debug_assert!(instructions % 32 == 0);
-        // dbg!(warps
-        //     .iter()
-        //     .map(|w| w.try_borrow().unwrap().trace_instructions.len())
-        //     .collect::<Vec<_>>());
-        // debug_assert!(
-        //     warps
-        //         .iter()
-        //         .map(|w| w.try_borrow().unwrap().trace_instructions.len())
-        //         .collect::<HashSet<_>>()
-        //         .len()
-        //         == 1,
-        //     "all warps have the same number of instructions"
-        // );
-        // dbg!(warps
-        //     .iter()
-        //     .map(|w| w.try_borrow().unwrap().trace_instructions.len())
-        //     .collect::<Vec<_>>());
-
-        debug_assert!(
-            warps
-                .iter()
-                .all(|w| !w.try_borrow().unwrap().trace_instructions.is_empty()),
-            "all warps have at least one instruction (need at least an EXIT)"
-        );
-    }
-
-    pub fn inc_running(&mut self) {
-        self.num_cores_running += 1;
-    }
-
-    pub fn name(&self) -> &str {
-        &self.config.name
-    }
-
-    pub fn was_launched(&self) -> bool {
-        *self.launched.lock().unwrap()
-    }
-
-    pub fn running(&self) -> bool {
-        self.num_cores_running > 0
-    }
-
-    pub fn current_block(&self) -> Option<Point> {
-        let traces_pos = self.trace_pos.read().unwrap();
-        let trace = self.trace.get(*traces_pos)?;
-        Some(Point::new(trace.block_id.clone(), self.config.grid.clone()))
-    }
-
-    pub fn done(&self) -> bool {
-        self.no_more_blocks_to_run() && !self.running()
-    }
-
-    pub fn num_blocks(&self) -> usize {
-        let grid = &self.config.grid;
-        grid.x as usize * grid.y as usize * grid.z as usize
-    }
-
-    pub fn threads_per_block(&self) -> usize {
-        let block = &self.config.block;
-        block.x as usize * block.y as usize * block.z as usize
-    }
-
-    pub fn no_more_blocks_to_run(&self) -> bool {
-        self.current_block().is_none()
-    }
-}
-
 pub fn parse_commands(path: impl AsRef<Path>) -> eyre::Result<Vec<Command>> {
     let reader = utils::fs::open_readable(path.as_ref())?;
     let commands = serde_json::from_reader(reader)?;
@@ -316,18 +113,19 @@ impl std::ops::Deref for Allocations {
 impl Allocations {
     pub fn insert(&mut self, range: std::ops::Range<address>, name: Option<String>) {
         // check for intersections
-        if self.0.overlaps(&range) {
-            panic!("overlapping memory allocation {:?}", &range);
-        }
+        assert!(
+            !self.0.overlaps(&range),
+            "overlapping memory allocation {:?}",
+            &range
+        );
         let id = self.0.len() + 1; // zero is reserved for instructions
         let start_addr = range.start;
         let end_addr = Some(range.end);
         self.0.insert(
             range,
             Allocation {
-                name,
-                // avoid joining of allocations using the id and range
                 id,
+                name,
                 start_addr,
                 end_addr,
             },
@@ -343,7 +141,7 @@ pub struct MockSimulator<I> {
     mem_sub_partitions: Vec<
         Rc<RefCell<mem_sub_partition::MemorySubPartition<fifo::FifoQueue<mem_fetch::MemFetch>>>>,
     >,
-    running_kernels: Vec<Option<Arc<KernelInfo>>>,
+    running_kernels: Vec<Option<Arc<kernel::Kernel>>>,
     executed_kernels: Mutex<HashMap<u64, String>>,
     clusters: Vec<SIMTCoreCluster<I>>,
     #[allow(dead_code)]
@@ -359,19 +157,20 @@ pub struct MockSimulator<I> {
     traces_dir: PathBuf,
     commands: Vec<Command>,
     command_idx: usize,
-    kernels: VecDeque<Arc<KernelInfo>>,
+    kernels: VecDeque<Arc<kernel::Kernel>>,
     kernel_window_size: usize,
     busy_streams: VecDeque<u64>,
     cycle_limit: Option<u64>,
     log_after_cycle: Option<u64>,
     // gpu_stall_icnt2sh: usize,
-    // partition_replies_in_parallel: usize,
+    partition_replies_in_parallel: usize,
 }
 
 #[derive(Debug, Default)]
 pub struct AtomicCycle(std::sync::atomic::AtomicU64);
 
 impl AtomicCycle {
+    #[must_use]
     pub fn new(cycle: u64) -> Self {
         Self(std::sync::atomic::AtomicU64::new(cycle))
     }
@@ -403,14 +202,10 @@ impl FromConfig for stats::Stats {
     }
 }
 
-// impl MockSimulator {
-// impl<'a> MockSimulator<'a> {
 impl<I> MockSimulator<I>
 where
-    // I: ic::MemFetchInterface + 'static,
     I: ic::Interconnect<core::Packet> + 'static,
 {
-    // see new trace_gpgpu_sim
     pub fn new(
         interconn: Arc<I>,
         config: Arc<config::GPUConfig>,
@@ -419,7 +214,7 @@ where
     ) -> Self {
         let _start = Instant::now();
         let traces_dir = traces_dir.as_ref();
-        let stats = Arc::new(Mutex::new(Stats::from_config(&*config)));
+        let stats = Arc::new(Mutex::new(Stats::from_config(&config)));
 
         let num_mem_units = config.num_memory_controllers;
         let num_sub_partitions = config.num_sub_partition_per_memory_channel;
@@ -481,15 +276,14 @@ where
 
         // todo: make this a hashset?
         let busy_streams: VecDeque<u64> = VecDeque::new();
-        let mut kernels: VecDeque<Arc<KernelInfo>> = VecDeque::new();
+        let mut kernels: VecDeque<Arc<kernel::Kernel>> = VecDeque::new();
         kernels.reserve_exact(window_size);
 
         let cycle_limit: Option<u64> = std::env::var("CYCLES")
             .ok()
             .as_deref()
             .map(str::parse)
-            .map(Result::ok)
-            .flatten();
+            .and_then(Result::ok);
 
         // this causes first launch to use simt cluster
         let last_cluster_issue = config.num_simt_clusters - 1;
@@ -516,6 +310,7 @@ where
             busy_streams,
             cycle_limit,
             log_after_cycle: None,
+            partition_replies_in_parallel: 0,
         }
     }
 
@@ -523,7 +318,7 @@ where
     ///
     /// Todo: used hack to allow selecting the kernel from the shader core,
     /// but we could maybe refactor
-    pub fn select_kernel(&self) -> Option<&Arc<KernelInfo>> {
+    pub fn select_kernel(&self) -> Option<&Arc<Kernel>> {
         let mut executed_kernels = self.executed_kernels.lock().unwrap();
         if let Some(k) = &self.running_kernels[self.last_issued_kernel] {
             if !k.no_more_blocks_to_run()
@@ -590,8 +385,8 @@ where
         })
     }
 
-    pub fn launch(&mut self, kernel: Arc<KernelInfo>) -> eyre::Result<()> {
-        *kernel.launched.lock().unwrap() = true;
+    pub fn launch(&mut self, kernel: Arc<Kernel>) -> eyre::Result<()> {
+        kernel.set_launched();
         let threads_per_block = kernel.threads_per_block();
         let max_threads_per_block = self.config.max_threads_per_core;
         if threads_per_block > max_threads_per_block {
@@ -630,7 +425,7 @@ where
     pub fn set_cycle(&self, cycle: u64) {
         let mut stats = self.stats.lock().unwrap();
         stats.sim.cycles = cycle;
-        self.cycle.set(cycle)
+        self.cycle.set(cycle);
     }
 
     pub fn cycle(&mut self) {
@@ -641,8 +436,6 @@ where
             cluster.interconn_cycle();
         }
 
-        let mut partition_replies_in_parallel_per_cycle = 0;
-
         log::debug!(
             "POP from {} memory sub partitions",
             self.mem_sub_partitions.len()
@@ -678,7 +471,7 @@ where
                 let dram_latency_queue: Vec<_> = partition
                     .dram_latency_queue
                     .iter()
-                    .map(|f| f.to_string())
+                    .map(std::string::ToString::to_string)
                     .collect();
                 log::debug!(
                     "\t dram latency queue ({:3}) = {:?}",
@@ -705,13 +498,12 @@ where
                     // drop(fetch);
                     self.interconn
                         .push(device, cluster_id, packet, response_packet_size);
-                    partition_replies_in_parallel_per_cycle += 1;
+                    self.partition_replies_in_parallel += 1;
                 } else {
                     // self.gpu_stall_icnt2sh += 1;
                 }
             }
         }
-        // self.partition_replies_in_parallel += partition_replies_in_parallel_per_cycle;
 
         // dram
         log::debug!("cycle for {} drams", self.mem_partition_units.len());
@@ -731,8 +523,8 @@ where
             "moving mem requests from interconn to {} mem partitions",
             self.mem_sub_partitions.len()
         );
-        let mut parallel_mem_partition_reqs_per_cycle = 0;
-        let mut stall_dram_full = 0;
+        // let mut parallel_mem_partition_reqs_per_cycle = 0;
+        // let mut stall_dram_full = 0;
         for (i, mem_sub) in self.mem_sub_partitions.iter_mut().enumerate() {
             let mut mem_sub = mem_sub.try_borrow_mut().unwrap();
             // move memory request from interconnect into memory partition
@@ -755,11 +547,11 @@ where
                     );
 
                     mem_sub.push(fetch);
-                    parallel_mem_partition_reqs_per_cycle += 1;
+                    // self.parallel_mem_partition_reqs += 1;
                 }
             } else {
                 log::debug!("SKIP sub partition {} ({}): DRAM full stall", i, device);
-                stall_dram_full += 1;
+                self.stats.lock().unwrap().stall_dram_full += 1;
             }
             // we borrow all of sub here, which is a problem for the cyclic reference in l2
             // interface
@@ -774,17 +566,17 @@ where
 
         // self.interconn_transfer();
 
-        let mut active_sms = 0;
+        // let mut active_sms = 0;
         for cluster in &mut self.clusters {
             let cores_completed = cluster.not_completed() == 0;
             let kernels_completed = self
                 .running_kernels
                 .iter()
-                .filter_map(|k| k.as_ref())
+                .filter_map(std::option::Option::as_ref)
                 .all(|k| k.no_more_blocks_to_run());
             if !cores_completed || !kernels_completed {
                 cluster.cycle();
-                active_sms += cluster.num_active_sms();
+                // active_sms += cluster.num_active_sms();
             }
         }
 
@@ -795,7 +587,7 @@ where
         // once all of threads are completed.
         let mut all_threads_complete = true;
         if self.config.flush_l1_cache {
-            for cluster in self.clusters.iter_mut() {
+            for cluster in &mut self.clusters {
                 if cluster.not_completed() == 0 {
                     cluster.cache_invalidate();
                 } else {
@@ -806,7 +598,7 @@ where
 
         if self.config.flush_l2_cache {
             if !self.config.flush_l1_cache {
-                for cluster in self.clusters.iter_mut() {
+                for cluster in &mut self.clusters {
                     if cluster.not_completed() > 0 {
                         all_threads_complete = false;
                         break;
@@ -833,7 +625,7 @@ where
         }
     }
 
-    pub fn gpu_mem_alloc(&mut self, addr: address, num_bytes: u64, name: Option<String>) {
+    pub fn gpu_mem_alloc(&mut self, addr: address, num_bytes: u64, name: Option<&str>) {
         log::info!(
             "memalloc: {:<20} {:>15} ({:>5} f32) at address {addr:>20}",
             name.as_deref().unwrap_or("<unnamed>"),
@@ -858,13 +650,13 @@ where
         self.allocations
             .try_borrow_mut()
             .unwrap()
-            .insert(alloc_range.clone(), name);
+            .insert(alloc_range, name);
 
         if self.config.fill_l2_on_memcopy {
             let num_sub_partitions = self.config.num_sub_partition_per_memory_channel;
             let mut transfered = 0;
             while transfered < num_bytes {
-                let write_addr = addr + transfered as u64;
+                let write_addr = addr + transfered;
 
                 let tlx_addr = self.config.address_mapping().tlx(write_addr);
                 let partition_id = tlx_addr.sub_partition / num_sub_partitions as u64;
@@ -932,7 +724,7 @@ where
     /// Process commands
     ///
     /// Take as many commands as possible until we have collected as many kernels to fill
-    /// the window_size or processed every command.
+    /// the `window_size` or processed every command.
     pub fn process_commands(&mut self) {
         while self.kernels.len() < self.kernel_window_size && self.command_idx < self.commands.len()
         {
@@ -947,9 +739,11 @@ where
                     allocation_name,
                     device_ptr,
                     num_bytes,
-                }) => self.gpu_mem_alloc(*device_ptr, *num_bytes, allocation_name.clone()),
+                }) => {
+                    self.gpu_mem_alloc(*device_ptr, *num_bytes, allocation_name.clone().as_deref())
+                }
                 Command::KernelLaunch(launch) => {
-                    let kernel = KernelInfo::from_trace(&self.traces_dir, launch.clone());
+                    let kernel = Kernel::from_trace(&self.traces_dir, launch.clone());
                     self.kernels.push_back(Arc::new(kernel));
                 }
             }
@@ -971,7 +765,7 @@ where
     /// Launch all kernels within window that are on a stream that isn't already running
     pub fn launch_kernels(&mut self) {
         log::trace!("launching kernels");
-        let mut launch_queue: Vec<Arc<KernelInfo>> = Vec::new();
+        let mut launch_queue: Vec<Arc<Kernel>> = Vec::new();
         for kernel in &self.kernels {
             let stream_busy = self
                 .busy_streams
@@ -991,19 +785,17 @@ where
                 .transpose()
                 .unwrap();
             if let Some(up_to_kernel) = up_to_kernel {
-                if kernel.config.id > up_to_kernel {
-                    panic!("launching kernel {}", kernel);
-                }
+                assert!(
+                    kernel.config.id <= up_to_kernel,
+                    "launching kernel {kernel}"
+                );
             }
-            self.launch(kernel);
+            self.launch(kernel).unwrap();
         }
     }
 
     pub fn reached_limit(&self, cycle: u64) -> bool {
-        match self.cycle_limit {
-            Some(limit) if cycle >= limit => true,
-            _ => false,
-        }
+        matches!(self.cycle_limit, Some(limit) if cycle >= limit)
     }
 
     pub fn commands_left(&self) -> bool {
@@ -1020,7 +812,7 @@ where
         deadlock_check: bool,
     ) -> eyre::Result<()> {
         let mut cycle: u64 = 0;
-        let mut last_state_change: Option<(DeadlockCheckState, u64)> = None;
+        let mut last_state_change: Option<(deadlock::State, u64)> = None;
 
         while (self.commands_left() || self.kernels_left()) && !self.reached_limit(cycle) {
             self.process_commands();
@@ -1048,10 +840,11 @@ where
 
                 match self.log_after_cycle {
                     Some(ref log_after_cycle) if cycle >= *log_after_cycle => {
-                        println!("initializing logging after cycle {}", cycle);
+                        use std::io::Write;
+
+                        println!("initializing logging after cycle {cycle}");
                         let mut log_builder = env_logger::Builder::new();
 
-                        use std::io::Write;
                         log_builder.format(|buf, record| {
                             writeln!(
                                 buf,
@@ -1105,7 +898,7 @@ where
 
                     match &mut last_state_change {
                         Some((last_state, update_cycle)) if &state == last_state => {
-                            panic!("deadlock after cycle {}", update_cycle);
+                            panic!("deadlock after cycle {update_cycle}");
                         }
                         Some((ref mut last_state, ref mut update_cycle)) => {
                             // log::info!("deadlock check: updated state in cycle {}", cycle);
@@ -1120,7 +913,7 @@ where
             }
 
             if let Some(kernel) = finished_kernel {
-                self.cleanup_finished_kernel(&*kernel);
+                self.cleanup_finished_kernel(&kernel);
             }
 
             log::trace!(
@@ -1133,7 +926,7 @@ where
         Ok(())
     }
 
-    // pub fn set_kernel_done(&mut self, kernel: &mut KernelInfo) {
+    // pub fn set_kernel_done(&mut self, kernel: &mut Kernel) {
     //     self.finished_kernels
     //         .borrow_mut()
     //         .push_back(kernel.config.id);
@@ -1146,16 +939,11 @@ where
     //     self.running_kernels.remove(running_kernel_idx);
     // }
 
-    fn finished_kernel(&mut self) -> Option<Arc<KernelInfo>> {
+    fn finished_kernel(&mut self) -> Option<Arc<Kernel>> {
         // check running kernels
         let _active = self.active();
-        let finished_kernel: Option<&mut Option<Arc<KernelInfo>>> = self
-            .running_kernels
-            .iter_mut()
-            // .filter_map(|k| k.as_ref())
-            // .filter_map(|k| k)
-            // .filter(|k| {
-            .find(|k| {
+        let finished_kernel: Option<&mut Option<Arc<Kernel>>> =
+            self.running_kernels.iter_mut().find(|k| {
                 if let Some(k) = k {
                     // TODO: could also check here if !self.active()
                     k.no_more_blocks_to_run() && !k.running() && k.was_launched()
@@ -1169,32 +957,9 @@ where
         } else {
             None
         }
-        // for running_kernel in self.running_kernels.iter().filter_map(|k| k.as_ref()) {
-        //     if running_kernel.no_more_blocks_to_run() && !running_kernel.running() {
-        //         //       SHADER_DPRINTF(LIVENESS,
-        //         //                      "GPGPU-Sim uArch: GPU detected kernel %u \'%s\' "
-        //         //                      "finished on shader %u.\n",
-        //         //                      kernel->get_uid(), kernel->name().c_str(), m_sid);
-        //         //
-        //         // if current_kernel.map(|k| k.config.id) == Some(kernel.config.id) {
-        //         //     *current_kernel = None;
-        //         // }
-        //
-        //         // m_gpu->set_kernel_done(kernel);
-        //         todo!("kernel {} done", &running_kernel);
-        //     }
-        // }
-
-        // self.finished_kernels.borrow_mut().pop_front()
     }
 
-    fn cleanup_finished_kernel(&mut self, kernel: &KernelInfo) {
-        // if !self.reached_limit() && self.active()) {
-        //     return;
-        // }
-        //   trace_kernel_info_t *k = NULL;
-        // let finished_kernel_idx = self.kernels.iter().position(|k| k.config.id == id).unwrap();
-        // let finished_kernel = &self.kernels[finished_kernel_idx];
+    fn cleanup_finished_kernel(&mut self, kernel: &Kernel) {
         log::debug!(
             "cleanup finished kernel with id={}: {}",
             kernel.id(),
@@ -1209,152 +974,6 @@ where
         //   m_gpgpu_sim->update_stats();
         //   m_gpgpu_context->print_simulation_time();
         // }
-
-        // if let Some(stream_idx) = self
-        //     .busy_streams
-        //     .iter()
-        //     .position(|stream| *stream == finished_kernel.config.stream_id)
-        // {
-        //     self.busy_streams.remove(stream_idx);
-        // }
-
-        // self.kernels.remove(finished_kernel_idx);
-
-        // tracer->kernel_finalizer(k->get_trace_info());
-        // delete k->entry();
-        // delete k;
-        // kernels_info.erase(kernels_info.begin() + j);
-        // if (!limit_reached() && active()) break;
-
-        // for stream in self.busy_streams.iter() {
-        //     if stream = finished_kernel.config.stream_id
-        //     if (busy_streams.at(l) == k->get_cuda_stream_id()) {
-        //       busy_streams.erase(busy_streams.begin() + l);
-        //       break;
-        //     }
-        //   }
-        //       tracer->kernel_finalizer(k->get_trace_info());
-        //       delete k->entry();
-        //       delete k;
-        //       kernels_info.erase(kernels_info.begin() + j);
-        //       if (!limit_reached() && active()) break;
-
-        // for (unsigned j = 0; j < kernels_info.size(); j++) {
-        // k = kernels_info.at(j);
-        // if (k->get_uid() == finished_kernel_uid || limit_reached() || !active()) {
-        //       for (unsigned l = 0; l < busy_streams.size(); l++) {
-        //         if (busy_streams.at(l) == k->get_cuda_stream_id()) {
-        //           busy_streams.erase(busy_streams.begin() + l);
-        //           break;
-        //         }
-        //       }
-        //       tracer->kernel_finalizer(k->get_trace_info());
-        //       delete k->entry();
-        //       delete k;
-        //       kernels_info.erase(kernels_info.begin() + j);
-        //       if (!limit_reached() && active()) break;
-        //     }
-        //   }
-        //   // make sure kernel was found and removed
-        //   assert(k);
-        //   // if (!silent) m_gpgpu_sim->print_stats();
-        // }
-    }
-
-    fn gather_state(&self) -> DeadlockCheckState {
-        let total_cores = self.config.total_cores();
-        let num_partitions = self.mem_partition_units.len();
-        let num_sub_partitions = self.mem_sub_partitions.len();
-
-        let mut state = DeadlockCheckState::new(total_cores, num_partitions, num_sub_partitions);
-
-        for (cluster_id, cluster) in self.clusters.iter().enumerate() {
-            for (core_id, core) in cluster.cores.lock().unwrap().iter().enumerate() {
-                let global_core_id = cluster_id * self.config.num_cores_per_simt_cluster + core_id;
-                assert_eq!(core.inner.core_id, global_core_id);
-
-                // this is the one we will use (unless the assertion is ever false)
-                let core_id = core.inner.core_id;
-
-                // core: functional units
-                for (fu_id, fu) in core.functional_units.iter().enumerate() {
-                    let _fu = fu.lock().unwrap();
-                    let issue_port = core.issue_ports[fu_id];
-                    let issue_reg: register_set::RegisterSet = core.inner.pipeline_reg
-                        [issue_port as usize]
-                        .borrow()
-                        .clone();
-                    assert_eq!(issue_port, issue_reg.stage);
-
-                    state.functional_unit_pipelines[core_id].push(issue_reg);
-                }
-                // core: operand collector
-                state.operand_collectors[core_id] =
-                    Some(core.inner.operand_collector.borrow().clone());
-                // core: schedulers
-                // state.schedulers[core_id].extend(core.schedulers.iter().map(Into::into));
-            }
-        }
-        for (partition_id, partition) in self.mem_partition_units.iter().enumerate() {
-            state.dram_latency_queue[partition_id]
-                .extend(partition.dram_latency_queue.clone().into_iter());
-        }
-        for (sub_id, sub) in self.mem_sub_partitions.iter().enumerate() {
-            for (dest_queue, src_queue) in [
-                (
-                    &mut state.interconn_to_l2_queue[sub_id],
-                    &sub.borrow().interconn_to_l2_queue,
-                ),
-                (
-                    &mut state.l2_to_interconn_queue[sub_id],
-                    &sub.borrow().l2_to_interconn_queue,
-                ),
-                (
-                    &mut state.l2_to_dram_queue[sub_id],
-                    &sub.borrow().l2_to_dram_queue.lock().unwrap(),
-                ),
-                (
-                    &mut state.dram_to_l2_queue[sub_id],
-                    &sub.borrow().dram_to_l2_queue,
-                ),
-            ] {
-                dest_queue.extend(src_queue.clone().into_iter());
-            }
-        }
-        state
-    }
-}
-
-#[derive(Debug, PartialEq, Eq)]
-struct DeadlockCheckState {
-    pub interconn_to_l2_queue: Vec<Vec<MemFetch>>,
-    pub l2_to_interconn_queue: Vec<Vec<MemFetch>>,
-    pub l2_to_dram_queue: Vec<Vec<MemFetch>>,
-    pub dram_to_l2_queue: Vec<Vec<MemFetch>>,
-    pub dram_latency_queue: Vec<Vec<MemFetch>>,
-    pub functional_unit_pipelines: Vec<Vec<register_set::RegisterSet>>,
-    pub operand_collectors: Vec<Option<operand_collector::OperandCollectorRegisterFileUnit>>,
-    // pub schedulers: Vec<Vec<sched::Scheduler>>,
-    // functional_unit_pipelines
-    // schedulers
-    // operand_collectors
-}
-
-impl DeadlockCheckState {
-    pub fn new(total_cores: usize, num_mem_partitions: usize, num_sub_partitions: usize) -> Self {
-        Self {
-            // per sub partition
-            interconn_to_l2_queue: vec![vec![]; num_sub_partitions],
-            l2_to_interconn_queue: vec![vec![]; num_sub_partitions],
-            l2_to_dram_queue: vec![vec![]; num_sub_partitions],
-            dram_to_l2_queue: vec![vec![]; num_sub_partitions],
-            // per partition
-            dram_latency_queue: vec![vec![]; num_mem_partitions],
-            // per core
-            functional_unit_pipelines: vec![vec![]; total_cores],
-            operand_collectors: vec![None; total_cores],
-            // schedulers: vec![vec![]; total_cores],
-        }
     }
 }
 
@@ -1379,8 +998,6 @@ pub fn accelmain(
     traces_dir: impl AsRef<Path>,
     log_after_cycle: Option<u64>,
 ) -> eyre::Result<Stats> {
-    log::info!("box version {}", 0);
-
     let traces_dir = traces_dir.as_ref();
     let (traces_dir, commands_path) = if traces_dir.is_dir() {
         (traces_dir.to_path_buf(), traces_dir.join("commands.json"))
@@ -1397,26 +1014,21 @@ pub fn accelmain(
     };
 
     // debugging config
-    let mut config = config::GPUConfig::default();
-
-    config.num_simt_clusters = 20; // 20
-    config.num_cores_per_simt_cluster = 4; // 1
-    config.num_schedulers_per_core = 2; // 1
-
-    config.num_memory_controllers = 8; // 8
-    config.num_sub_partition_per_memory_channel = 2; // 2
-    config.fill_l2_on_memcopy = true; // true
-
-    let config = Arc::new(config);
+    let config = Arc::new(config::GPUConfig {
+        num_simt_clusters: 20,                   // 20
+        num_cores_per_simt_cluster: 4,           // 1
+        num_schedulers_per_core: 2,              // 1
+        num_memory_controllers: 8,               // 8
+        num_sub_partition_per_memory_channel: 2, // 2
+        fill_l2_on_memcopy: true,                // true
+        ..config::GPUConfig::default()
+    });
 
     let interconn = Arc::new(ic::ToyInterconnect::new(
         config.num_simt_clusters,
         config.num_memory_controllers * config.num_sub_partition_per_memory_channel,
-        // config.num_simt_clusters * config.num_cores_per_simt_cluster,
-        // config.num_mem_units,
-        Some(9), // found by printf debugging gpgusim
     ));
-    let mut sim = MockSimulator::new(interconn, Arc::clone(&config), &traces_dir, &commands_path);
+    let mut sim = MockSimulator::new(interconn, Arc::clone(&config), &traces_dir, commands_path);
 
     sim.log_after_cycle = log_after_cycle;
 
@@ -1424,7 +1036,8 @@ pub fn accelmain(
         .unwrap_or_default()
         .to_lowercase()
         == "yes";
-    sim.run_to_completion(&traces_dir, deadlock_check);
+
+    sim.run_to_completion(&traces_dir, deadlock_check)?;
 
     let stats = sim.stats();
 
@@ -1435,12 +1048,7 @@ pub fn accelmain(
 mod tests {
     use crate::{
         config,
-        ported::{
-            self,
-            fifo::{self, Queue},
-            interconn as ic, testing,
-            testing::diff,
-        },
+        ported::{self, fifo, interconn as ic, testing, testing::diff},
     };
     use color_eyre::eyre;
     use itertools::Itertools;
@@ -1568,7 +1176,7 @@ mod tests {
                     .map(Into::into),
             );
             box_sim_state.dram_arbitration_per_partition[partition_id] =
-                testing::state::ArbitrationState {
+                testing::state::Arbitration {
                     last_borrower: partition.arbitration_metadata.last_borrower,
                     shared_credit: partition.arbitration_metadata.shared_credit,
                     private_credit: partition.arbitration_metadata.private_credit.clone().into(),
@@ -1618,7 +1226,7 @@ mod tests {
         play_sim_state.last_cluster_issue = play_sim.last_cluster_issue() as usize;
 
         for (core_id, core) in play_sim.cores().enumerate() {
-            for regs in core.functional_unit_issue_register_sets().into_iter() {
+            for regs in core.functional_unit_issue_register_sets() {
                 play_sim_state.functional_unit_pipelines_per_core[core_id].push(regs.into());
             }
             let valid_units: HashSet<_> = box_sim_state.functional_unit_pipelines_per_core[core_id]
@@ -1685,7 +1293,7 @@ mod tests {
             partitions_added += 1;
 
             play_sim_state.dram_arbitration_per_partition[partition_id] =
-                testing::state::ArbitrationState {
+                testing::state::Arbitration {
                     last_borrower: partition.last_borrower(),
                     shared_credit: partition.shared_credit(),
                     private_credit: partition.private_credit().into(),
@@ -1997,6 +1605,8 @@ mod tests {
     // }
 
     fn run_lockstep(trace_dir: &Path, trace_provider: TraceProvider) -> eyre::Result<()> {
+        use accelsim::tracegen::reader::Command as AccelsimCommand;
+
         let manifest_dir = PathBuf::from(std::env!("CARGO_MANIFEST_DIR"));
 
         let box_trace_dir = trace_dir.join("trace");
@@ -2027,7 +1637,6 @@ mod tests {
                 let accelsim_commands =
                     accelsim::tracegen::reader::read_commands(&accelsim_trace_dir, reader)?;
 
-                use accelsim::tracegen::reader::Command as AccelsimCommand;
                 let commands: Vec<_> = accelsim_commands
                     .into_iter()
                     .map(|cmd| match cmd {
@@ -2037,7 +1646,7 @@ mod tests {
                         AccelsimCommand::KernelLaunch((mut kernel, metadata)) => {
                             // transform kernel instruction trace
                             let kernel_trace_path = accelsim_trace_dir.join(&kernel.trace_file);
-                            let reader = utils::fs::open_readable(&kernel_trace_path)?;
+                            let reader = utils::fs::open_readable(kernel_trace_path)?;
                             let parsed_trace = accelsim::tracegen::reader::read_trace_instructions(
                                 reader,
                                 metadata.trace_version,
@@ -2130,7 +1739,6 @@ mod tests {
         dbg!(&box_commands_path);
         dbg!(&accelsim_kernelslist_path);
 
-        // assert!(false);
         let gpgpusim_config = manifest_dir.join("accelsim/gtx1080/gpgpusim.config");
         let trace_config = manifest_dir.join("accelsim/gtx1080/gpgpusim.trace.config");
         let inter_config = manifest_dir.join("accelsim/gtx1080/config_fermi_islip.icnt");
@@ -2143,37 +1751,29 @@ mod tests {
         assert!(trace_config.is_file());
         assert!(inter_config.is_file());
 
-        // let start = std::time::Instant::now();
-        // let box_stats = super::accelmain(&vec_add_trace_dir.join("trace"), None)?;
-
         // debugging config
-        let mut box_config = config::GPUConfig::default();
-        box_config.num_simt_clusters = 20; // 20
-        box_config.num_cores_per_simt_cluster = 4; // 1
-        box_config.num_schedulers_per_core = 2; // 2
-        box_config.num_memory_controllers = 8; // 8
-        box_config.num_sub_partition_per_memory_channel = 2; // 2
-        box_config.fill_l2_on_memcopy = true; // true
-
-        let box_config = Arc::new(box_config);
+        let box_config = Arc::new(config::GPUConfig {
+            num_simt_clusters: 20,                   // 20
+            num_cores_per_simt_cluster: 4,           // 1
+            num_schedulers_per_core: 2,              // 2
+            num_memory_controllers: 8,               // 8
+            num_sub_partition_per_memory_channel: 2, // 2
+            fill_l2_on_memcopy: true,                // true
+            ..config::GPUConfig::default()
+        });
 
         let box_interconn = Arc::new(ic::ToyInterconnect::new(
             box_config.num_simt_clusters,
             box_config.num_memory_controllers * box_config.num_sub_partition_per_memory_channel,
-            // config.num_simt_clusters * config.num_cores_per_simt_cluster,
-            // config.num_mem_units,
-            Some(9), // found by printf debugging gpgusim
         ));
 
         let mut box_sim = super::MockSimulator::new(
             box_interconn,
-            box_config.clone(),
+            box_config,
             &box_trace_dir,
             &box_commands_path,
         );
-        // let box_dur = start.elapsed();
 
-        // let start = std::time::Instant::now();
         let args = vec![
             "-trace",
             accelsim_kernelslist_path.as_os_str().to_str().unwrap(),
@@ -2189,10 +1789,6 @@ mod tests {
         let play_config = playground::Config::default();
         let mut play_sim = playground::Accelsim::new(&play_config, &args)?;
 
-        // accelsim.run_to_completion();
-        // let ref_stats = accelsim.stats().clone();
-        // let ref_stats = playground::run(&config, &args)?;
-        //
         let mut play_time_cycle = std::time::Duration::ZERO;
         let mut play_time_other = std::time::Duration::ZERO;
         let mut box_time_cycle = std::time::Duration::ZERO;
@@ -2225,15 +1821,15 @@ mod tests {
             .unwrap_or(200);
         assert!(check_every >= 1);
 
-        let _num_schedulers = box_sim.config.num_schedulers_per_core;
-        let num_clusters = box_sim.config.num_simt_clusters;
-        let cores_per_cluster = box_sim.config.num_cores_per_simt_cluster;
-        assert_eq!(
-            box_sim.config.total_cores(),
-            num_clusters * cores_per_cluster
-        );
-        let _num_partitions = box_sim.mem_partition_units.len();
-        let _num_sub_partitions = box_sim.mem_sub_partitions.len();
+        // let _num_schedulers = box_sim.config.num_schedulers_per_core;
+        // let num_clusters = box_sim.config.num_simt_clusters;
+        // let cores_per_cluster = box_sim.config.num_cores_per_simt_cluster;
+        // assert_eq!(
+        //     box_sim.config.total_cores(),
+        //     num_clusters * cores_per_cluster
+        // );
+        // let _num_partitions = box_sim.mem_partition_units.len();
+        // let _num_sub_partitions = box_sim.mem_sub_partitions.len();
         //
         // let mut box_sim_state = testing::state::Simulation::new(
         //     num_clusters,
@@ -2420,7 +2016,7 @@ mod tests {
                         //     dbg!(sub_id, box_icnt_l2_queue);
                         // }
                     }
-                    println!("checking for diff after cycle {}", cycle);
+                    println!("checking for diff after cycle {cycle}");
 
                     if use_full_diff {
                         full_diff::assert_eq!(&box_sim_state, &play_sim_state);
@@ -2441,7 +2037,7 @@ mod tests {
                 }
 
                 if let Some(kernel) = box_sim.finished_kernel() {
-                    box_sim.cleanup_finished_kernel(&*kernel);
+                    box_sim.cleanup_finished_kernel(&kernel);
                 }
                 box_time_other += start.elapsed();
 
@@ -2743,7 +2339,7 @@ mod tests {
                         playground_bin.display()
                     )
                 })
-                .with_suggestion(|| format!("make sure to build playground with `cargo build -p playground` for the {:?} target", target))?;
+                .with_suggestion(|| format!("make sure to build playground with `cargo build -p playground` for the {target:?} target"))?;
 
             let gpgpu_sim_config = sim_config.config().unwrap();
             let trace_config = sim_config.trace_config().unwrap();
@@ -2799,8 +2395,6 @@ mod tests {
         kernelslist: &Path,
         sim_config: &accelsim::SimConfig,
     ) -> eyre::Result<()> {
-        use std::io::Write;
-
         dbg!(&traces_dir);
         dbg!(&kernelslist);
         dbg!(&sim_config);
@@ -2849,24 +2443,21 @@ mod tests {
         let filter_func =
             |((_name, _kernel, stat_name), _value): &((String, u16, String), f64)| -> bool {
                 // we ignore rates and other stats that can vary per run
-                match stat_name.as_str() {
+                !matches!(
+                    stat_name.as_str(),
                     "gpgpu_silicon_slowdown"
-                    | "gpgpu_simulation_rate"
-                    | "gpgpu_simulation_time_sec"
-                    | "gpu_ipc"
-                    | "gpu_occupancy"
-                    | "gpu_tot_ipc"
-                    | "l1_inst_cache_total_miss_rate"
-                    | "l2_bandwidth_gbps" => false,
-                    _ => true,
-                }
+                        | "gpgpu_simulation_rate"
+                        | "gpgpu_simulation_time_sec"
+                        | "gpu_ipc"
+                        | "gpu_occupancy"
+                        | "gpu_tot_ipc"
+                        | "l1_inst_cache_total_miss_rate"
+                        | "l2_bandwidth_gbps"
+                )
             };
 
-        let cmp_play_stats: accelsim::Stats = playground_stats
-            .clone()
-            .into_iter()
-            .filter(filter_func)
-            .collect();
+        let cmp_play_stats: accelsim::Stats =
+            playground_stats.into_iter().filter(filter_func).collect();
 
         let cmp_accel_stats: accelsim::Stats = accelsim_stats
             .clone()
diff --git a/src/ported/mshr.rs b/src/ported/mshr.rs
index 5a49b6af..d58d116a 100644
--- a/src/ported/mshr.rs
+++ b/src/ported/mshr.rs
@@ -24,7 +24,6 @@ pub struct MshrTable {
     num_entries: usize,
     max_merged: usize,
     data: Table,
-    pending_lines: LineTable,
     /// If the current response is ready
     ///
     /// it may take several cycles to process the merged requests
@@ -33,25 +32,23 @@ pub struct MshrTable {
 }
 
 impl MshrTable {
-    pub fn new(num_entries: usize, max_merged: usize) -> Self {
+    #[must_use] pub fn new(num_entries: usize, max_merged: usize) -> Self {
         let data = HashMap::with_capacity(2 * num_entries);
         Self {
             num_entries,
             max_merged,
             data,
-            pending_lines: HashMap::new(),
             current_response: VecDeque::new(),
-            // current_response_ready: false,
         }
     }
 
     /// Checks if there is a pending request to the lower memory level already
-    pub fn probe(&self, block_addr: address) -> bool {
+    #[must_use] pub fn probe(&self, block_addr: address) -> bool {
         self.data.contains_key(&block_addr)
     }
 
     /// Checks if there is space for tracking a new memory access
-    pub fn full(&self, block_addr: address) -> bool {
+    #[must_use] pub fn full(&self, block_addr: address) -> bool {
         match self.data.get(&block_addr) {
             Some(entry) => entry.list.len() >= self.max_merged,
             None => self.data.len() >= self.num_entries,
@@ -109,16 +106,16 @@ impl MshrTable {
     }
 
     /// Returns true if ready accesses exist
-    pub fn has_ready_accesses(&self) -> bool {
+    #[must_use] pub fn has_ready_accesses(&self) -> bool {
         !self.current_response.is_empty()
     }
 
     /// Returns next ready accesses
-    pub fn ready_accesses(&self) -> Option<&VecDeque<mem_fetch::MemFetch>> {
+    #[must_use] pub fn ready_accesses(&self) -> Option<&VecDeque<mem_fetch::MemFetch>> {
         let Some(block_addr) = self.current_response.front() else {
             return None;
         };
-        let Some(entry) = self.data.get(&block_addr) else {
+        let Some(entry) = self.data.get(block_addr) else {
             return None;
         };
         Some(&entry.list)
@@ -129,7 +126,7 @@ impl MshrTable {
         let Some(block_addr) = self.current_response.front() else {
             return None;
         };
-        let Some(entry) = self.data.get_mut(&block_addr) else {
+        let Some(entry) = self.data.get_mut(block_addr) else {
             return None;
         };
         Some(&mut entry.list)
@@ -143,7 +140,7 @@ impl MshrTable {
             return None;
         };
 
-        let Some(entry) = self.data.get_mut(&block_addr) else {
+        let Some(entry) = self.data.get_mut(block_addr) else {
             return None;
         };
 
@@ -152,7 +149,7 @@ impl MshrTable {
 
         let should_remove = entry.list.is_empty();
         if should_remove {
-            self.data.remove(&block_addr);
+            self.data.remove(block_addr);
             self.current_response.pop_front();
         }
         fetch
@@ -165,7 +162,6 @@ mod tests {
     use crate::config;
     use crate::ported::{mem_fetch, scheduler::ThreadActiveMask};
     use mem_fetch::{AccessKind, MemAccess, MemFetch};
-    
 
     #[test]
     fn test_mshr_table() {
@@ -173,7 +169,7 @@ mod tests {
         let cache_config = config.inst_cache_l1.as_ref().unwrap();
         let mut mshrs = MshrTable::new(cache_config.mshr_entries, cache_config.mshr_max_merge);
 
-        let fetch_addr = 4026531848;
+        let fetch_addr = 4_026_531_848;
         let access = MemAccess::new(
             AccessKind::INST_ACC_R,
             fetch_addr,
@@ -186,11 +182,11 @@ mod tests {
         );
         let fetch = MemFetch::new(None, access, &config, 0, 0, 0, 0);
         let mshr_addr = cache_config.mshr_addr(fetch_addr);
-        assert_eq!(mshrs.probe(mshr_addr), false);
-        assert_eq!(mshrs.probe(mshr_addr), false);
+        assert!(!mshrs.probe(mshr_addr));
+        assert!(!mshrs.probe(mshr_addr));
 
         mshrs.add(mshr_addr, fetch);
-        assert_eq!(mshrs.probe(mshr_addr), true);
+        assert!(mshrs.probe(mshr_addr));
 
         // TODO: test against bridge here
     }
diff --git a/src/ported/operand_collector.rs b/src/ported/operand_collector.rs
index 6a9d712e..4b6bcca7 100644
--- a/src/ported/operand_collector.rs
+++ b/src/ported/operand_collector.rs
@@ -44,6 +44,7 @@ pub struct Operand {
 }
 
 impl Operand {
+    #[must_use]
     pub fn new(
         warp_id: Option<usize>,
         cu_id: usize,
@@ -62,6 +63,7 @@ impl Operand {
         }
     }
 
+    #[must_use]
     pub fn warp_id(&self) -> Option<usize> {
         self.warp_id
     }
@@ -127,6 +129,7 @@ impl CollectorUnit {
     }
 
     // looks ok
+    #[must_use]
     pub fn ready(&self) -> bool {
         if self.free {
             return false;
@@ -302,18 +305,22 @@ impl Default for Allocation {
 }
 
 impl Allocation {
+    #[must_use]
     pub fn new(kind: AllocationKind, op: Option<Operand>) -> Self {
         Self { kind, op }
     }
 
+    #[must_use]
     pub fn is_read(&self) -> bool {
         self.kind == AllocationKind::READ_ALLOC
     }
 
+    #[must_use]
     pub fn is_write(&self) -> bool {
         self.kind == AllocationKind::WRITE_ALLOC
     }
 
+    #[must_use]
     pub fn is_free(&self) -> bool {
         self.kind == AllocationKind::NO_ALLOC
     }
@@ -433,14 +440,13 @@ impl Arbiter {
             log::trace!("request: {:?}", &Self::compat(&request[bank]));
         }
 
-        log::trace!("inmatch: {:?}", &Self::compat(&inmatch));
+        log::trace!("inmatch: {:?}", &Self::compat(inmatch));
 
         // wavefront allocator from booksim
         // loop through diagonals of request matrix
 
-        let mut output = 0;
         for p in 0.._square {
-            output = (_pri + p) % _outputs;
+            let mut output = (_pri + p) % _outputs;
 
             // step through the current diagonal
             for input in 0.._inputs {
@@ -464,8 +470,8 @@ impl Arbiter {
             }
         }
 
-        log::trace!("inmatch: {:?}", &Self::compat(&inmatch));
-        log::trace!("outmatch: {:?}", &Self::compat(&outmatch));
+        log::trace!("inmatch: {:?}", &Self::compat(inmatch));
+        log::trace!("outmatch: {:?}", &Self::compat(outmatch));
 
         // Round-robin the priority diagonal
         _pri = (_pri + 1) % _outputs;
@@ -500,14 +506,13 @@ impl Arbiter {
     }
 
     pub fn add_read_requests(&mut self, cu: &CollectorUnit) {
-        for src_op in &cu.src_operands {
-            if let Some(src_op) = src_op {
-                let bank = src_op.bank;
-                self.queue[bank].push_back(src_op.clone());
-            }
+        for src_op in cu.src_operands.iter().flatten() {
+            let bank = src_op.bank;
+            self.queue[bank].push_back(src_op.clone());
         }
     }
 
+    #[must_use]
     pub fn bank_idle(&self, bank: usize) -> bool {
         self.allocated_banks[bank].is_free()
     }
@@ -539,6 +544,7 @@ pub struct DispatchUnit {
 }
 
 impl DispatchUnit {
+    #[must_use]
     pub fn new(kind: OperandCollectorUnitKind) -> Self {
         Self {
             kind,
@@ -603,6 +609,7 @@ pub struct InputPort {
 }
 
 impl InputPort {
+    #[must_use]
     pub fn new(
         in_ports: PortVec,
         out_ports: PortVec,
@@ -633,7 +640,7 @@ pub enum OperandCollectorUnitKind {
 pub type CuSets = HashMap<OperandCollectorUnitKind, Vec<Rc<RefCell<CollectorUnit>>>>;
 
 // operand collector based register file unit
-#[derive(Debug, Clone, PartialEq, Eq)]
+#[derive(Debug, Clone)]
 pub struct OperandCollectorRegisterFileUnit {
     pub config: Arc<config::GPUConfig>,
 
@@ -711,7 +718,7 @@ impl OperandCollectorRegisterFileUnit {
 
             debug_assert!(cu.id == cu_id);
         }
-        for dispatch_unit in self.dispatch_units.iter_mut() {
+        for dispatch_unit in &mut self.dispatch_units {
             dispatch_unit.init(self.sub_core_model, self.num_warp_schedulers);
         }
         self.initialized = true;
@@ -760,7 +767,7 @@ impl OperandCollectorRegisterFileUnit {
         }
 
         log::debug!("allocating {} reads ({:?})", read_ops.len(), &read_ops);
-        for (_bank, read) in &read_ops {
+        for read in read_ops.values() {
             assert!(read.collector_unit_id < self.collector_units.len());
             let mut cu = self.collector_units[read.collector_unit_id].borrow_mut();
             if let Some(operand) = read.operand {
@@ -824,17 +831,17 @@ impl OperandCollectorRegisterFileUnit {
                         debug_assert!(cu_upper_bound <= cu_set.len());
                     }
 
-                    for k in cu_lower_bound..cu_upper_bound {
-                        let mut collector_unit = cu_set[k].try_borrow_mut().unwrap();
+                    for collector_unit in &cu_set[cu_lower_bound..cu_upper_bound] {
+                        let mut collector_unit = collector_unit.try_borrow_mut().unwrap();
 
                         if collector_unit.free {
                             log::debug!(
                                 "{} cu={:?}",
-                                style(format!("operand collector::allocate()")).green(),
+                                style("operand collector::allocate()".to_string()).green(),
                                 collector_unit.kind
                             );
 
-                            allocated = collector_unit.allocate(&input_port, &output_port);
+                            allocated = collector_unit.allocate(input_port, output_port);
                             self.arbiter.add_read_requests(&collector_unit);
                             break;
                         }
@@ -1045,8 +1052,8 @@ mod test {
             let arbiter = (&opcoll.arbiter).into();
             Self {
                 ports,
-                dispatch_units,
                 collector_units,
+                dispatch_units,
                 arbiter,
             }
         }
diff --git a/src/ported/register_set.rs b/src/ported/register_set.rs
index ef6f7d59..3f4af0df 100644
--- a/src/ported/register_set.rs
+++ b/src/ported/register_set.rs
@@ -9,9 +9,9 @@ pub struct RegisterSet {
 }
 
 impl RegisterSet {
-    pub fn new(stage: super::PipelineStage, size: usize, id: usize) -> Self {
+    #[must_use] pub fn new(stage: super::PipelineStage, size: usize, id: usize) -> Self {
         let regs = (0..size).map(|_| None).collect();
-        Self { regs, stage, id }
+        Self { stage, regs, id }
     }
 
     pub fn has_free(&self) -> bool {
@@ -23,7 +23,7 @@ impl RegisterSet {
     }
 
     // pub fn has_free_sub_core(&self, sub_core_model: bool, reg_id: usize) -> bool {
-    pub fn has_free_sub_core(&self, reg_id: usize) -> bool {
+    #[must_use] pub fn has_free_sub_core(&self, reg_id: usize) -> bool {
         // in subcore model, each sched has a one specific
         // reg to use (based on sched id)
         // if !sub_core_model {
@@ -66,7 +66,7 @@ impl RegisterSet {
     // }
 
     pub fn scheduler_id(&self, reg_id: usize) -> Option<usize> {
-        match self.regs.get(reg_id).map(Option::as_ref).flatten() {
+        match self.regs.get(reg_id).and_then(Option::as_ref) {
             Some(r) => {
                 // debug_assert!(!r.empty());
                 r.scheduler_id
@@ -94,7 +94,7 @@ impl RegisterSet {
         self.regs.iter().any(Option::is_some)
     }
 
-    pub fn get_ready(&self) -> Option<(usize, &Option<WarpInstruction>)> {
+    #[must_use] pub fn get_ready(&self) -> Option<(usize, &Option<WarpInstruction>)> {
         let mut ready: Option<(usize, &Option<WarpInstruction>)> = None;
         for free in self.iter_occupied() {
             match (&ready, free) {
@@ -193,8 +193,7 @@ impl RegisterSet {
     pub fn get_instruction_mut(&mut self) -> Option<&mut WarpInstruction> {
         self.get_ready_mut()
             .map(|(_, r)| r)
-            .map(Option::as_mut)
-            .flatten()
+            .and_then(Option::as_mut)
     }
 
     // pub fn get_ready_mut(&mut self) -> Option<&mut WarpInstruction> {
@@ -212,7 +211,7 @@ impl RegisterSet {
     //     ready
     // }
 
-    pub fn get_ready_sub_core(&self, reg_id: usize) -> Option<&Option<WarpInstruction>> {
+    #[must_use] pub fn get_ready_sub_core(&self, reg_id: usize) -> Option<&Option<WarpInstruction>> {
         debug_assert!(reg_id < self.regs.len());
         self.regs.get(reg_id)
     }
@@ -227,12 +226,12 @@ impl RegisterSet {
 
     pub fn get_instruction_sub_core(&self, reg_id: usize) -> Option<&WarpInstruction> {
         debug_assert!(reg_id < self.regs.len());
-        self.regs.get(reg_id).map(Option::as_ref).flatten()
+        self.regs.get(reg_id).and_then(Option::as_ref)
     }
 
     pub fn get_instruction_sub_core_mut(&mut self, reg_id: usize) -> Option<&mut WarpInstruction> {
         debug_assert!(reg_id < self.regs.len());
-        self.regs.get_mut(reg_id).map(Option::as_mut).flatten()
+        self.regs.get_mut(reg_id).and_then(Option::as_mut)
     }
 
     pub fn iter_occupied(&self) -> impl Iterator<Item = (usize, &Option<WarpInstruction>)> {
@@ -249,11 +248,11 @@ impl RegisterSet {
     }
 
     pub fn iter_instructions(&self) -> impl Iterator<Item = &WarpInstruction> {
-        self.regs.iter().map(Option::as_ref).filter_map(|r| r)
+        self.regs.iter().filter_map(Option::as_ref)
     }
 
     pub fn iter_instructions_mut(&mut self) -> impl Iterator<Item = &mut WarpInstruction> {
-        self.regs.iter_mut().map(Option::as_mut).filter_map(|r| r)
+        self.regs.iter_mut().filter_map(Option::as_mut)
     }
 
     pub fn iter_free(&self) -> impl Iterator<Item = &Option<WarpInstruction>> {
@@ -287,18 +286,15 @@ impl RegisterSet {
         // in subcore model, each sched has a one specific reg
         // to use (based on sched id)
         debug_assert!(reg_id < self.regs.len());
-        match self.regs.get_mut(reg_id) {
-            Some(r) => Some((reg_id, r)),
-            None => None,
-        }
+        self.regs.get_mut(reg_id).map(|r| (reg_id, r))
         // .and_then(Option::as_ref) .filter(|r| r.empty())
     }
 
-    pub fn size(&self) -> usize {
+    #[must_use] pub fn size(&self) -> usize {
         self.regs.len()
     }
 
-    pub fn empty(&self) -> bool {
+    #[must_use] pub fn empty(&self) -> bool {
         todo!("RegisterSet::empty")
     }
 
@@ -335,8 +331,7 @@ impl RegisterSet {
         let ready: Option<WarpInstruction> = self
             .get_ready_mut()
             .map(|(_, r)| r)
-            .map(Option::take)
-            .flatten();
+            .and_then(Option::take);
         // let msg = format!(
         //     "register set moving out from ready={:?} to {:?}",
         //     ready.as_ref().map(ToString::to_string),
@@ -354,8 +349,7 @@ impl RegisterSet {
     ) {
         let ready: Option<WarpInstruction> = self
             .get_ready_sub_core_mut(reg_id)
-            .map(Option::take)
-            .flatten();
+            .and_then(Option::take);
         // let msg = format!(
         //     "register set moving out to sub core from ready={:?} to {:?}",
         //     ready.as_ref().map(ToString::to_string),
@@ -370,7 +364,7 @@ impl std::fmt::Display for RegisterSet {
         let instructions = self
             .regs
             .iter()
-            .map(|inst| inst.as_ref().map(|i| i.to_string()));
+            .map(|inst| inst.as_ref().map(std::string::ToString::to_string));
         f.debug_list().entries(instructions).finish()
     }
 }
diff --git a/src/ported/scheduler.rs b/src/ported/scheduler.rs
deleted file mode 100644
index 6b83b4b5..00000000
--- a/src/ported/scheduler.rs
+++ /dev/null
@@ -1,1273 +0,0 @@
-use std::cell::RefCell;
-use std::collections::VecDeque;
-use std::rc::Rc;
-use std::sync::{Arc, Mutex, RwLock};
-
-use super::core::PipelineStage;
-use super::{instruction::WarpInstruction, opcodes, scoreboard};
-use crate::config::GPUConfig;
-use bitvec::{array::BitArray, BitArr};
-use console::style;
-
-pub type ThreadActiveMask = BitArr!(for 32, in u32);
-
-pub type CoreWarp = Rc<RefCell<SchedulerWarp>>;
-
-#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
-#[allow(dead_code)]
-enum ExecUnitKind {
-    NONE = 0,
-    SP = 1,
-    SFU = 2,
-    MEM = 3,
-    DP = 4,
-    INT = 5,
-    TENSOR = 6,
-    SPECIALIZED = 7,
-}
-
-#[derive(Debug)]
-pub struct SchedulerWarp {
-    pub block_id: u64,
-    pub dynamic_warp_id: usize,
-    pub warp_id: usize,
-    pub kernel: Option<Arc<super::KernelInfo>>,
-
-    pub trace_pc: usize,
-    pub active_mask: ThreadActiveMask,
-    pub trace_instructions: VecDeque<WarpInstruction>,
-
-    // state
-    pub done_exit: bool,
-    pub num_instr_in_pipeline: usize,
-    pub num_outstanding_stores: usize,
-    pub num_outstanding_atomics: usize,
-    pub has_imiss_pending: bool,
-    pub instr_buffer: Vec<Option<WarpInstruction>>,
-    pub next: usize,
-}
-
-impl PartialEq for SchedulerWarp {
-    fn eq(&self, other: &Self) -> bool {
-        self.kernel == other.kernel
-            && self.block_id == other.block_id
-            && self.warp_id == other.warp_id
-            && self.dynamic_warp_id == other.dynamic_warp_id
-    }
-}
-
-const IBUFFER_SIZE: usize = 2;
-
-impl Default for SchedulerWarp {
-    fn default() -> Self {
-        let instr_buffer = vec![None; IBUFFER_SIZE];
-        Self {
-            block_id: 0,
-            dynamic_warp_id: u32::MAX as usize,
-            warp_id: u32::MAX as usize,
-            kernel: None,
-            trace_pc: 0,
-            trace_instructions: VecDeque::new(),
-            active_mask: BitArray::ZERO,
-            done_exit: false,
-            num_instr_in_pipeline: 0,
-            num_outstanding_stores: 0,
-            num_outstanding_atomics: 0,
-            has_imiss_pending: false,
-            instr_buffer,
-            next: 0,
-        }
-    }
-}
-
-impl SchedulerWarp {
-    pub fn init(
-        &mut self,
-        _start_pc: Option<usize>,
-        block_id: u64,
-        warp_id: usize,
-        dynamic_warp_id: usize,
-        active_mask: ThreadActiveMask,
-        kernel: Arc<super::KernelInfo>,
-    ) {
-        self.block_id = block_id;
-        self.warp_id = warp_id;
-        self.dynamic_warp_id = dynamic_warp_id;
-        self.done_exit = false;
-        self.kernel = Some(kernel);
-        self.active_mask = active_mask;
-    }
-
-    pub fn reset(&mut self) {
-        debug_assert_eq!(self.num_outstanding_stores, 0);
-        debug_assert_eq!(self.num_instr_in_pipeline, 0);
-        self.has_imiss_pending = false;
-        self.warp_id = u32::MAX as usize;
-        self.dynamic_warp_id = u32::MAX as usize;
-
-        self.active_mask.fill(false);
-        self.done_exit = true;
-        self.next = 0;
-    }
-
-    pub fn current_instr(&self) -> Option<&WarpInstruction> {
-        self.trace_instructions.get(self.trace_pc)
-    }
-
-    pub fn push_trace_instruction(&mut self, instr: WarpInstruction) {
-        self.trace_instructions.push_back(instr);
-    }
-
-    pub fn next_trace_inst(&mut self) -> Option<&WarpInstruction> {
-        let trace_instr = self.trace_instructions.get(self.trace_pc)?;
-        self.trace_pc += 1;
-        Some(trace_instr)
-    }
-
-    pub fn instruction_count(&self) -> usize {
-        self.trace_instructions.len()
-    }
-
-    pub fn pc(&self) -> Option<usize> {
-        debug_assert!(self.trace_pc <= self.instruction_count());
-        self.trace_instructions
-            .get(self.trace_pc)
-            .map(|instr| instr.pc)
-    }
-
-    pub fn done(&self) -> bool {
-        self.trace_pc == self.instruction_count()
-    }
-
-    pub fn clear(&mut self) {
-        self.trace_pc = 0;
-        self.trace_instructions.clear();
-    }
-
-    pub fn ibuffer_fill(&mut self, slot: usize, instr: WarpInstruction) {
-        debug_assert!(slot < self.instr_buffer.len());
-        self.instr_buffer[slot] = Some(instr);
-        self.next = 0;
-    }
-
-    pub fn ibuffer_size(&self) -> usize {
-        self.instr_buffer.iter().filter(|x| x.is_some()).count()
-    }
-
-    pub fn ibuffer_empty(&self) -> bool {
-        self.instr_buffer.iter().all(Option::is_none)
-    }
-
-    pub fn ibuffer_flush(&mut self) {
-        for i in self.instr_buffer.iter_mut() {
-            if i.is_some() {
-                self.num_instr_in_pipeline -= 1;
-            }
-            *i = None;
-        }
-    }
-
-    pub fn ibuffer_peek(&self) -> Option<&WarpInstruction> {
-        self.instr_buffer[self.next].as_ref()
-    }
-
-    pub fn ibuffer_take(&mut self) -> Option<WarpInstruction> {
-        self.instr_buffer[self.next].take()
-    }
-
-    pub fn ibuffer_step(&mut self) {
-        self.next = (self.next + 1) % IBUFFER_SIZE;
-    }
-
-    pub fn done_exit(&self) -> bool {
-        self.done_exit
-    }
-
-    pub fn hardware_done(&self) -> bool {
-        self.functional_done() && self.stores_done() && self.num_instr_in_pipeline == 0
-    }
-
-    pub fn has_instr_in_pipeline(&self) -> bool {
-        self.num_instr_in_pipeline > 0
-    }
-
-    pub fn stores_done(&self) -> bool {
-        self.num_outstanding_stores == 0
-    }
-
-    pub fn num_completed(&self) -> usize {
-        self.active_mask.count_zeros()
-    }
-
-    pub fn set_thread_completed(&mut self, thread_id: usize) {
-        self.active_mask.set(thread_id, false);
-    }
-
-    pub fn functional_done(&self) -> bool {
-        self.active_mask.not_any()
-    }
-
-    pub fn waiting(&self) -> bool {
-        if self.functional_done() {
-            // waiting to be initialized with a kernel
-            true
-        // } else if core.warp_waiting_at_barrier(self.warp_id) {
-        //     // waiting for other warps in block to reach barrier
-        //     true
-        // } else if core.warp_waiting_at_mem_barrier(self.warp_id) {
-        //     // waiting for memory barrier
-        //     true
-        } else if self.num_outstanding_atomics > 0 {
-            // waiting for atomic operation to complete at memory:
-            // this stall is not required for accurate timing model,
-            // but rather we stall here since if a call/return
-            // instruction occurs in the meantime the functional
-            // execution of the atomic when it hits DRAM can cause
-            // the wrong register to be read.
-            true
-        } else {
-            false
-        }
-    }
-
-    pub fn dynamic_warp_id(&self) -> usize {
-        self.dynamic_warp_id
-    }
-}
-
-fn sort_warps_by_oldest_dynamic_id(lhs: &CoreWarp, rhs: &CoreWarp) -> std::cmp::Ordering {
-    let lhs = lhs.try_borrow().unwrap();
-    let rhs = rhs.try_borrow().unwrap();
-    if lhs.done_exit() || lhs.waiting() {
-        std::cmp::Ordering::Greater
-    } else if rhs.done_exit() || rhs.waiting() {
-        std::cmp::Ordering::Less
-    } else {
-        lhs.dynamic_warp_id().cmp(&rhs.dynamic_warp_id())
-    }
-}
-
-#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
-pub enum Ordering {
-    // The item that issued last is prioritized first then the
-    // sorted result of the priority_function
-    GREEDY_THEN_PRIORITY_FUNC = 0,
-    // No greedy scheduling based on last to issue.
-    //
-    // Only the priority function determines priority
-    PRIORITY_FUNC_ONLY,
-    // NUM_ORDERING,
-}
-
-#[derive(Debug)]
-pub struct BaseSchedulerUnit {
-    id: usize,
-    cluster_id: usize,
-    core_id: usize,
-    /// This is the prioritized warp list that is looped over each cycle to
-    /// determine which warp gets to issue.
-    next_cycle_prioritized_warps: VecDeque<CoreWarp>,
-    // Supervised warps keeps all warps this scheduler can arbitrate between.
-    //
-    // This is useful in systems where there is more than one warp scheduler.
-    // In a single scheduler system, this is simply all the warps
-    // assigned to this core.
-    supervised_warps: VecDeque<CoreWarp>,
-    /// This is the iterator pointer to the last supervised warp issued
-    last_supervised_issued_idx: usize,
-
-    warps: Vec<CoreWarp>,
-    num_issued_last_cycle: usize,
-    current_turn_warp: usize,
-
-    scoreboard: Arc<RwLock<scoreboard::Scoreboard>>,
-    config: Arc<GPUConfig>,
-    stats: Arc<Mutex<stats::Stats>>,
-}
-
-impl BaseSchedulerUnit {
-    pub fn new(
-        id: usize,
-        cluster_id: usize,
-        core_id: usize,
-        warps: Vec<CoreWarp>,
-        scoreboard: Arc<RwLock<scoreboard::Scoreboard>>,
-        stats: Arc<Mutex<stats::Stats>>,
-        config: Arc<GPUConfig>,
-    ) -> Self {
-        let supervised_warps = VecDeque::new();
-        Self {
-            id,
-            cluster_id,
-            core_id,
-            next_cycle_prioritized_warps: VecDeque::new(),
-            supervised_warps,
-            last_supervised_issued_idx: 0,
-            warps,
-            num_issued_last_cycle: 0,
-            current_turn_warp: 0,
-            scoreboard,
-            config,
-            stats,
-        }
-    }
-
-    fn prioritized_warps(&self) -> &VecDeque<CoreWarp> {
-        &self.next_cycle_prioritized_warps
-    }
-
-    fn cycle(&mut self, issuer: &mut dyn super::core::WarpIssuer) {
-        log::debug!("{}: cycle", style("base scheduler").yellow());
-
-        // there was one warp with a valid instruction to issue
-        // (didn't require flush due to control hazard)
-        let mut valid_inst = false;
-        // of the valid instructions, there was one not waiting for pending register writes
-        let mut ready_inst = false;
-        // of these we issued one
-        let mut issued_inst = false;
-
-        // dbg!(&self.next_cycle_prioritized_warps.len());
-        // dbg!(&self.supervised_warps.len());
-        // dbg!(&self.last_supervised_issued_idx);
-        //
-        // dbg!(&self
-        //     .warps
-        //     .iter()
-        //     .map(|w| w.lock().unwrap().instruction_count())
-        //     .sum::<usize>());
-        // dbg!(&self
-        //     .supervised_warps
-        //     .iter()
-        //     .map(|w| w.lock().unwrap().instruction_count())
-        //     .sum::<usize>());
-        //
-        // dbg!(&self
-        //     .next_cycle_prioritized_warps
-        //     .iter()
-        //     .map(|w| w.lock().unwrap().instruction_count())
-        //     .sum::<usize>());
-
-        // log::debug!(
-        //     "supervised warps: {:#?}",
-        //     self.supervised_warps
-        //         .iter()
-        //         .map(|w| w.lock().unwrap().instruction_count())
-        //         .filter(|&c| c > 0)
-        //         .collect::<Vec<_>>()
-        // );
-        // log::debug!(
-        //     "next_cycle_prioritized_warps: {:#?}",
-        //     self.next_cycle_prioritized_warps
-        //         .iter()
-        //         .map(|w| w.lock().unwrap().instruction_count())
-        //         .filter(|&c| c > 0)
-        //         .collect::<Vec<_>>()
-        // );
-
-        // log::debug!("next cycle prio warp");
-        for next_warp_rc in &self.next_cycle_prioritized_warps {
-            // don't consider warps that are not yet valid
-            let next_warp = next_warp_rc.try_borrow().unwrap();
-            let (warp_id, dyn_warp_id) = (next_warp.warp_id, next_warp.dynamic_warp_id);
-            // log::debug!("locked next warp = {}", warp_id);
-
-            if next_warp.done_exit() {
-                continue;
-            }
-            let inst_count = next_warp.instruction_count();
-            if inst_count == 0 {
-                log::debug!("next warp: {:#?}", &next_warp);
-            }
-            assert!(inst_count > 0);
-            if inst_count > 1 {
-                log::debug!(
-                    "core[{}][{}] scheduler[{}]: \n\t => testing (warp_id={}, dynamic_warp_id={}, trace_pc={}, pc={:?}, ibuffer={:?}, {} instructions)",
-                    self.cluster_id,
-                    self.core_id,
-                    self.id,
-                    warp_id, dyn_warp_id,
-                    next_warp.trace_pc,
-                    next_warp.pc(),
-                    next_warp.instr_buffer.iter().filter_map(Option::as_ref).map(|i| i.pc).collect::<Vec<_>>(), inst_count,
-                );
-            }
-            let mut checked = 0;
-            let mut issued = 0;
-
-            let mut prev_issued_exec_unit = ExecUnitKind::NONE;
-            let max_issue = self.config.max_instruction_issue_per_warp;
-            // In tis mode, we only allow dual issue to diff execution
-            // units (as in Maxwell and Pascal)
-            let diff_exec_units = self.config.dual_issue_diff_exec_units;
-
-            if inst_count > 1 {
-                if next_warp.ibuffer_empty() {
-                    log::debug!(
-                        "warp (warp_id={}, dynamic_warp_id={}) fails as ibuffer_empty",
-                        warp_id,
-                        dyn_warp_id
-                    );
-                }
-
-                if next_warp.waiting() {
-                    log::debug!(
-                        "warp (warp_id={}, dynamic_warp_id={}) is waiting for completion",
-                        warp_id,
-                        dyn_warp_id
-                    );
-                }
-            }
-
-            let warp = self.warps.get(warp_id).unwrap();
-
-            // todo: what is the difference? why dont we just use next_warp?
-            debug_assert!(Rc::ptr_eq(warp, next_warp_rc));
-            drop(next_warp);
-
-            // log::debug!("locking warp = {}", warp_id);
-            let mut warp = warp.try_borrow_mut().unwrap();
-            // log::debug!("locked warp {}", warp_id);
-            // .as_mut()
-            // .as_ref()
-            // .unwrap();
-            while !warp.waiting()
-                && !warp.ibuffer_empty()
-                && checked < max_issue
-                && checked <= issued
-                && issued < max_issue
-            {
-                // let valid = warp.ibuffer_next_valid();
-                let mut warp_inst_issued = false;
-
-                if let Some(instr) = warp.ibuffer_peek() {
-                    // let (pc, rpc) = get_pdom_stack_top_info(warp_id, instr);
-                    log::debug!(
-                        "Warp (warp_id={}, dynamic_warp_id={}) instruction buffer[{}] has valid instruction {}",
-                        warp_id, dyn_warp_id, warp.next, instr,
-                    );
-
-                    // In trace-driven mode, we assume no control hazard, meaning
-                    // that `pc == rpc == instr.pc`
-                    // if pc != instr.pc {
-                    //     log::debug!(
-                    //         "Warp (warp_id {}, dynamic_warp_id {}) control hazard instruction flush",
-                    //         warp_id, dyn_warp_id);
-                    //     // control hazard
-                    //     warp.set_next_pc(pc);
-                    //     warp.ibuffer_flush();
-                    // } else {
-                    valid_inst = true;
-                    if !self
-                        .scoreboard
-                        .read()
-                        .unwrap()
-                        .has_collision(warp_id, instr)
-                    {
-                        log::debug!(
-                            "Warp (warp_id={}, dynamic_warp_id={}) {}",
-                            warp_id,
-                            dyn_warp_id,
-                            style("passes scoreboard").yellow(),
-                        );
-                        ready_inst = true;
-
-                        // let active_mask = core.active_mask(warp_id, instr);
-
-                        debug_assert!(warp.has_instr_in_pipeline());
-
-                        use opcodes::ArchOp;
-                        match instr.opcode.category {
-                            ArchOp::LOAD_OP
-                            | ArchOp::STORE_OP
-                            | ArchOp::MEMORY_BARRIER_OP
-                            | ArchOp::TENSOR_CORE_LOAD_OP
-                            | ArchOp::TENSOR_CORE_STORE_OP => {
-                                // if warp.warp_id == 3 {
-                                //     super::debug_break(format!(
-                                //         "scheduled mem instr for warp id 3: {}",
-                                //         instr
-                                //     ));
-                                // }
-                                let mem_stage = PipelineStage::ID_OC_MEM;
-
-                                let free_register = issuer.has_free_register(mem_stage, self.id);
-
-                                if free_register
-                                    && (!diff_exec_units
-                                        || prev_issued_exec_unit != ExecUnitKind::MEM)
-                                {
-                                    let instr = warp.ibuffer_take().unwrap();
-                                    debug_assert_eq!(warp_id, warp.warp_id);
-                                    issuer.issue_warp(mem_stage, &mut warp, instr, self.id);
-                                    // .issue_warp(mem_stage, &mut warp, instr, warp_id, self.id);
-                                    issued += 1;
-                                    issued_inst = true;
-                                    warp_inst_issued = true;
-                                    prev_issued_exec_unit = ExecUnitKind::MEM;
-                                } else {
-                                    log::debug!("issue failed: no free mem port register");
-                                }
-                            }
-                            // ArchOp::EXIT_OPS => {}
-                            op => {
-                                if op != ArchOp::TENSOR_CORE_OP
-                                    && op != ArchOp::SFU_OP
-                                    && op != ArchOp::DP_OP
-                                    && (op as usize) < opcodes::SPEC_UNIT_START_ID
-                                {
-                                    let mut execute_on_sp = false;
-                                    let mut execute_on_int = false;
-
-                                    let sp_pipe_avail = self.config.num_sp_units > 0
-                                        && issuer
-                                            .has_free_register(PipelineStage::ID_OC_SP, self.id);
-                                    let int_pipe_avail = self.config.num_int_units > 0
-                                        && issuer
-                                            .has_free_register(PipelineStage::ID_OC_INT, self.id);
-
-                                    // if INT unit pipline exist, then execute ALU and INT
-                                    // operations on INT unit and SP-FPU on SP unit (like in Volta)
-                                    // if INT unit pipline does not exist, then execute all ALU, INT
-                                    // and SP operations on SP unit (as in Fermi, Pascal GPUs)
-                                    if int_pipe_avail
-                                        && op != ArchOp::SP_OP
-                                        && !(diff_exec_units
-                                            && prev_issued_exec_unit == ExecUnitKind::INT)
-                                    {
-                                        execute_on_int = true;
-                                    } else if sp_pipe_avail
-                                        && (self.config.num_int_units == 0
-                                            || (self.config.num_int_units > 0
-                                                && op == ArchOp::SP_OP))
-                                        && !(diff_exec_units
-                                            && prev_issued_exec_unit == ExecUnitKind::SP)
-                                    {
-                                        execute_on_sp = true;
-                                    }
-
-                                    log::debug!(
-                                        "execute on INT={} execute on SP={}",
-                                        execute_on_int,
-                                        execute_on_sp
-                                    );
-
-                                    let issue_target = if execute_on_sp {
-                                        Some((PipelineStage::ID_OC_SP, ExecUnitKind::SP))
-                                    } else if execute_on_int {
-                                        Some((PipelineStage::ID_OC_INT, ExecUnitKind::INT))
-                                    } else {
-                                        None
-                                    };
-
-                                    if let Some((stage, unit)) = issue_target {
-                                        let instr = warp.ibuffer_take().unwrap();
-                                        debug_assert_eq!(warp.warp_id, warp_id);
-                                        issuer.issue_warp(stage, &mut warp, instr, self.id);
-                                        // .issue_warp(stage, &mut warp, instr, warp_id, self.id);
-                                        issued += 1;
-                                        issued_inst = true;
-                                        warp_inst_issued = true;
-                                        prev_issued_exec_unit = unit;
-                                    }
-                                }
-                                // else if ((m_shader->m_config->gpgpu_num_dp_units > 0) &&
-                                //                          (pI->op == DP_OP) &&
-                                //                          !(diff_exec_units && previous_issued_inst_exec_type ==
-                                //                                                   exec_unit_type_t::DP)) {
-                                // } else if (((m_shader->m_config->gpgpu_num_dp_units == 0 &&
-                                //                          pI->op == DP_OP) ||
-                                //                         (pI->op == SFU_OP) || (pI->op == ALU_SFU_OP)) &&
-                                //                        !(diff_exec_units && previous_issued_inst_exec_type ==
-                                //                                                 exec_unit_type_t::SFU)) {
-                                // } else if ((pI->op == TENSOR_CORE_OP) &&
-                                //                          !(diff_exec_units && previous_issued_inst_exec_type ==
-                                //                                                   exec_unit_type_t::TENSOR)) {
-                                // } else if ((pI->op >= SPEC_UNIT_START_ID) &&
-                                //                          !(diff_exec_units &&
-                                //                            previous_issued_inst_exec_type ==
-                                //                                exec_unit_type_t::SPECIALIZED)) {
-                                // }
-                            } // op => unimplemented!("op {:?} not implemented", op),
-                        }
-                    } else {
-                        log::debug!(
-                            "Warp (warp_id={}, dynamic_warp_id={}) {}",
-                            warp_id,
-                            dyn_warp_id,
-                            style("fails scoreboard").yellow(),
-                        );
-                    }
-                    // }
-                }
-                // else if (valid) {
-                //   // this case can happen after a return instruction in diverged warp
-                //   SCHED_DPRINTF(
-                //       "Warp (warp_id %u, dynamic_warp_id %u) return from diverged warp "
-                //       "flush\n",
-                //       (*iter)->get_warp_id(), (*iter)->get_dynamic_warp_id());
-                //   warp(warp_id).set_next_pc(pc);
-                //   warp(warp_id).ibuffer_flush();
-                // }
-                if warp_inst_issued {
-                    log::debug!(
-                        "Warp (warp_id={}, dynamic_warp_id={}) issued {} instructions",
-                        warp_id,
-                        dyn_warp_id,
-                        issued
-                    );
-                    // m_stats->event_warp_issued(m_shader->get_sid(), warp_id, num_issued, warp(warp_id).get_dynamic_warp_id());
-                    warp.ibuffer_step();
-                }
-                checked += 1;
-            }
-            // drop(next_warp);
-            drop(warp);
-            if issued > 0 {
-                // This might be a bit inefficient, but we need to maintain
-                // two ordered list for proper scheduler execution.
-                // We could remove the need for this loop by associating a
-                // supervised_is index with each entry in the
-                // m_next_cycle_prioritized_warps vector.
-                // For now, just run through until you find the right warp_id
-                for (sup_idx, supervised) in self.supervised_warps.iter().enumerate() {
-                    // if *next_warp == *supervised.lock().unwrap().warp_id {
-                    // log::debug!("locking supervised[{}]", sup_idx);
-                    // if dynamicwarp_id == supervised.try_borrow().unwrap().warp_id {
-                    // if warp.borrow() == supervised.borrow() {
-                    if *next_warp_rc.try_borrow().unwrap() == *supervised.try_borrow().unwrap() {
-                        // test
-                        self.last_supervised_issued_idx = sup_idx;
-                    }
-                }
-                self.num_issued_last_cycle = issued;
-                if issued == 1 {
-                    // m_stats->single_issue_nums[m_id]++;
-                } else if issued > 1 {
-                    // m_stats->dual_issue_nums[m_id]++;
-                }
-                break;
-            }
-        }
-
-        // issue stall statistics:
-        if !valid_inst {
-            // idle or control hazard
-            // m_stats.shader_cycle_distro[0]++;
-        } else if !ready_inst {
-            // waiting for RAW hazards (possibly due to memory)
-            // m_stats.shader_cycle_distro[1]++;
-        } else if !issued_inst {
-            // pipeline stalled
-            // m_stats.shader_cycle_distro[2]++;
-        }
-
-        // todo!("base scheduler unit: cycle");
-    }
-}
-
-pub trait SchedulerUnit {
-    fn cycle(&mut self, _core: &mut dyn super::core::WarpIssuer) {
-        // fn cycle(&mut self, core: ()) {
-        // fn cycle(&mut self) {
-        todo!("scheduler unit: cycle");
-    }
-
-    // fn done_adding_supervised_warps(&mut self) {
-    //     todo!("scheduler unit: done_adding_supervised_warps");
-    // }
-
-    fn add_supervised_warp(&mut self, _warp: CoreWarp) {
-        todo!("scheduler unit: add supervised warp id");
-    }
-
-    fn prioritized_warps(&self) -> &VecDeque<CoreWarp>;
-
-    // self.scheduler
-    // self.inner.supervised_warps
-
-    // fn add_supervised_warp_id(&mut self, warp_id: usize) {
-    //     todo!("scheduler unit: add supervised warp id");
-    // }
-
-    /// Order warps based on scheduling policy.
-    ///
-    /// Derived classes can override this function to populate
-    /// m_supervised_warps with their scheduling policies
-    fn order_warps(
-        &mut self,
-        // out: &mut VecDeque<SchedulerWarp>,
-        // warps: &mut Vec<SchedulerWarp>,
-        // last_issued_warps: &Vec<SchedulerWarp>,
-        // num_warps_to_add: usize,
-    ) {
-        todo!("scheduler unit: order warps")
-    }
-}
-
-#[derive(Debug)]
-pub struct LrrScheduler {
-    inner: BaseSchedulerUnit,
-}
-
-pub fn all_different<T>(values: &[Rc<RefCell<T>>]) -> bool {
-    for (vi, v) in values.iter().enumerate() {
-        for (vii, vv) in values.iter().enumerate() {
-            let should_be_equal = vi == vii;
-            let are_equal = Rc::ptr_eq(v, vv);
-            if should_be_equal && !are_equal {
-                return false;
-            }
-            if !should_be_equal && are_equal {
-                return false;
-            }
-        }
-    }
-    true
-}
-
-// pub struct LrrScheduler<'a> {
-//     inner: BaseSchedulerUnit<'a>,
-// }
-
-// impl<'a> BaseSchedulerUnit<'a> {
-impl BaseSchedulerUnit {
-    fn order_by_priority<F>(&mut self, ordering: Ordering, priority_func: F)
-    where
-        F: FnMut(&CoreWarp, &CoreWarp) -> std::cmp::Ordering,
-    {
-        // todo!("base scheduler unit: order by priority");
-        let num_warps_to_add = self.supervised_warps.len();
-        let out = &mut self.next_cycle_prioritized_warps;
-
-        debug_assert!(num_warps_to_add <= self.warps.len());
-        out.clear();
-
-        debug_assert!(all_different(&self.supervised_warps.make_contiguous()));
-
-        // let mut last_issued_iter = self.warps.iter().skip(self.last_supervised_issued_idx);
-        let mut last_issued_iter = self
-            .supervised_warps
-            .iter()
-            .skip(self.last_supervised_issued_idx);
-        debug_assert!(all_different(&self.warps));
-
-        // TODO: maybe we actually should make a copy of the supervised warps to not actually
-        // reorder those for stability
-
-        let mut supervised_warps_sorted: Vec<_> =
-            self.supervised_warps.clone().into_iter().collect();
-        supervised_warps_sorted.sort_by(priority_func);
-
-        debug_assert!(all_different(&supervised_warps_sorted));
-
-        // dbg!(&supervised_warps_sorted.len());
-        // dbg!(&supervised_warps_sorted
-        //     .iter()
-        //     .map(|w| w.borrow().dynamic_warp_id)
-        //     .collect::<Vec<_>>());
-
-        // self.supervised_warps
-        //     .make_contiguous()
-        //     .sort_by(priority_func);
-
-        match ordering {
-            Ordering::GREEDY_THEN_PRIORITY_FUNC => {
-                let greedy_value = last_issued_iter.next();
-                if let Some(greedy) = greedy_value {
-                    out.push_back(Rc::clone(greedy));
-                }
-
-                log::debug!(
-                    "added greedy warp (last supervised issued idx={}): {:?}",
-                    self.last_supervised_issued_idx,
-                    &greedy_value.map(|w| w.borrow().dynamic_warp_id)
-                );
-
-                // dbg!(&greedy_value);
-
-                // self.supervised_warps
-                //     .make_contiguous()
-                //     .sort_by(priority_func);
-
-                // self.supervised_warpsself.supervised_warps.any( .iter()
-
-                out.extend(
-                    supervised_warps_sorted
-                        .into_iter()
-                        .take(num_warps_to_add)
-                        .filter(|warp| {
-                            if let Some(greedy) = greedy_value {
-                                // log::debug!(
-                                //     "greedy@{:?} warp@{:?}",
-                                //     Rc::as_ptr(greedy),
-                                //     Rc::as_ptr(warp)
-                                // );
-                                let already_added = Rc::ptr_eq(greedy, warp);
-                                !already_added
-                            } else {
-                                true
-                            }
-                        }),
-                    // .map(Rc::clone),
-                );
-            }
-            Ordering::PRIORITY_FUNC_ONLY => {
-                // self.supervised_warps
-                //     .make_contiguous()
-                //     .sort_by(priority_func);
-                out.extend(supervised_warps_sorted.into_iter().take(num_warps_to_add));
-            }
-        }
-        // dbg!(num_warps_to_add, out.len());
-        assert_eq!(
-            num_warps_to_add,
-            out.len(),
-            "either too few supervised warps or greedy warp not in supervised warps"
-        );
-    }
-
-    fn order_rrr(
-        &mut self,
-        // out: &mut VecDeque<SchedulerWarp>,
-        // warps: &mut Vec<SchedulerWarp>,
-        // std::vector<T> &result_list, const typename std::vector<T> &input_list,
-        // const typename std::vector<T>::const_iterator &last_issued_from_input,
-        // unsigned num_warps_to_add)
-    ) {
-        unimplemented!("order rrr is untested");
-        let num_warps_to_add = self.supervised_warps.len();
-        let out = &mut self.next_cycle_prioritized_warps;
-        // order_lrr(
-        //     &mut self.inner.next_cycle_prioritized_warps,
-        //     &mut self.inner.supervised_warps,
-        //     &mut self.inner.last_supervised_issued_idx,
-        //     // &mut self.inner.last_supervised_issued(),
-        //     num_warps_to_add,
-        // );
-
-        out.clear();
-
-        let current_turn_warp_ref = self.warps.get(self.current_turn_warp).unwrap();
-        let current_turn_warp = current_turn_warp_ref.try_borrow().unwrap();
-        // .as_ref()
-        // .unwrap();
-
-        if self.num_issued_last_cycle > 0
-            || current_turn_warp.done_exit()
-            || current_turn_warp.waiting()
-        {
-            // std::vector<shd_warp_t *>::const_iterator iter =
-            //   (last_issued_from_input == input_list.end()) ?
-            //     input_list.begin() : last_issued_from_input + 1;
-
-            let mut iter = self
-                .supervised_warps
-                .iter()
-                .skip(self.last_supervised_issued_idx + 1)
-                .chain(self.supervised_warps.iter());
-
-            for w in iter.take(num_warps_to_add) {
-                let warp = w.try_borrow().unwrap();
-                let warp_id = warp.warp_id;
-                if !warp.done_exit() && !warp.waiting() {
-                    out.push_back(w.clone());
-                    self.current_turn_warp = warp_id;
-                    break;
-                }
-            }
-            // for (unsigned count = 0; count < num_warps_to_add; ++iter, ++count) {
-            //   if (iter == input_list.end()) {
-            //   iter = input_list.begin();
-            //   }
-            //   unsigned warp_id = (*iter)->get_warp_id();
-            //   if (!(*iter)->done_exit() && !(*iter)->waiting()) {
-            //     result_list.push_back(*iter);
-            //     m_current_turn_warp = warp_id;
-            //     break;
-            //   }
-            // }
-        } else {
-            out.push_back(current_turn_warp_ref.clone());
-        }
-    }
-
-    fn order_lrr(
-        &mut self,
-        // out: &mut VecDeque<SchedulerWarp>,
-        // warps: &mut Vec<SchedulerWarp>,
-        // // last_issued_warps: &Vec<SchedulerWarp>,
-        // // last_issued_warps: impl Iterator<Item=SchedulerWarp>,
-        // // last_issued_warps: &mut std::slice::Iter<'_, SchedulerWarp>,
-        // // last_issued_warps: impl Iterator<Item = &'a SchedulerWarp>,
-        // last_issued_warp_idx: &mut usize,
-        // num_warps_to_add: usize,
-    ) {
-        unimplemented!("order lrr is not tested");
-        let num_warps_to_add = self.supervised_warps.len();
-        let out = &mut self.next_cycle_prioritized_warps;
-
-        debug_assert!(num_warps_to_add <= self.warps.len());
-        out.clear();
-        // if last_issued_warps
-        //   typename std::vector<T>::const_iterator iter = (last_issued_from_input == input_list.end()) ? input_list.begin()
-        //                                                    : last_issued_from_input + 1;
-        //
-        let mut last_issued_iter = self.warps.iter().skip(self.last_supervised_issued_idx);
-
-        let mut iter = last_issued_iter.chain(self.warps.iter());
-        // .filter_map(|x| x.as_ref());
-        // .filter_map(|x| x.as_ref());
-
-        out.extend(iter.take(num_warps_to_add).cloned());
-        // for count in 0..num_warps_to_add {
-        //     let Some(warp) = iter.next() else {
-        //         return;
-        //     };
-        //     // if (iter == input_list.end()) {
-        //     //   iter = input_list.begin();
-        //     // }
-        //     out.push_back(warp.clone());
-        // }
-        // todo!("order lrr: order warps")
-    }
-}
-
-impl SchedulerUnit for LrrScheduler {
-    // impl<'a> SchedulerUnit for LrrScheduler<'a> {
-    fn order_warps(
-        &mut self,
-        // out: &mut VecDeque<SchedulerWarp>,
-        // warps: &mut Vec<SchedulerWarp>,
-        // last_issued_warps: &Vec<SchedulerWarp>,
-        // num_warps_to_add: usize,
-    ) {
-        self.inner.order_lrr();
-        // let num_warps_to_add = self.inner.supervised_warps.len();
-        // order_lrr(
-        //     &mut self.inner.next_cycle_prioritized_warps,
-        //     &mut self.inner.supervised_warps,
-        //     &mut self.inner.last_supervised_issued_idx,
-        //     // &mut self.inner.last_supervised_issued(),
-        //     num_warps_to_add,
-        // );
-    }
-
-    fn add_supervised_warp(&mut self, warp: CoreWarp) {
-        self.inner.supervised_warps.push_back(warp);
-        // self.inner.add_supervised_warp_id(warp_id);
-    }
-
-    fn prioritized_warps(&self) -> &VecDeque<CoreWarp> {
-        self.inner.prioritized_warps()
-    }
-
-    // fn add_supervised_warp_id(&mut self, warp_id: usize) {
-    //     self.inner.add_supervised_warp_id(warp_id);
-    // }
-
-    // fn done_adding_supervised_warps(&mut self) {
-    //     self.inner.last_supervised_issued_idx = self.inner.supervised_warps.len();
-    // }
-
-    // fn cycle<I>(&mut self, core: &mut super::core::InnerSIMTCore<I>) {
-    // fn cycle(&mut self, core: ()) {
-    fn cycle(&mut self, issuer: &mut dyn super::core::WarpIssuer) {
-        self.order_warps();
-        self.inner.cycle(issuer);
-    }
-}
-
-// impl<'a> LrrScheduler<'a> {
-impl LrrScheduler {
-    // fn order_warps(
-    //     &self,
-    //     out: &mut VecDeque<SchedulerWarp>,
-    //     warps: &mut Vec<SchedulerWarp>,
-    //     last_issued_warps: &Vec<SchedulerWarp>,
-    //     num_warps_to_add: usize,
-    // ) {
-    //     todo!("scheduler unit: order warps")
-    // }
-
-    // pub fn new(
-    //     id: usize,
-    //     // warps: &'a Vec<SchedulerWarp>,
-    //     warps: Vec<CoreWarp>,
-    //     // warps: &'a Vec<Option<SchedulerWarp>>,
-    //     // mem_out: &'a register_set::RegisterSet,
-    //     // core: &'a super::core::InnerSIMTCore,
-    //     scoreboard: Arc<RwLock<scoreboard::Scoreboard>>,
-    //     stats: Arc<Mutex<stats::Stats>>,
-    //     config: Arc<GPUConfig>,
-    // ) -> Self {
-    //     // todo!("lrr scheduler: new");
-    //     let inner = BaseSchedulerUnit::new(
-    //         id, // mem_out, core,
-    //         warps, scoreboard, stats, config,
-    //     );
-    //     Self { inner }
-    // }
-
-    // lrr_scheduler(shader_core_stats *stats, shader_core_ctx *shader,
-    //               Scoreboard *scoreboard, simt_stack **simt,
-    //               std::vector<shd_warp_t *> *warp, register_set *sp_out,
-    //               register_set *dp_out, register_set *sfu_out,
-    //               register_set *int_out, register_set *tensor_core_out,
-    //               std::vector<register_set *> &spec_cores_out,
-    //               register_set *mem_out, int id)
-    //     : scheduler_unit(stats, shader, scoreboard, simt, warp, sp_out, dp_out,
-    //                      sfu_out, int_out, tensor_core_out, spec_cores_out,
-    //                      mem_out, id) {}
-
-    // virtual void order_warps();
-}
-
-#[derive(Debug)]
-pub struct GTOScheduler {
-    inner: BaseSchedulerUnit,
-}
-
-impl GTOScheduler {
-    pub fn new(
-        id: usize,
-        cluster_id: usize,
-        core_id: usize,
-        warps: Vec<CoreWarp>,
-        scoreboard: Arc<RwLock<scoreboard::Scoreboard>>,
-        stats: Arc<Mutex<stats::Stats>>,
-        config: Arc<GPUConfig>,
-    ) -> Self {
-        let inner = BaseSchedulerUnit::new(
-            id, // mem_out, core,
-            cluster_id, core_id, warps, scoreboard, stats, config,
-        );
-        Self { inner }
-    }
-}
-
-impl GTOScheduler {
-    fn debug_warp_ids(&self) -> Vec<usize> {
-        self.inner
-            .next_cycle_prioritized_warps
-            .iter()
-            .map(|w| w.borrow().warp_id)
-            .collect()
-    }
-
-    fn debug_dynamic_warp_ids(&self) -> Vec<usize> {
-        self.inner
-            .next_cycle_prioritized_warps
-            .iter()
-            .map(|w| w.borrow().dynamic_warp_id())
-            .collect()
-    }
-}
-
-impl SchedulerUnit for GTOScheduler {
-    fn order_warps(&mut self) {
-        // order_by_priority(
-        //     m_next_cycle_prioritized_warps,
-        //     m_supervised_warps,
-        //     m_last_supervised_issued,
-        //     m_supervised_warps.size(),
-        //     ORDERING_GREEDY_THEN_PRIORITY_FUNC,
-        //     scheduler_unit::sort_warps_by_oldest_dynamic_id,
-        // );
-        //x
-
-        // let before = self.inner.next_cycle_prioritized_warps.len();
-        self.inner.order_by_priority(
-            Ordering::GREEDY_THEN_PRIORITY_FUNC,
-            sort_warps_by_oldest_dynamic_id,
-        );
-        // let after = self.inner.next_cycle_prioritized_warps.len();
-        // assert_eq!(before, after);
-    }
-
-    fn add_supervised_warp(&mut self, warp: CoreWarp) {
-        self.inner.supervised_warps.push_back(warp);
-    }
-
-    fn prioritized_warps(&self) -> &VecDeque<CoreWarp> {
-        self.inner.prioritized_warps()
-    }
-
-    // fn done_adding_supervised_warps(&mut self) {
-    //     // self.inner.last_supervised_issued_idx = self.inner.supervised_warps.len();
-    //     self.inner.last_supervised_issued_idx = 0;
-    // }
-
-    // fn cycle(&mut self, core: ()) {
-    fn cycle(&mut self, issuer: &mut dyn super::core::WarpIssuer) {
-        log::debug!(
-            "gto scheduler[{}]: BEFORE: prioritized warp ids: {:?}",
-            self.inner.id,
-            self.debug_warp_ids()
-        );
-        log::debug!(
-            "gto scheduler[{}]: BEFORE: prioritized dynamic warp ids: {:?}",
-            self.inner.id,
-            self.debug_dynamic_warp_ids()
-        );
-
-        self.order_warps();
-
-        log::debug!(
-            "gto scheduler[{}]: AFTER: prioritized warp ids: {:?}",
-            self.inner.id,
-            self.debug_warp_ids()
-        );
-        log::debug!(
-            "gto scheduler[{}]: AFTER: prioritized dynamic warp ids: {:?}",
-            self.inner.id,
-            self.debug_dynamic_warp_ids()
-        );
-
-        self.inner.cycle(issuer);
-    }
-}
-
-impl GTOScheduler {
-    pub fn order_warps(
-        &self,
-        out: &mut VecDeque<SchedulerWarp>,
-        warps: &mut Vec<SchedulerWarp>,
-        _last_issued_warps: &Vec<SchedulerWarp>,
-        num_warps_to_add: usize,
-    ) {
-        // let mut next_cycle_prioritized_warps = Vec::new();
-        //
-        // let mut supervised_warps = Vec::new(); // input
-        // let mut last_issued_from_input = Vec::new(); // last issued
-        // let num_warps_to_add = supervised_warps.len();
-        debug_assert!(num_warps_to_add <= warps.len());
-
-        // scheduler_unit::sort_warps_by_oldest_dynamic_id
-
-        // ORDERING_GREEDY_THEN_PRIORITY_FUNC
-        out.clear();
-        // let greedy_value = last_issued_warps.first();
-        // if let Some(greedy_value) = greedy_value {
-        //     out.push_back(greedy_value.clone());
-        // }
-        //
-        // warps.sort_by(sort_warps_by_oldest_dynamic_id);
-        // out.extend(
-        //     warps
-        //         .iter()
-        //         .take_while(|w| match greedy_value {
-        //             None => true,
-        //             Some(val) => *w != val,
-        //         })
-        //         .take(num_warps_to_add)
-        //         .cloned(),
-        // );
-
-        //     typename std::vector<T>::iterator iter = temp.begin();
-        //     for (unsigned count = 0; count < num_warps_to_add; ++count, ++iter) {
-        //       if (*iter != greedy_value) {
-        //         result_list.push_back(*iter);
-        //       }
-        //     }
-
-        //   result_list.clear();
-        //   typename std::vector<T> temp = input_list;
-        //
-        //   if (ORDERING_GREEDY_THEN_PRIORITY_FUNC == ordering) {
-        //     T greedy_value = *last_issued_from_input;
-        //     result_list.push_back(greedy_value);
-        //
-        //     std::sort(temp.begin(), temp.end(), priority_func);
-        //     typename std::vector<T>::iterator iter = temp.begin();
-        //     for (unsigned count = 0; count < num_warps_to_add; ++count, ++iter) {
-        //       if (*iter != greedy_value) {
-        //         result_list.push_back(*iter);
-        //       }
-        //     }
-        //   } else if (ORDERED_PRIORITY_FUNC_ONLY == ordering) {
-        //     std::sort(temp.begin(), temp.end(), priority_func);
-        //     typename std::vector<T>::iterator iter = temp.begin();
-        //     for (unsigned count = 0; count < num_warps_to_add; ++count, ++iter) {
-        //       result_list.push_back(*iter);
-        //     }
-        //   } else {
-        //     fprintf(stderr, "Unknown ordering - %d\n", ordering);
-        //     abort();
-        //   }
-
-        // order by priority
-        // (m_next_cycle_prioritized_warps, m_supervised_warps,
-        //                 m_last_supervised_issued, m_supervised_warps.size(),
-        //                 ORDERING_GREEDY_THEN_PRIORITY_FUNC,
-        //                 scheduler_unit::sort_warps_by_oldest_dynamic_id);
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use crate::ported::testing;
-
-    use std::ptr;
-
-    #[ignore = "todo"]
-    #[test]
-    fn test_shd_warp() {
-        use playground::types::trace_shd_warp::new_trace_shd_warp;
-        let core = ptr::null_mut();
-        let warp_size = 32;
-        let mut warp = unsafe { new_trace_shd_warp(core, warp_size) };
-        warp.pin_mut().reset();
-        dbg!(&warp.get_n_completed());
-        dbg!(&warp.hardware_done());
-        dbg!(&warp.functional_done());
-        assert!(false);
-    }
-
-    #[test]
-    fn test_skip_iterator_indexing() {
-        let issued_warp_id = 3;
-        let supervised_warp_ids = vec![1, 2, 3, 4, 5];
-        let mut last_supervised_idx = 0;
-
-        for (idx, id) in supervised_warp_ids.iter().enumerate() {
-            if *id == issued_warp_id {
-                last_supervised_idx = idx;
-            }
-        }
-        assert_eq!(
-            supervised_warp_ids.iter().skip(last_supervised_idx).next(),
-            Some(&issued_warp_id)
-        );
-    }
-
-    impl From<&Box<dyn super::SchedulerUnit>> for testing::state::Scheduler {
-        fn from(scheduler: &Box<dyn super::SchedulerUnit>) -> Self {
-            // let prioritized_warps = ;
-            let prioritized_warp_ids: Vec<_> = scheduler
-                .prioritized_warps()
-                .iter()
-                .map(|warp| (warp.borrow().warp_id, warp.borrow().dynamic_warp_id()))
-                .collect();
-            // let prioritized_warp_ids: Vec<_> = prioritized_warps
-            //     .clone()
-            //     .map(|warp| warp.borrow().warp_id)
-            //     .collect();
-            // let prioritized_dynamic_warp_ids: Vec<_> = prioritized_warps
-            //     .clone()
-            //     .map(|warp| warp.borrow().dynamic_warp_id())
-            //     .collect();
-            //
-            // assert_eq!(
-            //     prioritized_warp_ids.len(),
-            //     prioritized_dynamic_warp_ids.len()
-            // );
-
-            Self {
-                prioritized_warp_ids,
-                // prioritized_warp_ids
-                // prioritized_dynamic_warp_ids,
-            }
-        }
-    }
-}
diff --git a/src/ported/scheduler/gto.rs b/src/ported/scheduler/gto.rs
new file mode 100644
index 00000000..b3623d90
--- /dev/null
+++ b/src/ported/scheduler/gto.rs
@@ -0,0 +1,89 @@
+use super::{BaseSchedulerUnit, SchedulerUnit, WarpRef};
+use crate::config::GPUConfig;
+use crate::ported::scoreboard::Scoreboard;
+use std::collections::VecDeque;
+use std::sync::{Arc, Mutex, RwLock};
+
+#[derive(Debug)]
+pub struct Scheduler {
+    inner: BaseSchedulerUnit,
+}
+
+impl Scheduler {
+    pub fn new(
+        id: usize,
+        cluster_id: usize,
+        core_id: usize,
+        warps: Vec<WarpRef>,
+        scoreboard: Arc<RwLock<Scoreboard>>,
+        stats: Arc<Mutex<stats::scheduler::Scheduler>>,
+        config: Arc<GPUConfig>,
+    ) -> Self {
+        let inner =
+            BaseSchedulerUnit::new(id, cluster_id, core_id, warps, scoreboard, stats, config);
+        Self { inner }
+    }
+}
+
+impl Scheduler {
+    fn debug_warp_ids(&self) -> Vec<usize> {
+        self.inner
+            .next_cycle_prioritized_warps
+            .iter()
+            .map(|w| w.borrow().warp_id)
+            .collect()
+    }
+
+    fn debug_dynamic_warp_ids(&self) -> Vec<usize> {
+        self.inner
+            .next_cycle_prioritized_warps
+            .iter()
+            .map(|w| w.borrow().dynamic_warp_id())
+            .collect()
+    }
+}
+
+impl SchedulerUnit for Scheduler {
+    fn order_warps(&mut self) {
+        self.inner.order_by_priority(
+            super::ordering::Ordering::GREEDY_THEN_PRIORITY_FUNC,
+            super::ordering::sort_warps_by_oldest_dynamic_id,
+        );
+    }
+
+    fn add_supervised_warp(&mut self, warp: WarpRef) {
+        self.inner.supervised_warps.push_back(warp);
+    }
+
+    fn prioritized_warps(&self) -> &VecDeque<WarpRef> {
+        self.inner.prioritized_warps()
+    }
+
+    fn cycle(&mut self, issuer: &mut dyn crate::ported::core::WarpIssuer) {
+        log::debug!(
+            "gto scheduler[{}]: BEFORE: prioritized warp ids: {:?}",
+            self.inner.id,
+            self.debug_warp_ids()
+        );
+        log::debug!(
+            "gto scheduler[{}]: BEFORE: prioritized dynamic warp ids: {:?}",
+            self.inner.id,
+            self.debug_dynamic_warp_ids()
+        );
+
+        self.order_warps();
+
+        log::debug!(
+            "gto scheduler[{}]: AFTER: prioritized warp ids: {:?}",
+            self.inner.id,
+            self.debug_warp_ids()
+        );
+        log::debug!(
+            "gto scheduler[{}]: AFTER: prioritized dynamic warp ids: {:?}",
+            self.inner.id,
+            self.debug_dynamic_warp_ids()
+        );
+
+        self.inner.cycle(issuer);
+    }
+}
diff --git a/src/ported/scheduler/mod.rs b/src/ported/scheduler/mod.rs
new file mode 100644
index 00000000..faec15ea
--- /dev/null
+++ b/src/ported/scheduler/mod.rs
@@ -0,0 +1,396 @@
+pub mod gto;
+pub mod ordering;
+pub mod warp;
+
+use super::core::PipelineStage;
+use super::{opcodes, scoreboard};
+use crate::config::GPUConfig;
+use console::style;
+use std::cell::RefCell;
+use std::collections::VecDeque;
+use std::rc::Rc;
+use std::sync::{Arc, Mutex, RwLock};
+
+pub use warp::{SchedulerWarp, ThreadActiveMask};
+
+pub type WarpRef = Rc<RefCell<warp::SchedulerWarp>>;
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
+enum ExecUnitKind {
+    NONE = 0,
+    SP = 1,
+    #[allow(dead_code)]
+    SFU = 2,
+    MEM = 3,
+    #[allow(dead_code)]
+    DP = 4,
+    INT = 5,
+    #[allow(dead_code)]
+    TENSOR = 6,
+    #[allow(dead_code)]
+    SPECIALIZED = 7,
+}
+
+pub trait SchedulerUnit {
+    fn cycle(&mut self, _core: &mut dyn super::core::WarpIssuer);
+
+    fn add_supervised_warp(&mut self, warp: WarpRef);
+
+    fn prioritized_warps(&self) -> &VecDeque<WarpRef>;
+
+    /// Order warps based on scheduling policy.
+    fn order_warps(&mut self);
+}
+
+#[derive(Debug)]
+pub struct BaseSchedulerUnit {
+    id: usize,
+    cluster_id: usize,
+    core_id: usize,
+
+    /// This is the prioritized warp list that is looped over each cycle to
+    /// determine which warp gets to issue.
+    next_cycle_prioritized_warps: VecDeque<WarpRef>,
+
+    // Supervised warps keeps all warps this scheduler can arbitrate between.
+    //
+    // This is useful in systems where there is more than one warp scheduler.
+    // In a single scheduler system, this is simply all the warps
+    // assigned to this core.
+    supervised_warps: VecDeque<WarpRef>,
+    warps: Vec<WarpRef>,
+
+    /// This is the iterator pointer to the last supervised warp issued
+    last_supervised_issued_idx: usize,
+    num_issued_last_cycle: usize,
+
+    scoreboard: Arc<RwLock<scoreboard::Scoreboard>>,
+
+    config: Arc<GPUConfig>,
+    stats: Arc<Mutex<stats::scheduler::Scheduler>>,
+}
+
+impl BaseSchedulerUnit {
+    pub fn new(
+        id: usize,
+        cluster_id: usize,
+        core_id: usize,
+        warps: Vec<WarpRef>,
+        scoreboard: Arc<RwLock<scoreboard::Scoreboard>>,
+        stats: Arc<Mutex<stats::scheduler::Scheduler>>,
+        config: Arc<GPUConfig>,
+    ) -> Self {
+        let supervised_warps = VecDeque::new();
+        Self {
+            id,
+            cluster_id,
+            core_id,
+            next_cycle_prioritized_warps: VecDeque::new(),
+            supervised_warps,
+            last_supervised_issued_idx: 0,
+            warps,
+            num_issued_last_cycle: 0,
+            stats,
+            scoreboard,
+            config,
+        }
+    }
+
+    fn prioritized_warps(&self) -> &VecDeque<WarpRef> {
+        &self.next_cycle_prioritized_warps
+    }
+
+    fn cycle(&mut self, issuer: &mut dyn super::core::WarpIssuer) {
+        log::debug!("{}: cycle", style("base scheduler").yellow());
+
+        let mut valid_inst = false;
+        let mut ready_inst = false;
+        let mut issued_inst = false;
+
+        for next_warp_rc in &self.next_cycle_prioritized_warps {
+            // don't consider warps that are not yet valid
+            let next_warp = next_warp_rc.try_borrow().unwrap();
+            let (warp_id, dyn_warp_id) = (next_warp.warp_id, next_warp.dynamic_warp_id);
+
+            if next_warp.done_exit() {
+                continue;
+            }
+            let inst_count = next_warp.instruction_count();
+            if inst_count == 0 {
+                log::debug!("next warp: {:#?}", &next_warp);
+            }
+            assert!(inst_count > 0);
+            if inst_count > 1 {
+                log::debug!(
+                    "core[{}][{}] scheduler[{}]: \n\t => testing (warp_id={}, dynamic_warp_id={}, trace_pc={}, pc={:?}, ibuffer={:?}, {} instructions)",
+                    self.cluster_id,
+                    self.core_id,
+                    self.id,
+                    warp_id, dyn_warp_id,
+                    next_warp.trace_pc,
+                    next_warp.pc(),
+                    next_warp.instr_buffer.iter().filter_map(Option::as_ref).map(|i| i.pc).collect::<Vec<_>>(), inst_count,
+                );
+            }
+            let mut checked = 0;
+            let mut issued = 0;
+
+            let mut prev_issued_exec_unit = ExecUnitKind::NONE;
+            let max_issue = self.config.max_instruction_issue_per_warp;
+            // In tis mode, we only allow dual issue to diff execution
+            // units (as in Maxwell and Pascal)
+            let diff_exec_units = self.config.dual_issue_diff_exec_units;
+
+            if inst_count > 1 {
+                if next_warp.ibuffer_empty() {
+                    log::debug!(
+                        "warp (warp_id={}, dynamic_warp_id={}) fails as ibuffer_empty",
+                        warp_id,
+                        dyn_warp_id
+                    );
+                }
+
+                if next_warp.waiting() {
+                    log::debug!(
+                        "warp (warp_id={}, dynamic_warp_id={}) is waiting for completion",
+                        warp_id,
+                        dyn_warp_id
+                    );
+                }
+            }
+
+            let warp = self.warps.get(warp_id).unwrap();
+
+            // todo: what is the difference? why dont we just use next_warp?
+            debug_assert!(Rc::ptr_eq(warp, next_warp_rc));
+            drop(next_warp);
+
+            let mut warp = warp.try_borrow_mut().unwrap();
+            while !warp.waiting()
+                && !warp.ibuffer_empty()
+                && checked < max_issue
+                && checked <= issued
+                && issued < max_issue
+            {
+                let mut warp_inst_issued = false;
+
+                if let Some(instr) = warp.ibuffer_peek() {
+                    log::debug!(
+                        "Warp (warp_id={}, dynamic_warp_id={}) instruction buffer[{}] has valid instruction {}",
+                        warp_id, dyn_warp_id, warp.next, instr,
+                    );
+
+                    valid_inst = true;
+                    if !self
+                        .scoreboard
+                        .read()
+                        .unwrap()
+                        .has_collision(warp_id, instr)
+                    {
+                        log::debug!(
+                            "Warp (warp_id={}, dynamic_warp_id={}) {}",
+                            warp_id,
+                            dyn_warp_id,
+                            style("passes scoreboard").yellow(),
+                        );
+                        ready_inst = true;
+
+                        debug_assert!(warp.has_instr_in_pipeline());
+
+                        use opcodes::ArchOp;
+                        match instr.opcode.category {
+                            ArchOp::LOAD_OP
+                            | ArchOp::STORE_OP
+                            | ArchOp::MEMORY_BARRIER_OP
+                            | ArchOp::TENSOR_CORE_LOAD_OP
+                            | ArchOp::TENSOR_CORE_STORE_OP => {
+                                let mem_stage = PipelineStage::ID_OC_MEM;
+
+                                let free_register = issuer.has_free_register(mem_stage, self.id);
+
+                                if free_register
+                                    && (!diff_exec_units
+                                        || prev_issued_exec_unit != ExecUnitKind::MEM)
+                                {
+                                    let instr = warp.ibuffer_take().unwrap();
+                                    debug_assert_eq!(warp_id, warp.warp_id);
+                                    issuer.issue_warp(mem_stage, &mut warp, instr, self.id);
+                                    issued += 1;
+                                    issued_inst = true;
+                                    warp_inst_issued = true;
+                                    prev_issued_exec_unit = ExecUnitKind::MEM;
+                                } else {
+                                    log::debug!("issue failed: no free mem port register");
+                                }
+                            }
+                            op => {
+                                if op != ArchOp::TENSOR_CORE_OP
+                                    && op != ArchOp::SFU_OP
+                                    && op != ArchOp::DP_OP
+                                    && (op as usize) < opcodes::SPEC_UNIT_START_ID
+                                {
+                                    let mut execute_on_sp = false;
+                                    let mut execute_on_int = false;
+
+                                    let sp_pipe_avail = self.config.num_sp_units > 0
+                                        && issuer
+                                            .has_free_register(PipelineStage::ID_OC_SP, self.id);
+                                    let int_pipe_avail = self.config.num_int_units > 0
+                                        && issuer
+                                            .has_free_register(PipelineStage::ID_OC_INT, self.id);
+
+                                    // if INT unit pipline exist, then execute ALU and INT
+                                    // operations on INT unit and SP-FPU on SP unit (like in Volta)
+                                    // if INT unit pipline does not exist, then execute all ALU, INT
+                                    // and SP operations on SP unit (as in Fermi, Pascal GPUs)
+                                    if int_pipe_avail
+                                        && op != ArchOp::SP_OP
+                                        && !(diff_exec_units
+                                            && prev_issued_exec_unit == ExecUnitKind::INT)
+                                    {
+                                        execute_on_int = true;
+                                    } else if sp_pipe_avail
+                                        && (self.config.num_int_units == 0
+                                            || (self.config.num_int_units > 0
+                                                && op == ArchOp::SP_OP))
+                                        && !(diff_exec_units
+                                            && prev_issued_exec_unit == ExecUnitKind::SP)
+                                    {
+                                        execute_on_sp = true;
+                                    }
+
+                                    log::debug!(
+                                        "execute on INT={} execute on SP={}",
+                                        execute_on_int,
+                                        execute_on_sp
+                                    );
+
+                                    let issue_target = if execute_on_sp {
+                                        Some((PipelineStage::ID_OC_SP, ExecUnitKind::SP))
+                                    } else if execute_on_int {
+                                        Some((PipelineStage::ID_OC_INT, ExecUnitKind::INT))
+                                    } else {
+                                        None
+                                    };
+
+                                    if let Some((stage, unit)) = issue_target {
+                                        let instr = warp.ibuffer_take().unwrap();
+                                        debug_assert_eq!(warp.warp_id, warp_id);
+                                        issuer.issue_warp(stage, &mut warp, instr, self.id);
+                                        // .issue_warp(stage, &mut warp, instr, warp_id, self.id);
+                                        issued += 1;
+                                        issued_inst = true;
+                                        warp_inst_issued = true;
+                                        prev_issued_exec_unit = unit;
+                                    }
+                                }
+                            } // op => unimplemented!("op {:?} not implemented", op),
+                        }
+                    } else {
+                        log::debug!(
+                            "Warp (warp_id={}, dynamic_warp_id={}) {}",
+                            warp_id,
+                            dyn_warp_id,
+                            style("fails scoreboard").yellow(),
+                        );
+                    }
+                }
+                if warp_inst_issued {
+                    log::debug!(
+                        "Warp (warp_id={}, dynamic_warp_id={}) issued {} instructions",
+                        warp_id,
+                        dyn_warp_id,
+                        issued
+                    );
+                    warp.ibuffer_step();
+                }
+                checked += 1;
+            }
+            drop(warp);
+            if issued > 0 {
+                // This might be a bit inefficient, but we need to maintain
+                // two ordered list for proper scheduler execution.
+                // We could remove the need for this loop by associating a
+                // supervised_is index with each entry in the
+                // m_next_cycle_prioritized_warps vector.
+                // For now, just run through until you find the right warp_id
+                for (sup_idx, supervised) in self.supervised_warps.iter().enumerate() {
+                    if *next_warp_rc.try_borrow().unwrap() == *supervised.try_borrow().unwrap() {
+                        self.last_supervised_issued_idx = sup_idx;
+                    }
+                }
+                self.num_issued_last_cycle = issued;
+                let mut stats = self.stats.lock().unwrap();
+                if issued == 1 {
+                    stats.num_single_issue += 1;
+                } else {
+                    stats.num_dual_issue += 1;
+                }
+                break;
+            }
+        }
+
+        // issue stall statistics
+        let mut stats = self.stats.lock().unwrap();
+        if !valid_inst {
+            // idle or control hazard
+            stats.issue_raw_hazard_stall += 1;
+        } else if !ready_inst {
+            // waiting for RAW hazards (possibly due to memory)
+            stats.issue_control_hazard_stall += 1;
+        } else if !issued_inst {
+            // pipeline stalled
+            stats.issue_pipeline_stall += 1;
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::ported::testing;
+    use std::ptr;
+
+    #[ignore = "todo"]
+    #[test]
+    fn test_shd_warp() {
+        use playground::types::trace_shd_warp::new_trace_shd_warp;
+        let core = ptr::null_mut();
+        let warp_size = 32;
+        let mut warp = unsafe { new_trace_shd_warp(core, warp_size) };
+        warp.pin_mut().reset();
+        dbg!(&warp.get_n_completed());
+        dbg!(&warp.hardware_done());
+        dbg!(&warp.functional_done());
+        assert!(false);
+    }
+
+    #[test]
+    fn test_skip_iterator_indexing() {
+        let issued_warp_id = 3;
+        let supervised_warp_ids = vec![1, 2, 3, 4, 5];
+        let mut last_supervised_idx = 0;
+
+        for (idx, id) in supervised_warp_ids.iter().enumerate() {
+            if *id == issued_warp_id {
+                last_supervised_idx = idx;
+            }
+        }
+        assert_eq!(
+            supervised_warp_ids.iter().nth(last_supervised_idx),
+            Some(&issued_warp_id)
+        );
+    }
+
+    impl From<&Box<dyn super::SchedulerUnit>> for testing::state::Scheduler {
+        fn from(scheduler: &Box<dyn super::SchedulerUnit>) -> Self {
+            let prioritized_warp_ids: Vec<_> = scheduler
+                .prioritized_warps()
+                .iter()
+                .map(|warp| (warp.borrow().warp_id, warp.borrow().dynamic_warp_id()))
+                .collect();
+            Self {
+                prioritized_warp_ids,
+            }
+        }
+    }
+}
diff --git a/src/ported/scheduler/ordering.rs b/src/ported/scheduler/ordering.rs
new file mode 100644
index 00000000..e7b28785
--- /dev/null
+++ b/src/ported/scheduler/ordering.rs
@@ -0,0 +1,109 @@
+use super::{BaseSchedulerUnit, WarpRef};
+
+use std::cell::RefCell;
+use std::rc::Rc;
+
+pub fn all_different<T>(values: &[Rc<RefCell<T>>]) -> bool {
+    for (vi, v) in values.iter().enumerate() {
+        for (vii, vv) in values.iter().enumerate() {
+            let should_be_equal = vi == vii;
+            let are_equal = Rc::ptr_eq(v, vv);
+            if should_be_equal && !are_equal {
+                return false;
+            }
+            if !should_be_equal && are_equal {
+                return false;
+            }
+        }
+    }
+    true
+}
+
+pub fn sort_warps_by_oldest_dynamic_id(lhs: &WarpRef, rhs: &WarpRef) -> std::cmp::Ordering {
+    let lhs = lhs.try_borrow().unwrap();
+    let rhs = rhs.try_borrow().unwrap();
+    if lhs.done_exit() || lhs.waiting() {
+        std::cmp::Ordering::Greater
+    } else if rhs.done_exit() || rhs.waiting() {
+        std::cmp::Ordering::Less
+    } else {
+        lhs.dynamic_warp_id().cmp(&rhs.dynamic_warp_id())
+    }
+}
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
+pub enum Ordering {
+    // The item that issued last is prioritized first then the
+    // sorted result of the priority_function
+    GREEDY_THEN_PRIORITY_FUNC = 0,
+    // No greedy scheduling based on last to issue.
+    //
+    // Only the priority function determines priority
+    PRIORITY_FUNC_ONLY,
+    // NUM_ORDERING,
+}
+
+impl BaseSchedulerUnit {
+    pub fn order_by_priority<F>(&mut self, ordering: Ordering, priority_func: F)
+    where
+        F: FnMut(&WarpRef, &WarpRef) -> std::cmp::Ordering,
+    {
+        let num_warps_to_add = self.supervised_warps.len();
+        let out = &mut self.next_cycle_prioritized_warps;
+
+        debug_assert!(num_warps_to_add <= self.warps.len());
+        out.clear();
+
+        debug_assert!(all_different(self.supervised_warps.make_contiguous()));
+
+        let mut last_issued_iter = self
+            .supervised_warps
+            .iter()
+            .skip(self.last_supervised_issued_idx);
+        debug_assert!(all_different(&self.warps));
+
+        // sort a copy of the supervised warps reorder those for stability
+        let mut supervised_warps_sorted: Vec<_> =
+            self.supervised_warps.clone().into_iter().collect();
+        supervised_warps_sorted.sort_by(priority_func);
+
+        debug_assert!(all_different(&supervised_warps_sorted));
+
+        match ordering {
+            Ordering::GREEDY_THEN_PRIORITY_FUNC => {
+                let greedy_value = last_issued_iter.next();
+                if let Some(greedy) = greedy_value {
+                    out.push_back(Rc::clone(greedy));
+                }
+
+                log::debug!(
+                    "added greedy warp (last supervised issued idx={}): {:?}",
+                    self.last_supervised_issued_idx,
+                    &greedy_value.map(|w| w.borrow().dynamic_warp_id)
+                );
+
+                out.extend(
+                    supervised_warps_sorted
+                        .into_iter()
+                        .take(num_warps_to_add)
+                        .filter(|warp| {
+                            if let Some(greedy) = greedy_value {
+                                let already_added = Rc::ptr_eq(greedy, warp);
+                                !already_added
+                            } else {
+                                true
+                            }
+                        }),
+                );
+            }
+            Ordering::PRIORITY_FUNC_ONLY => {
+                out.extend(supervised_warps_sorted.into_iter().take(num_warps_to_add));
+            }
+        }
+        assert_eq!(
+            num_warps_to_add,
+            out.len(),
+            "either too few supervised warps or greedy warp not in supervised warps"
+        );
+    }
+}
diff --git a/src/ported/scheduler/warp.rs b/src/ported/scheduler/warp.rs
new file mode 100644
index 00000000..a9fd2bea
--- /dev/null
+++ b/src/ported/scheduler/warp.rs
@@ -0,0 +1,207 @@
+use crate::ported::{instruction::WarpInstruction, kernel::Kernel};
+use bitvec::{array::BitArray, BitArr};
+use std::collections::VecDeque;
+use std::sync::Arc;
+
+pub type ThreadActiveMask = BitArr!(for 32, in u32);
+
+#[derive(Debug)]
+pub struct SchedulerWarp {
+    pub block_id: u64,
+    pub dynamic_warp_id: usize,
+    pub warp_id: usize,
+    pub kernel: Option<Arc<Kernel>>,
+
+    pub trace_pc: usize,
+    pub active_mask: ThreadActiveMask,
+    pub trace_instructions: VecDeque<WarpInstruction>,
+
+    // state
+    pub done_exit: bool,
+    pub num_instr_in_pipeline: usize,
+    pub num_outstanding_stores: usize,
+    pub num_outstanding_atomics: usize,
+    pub has_imiss_pending: bool,
+    pub instr_buffer: Vec<Option<WarpInstruction>>,
+    pub next: usize,
+}
+
+impl PartialEq for SchedulerWarp {
+    fn eq(&self, other: &Self) -> bool {
+        self.kernel == other.kernel
+            && self.block_id == other.block_id
+            && self.warp_id == other.warp_id
+            && self.dynamic_warp_id == other.dynamic_warp_id
+    }
+}
+
+const IBUFFER_SIZE: usize = 2;
+
+impl Default for SchedulerWarp {
+    fn default() -> Self {
+        let instr_buffer = vec![None; IBUFFER_SIZE];
+        Self {
+            block_id: 0,
+            dynamic_warp_id: u32::MAX as usize,
+            warp_id: u32::MAX as usize,
+            kernel: None,
+            trace_pc: 0,
+            trace_instructions: VecDeque::new(),
+            active_mask: BitArray::ZERO,
+            done_exit: false,
+            num_instr_in_pipeline: 0,
+            num_outstanding_stores: 0,
+            num_outstanding_atomics: 0,
+            has_imiss_pending: false,
+            instr_buffer,
+            next: 0,
+        }
+    }
+}
+
+impl SchedulerWarp {
+    pub fn init(
+        &mut self,
+        _start_pc: Option<usize>,
+        block_id: u64,
+        warp_id: usize,
+        dynamic_warp_id: usize,
+        active_mask: ThreadActiveMask,
+        kernel: Arc<Kernel>,
+    ) {
+        self.block_id = block_id;
+        self.warp_id = warp_id;
+        self.dynamic_warp_id = dynamic_warp_id;
+        self.done_exit = false;
+        self.kernel = Some(kernel);
+        self.active_mask = active_mask;
+    }
+
+    pub fn reset(&mut self) {
+        debug_assert_eq!(self.num_outstanding_stores, 0);
+        debug_assert_eq!(self.num_instr_in_pipeline, 0);
+        self.has_imiss_pending = false;
+        self.warp_id = u32::MAX as usize;
+        self.dynamic_warp_id = u32::MAX as usize;
+
+        self.active_mask.fill(false);
+        self.done_exit = true;
+        self.next = 0;
+    }
+
+    #[must_use] pub fn current_instr(&self) -> Option<&WarpInstruction> {
+        self.trace_instructions.get(self.trace_pc)
+    }
+
+    pub fn push_trace_instruction(&mut self, instr: WarpInstruction) {
+        self.trace_instructions.push_back(instr);
+    }
+
+    pub fn next_trace_inst(&mut self) -> Option<&WarpInstruction> {
+        let trace_instr = self.trace_instructions.get(self.trace_pc)?;
+        self.trace_pc += 1;
+        Some(trace_instr)
+    }
+
+    #[must_use] pub fn instruction_count(&self) -> usize {
+        self.trace_instructions.len()
+    }
+
+    #[must_use] pub fn pc(&self) -> Option<usize> {
+        debug_assert!(self.trace_pc <= self.instruction_count());
+        self.trace_instructions
+            .get(self.trace_pc)
+            .map(|instr| instr.pc)
+    }
+
+    #[must_use] pub fn done(&self) -> bool {
+        self.trace_pc == self.instruction_count()
+    }
+
+    pub fn clear(&mut self) {
+        self.trace_pc = 0;
+        self.trace_instructions.clear();
+    }
+
+    pub fn ibuffer_fill(&mut self, slot: usize, instr: WarpInstruction) {
+        debug_assert!(slot < self.instr_buffer.len());
+        self.instr_buffer[slot] = Some(instr);
+        self.next = 0;
+    }
+
+    #[must_use] pub fn ibuffer_size(&self) -> usize {
+        self.instr_buffer.iter().filter(|x| x.is_some()).count()
+    }
+
+    pub fn ibuffer_empty(&self) -> bool {
+        self.instr_buffer.iter().all(Option::is_none)
+    }
+
+    pub fn ibuffer_flush(&mut self) {
+        for i in &mut self.instr_buffer {
+            if i.is_some() {
+                self.num_instr_in_pipeline -= 1;
+            }
+            *i = None;
+        }
+    }
+
+    #[must_use] pub fn ibuffer_peek(&self) -> Option<&WarpInstruction> {
+        self.instr_buffer[self.next].as_ref()
+    }
+
+    pub fn ibuffer_take(&mut self) -> Option<WarpInstruction> {
+        self.instr_buffer[self.next].take()
+    }
+
+    pub fn ibuffer_step(&mut self) {
+        self.next = (self.next + 1) % IBUFFER_SIZE;
+    }
+
+    #[must_use] pub fn done_exit(&self) -> bool {
+        self.done_exit
+    }
+
+    #[must_use] pub fn hardware_done(&self) -> bool {
+        self.functional_done() && self.stores_done() && self.num_instr_in_pipeline == 0
+    }
+
+    #[must_use] pub fn has_instr_in_pipeline(&self) -> bool {
+        self.num_instr_in_pipeline > 0
+    }
+
+    #[must_use] pub fn stores_done(&self) -> bool {
+        self.num_outstanding_stores == 0
+    }
+
+    #[must_use] pub fn num_completed(&self) -> usize {
+        self.active_mask.count_zeros()
+    }
+
+    pub fn set_thread_completed(&mut self, thread_id: usize) {
+        self.active_mask.set(thread_id, false);
+    }
+
+    #[must_use] pub fn functional_done(&self) -> bool {
+        self.active_mask.not_any()
+    }
+
+    #[must_use] pub fn waiting(&self) -> bool {
+        if self.functional_done() {
+            // waiting to be initialized with a kernel
+            true
+        // } else if core.warp_waiting_at_barrier(self.warp_id) {
+        //     // waiting for other warps in block to reach barrier
+        //     true
+        // } else if core.warp_waiting_at_mem_barrier(self.warp_id) {
+        //     // waiting for memory barrier
+        //     true
+        } else {
+            self.num_outstanding_atomics > 0
+        }
+    }
+
+    #[must_use] pub fn dynamic_warp_id(&self) -> usize {
+        self.dynamic_warp_id
+    }
+}
diff --git a/src/ported/scoreboard.rs b/src/ported/scoreboard.rs
index 467138fc..232ed6a5 100644
--- a/src/ported/scoreboard.rs
+++ b/src/ported/scoreboard.rs
@@ -1,4 +1,4 @@
-use super::instruction::{MemorySpace, WarpInstruction};
+use super::instruction::WarpInstruction;
 use std::collections::HashSet;
 
 /// Scoreboard implementation
@@ -6,25 +6,19 @@ use std::collections::HashSet;
 /// This should however not be needed in trace driven mode..
 #[derive(Debug, Default)]
 pub struct Scoreboard {
-    core_id: usize,
-    cluster_id: usize,
-    max_warps: usize,
+    pub core_id: usize,
+    pub cluster_id: usize,
 
     pub register_table: Vec<HashSet<u32>>,
-    /// Register that depend on a long operation (global, local or tex memory)
-    long_op_registers: Vec<HashSet<u32>>,
 }
 
 impl Scoreboard {
-    pub fn new(core_id: usize, cluster_id: usize, max_warps: usize) -> Self {
+    #[must_use] pub fn new(core_id: usize, cluster_id: usize, max_warps: usize) -> Self {
         let register_table: Vec<_> = (0..max_warps).map(|_| HashSet::new()).collect();
-        let long_op_registers = register_table.clone();
         Self {
             core_id,
             cluster_id,
-            max_warps,
             register_table,
-            long_op_registers,
         }
     }
 
@@ -33,7 +27,7 @@ impl Scoreboard {
     /// # Returns
     /// true if WAW or RAW hazard (no WAR since in-order issue)
     ///
-    pub fn has_collision(&self, warp_id: usize, instr: &WarpInstruction) -> bool {
+    #[must_use] pub fn has_collision(&self, warp_id: usize, instr: &WarpInstruction) -> bool {
         use itertools::Itertools;
 
         // Get list of all input and output registers
@@ -49,17 +43,6 @@ impl Scoreboard {
             instr_registers.iter().sorted().collect::<Vec<_>>(),
         );
 
-        // ar1 = 0;
-        // ar2 = 0;
-
-        // predicate register number
-        // if instr.pred > 0
-        //   inst_regs.insert(inst->pred);
-        // if (inst->ar1 > 0)
-        //   inst_regs.insert(inst->ar1);
-        // if (inst->ar2 > 0)
-        //   inst_regs.insert(inst->ar2);
-
         // get the intersection of reserved registers and instruction registers
         let Some(reserved) = self.register_table.get(warp_id) else {
             return false;
@@ -69,12 +52,11 @@ impl Scoreboard {
             warp_id,
             reserved.iter().sorted().collect::<Vec<_>>(),
         );
-        let mut intersection = instr_registers.intersection(&reserved);
+        let mut intersection = instr_registers.intersection(reserved);
         intersection.next().is_some()
-        // todo!("scoreboard: check collision");
     }
 
-    pub fn pending_writes(&self, warp_id: usize) -> &HashSet<u32> {
+    #[must_use] pub fn pending_writes(&self, warp_id: usize) -> &HashSet<u32> {
         &self.register_table[warp_id]
     }
 
@@ -92,16 +74,13 @@ impl Scoreboard {
     pub fn release_registers(&mut self, instr: &WarpInstruction) {
         for &out_reg in instr.outputs() {
             self.release_register(instr.warp_id, out_reg);
-            self.long_op_registers[instr.warp_id].remove(&out_reg);
         }
     }
 
     pub fn reserve_register(&mut self, warp_id: usize, reg_num: u32) {
         let warp_registers = &mut self.register_table[warp_id];
-        if warp_registers.contains(&reg_num) {
-            panic!("trying to reserve an already reserved register (core_id={}, warp_id={}, reg_num={})",
+        assert!(!warp_registers.contains(&reg_num), "trying to reserve an already reserved register (core_id={}, warp_id={}, reg_num={})",
            self.core_id, warp_id, reg_num);
-        }
         log::trace!(
             "scoreboard: warp {} reserves register: {}",
             warp_id,
@@ -114,21 +93,5 @@ impl Scoreboard {
         for &out_reg in instr.outputs() {
             self.reserve_register(instr.warp_id, out_reg);
         }
-
-        // Keep track of long operations
-        if instr.is_load()
-            && matches!(
-                instr.memory_space,
-                Some(MemorySpace::Global | MemorySpace::Local | MemorySpace::Texture)
-            )
-        {
-            // inst->space.get_type() == local_space ||
-            // inst->space.get_type() == param_space_kernel ||
-            // inst->space.get_type() == param_space_local ||
-            // inst->space.get_type() == param_space_unclassified ||
-            for &out_reg in instr.outputs() {
-                self.long_op_registers[instr.warp_id].insert(out_reg);
-            }
-        }
     }
 }
diff --git a/src/ported/set_index/mod.rs b/src/ported/set_index/mod.rs
new file mode 100644
index 00000000..e38d4b91
--- /dev/null
+++ b/src/ported/set_index/mod.rs
@@ -0,0 +1,157 @@
+use super::address;
+
+pub trait SetIndexFunction: std::fmt::Debug {
+    /// Compute set index using
+    fn compute_set_index(
+        &self,
+        addr: address,
+        num_sets: usize,
+        line_size_log2: u32,
+        num_sets_log2: u32,
+    ) -> u64;
+}
+
+pub mod fermi {
+    // Set Indexing function from
+    // "A Detailed GPU Cache Model Based on Reuse
+    // Distance Theory" Cedric Nugteren et al. HPCA 2014
+    #[derive(Default, Debug, PartialEq, Eq, Hash)]
+    pub struct SetIndex {}
+    impl super::SetIndexFunction for SetIndex {
+        fn compute_set_index(
+            &self,
+            addr: super::address,
+            num_sets: usize,
+            line_size_log2: u32,
+            num_sets_log2: u32,
+        ) -> u64 {
+            // check for incorrect number of sets
+            assert!(
+                matches!(num_sets, 32 | 64),
+                "bad cache config: num sets should be 32 or 64 for fermi set index function (got {num_sets})",
+            );
+
+            // lower xor value is bits 7-11
+            let lower_xor = (addr >> line_size_log2) & 0x1F;
+
+            // upper xor value is bits 13, 14, 15, 17, and 19
+            let mut upper_xor = (addr & 0xE000) >> 13; // Bits 13, 14, 15
+            upper_xor |= (addr & 0x20000) >> 14; // Bit 17
+            upper_xor |= (addr & 0x80000) >> 15; // Bit 19
+
+            let mut set_idx = lower_xor ^ upper_xor;
+
+            // 48KB cache prepends the set_index with bit 12
+            if num_sets == 64 {
+                set_idx |= (addr & 0x1000) >> 7;
+            }
+            assert!(set_idx < num_sets as u64, "set index out of bounds");
+            set_idx
+        }
+    }
+}
+
+pub mod bitwise_xor {
+    #[must_use]
+    pub fn bitwise_hash_function(
+        higher_bits: super::address,
+        index: usize,
+        bank_set_num: usize,
+    ) -> u64 {
+        index as u64 ^ (higher_bits & (bank_set_num as u64 - 1))
+    }
+
+    #[derive(Default, Debug, PartialEq, Eq, Hash)]
+    pub struct SetIndex {}
+    impl super::SetIndexFunction for SetIndex {
+        fn compute_set_index(
+            &self,
+            addr: super::address,
+            num_sets: usize,
+            line_size_log2: u32,
+            num_sets_log2: u32,
+        ) -> u64 {
+            let bits = line_size_log2 + num_sets_log2;
+            let higher_bits = addr >> bits;
+            let mut index = (addr >> line_size_log2) as usize;
+            index &= num_sets - 1;
+            let set_idx = bitwise_hash_function(higher_bits, index, num_sets);
+            assert!(set_idx < num_sets as u64, "set index out of bounds");
+            set_idx
+        }
+    }
+}
+
+pub mod ipoly {
+    /// Set Indexing function from "Pseudo-randomly interleaved memory."
+    /// Rau, B. R et al.
+    /// ISCA 1991
+    /// http://citeseerx.ist.psu.edu/viewdoc/download;jsessionid=348DEA37A3E440473B3C075EAABC63B6?doi=10.1.1.12.7149&rep=rep1&type=pdf
+    ///
+    /// equations are corresponding to IPOLY(37) and are adopted from:
+    /// "Sacat: streaming-aware conflict-avoiding thrashing-resistant gpgpu
+    /// cache management scheme." Khairy et al. IEEE TPDS 2017.
+    ///
+    /// equations for 16 banks are corresponding to IPOLY(5)
+    /// equations for 32 banks are corresponding to IPOLY(37)
+    /// equations for 64 banks are corresponding to IPOLY(67)
+    /// To see all the IPOLY equations for all the degrees, see
+    /// http://wireless-systems.ece.gatech.edu/6604/handouts/Peterson's%20Table.pdf
+    ///
+    /// We generate these equations using GF(2) arithmetic:
+    /// http://www.ee.unb.ca/cgi-bin/tervo/calc.pl?num=&den=&f=d&e=1&m=1
+    ///
+    /// We go through all the strides 128 (10000000), 256 (100000000),...  and
+    /// do modular arithmetic in GF(2) Then, we create the H-matrix and group
+    /// each bit together, for more info read the ISCA 1991 paper
+    ///
+    /// IPOLY hashing guarantees conflict-free for all 2^n strides which widely
+    /// exit in GPGPU applications and also show good performance for other
+    /// strides.
+    #[must_use]
+    pub fn ipoly_hash_function(
+        _higher_bits: super::address,
+        _index: usize,
+        _bank_set_num: usize,
+    ) -> u64 {
+        todo!("ipoly_hash_function");
+    }
+
+    #[derive(Default, Debug, PartialEq, Eq, Hash)]
+    pub struct SetIndex {}
+    impl super::SetIndexFunction for SetIndex {
+        fn compute_set_index(
+            &self,
+            addr: super::address,
+            num_sets: usize,
+            line_size_log2: u32,
+            num_sets_log2: u32,
+        ) -> u64 {
+            let bits = line_size_log2 + num_sets_log2;
+            let higher_bits = addr >> bits;
+            let mut index = (addr >> line_size_log2) as usize;
+            index &= num_sets - 1;
+            let set_idx = ipoly_hash_function(higher_bits, index, num_sets);
+            assert!(set_idx < num_sets as u64, "set index out of bounds");
+            set_idx
+        }
+    }
+}
+
+pub mod linear {
+    #[derive(Default, Debug, PartialEq, Eq, Hash)]
+    pub struct SetIndex {}
+    impl super::SetIndexFunction for SetIndex {
+        fn compute_set_index(
+            &self,
+            addr: super::address,
+            num_sets: usize,
+            line_size_log2: u32,
+            num_sets_log2: u32,
+        ) -> u64 {
+            let set_idx = (addr >> line_size_log2) & (num_sets as u64 - 1);
+            assert!(set_idx < num_sets as u64, "set index out of bounds");
+            set_idx
+        }
+    }
+}
diff --git a/src/ported/simd_function_unit.rs b/src/ported/simd_function_unit.rs
index b6df759b..13791bd7 100644
--- a/src/ported/simd_function_unit.rs
+++ b/src/ported/simd_function_unit.rs
@@ -93,16 +93,9 @@ impl PipelinedSimdUnitImpl {
 impl SimdFunctionUnit for PipelinedSimdUnitImpl {
     fn active_lanes_in_pipeline(&self) -> usize {
         let mut active_lanes: sched::ThreadActiveMask = BitArray::ZERO;
-        // if self.config.
-        for stage in &self.pipeline_reg {
-            if let Some(stage) = stage {
-                active_lanes |= stage.active_mask;
-            }
+        for stage in self.pipeline_reg.iter().flatten() {
+            active_lanes |= stage.active_mask;
         }
-        // for (unsigned stage = 0; (stage + 1) < m_pipeline_depth; stage++) {
-        //   if (!m_pipeline_reg[stage]->empty())
-        //     active_lanes |= m_pipeline_reg[stage]->get_active_mask();
-        // }
         active_lanes.count_ones()
     }
 
@@ -122,7 +115,7 @@ impl SimdFunctionUnit for PipelinedSimdUnitImpl {
             self.cycle.get(),
             self.pipeline_reg
                 .iter()
-                .map(|reg| reg.as_ref().map(|r| r.to_string()))
+                .map(|reg| reg.as_ref().map(std::string::ToString::to_string))
                 .collect::<Vec<_>>(),
             self.num_active_instr_in_pipeline(),
             self.pipeline_reg.len(),
diff --git a/src/ported/sp_unit.rs b/src/ported/sp_unit.rs
index 8736103e..8a90dda0 100644
--- a/src/ported/sp_unit.rs
+++ b/src/ported/sp_unit.rs
@@ -1,6 +1,5 @@
 use super::{
-    instruction::WarpInstruction, opcodes, register_set::RegisterSet,
-    simd_function_unit as fu,
+    instruction::WarpInstruction, opcodes, register_set::RegisterSet, simd_function_unit as fu,
 };
 use crate::config::GPUConfig;
 use std::cell::RefCell;
@@ -8,15 +7,11 @@ use std::rc::Rc;
 use std::sync::{Arc, Mutex};
 
 #[derive()]
-// pub struct SPUnit<I> {
 pub struct SPUnit {
-    // core_id: usize,
-    // cluster_id: usize,
     config: Arc<GPUConfig>,
     pipelined_simd_unit: fu::PipelinedSimdUnitImpl,
 }
 
-// impl<I> SPUnit<I> {
 impl SPUnit {
     pub fn new(
         id: usize,
@@ -56,23 +51,19 @@ impl std::fmt::Debug for SPUnit {
     }
 }
 
-// impl<I> fu::SimdFunctionUnit for SPUnit<I>
-impl fu::SimdFunctionUnit for SPUnit
-// where
-//     I: ic::Interconnect<super::core::Packet>,
-{
+impl fu::SimdFunctionUnit for SPUnit {
     fn can_issue(&self, instr: &WarpInstruction) -> bool {
+        use opcodes::ArchOp;
         match instr.opcode.category {
-            opcodes::ArchOp::SFU_OP => false,
-            opcodes::ArchOp::LOAD_OP => false,
-            opcodes::ArchOp::TENSOR_CORE_LOAD_OP => false,
-            opcodes::ArchOp::STORE_OP => false,
-            opcodes::ArchOp::TENSOR_CORE_STORE_OP => false,
-            opcodes::ArchOp::MEMORY_BARRIER_OP => false,
-            opcodes::ArchOp::DP_OP => false,
+            ArchOp::SFU_OP
+            | ArchOp::LOAD_OP
+            | ArchOp::TENSOR_CORE_LOAD_OP
+            | ArchOp::STORE_OP
+            | ArchOp::TENSOR_CORE_STORE_OP
+            | ArchOp::MEMORY_BARRIER_OP
+            | ArchOp::DP_OP => false,
             _ => self.pipelined_simd_unit.can_issue(instr),
         }
-        // todo!("load store unit: can issue");
     }
 
     fn pipeline(&self) -> &Vec<Option<WarpInstruction>> {
diff --git a/src/ported/tag_array.rs b/src/ported/tag_array.rs
index eb2bb26f..d6107004 100644
--- a/src/ported/tag_array.rs
+++ b/src/ported/tag_array.rs
@@ -28,13 +28,6 @@ pub struct TagArray<B> {
     /// nbanks x nset x assoc lines in total
     pub lines: Vec<cache_block::LineCacheBlock>,
     phantom: std::marker::PhantomData<B>,
-    access: usize,
-    miss: usize,
-    pending_hit: usize,
-    res_fail: usize,
-    sector_miss: usize,
-    core_id: usize,
-    type_id: usize,
     is_used: bool,
     num_access: usize,
     num_miss: usize,
@@ -48,32 +41,15 @@ pub struct TagArray<B> {
 
 impl<B> TagArray<B> {
     #[must_use]
-    pub fn new(core_id: usize, type_id: usize, config: Arc<config::CacheConfig>) -> Self {
+    pub fn new(config: Arc<config::CacheConfig>) -> Self {
         let num_cache_lines = config.max_num_lines();
         let lines = (0..num_cache_lines)
             .map(|_| cache_block::LineCacheBlock::new())
             .collect();
-        // if (config.m_cache_type == NORMAL) {
-        //   for (unsigned i = 0; i < cache_lines_num; ++i)
-        //     m_lines[i] = new line_cache_block();
-        // } else if (config.m_cache_type == SECTOR) {
-        //   for (unsigned i = 0; i < cache_lines_num; ++i)
-        //     m_lines[i] = new sector_cache_block();
-        // } else
-        //   assert(0);
-        //
-        // init(core_id, type_id);
 
         Self {
             lines,
             phantom: std::marker::PhantomData,
-            access: 0,
-            miss: 0,
-            pending_hit: 0,
-            res_fail: 0,
-            sector_miss: 0,
-            core_id,
-            type_id,
             is_used: false,
             num_access: 0,
             num_miss: 0,
@@ -100,7 +76,6 @@ impl<B> TagArray<B> {
         let mut writeback = false;
         let mut evicted = None;
 
-        // shader_cache_access_log(m_core_id, m_type_id, 0);
         let (index, status) = self.probe(addr, fetch, fetch.is_write(), false);
         match status {
             cache::RequestStatus::HIT | cache::RequestStatus::HIT_RESERVED => {
@@ -156,10 +131,8 @@ impl<B> TagArray<B> {
                 }
             }
             cache::RequestStatus::SECTOR_MISS => {
-                unimplemented!("no sector miss");
                 debug_assert!(self.config.kind == config::CacheKind::Sector);
                 self.num_sector_miss += 1;
-                // shader_cache_access_log(m_core_id, m_type_id, 1);
                 if self.config.allocate_policy == config::CacheAllocatePolicy::ON_MISS {
                     let index = index.expect("hit has idx");
                     let line = &mut self.lines[index];
@@ -169,12 +142,13 @@ impl<B> TagArray<B> {
                         self.num_dirty -= 1;
                     }
                 }
+                unimplemented!("sector miss");
             }
             cache::RequestStatus::RESERVATION_FAIL => {
                 self.num_reservation_fail += 1;
             }
-            status => {
-                panic!("tag_array access: unknown cache request status {status:?}");
+            status @ cache::RequestStatus::MSHR_HIT => {
+                panic!("tag_array access: status {status:?} should never be returned");
             }
         }
         AccessStatus {
@@ -189,10 +163,10 @@ impl<B> TagArray<B> {
     ///
     /// # Returns
     /// A tuple with the cache index `Option<usize>` and cache request status.
+    #[must_use]
     pub fn probe(
         &self,
         block_addr: address,
-        // cache_idx: Option<usize>,
         fetch: &mem_fetch::MemFetch,
         is_write: bool,
         is_probe: bool,
@@ -202,19 +176,17 @@ impl<B> TagArray<B> {
             fetch.access_sector_mask(),
             is_write,
             is_probe,
-            fetch.to_string(),
+            Some(fetch),
         )
     }
 
     pub fn probe_masked(
         &self,
         block_addr: address,
-        // cache_idx: Option<usize>,
         mask: &mem_fetch::MemAccessSectorMask,
         is_write: bool,
         _is_probe: bool,
-        fetch: String,
-        // fetch: &mem_fetch::MemFetch,
+        fetch: Option<&mem_fetch::MemFetch>,
     ) -> (Option<usize>, cache::RequestStatus) {
         let set_index = self.config.set_index(block_addr) as usize;
         let tag = self.config.tag(block_addr);
@@ -231,8 +203,8 @@ impl<B> TagArray<B> {
         let dirty_line_percent = (dirty_line_percent * 100f64) as usize;
 
         log::trace!(
-            "tag_array::probe({}) set_idx = {}, tag = {}, assoc = {} dirty lines = {}%",
-            fetch,
+            "tag_array::probe({:?}) set_idx = {}, tag = {}, assoc = {} dirty lines = {}%",
+            fetch.map(ToString::to_string),
             set_index,
             tag,
             self.config.associativity,
@@ -244,15 +216,15 @@ impl<B> TagArray<B> {
             let idx = set_index * self.config.associativity + way;
             let line = &self.lines[idx];
             log::trace!(
-                "tag_array::probe({}) => checking cache index {} (tag={}, status={:?}, last_access={})",
-                fetch,
+                "tag_array::probe({:?}) => checking cache index {} (tag={}, status={:?}, last_access={})",
+                fetch.map(ToString::to_string),
                 idx,
                 line.tag,
-                line.status(&mask),
+                line.status(mask),
                 line.last_access_time()
             );
             if line.tag == tag {
-                match line.status(&mask) {
+                match line.status(mask) {
                     cache_block::Status::RESERVED => {
                         return (Some(idx), cache::RequestStatus::HIT_RESERVED);
                     }
@@ -260,11 +232,17 @@ impl<B> TagArray<B> {
                         return (Some(idx), cache::RequestStatus::HIT);
                     }
                     cache_block::Status::MODIFIED => {
-                        if (!is_write && line.is_readable(mask)) || is_write {
-                            return (Some(idx), cache::RequestStatus::HIT);
+                        let status = if is_write || line.is_readable(mask) {
+                            cache::RequestStatus::HIT
                         } else {
-                            return (Some(idx), cache::RequestStatus::SECTOR_MISS);
-                        }
+                            cache::RequestStatus::SECTOR_MISS
+                        };
+                        // let status = match is_write {
+                        //     true => cache::RequestStatus::HIT,
+                        //     false if line.is_readable(mask) => cache::RequestStatus::HIT,
+                        //     _ => cache::RequestStatus::SECTOR_MISS,
+                        // };
+                        return (Some(idx), status);
                     }
                     cache_block::Status::INVALID if line.is_valid() => {
                         return (Some(idx), cache::RequestStatus::SECTOR_MISS);
@@ -292,18 +270,17 @@ impl<B> TagArray<B> {
                             }
                         } else if self.config.replacement_policy
                             == config::CacheReplacementPolicy::FIFO
+                            && line.alloc_time() < valid_time
                         {
-                            if line.alloc_time() < valid_time {
-                                valid_time = line.alloc_time();
-                                valid_line = Some(idx);
-                            }
+                            valid_time = line.alloc_time();
+                            valid_line = Some(idx);
                         }
                     }
                 }
             }
         }
 
-        log::trace!("tag_array::probe({}) => all reserved={} invalid_line={:?} valid_line={:?} ({:?} policy)", fetch, all_reserved, invalid_line, valid_line, self.config.replacement_policy);
+        log::trace!("tag_array::probe({:?}) => all reserved={} invalid_line={:?} valid_line={:?} ({:?} policy)", fetch.map(ToString::to_string), all_reserved, invalid_line, valid_line, self.config.replacement_policy);
 
         if all_reserved {
             debug_assert_eq!(
@@ -323,16 +300,6 @@ impl<B> TagArray<B> {
                 panic!("found neither a valid nor invalid cache line");
             }
         };
-        // let cache_idx = if invalid_line.is_some() {
-        //     invalid_line
-        // } else if valid_line.is_some() {
-        //     valid_line
-        // } else {
-        //     // if an unreserved block exists,
-        //     // it is either invalid or replaceable
-        //     panic!("found neither a valid nor invalid cache line");
-        // };
-
         (Some(cache_idx), cache::RequestStatus::MISS)
     }
 
@@ -363,17 +330,12 @@ impl<B> TagArray<B> {
         time: u64,
     ) {
         let is_probe = false;
-        let (cache_index, probe_status) = self.probe_masked(
-            addr,
-            &sector_mask,
-            is_write,
-            is_probe,
-            "<dgbfetch>".to_string(),
-        );
+        let (cache_index, probe_status) =
+            self.probe_masked(addr, &sector_mask, is_write, is_probe, None);
 
         log::trace!(
             "tag_array::fill(cache={}, tag={}, addr={}) (on fill) status={:?}",
-            cache_index.map(|i| i as i64).unwrap_or(-1),
+            cache_index.map_or(-1, |i| i as i64),
             self.config.tag(addr),
             addr,
             probe_status,
@@ -424,7 +386,7 @@ impl<B> TagArray<B> {
 
         log::trace!(
             "tag_array::fill(cache={}, tag={}, addr={}) (on fill) status={:?}",
-            cache_index.map(|i| i as i64).unwrap_or(-1),
+            cache_index.map_or(-1, |i| i as i64),
             self.config.tag(fetch.addr()),
             fetch.addr(),
             probe_status,
@@ -473,6 +435,7 @@ impl<B> TagArray<B> {
         todo!("invalidate tag array");
     }
 
+    #[must_use]
     pub fn size(&self) -> usize {
         self.config.max_num_lines()
     }
@@ -481,12 +444,12 @@ impl<B> TagArray<B> {
         &mut self.lines[idx]
     }
 
+    #[must_use]
     pub fn get_block(&self, idx: usize) -> &cache_block::LineCacheBlock {
         &self.lines[idx]
     }
 
     pub fn add_pending_line(&mut self, fetch: &mem_fetch::MemFetch) {
-        // log::debug!("tag_array::add_pending_line({})", fetch.addr());
         let addr = self.config.block_addr(fetch.addr());
         let instr = fetch.instr.as_ref().unwrap();
         if self.pending_lines.contains_key(&addr) {
@@ -495,42 +458,9 @@ impl<B> TagArray<B> {
     }
 
     pub fn remove_pending_line(&mut self, fetch: &mem_fetch::MemFetch) {
-        // log::debug!("tag_array::remove_pending_line({})", fetch.addr());
         let addr = self.config.block_addr(fetch.addr());
         self.pending_lines.remove(&addr);
     }
-
-    // pub fn from_block(
-    //     config: GenericCacheConfig,
-    //     core_id: usize,
-    //     type_id: usize,
-    //     block: CacheBlock,
-    // ) -> Self {
-    //     Self {
-    //         // config,
-    //         lines: Vec::new(),
-    //     }
-    // }
-
-    // pub fn from_config(config: GenericCacheConfig, core_id: usize, type_id: usize) -> Self {
-    //     config.max_lines;
-    //     let lines =
-    //     Self {
-    //         // config,
-    //         lines: Vec::new(),
-    //     }
-    //     // unsigned cache_lines_num = config.get_max_num_lines();
-    //     //   m_lines = new cache_block_t *[cache_lines_num];
-    //     //   if (config.m_cache_type == NORMAL) {
-    //     //     for (unsigned i = 0; i < cache_lines_num; ++i)
-    //     //       m_lines[i] = new line_cache_block();
-    //     //   } else if (config.m_cache_type == SECTOR) {
-    //     //     for (unsigned i = 0; i < cache_lines_num; ++i)
-    //     //       m_lines[i] = new sector_cache_block();
-    //     //   } else
-    //     //     assert(0);
-    // }
-    // todo: update config (GenericCacheConfig)
 }
 
 #[cfg(test)]
@@ -543,7 +473,7 @@ mod tests {
     #[test]
     fn test_tag_array() {
         let config = GPUConfig::default().data_cache_l1.unwrap();
-        let _tag_array: TagArray<usize> = TagArray::new(0, 0, Arc::clone(&config.inner));
+        let _tag_array: TagArray<usize> = TagArray::new(Arc::clone(&config.inner));
         assert!(false);
     }
 }
diff --git a/src/ported/testing/state.rs b/src/ported/testing/state.rs
index a2b1a256..2e712274 100644
--- a/src/ported/testing/state.rs
+++ b/src/ported/testing/state.rs
@@ -31,7 +31,7 @@ impl From<types::mem_fetch::mem_access_type> for ported::mem_fetch::AccessKind {
             mem_access_type::L1_WR_ALLOC_R => AccessKind::L1_WR_ALLOC_R,
             mem_access_type::L2_WR_ALLOC_R => AccessKind::L2_WR_ALLOC_R,
             other @ mem_access_type::NUM_MEM_ACCESS_TYPE => {
-                panic!("bad mem access kind: {:?}", other)
+                panic!("bad mem access kind: {other:?}")
             }
         }
     }
@@ -100,8 +100,8 @@ impl From<ported::cache_block::LineCacheBlock> for CacheBlock {
     }
 }
 
-impl<'a> From<&'a playground::cache::cache_block_t> for CacheBlock {
-    fn from(block: &'a playground::cache::cache_block_t) -> Self {
+impl From<&playground::cache::cache_block_t> for CacheBlock {
+    fn from(block: &playground::cache::cache_block_t) -> Self {
         let status = if block.is_valid_line() {
             CacheBlockStatus::VALID
         } else if block.is_invalid_line() {
@@ -162,12 +162,17 @@ pub struct RegisterSet {
 }
 
 impl RegisterSet {
+    #[must_use]
     pub fn is_empty(&self) -> bool {
         self.num_instructions_in_pipeline() == 0
     }
 
+    #[must_use]
     pub fn num_instructions_in_pipeline(&self) -> usize {
-        self.pipeline.iter().filter_map(|x| x.as_ref()).count()
+        self.pipeline
+            .iter()
+            .filter_map(std::option::Option::as_ref)
+            .count()
     }
 }
 
@@ -182,10 +187,7 @@ impl From<ported::register_set::RegisterSet> for RegisterSet {
         let pipeline = reg
             .regs
             .into_iter()
-            .map(|instr| match instr {
-                Some(instr) => Some(instr.into()),
-                None => None,
-            })
+            .map(|instr| instr.map(std::convert::Into::into))
             .collect();
         Self {
             name: format!("{:?}", &reg.stage),
@@ -253,7 +255,7 @@ pub struct DispatchUnit {
     pub kind: OperandCollectorUnitKind,
 }
 
-impl<'a> From<&playground::operand_collector::dispatch_unit_t> for DispatchUnit {
+impl From<&playground::operand_collector::dispatch_unit_t> for DispatchUnit {
     fn from(unit: &playground::operand_collector::dispatch_unit_t) -> Self {
         Self {
             last_cu: unit.get_last_cu() as usize,
@@ -320,8 +322,8 @@ impl<'a> From<playground::collector_unit::CollectorUnit<'a>> for CollectorUnit {
     }
 }
 
-impl<'a> From<&'a playground::operand_collector::arbiter_t> for Arbiter {
-    fn from(arbiter: &'a playground::operand_collector::arbiter_t) -> Self {
+impl From<&playground::operand_collector::arbiter_t> for Arbiter {
+    fn from(arbiter: &playground::operand_collector::arbiter_t) -> Self {
         Self {
             last_cu: arbiter.get_last_cu() as usize,
         }
@@ -411,7 +413,7 @@ impl std::fmt::Debug for MemFetch {
     fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
         write!(f, "{:?}({:?}", self.kind, self.access_kind)?;
         if let Some((alloc_id, rel_addr)) = self.relative_addr {
-            write!(f, "@{}+{}", alloc_id, rel_addr)?;
+            write!(f, "@{alloc_id}+{rel_addr}")?;
         }
         write!(f, ")")
     }
@@ -465,7 +467,7 @@ impl From<&playground::core::pending_register_writes> for PendingRegisterWrites
 }
 
 #[derive(Debug, Clone, PartialEq, Eq, Serialize)]
-pub struct ArbitrationState {
+pub struct Arbitration {
     pub last_borrower: usize,
     pub shared_credit: usize,
     pub private_credit: Box<[usize]>,
@@ -482,7 +484,7 @@ pub struct Simulation {
     pub l2_cache_per_sub: Box<[Option<Cache>]>,
     // per partition
     pub dram_latency_queue_per_partition: Box<[Vec<MemFetch>]>,
-    pub dram_arbitration_per_partition: Box<[ArbitrationState]>,
+    pub dram_arbitration_per_partition: Box<[Arbitration]>,
     // per cluster
     pub core_sim_order_per_cluster: Box<[Box<[usize]>]>,
     // per core
@@ -493,6 +495,7 @@ pub struct Simulation {
 }
 
 impl Simulation {
+    #[must_use]
     pub fn new(
         num_clusters: usize,
         cores_per_cluster: usize,
@@ -514,7 +517,7 @@ impl Simulation {
             // per partition
             dram_latency_queue_per_partition: vec![vec![]; num_mem_partitions].into_boxed_slice(),
             dram_arbitration_per_partition: vec![
-                ArbitrationState {
+                Arbitration {
                     last_borrower: 0,
                     shared_credit: 0,
                     private_credit: vec![0; num_sub_partitions].into_boxed_slice(),
diff --git a/stats/src/lib.rs b/stats/src/lib.rs
index 60c1e96d..59c074f6 100644
--- a/stats/src/lib.rs
+++ b/stats/src/lib.rs
@@ -4,6 +4,7 @@ pub mod cache;
 pub mod dram;
 pub mod instructions;
 pub mod mem;
+pub mod scheduler;
 pub mod sim;
 
 pub use cache::{Cache, PerCache};
@@ -55,6 +56,8 @@ pub struct Stats {
     pub l1t_stats: PerCache,
     pub l1d_stats: PerCache,
     pub l2d_stats: PerCache,
+    // where should those go?
+    pub stall_dram_full: u64,
 }
 
 impl Stats {
@@ -70,6 +73,7 @@ impl Stats {
             l1t_stats: PerCache::default(),
             l1d_stats: PerCache::default(),
             l2d_stats: PerCache::default(),
+            stall_dram_full: 0,
         }
     }
 }
diff --git a/stats/src/scheduler.rs b/stats/src/scheduler.rs
new file mode 100644
index 00000000..49efcd87
--- /dev/null
+++ b/stats/src/scheduler.rs
@@ -0,0 +1,10 @@
+use serde::{Deserialize, Serialize};
+
+#[derive(Clone, Debug, Default, PartialEq, Eq, Serialize, Deserialize)]
+pub struct Scheduler {
+    pub num_single_issue: u64,
+    pub num_dual_issue: u64,
+    pub issue_raw_hazard_stall: u64,
+    pub issue_control_hazard_stall: u64,
+    pub issue_pipeline_stall: u64,
+}