diff --git a/WIP.md b/WIP.md index abeeb975..c8567d1c 100644 --- a/WIP.md +++ b/WIP.md @@ -2,22 +2,38 @@ - today: - - test flush caches using config options - - less important: + - todos - - perf: investigate if the many small allocations of msg for move in / move warp etc are problematic - - perf: investigate the performance overhead for finding the allocation ids + - use gpu_mem_alloc for the allocations but still allow smart comparision with play whose traces does not include allocations + + - refactor + + - join core and inner core + - flatten ported submodule + - lint + - factor into multiple files + - some minor todos + - remove dead code + - instantiate the entire GPU in one file to find a good API + - factor out traits + + - generate plots and correlation stuff etc + + - less important: - - fix: investigate lockstep performance and see if we can reduce allocations? - fix: remove global statics to allow running tests in parallel - parse accelsim config files + - with defaults for compatibility + - test flush caches using config options + - perf: investigate if the many small allocations of msg for move in / move warp etc are problematic + - perf: investigate the performance overhead for finding the allocation ids + - perf: investigate lockstep performance and see if we can reduce allocations? + - allow basic configurations for the playground bridge - - - FIX: add l2 set index back in - - generate plots and correlation stuff etc - DONE: multiple memories - DONE: lockstep with multiple cores and clusters diff --git a/accelsim/src/stats.rs b/accelsim/src/stats.rs index 426ca526..1ce0f0a1 100644 --- a/accelsim/src/stats.rs +++ b/accelsim/src/stats.rs @@ -170,6 +170,7 @@ impl TryFrom for stats::Stats { l1c_stats: stats::PerCache::default(), l1d_stats: stats::PerCache::default(), l2d_stats, + stall_dram_full: 0, // todo }) } } diff --git a/accelsim/src/tracegen/reader.rs b/accelsim/src/tracegen/reader.rs index 4bbe84ed..168b889b 100644 --- a/accelsim/src/tracegen/reader.rs +++ b/accelsim/src/tracegen/reader.rs @@ -332,8 +332,6 @@ pub fn parse_trace_instruction( // parse addresses if mem_width > 0 { - // let width = super::get_data_width_from_opcode(&opcode)?; - let address_format: usize = parse_decimal(values.pop_front(), "mem address format")?; let address_format = AddressFormat::from_repr(address_format) .ok_or_else(|| eyre::eyre!("unknown mem address format: {:?}", address_format))?; diff --git a/benches/vectoradd.rs b/benches/vectoradd.rs index 2db45bcb..3680c965 100644 --- a/benches/vectoradd.rs +++ b/benches/vectoradd.rs @@ -1,3 +1,5 @@ +#![allow(clippy::missing_errors_doc, clippy::missing_panics_doc)] + use color_eyre::eyre; use criterion::{black_box, Criterion}; use validate::materialize::{BenchmarkConfig, Benchmarks}; @@ -21,8 +23,8 @@ fn get_bench_config(benchmark_name: &str, input_idx: usize) -> eyre::Result eyre::Result<()> { - let _stats = validate::simulate::simulate_bench_config(&bench_config)?; +pub fn run_box(bench_config: &BenchmarkConfig) -> eyre::Result<()> { + let _stats = validate::simulate::simulate_bench_config(bench_config)?; Ok(()) } @@ -31,8 +33,8 @@ pub async fn run_accelsim(bench_config: BenchmarkConfig) -> eyre::Result<()> { Ok(()) } -pub fn run_playground(bench_config: BenchmarkConfig) -> eyre::Result<()> { - let _stats = validate::playground::simulate_bench_config(&bench_config); +pub fn run_playground(bench_config: &BenchmarkConfig) -> eyre::Result<()> { + let _stats = validate::playground::simulate_bench_config(bench_config); Ok(()) } @@ -48,7 +50,7 @@ pub fn accelsim_benchmark(c: &mut Criterion) { group.bench_function("vectoradd/10000", |b| { b.to_async(&runtime) - .iter(|| run_accelsim(black_box(get_bench_config("vectorAdd", 2).unwrap()))) + .iter(|| run_accelsim(black_box(get_bench_config("vectorAdd", 2).unwrap()))); }); // group.bench_function("transpose/256/naive", |b| { // b.iter(|| run_accelsim(black_box(get_bench_config("transpose", 0).unwrap()))) @@ -61,7 +63,7 @@ pub fn play_benchmark(c: &mut Criterion) { group.sampling_mode(criterion::SamplingMode::Flat); group.bench_function("vectoradd/10000", |b| { - b.iter(|| run_playground(black_box(get_bench_config("vectorAdd", 2).unwrap()))) + b.iter(|| run_playground(&black_box(get_bench_config("vectorAdd", 2).unwrap()))); }); // group.bench_function("transpose/256/naive", |b| { // b.iter(|| run_playground(black_box(get_bench_config("transpose", 0).unwrap()))) @@ -74,7 +76,7 @@ pub fn box_benchmark(c: &mut Criterion) { group.sampling_mode(criterion::SamplingMode::Flat); group.bench_function("vectoradd/10000", |b| { - b.iter(|| run_box(black_box(get_bench_config("vectorAdd", 2).unwrap()))) + b.iter(|| run_box(&black_box(get_bench_config("vectorAdd", 2).unwrap()))); }); // group.bench_function("transpose/256/naive", |b| { // b.iter(|| run_box(black_box(get_bench_config("transpose", 0).unwrap()))) @@ -82,30 +84,29 @@ pub fn box_benchmark(c: &mut Criterion) { } criterion::criterion_group!(benches, box_benchmark, play_benchmark, accelsim_benchmark); -criterion::criterion_main!(benches); +// criterion::criterion_main!(benches); #[allow(dead_code)] -fn custom() -> eyre::Result<()> { +fn main() -> eyre::Result<()> { use std::time::Instant; let runtime = tokio::runtime::Builder::new_multi_thread() .enable_all() - .build() - .expect("build tokio runtime"); + .build()?; let mut start = Instant::now(); - let _ = run_box(black_box(get_bench_config("transpose", 0)?)); + let _ = run_box(&black_box(get_bench_config("transpose", 0)?)); println!("box took:\t\t{:?}", start.elapsed()); start = Instant::now(); - let _ = run_playground(black_box(get_bench_config("transpose", 0)?)); + let _ = run_playground(&black_box(get_bench_config("transpose", 0)?)); println!("play took:\t\t{:?}", start.elapsed()); start = Instant::now(); - let _ = runtime.block_on(async { - let _ = run_accelsim(black_box(get_bench_config("transpose", 0)?)).await?; + runtime.block_on(async { + run_accelsim(black_box(get_bench_config("transpose", 0)?)).await?; Ok::<(), eyre::Report>(()) - }); + })?; println!("accel took:\t\t{:?}", start.elapsed()); Ok(()) diff --git a/examples/pycachesim.rs b/examples/pycachesim.rs index d2b70829..0a489ef8 100644 --- a/examples/pycachesim.rs +++ b/examples/pycachesim.rs @@ -1,57 +1,58 @@ -#![allow(warnings)] - -use casimu::{cache::LRU, Cache, CacheConfig, MainMemory, Simulation}; -use std::sync::Arc; - -const CACHELINE_SIZE: usize = 64; - -fn main() { - let mut mem = MainMemory::new(); - let l3 = Arc::new(Cache::new(CacheConfig { - name: "L3".to_string(), - sets: 20480, - ways: 16, - line_size: CACHELINE_SIZE, - replacement_policy: LRU {}, - write_back: true, - write_allocate: true, - store_to: None, - load_from: None, - victims_to: None, - swap_on_load: false, - })); - mem.set_load_to(l3.clone()); - mem.set_store_from(l3.clone()); - - let l2 = Arc::new(Cache::new(CacheConfig { - name: "L2".to_string(), - sets: 512, - ways: 8, - line_size: CACHELINE_SIZE, - replacement_policy: LRU {}, - write_back: true, - write_allocate: true, - store_to: Some(l3.clone()), - load_from: Some(l3), - victims_to: None, - swap_on_load: false, - })); - let l1 = Arc::new(Cache::new(CacheConfig { - name: "L1".to_string(), - sets: 64, - ways: 8, - line_size: CACHELINE_SIZE, - replacement_policy: LRU {}, - write_back: true, - write_allocate: true, - store_to: Some(l2.clone()), - load_from: Some(l2), - victims_to: None, - swap_on_load: false, // incl/excl does not matter in first level - })); - - // let mut sim = Simulation::new(l1.clone(), mem); - // sim.load(23) - // cv = CacheVisualizer(cs, [10, 16]) - // sim.dump_state() -} +// #![allow(warnings)] +// +// use casimu::{cache::LRU, Cache, CacheConfig, MainMemory, Simulation}; +// use std::sync::Arc; +// +// const CACHELINE_SIZE: usize = 64; +// +// fn main() { +// let mut mem = MainMemory::new(); +// let l3 = Arc::new(Cache::new(CacheConfig { +// name: "L3".to_string(), +// sets: 20480, +// ways: 16, +// line_size: CACHELINE_SIZE, +// replacement_policy: LRU {}, +// write_back: true, +// write_allocate: true, +// store_to: None, +// load_from: None, +// victims_to: None, +// swap_on_load: false, +// })); +// mem.set_load_to(l3.clone()); +// mem.set_store_from(l3.clone()); +// +// let l2 = Arc::new(Cache::new(CacheConfig { +// name: "L2".to_string(), +// sets: 512, +// ways: 8, +// line_size: CACHELINE_SIZE, +// replacement_policy: LRU {}, +// write_back: true, +// write_allocate: true, +// store_to: Some(l3.clone()), +// load_from: Some(l3), +// victims_to: None, +// swap_on_load: false, +// })); +// let l1 = Arc::new(Cache::new(CacheConfig { +// name: "L1".to_string(), +// sets: 64, +// ways: 8, +// line_size: CACHELINE_SIZE, +// replacement_policy: LRU {}, +// write_back: true, +// write_allocate: true, +// store_to: Some(l2.clone()), +// load_from: Some(l2), +// victims_to: None, +// swap_on_load: false, // incl/excl does not matter in first level +// })); +// +// // let mut sim = Simulation::new(l1.clone(), mem); +// // sim.load(23) +// // cv = CacheVisualizer(cs, [10, 16]) +// // sim.dump_state() +// } +fn main() {} diff --git a/examples/vectoradd.rs b/examples/vectoradd.rs index f27007a7..bf3f916c 100644 --- a/examples/vectoradd.rs +++ b/examples/vectoradd.rs @@ -3,92 +3,92 @@ #![allow(clippy::cast_sign_loss)] use color_eyre::eyre; -use num_traits::{Float, NumCast, Zero}; - -#[derive(Debug)] -struct VecAdd<'s, 'a, T> { - d_a: &'a mut casimu::DevicePtr<'s, 'a, Vec>, - d_b: &'a mut casimu::DevicePtr<'s, 'a, Vec>, - d_c: &'a mut casimu::DevicePtr<'s, 'a, Vec>, - n: usize, -} - -impl<'s, 'a, T> casimu::Kernel for VecAdd<'s, 'a, T> -where - T: Float + std::fmt::Debug, -{ - type Error = std::convert::Infallible; - - fn run(&mut self, idx: &casimu::ThreadIndex) -> Result<(), Self::Error> { - // Get our global thread ID - // int id = blockIdx.x * blockDim.x + threadIdx.x; - let id: usize = (idx.block_idx.x * idx.block_dim.x + idx.thread_idx.x) as usize; - - // Make sure we do not go out of bounds - // if (id < n) c[id] = a[id] + b[id]; - // let test2: &(dyn std::ops::IndexMut) = self.d_a; - if id < self.n { - self.d_c[id] = self.d_a[id] + self.d_b[id]; - } - Ok(()) - } -} - -// Number of threads in each thread block -const BLOCK_SIZE: u32 = 1024; - -fn vectoradd(n: usize) -> eyre::Result<()> -where - T: Float + Zero + NumCast + std::iter::Sum + std::fmt::Display + std::fmt::Debug, -{ - // create host vectors - let mut a: Vec = vec![T::zero(); n]; - let mut b: Vec = vec![T::zero(); n]; - let mut c: Vec = vec![T::zero(); n]; - - // initialize vectors - for i in 0..n { - let angle = T::from(i).unwrap(); - a[i] = angle.sin() * angle.sin(); - b[i] = angle.cos() * angle.cos(); - c[i] = T::zero(); - } - - let sim = casimu::Simulation::new(); - - // allocate memory for each vector on simulated GPU device - let a_size = a.len() * std::mem::size_of::(); - let b_size = b.len() * std::mem::size_of::(); - let c_size = c.len() * std::mem::size_of::(); - let mut d_a = sim.allocate(&mut a, a_size as u64); - let mut d_b = sim.allocate(&mut b, b_size as u64); - let mut d_c = sim.allocate(&mut c, c_size as u64); - - // number of thread blocks in grid - let grid_size = (n as f64 / >::from(BLOCK_SIZE)).ceil() as u32; - - let kernel: VecAdd = VecAdd { - d_a: &mut d_a, - d_b: &mut d_b, - d_c: &mut d_c, - n, - }; - sim.launch_kernel(grid_size, BLOCK_SIZE, kernel)?; - - // sum up vector c and print result divided by n. - // this should equal 1 within - let total_sum: T = c.into_iter().sum(); - println!( - "Final sum = {total_sum}; sum/n = {} (should be ~1)\n", - total_sum / T::from(n).unwrap() - ); - - dbg!(&sim.stats.lock().unwrap()); - Ok(()) -} +// use num_traits::{Float, NumCast, Zero}; + +// #[derive(Debug)] +// struct VecAdd<'s, 'a, T> { +// d_a: &'a mut casimu::DevicePtr<'s, 'a, Vec>, +// d_b: &'a mut casimu::DevicePtr<'s, 'a, Vec>, +// d_c: &'a mut casimu::DevicePtr<'s, 'a, Vec>, +// n: usize, +// } +// +// impl<'s, 'a, T> casimu::Kernel for VecAdd<'s, 'a, T> +// where +// T: Float + std::fmt::Debug, +// { +// type Error = std::convert::Infallible; +// +// fn run(&mut self, idx: &casimu::ThreadIndex) -> Result<(), Self::Error> { +// // Get our global thread ID +// // int id = blockIdx.x * blockDim.x + threadIdx.x; +// let id: usize = (idx.block_idx.x * idx.block_dim.x + idx.thread_idx.x) as usize; +// +// // Make sure we do not go out of bounds +// // if (id < n) c[id] = a[id] + b[id]; +// // let test2: &(dyn std::ops::IndexMut) = self.d_a; +// if id < self.n { +// self.d_c[id] = self.d_a[id] + self.d_b[id]; +// } +// Ok(()) +// } +// } +// +// // Number of threads in each thread block +// const BLOCK_SIZE: u32 = 1024; +// +// fn vectoradd(n: usize) -> eyre::Result<()> +// where +// T: Float + Zero + NumCast + std::iter::Sum + std::fmt::Display + std::fmt::Debug, +// { +// // create host vectors +// let mut a: Vec = vec![T::zero(); n]; +// let mut b: Vec = vec![T::zero(); n]; +// let mut c: Vec = vec![T::zero(); n]; +// +// // initialize vectors +// for i in 0..n { +// let angle = T::from(i).unwrap(); +// a[i] = angle.sin() * angle.sin(); +// b[i] = angle.cos() * angle.cos(); +// c[i] = T::zero(); +// } +// +// let sim = casimu::Simulation::new(); +// +// // allocate memory for each vector on simulated GPU device +// let a_size = a.len() * std::mem::size_of::(); +// let b_size = b.len() * std::mem::size_of::(); +// let c_size = c.len() * std::mem::size_of::(); +// let mut d_a = sim.allocate(&mut a, a_size as u64); +// let mut d_b = sim.allocate(&mut b, b_size as u64); +// let mut d_c = sim.allocate(&mut c, c_size as u64); +// +// // number of thread blocks in grid +// let grid_size = (n as f64 / >::from(BLOCK_SIZE)).ceil() as u32; +// +// let kernel: VecAdd = VecAdd { +// d_a: &mut d_a, +// d_b: &mut d_b, +// d_c: &mut d_c, +// n, +// }; +// sim.launch_kernel(grid_size, BLOCK_SIZE, kernel)?; +// +// // sum up vector c and print result divided by n. +// // this should equal 1 within +// let total_sum: T = c.into_iter().sum(); +// println!( +// "Final sum = {total_sum}; sum/n = {} (should be ~1)\n", +// total_sum / T::from(n).unwrap() +// ); +// +// dbg!(&sim.stats.lock().unwrap()); +// Ok(()) +// } fn main() -> eyre::Result<()> { - vectoradd::(100)?; + // vectoradd::(100)?; Ok(()) } diff --git a/playground/sys/build.rs b/playground/sys/build.rs index 3a550b2e..513d7e77 100644 --- a/playground/sys/build.rs +++ b/playground/sys/build.rs @@ -41,23 +41,6 @@ fn configure_debug_mode(build: &mut cc::Build) { } } -#[allow(dead_code)] -#[deprecated = "redundant when compiling the bridge"] -fn build(sources: &[PathBuf]) -> eyre::Result<()> { - let mut build = cc::Build::new(); - build - .cpp(true) - .static_flag(true) - .files(sources) - .flag("-std=c++14") - .warnings(false); - - configure_debug_mode(&mut build); - enable_diagnostics_color(&mut build); - build.try_compile("playground")?; - Ok(()) -} - #[derive(Debug)] struct ParseCallbacks {} @@ -329,6 +312,7 @@ fn generate_bridge( build .cpp(true) .static_flag(true) + .pic(true) .warnings(false) .include(include_dir) .include(parser_include_dir) diff --git a/playground/sys/src/ref/box_interconnect.cc b/playground/sys/src/ref/box_interconnect.cc index 3ccfab49..87eed118 100644 --- a/playground/sys/src/ref/box_interconnect.cc +++ b/playground/sys/src/ref/box_interconnect.cc @@ -5,16 +5,17 @@ #include "mem_fetch.hpp" bool BoxInterconnect::HasBuffer(unsigned deviceID, unsigned int size) const { - unsigned icntID = _node_map.find(deviceID)->second; - assert(icntID == deviceID); - - // request is subnet 0 and reply is subnet 1 - bool is_memory_node = ((_subnets > 1) && deviceID >= _n_shader); - unsigned subnet = is_memory_node ? 1 : 0; - bool has_buffer = - simple_input_queue[subnet][icntID][0].size() <= _input_buffer_capacity; - - return has_buffer; + return true; + // unsigned icntID = _node_map.find(deviceID)->second; + // assert(icntID == deviceID); + // + // // request is subnet 0 and reply is subnet 1 + // bool is_memory_node = ((_subnets > 1) && deviceID >= _n_shader); + // unsigned subnet = is_memory_node ? 1 : 0; + // bool has_buffer = + // simple_input_queue[subnet][icntID][0].size() <= _input_buffer_capacity; + // + // return has_buffer; } void BoxInterconnect::Advance() { @@ -93,7 +94,6 @@ void BoxInterconnect::Push(unsigned input_deviceID, unsigned output_deviceID, mem_fetch_ptr(mf), size, input_icntID, output_icntID, subnet); } - // simple_input_queue[subnet][input_icntID][0].push_back(data); simple_output_queue[subnet][output_icntID][0].push_back(data); } @@ -103,13 +103,13 @@ void BoxInterconnect::Init() { unsigned nodes = _net[0]->NumNodes(); unsigned classes = _icnt_config->GetInt("classes"); - simple_input_queue.resize(_subnets); + // simple_input_queue.resize(_subnets); simple_output_queue.resize(_subnets); for (int subnet = 0; subnet < _subnets; ++subnet) { - simple_input_queue[subnet].resize(nodes); + // simple_input_queue[subnet].resize(nodes); simple_output_queue[subnet].resize(nodes); for (unsigned node = 0; node < nodes; ++node) { - simple_input_queue[subnet][node].resize(classes); + // simple_input_queue[subnet][node].resize(classes); simple_output_queue[subnet][node].resize(classes); } } diff --git a/playground/sys/src/ref/box_interconnect.hpp b/playground/sys/src/ref/box_interconnect.hpp index 765034a3..631c7f25 100644 --- a/playground/sys/src/ref/box_interconnect.hpp +++ b/playground/sys/src/ref/box_interconnect.hpp @@ -26,7 +26,8 @@ class BoxInterconnect : public InterconnectInterface { std::shared_ptr logger; protected: - std::vector>>> simple_input_queue; + // std::vector>>> + // simple_input_queue; std::vector>>> simple_output_queue; }; diff --git a/src/cache.rs b/src/cache.rs index ae7a5abc..25914fa6 100644 --- a/src/cache.rs +++ b/src/cache.rs @@ -1,4 +1,3 @@ - use std::sync::Arc; pub trait ReplacementPolicy {} diff --git a/src/config/accelsim.rs b/src/config/accelsim.rs index cae629d8..997539a4 100644 --- a/src/config/accelsim.rs +++ b/src/config/accelsim.rs @@ -619,7 +619,7 @@ static ARGUMENT_REGEX: Lazy = Lazy::new(|| { .unwrap() }); -pub fn extract_arguments<'a>(config: &'a str) -> impl Iterator + '_ { +pub fn extract_arguments(config: &str) -> impl Iterator { ARGUMENT_REGEX.captures_iter(config).filter_map(|cap| { let key = cap.get(1)?.as_str().trim(); let value = cap.get(2)?.as_str().trim(); @@ -630,7 +630,7 @@ pub fn extract_arguments<'a>(config: &'a str) -> impl Iterator) -> eyre::Result { let args = extract_arguments(config.as_ref()) - .flat_map(|(key, value)| [format!("--{}", key), value.to_string()]); + .flat_map(|(key, value)| [format!("--{key}"), value.to_string()]); let args: Vec = ["test".to_string()].into_iter().chain(args).collect(); dbg!(&args); let config = Self::try_parse_from(&args)?; @@ -655,8 +655,8 @@ mod tests { # --gpgpu_shader_core_pipeline 2048:32 # --gpgpu_simd_model 1 "; - let args = super::extract_arguments(&config) - .flat_map(|(key, value)| [format!("--{}", key), value.to_string()]); + let args = super::extract_arguments(config) + .flat_map(|(key, value)| [format!("--{key}"), value.to_string()]); let mut args: std::collections::VecDeque = args.collect(); args.push_front("test".to_string()); diff --git a/src/config/mod.rs b/src/config/mod.rs index f2435adf..b1825b26 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -1,7 +1,8 @@ pub mod accelsim; use super::ported::{ - addrdec, address, core::PipelineStage, mem_sub_partition, mshr, opcodes, KernelInfo, + addrdec, address, core::PipelineStage, kernel::Kernel, mem_sub_partition, mshr, opcodes, + set_index, }; use color_eyre::eyre; use std::collections::HashMap; @@ -29,13 +30,14 @@ pub enum CacheReplacementPolicy { FIFO, // F } -#[derive(Debug, PartialEq, Eq, Hash)] +#[derive(Debug)] pub struct L2DCacheConfig { pub inner: Arc, } impl L2DCacheConfig { #[inline] + #[must_use] pub fn set_index(&self, addr: address) -> u64 { let partition_addr = addr; @@ -48,12 +50,13 @@ impl L2DCacheConfig { } } -#[derive(Debug, PartialEq, Eq, Hash)] +#[derive(Debug)] pub struct L1DCacheConfig { /// L1 Hit Latency pub l1_latency: usize, // 1 /// l1 banks hashing function - pub l1_banks_hashing_function: CacheSetIndexFunc, // 0 + pub l1_banks_hashing_function: Box, // 0 + // pub l1_banks_hashing_function: CacheSetIndexFunc, // 0 /// l1 banks byte interleaving granularity pub l1_banks_byte_interleaving: usize, // 32 /// The number of L1 cache banks @@ -64,16 +67,19 @@ pub struct L1DCacheConfig { impl L1DCacheConfig { #[inline] + #[must_use] pub fn l1_banks_log2(&self) -> u32 { addrdec::logb2(self.l1_banks as u32) } #[inline] + #[must_use] pub fn l1_banks_byte_interleaving_log2(&self) -> u32 { addrdec::logb2(self.l1_banks_byte_interleaving as u32) } #[inline] + #[must_use] pub fn compute_set_bank(&self, addr: address) -> u64 { log::trace!( "computing set bank for address {} ({} l1 banks) using hashing function {:?}", @@ -85,20 +91,26 @@ impl L1DCacheConfig { // For sector cache, we select one sector per bank (sector interleaving) // This is what was found in Volta (one sector per bank, sector // interleaving) otherwise, line interleaving - hash_function( + + self.l1_banks_hashing_function.compute_set_index( addr, self.l1_banks, self.l1_banks_byte_interleaving_log2(), self.l1_banks_log2(), - self.l1_banks_hashing_function, ) + + // hash_function( + // addr, + // self.l1_banks, + // self.l1_banks_byte_interleaving_log2(), + // self.l1_banks_log2(), + // self.l1_banks_hashing_function, + // ) } } /// CacheConfig -/// -/// :::,::::,::,:** -#[derive(Debug, PartialEq, Eq, Hash)] +#[derive(Debug)] pub struct CacheConfig { pub kind: CacheKind, pub num_sets: usize, @@ -109,7 +121,8 @@ pub struct CacheConfig { pub write_policy: CacheWritePolicy, pub allocate_policy: CacheAllocatePolicy, pub write_allocate_policy: CacheWriteAllocatePolicy, - pub set_index_function: CacheSetIndexFunc, + // pub set_index_function: CacheSetIndexFunc, + pub set_index_function: Box, pub mshr_kind: mshr::Kind, pub mshr_entries: usize, @@ -126,6 +139,17 @@ pub struct CacheConfig { // pub disabled: bool, } +impl std::fmt::Display for CacheConfig { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + let size = human_bytes::human_bytes(self.total_bytes() as f64); + write!( + f, + "{size} ({} set, {}-way, {} byte line)", + self.num_sets, self.associativity, self.line_size + ) + } +} + pub static MAX_DEFAULT_CACHE_SIZE_MULTIPLIER: u8 = 4; /// TODO: use a builder here so we can fill in the remaining values @@ -135,6 +159,7 @@ impl CacheConfig { /// /// todo: this can be replaced with the builder? #[inline] + #[must_use] pub fn data_port_width(&self) -> usize { // default granularity is line size let width = self.data_port_width.unwrap_or(self.line_size as usize); @@ -144,49 +169,58 @@ impl CacheConfig { /// The total size of the cache in bytes. #[inline] + #[must_use] pub fn total_bytes(&self) -> usize { self.line_size as usize * self.num_sets * self.associativity } /// Number of lines in total. #[inline] + #[must_use] pub fn total_lines(&self) -> usize { self.num_sets * self.associativity } /// Maximum number of lines. #[inline] + #[must_use] pub fn max_num_lines(&self) -> usize { self.max_cache_multiplier() as usize * self.num_sets * self.associativity } /// this is virtual (possibly different) #[inline] + #[must_use] pub fn max_cache_multiplier(&self) -> u8 { MAX_DEFAULT_CACHE_SIZE_MULTIPLIER } #[inline] + #[must_use] pub fn line_size_log2(&self) -> u32 { - addrdec::logb2(self.line_size as u32) + addrdec::logb2(self.line_size) } #[inline] + #[must_use] pub fn num_sets_log2(&self) -> u32 { addrdec::logb2(self.num_sets as u32) } #[inline] + #[must_use] pub fn sector_size(&self) -> u32 { mem_sub_partition::SECTOR_SIZE } #[inline] + #[must_use] pub fn sector_size_log2(&self) -> u32 { addrdec::logb2(self.sector_size()) } #[inline] + #[must_use] pub fn atom_size(&self) -> u32 { if self.kind == CacheKind::Sector { mem_sub_partition::SECTOR_SIZE @@ -197,17 +231,25 @@ impl CacheConfig { // do not use enabled but options #[inline] + #[must_use] pub fn set_index(&self, addr: address) -> u64 { - hash_function( + self.set_index_function.compute_set_index( addr, self.num_sets, self.line_size_log2(), self.num_sets_log2(), - self.set_index_function, ) + // hash_function( + // addr, + // self.num_sets, + // self.line_size_log2(), + // self.num_sets_log2(), + // self.set_index_function, + // ) } #[inline] + #[must_use] pub fn tag(&self, addr: address) -> address { // For generality, the tag includes both index and tag. // This allows for more complex set index calculations that @@ -217,19 +259,21 @@ impl CacheConfig { // return addr >> (m_line_sz_log2+m_nset_log2); // return addr & ~(new_addr_type)(m_line_sz - 1); - addr & !((self.line_size - 1) as u64) + addr & !u64::from(self.line_size - 1) } /// Block address #[inline] + #[must_use] pub fn block_addr(&self, addr: address) -> address { - addr & !((self.line_size - 1) as u64) + addr & !u64::from(self.line_size - 1) } /// Mshr address #[inline] + #[must_use] pub fn mshr_addr(&self, addr: address) -> address { - addr & !((self.line_size - 1) as u64) + addr & !u64::from(self.line_size - 1) } // // detect invalid configuration @@ -268,93 +312,8 @@ impl CacheConfig { // assert(m_line_sz % m_data_port_width == 0); } -fn hash_function( - addr: address, - num_sets: usize, - line_size_log2: u32, - num_sets_log2: u32, - set_index_function: CacheSetIndexFunc, -) -> u64 { - use super::ported::set_index_function as indexing; - - let set_idx: u64 = match set_index_function { - CacheSetIndexFunc::LINEAR_SET_FUNCTION => { - // log::trace!( - // "set_index({}): LINEAR hash func: log2(line)={}, num sets={}", - // addr, - // line_size_log2, - // num_sets, - // ); - let set_index = (addr >> line_size_log2) & (num_sets as u64 - 1); - set_index - } - CacheSetIndexFunc::FERMI_HASH_SET_FUNCTION => { - // Set Indexing function from - // "A Detailed GPU Cache Model Based on Reuse - // Distance Theory" Cedric Nugteren et al. HPCA 2014 - - // check for incorrect number of sets - assert!( - matches!(num_sets, 32 | 64), - "bad cache config: num sets should be 32 or 64 for hashing set index function (got {})", num_sets, - ); - - let mut lower_xor = 0; - let mut upper_xor = 0; - - // lower xor value is bits 7-11 - lower_xor = (addr >> line_size_log2) & 0x1F; - - // upper xor value is bits 13, 14, 15, 17, and 19 - upper_xor = (addr & 0xE000) >> 13; // Bits 13, 14, 15 - upper_xor |= (addr & 0x20000) >> 14; // Bit 17 - upper_xor |= (addr & 0x80000) >> 15; // Bit 19 - - let mut set_index = lower_xor ^ upper_xor; - - // 48KB cache prepends the set_index with bit 12 - if num_sets == 64 { - set_index |= (addr & 0x1000) >> 7; - } - set_index - } - CacheSetIndexFunc::HASH_IPOLY_FUNCTION => { - let bits = line_size_log2 + num_sets_log2; - let higher_bits = addr >> bits; - let mut index = (addr >> line_size_log2) as usize; - index &= num_sets - 1; - indexing::ipoly_hash_function(higher_bits, index, num_sets) - } - - CacheSetIndexFunc::BITWISE_XORING_FUNCTION => { - let bits = line_size_log2 + num_sets_log2; - let higher_bits = addr >> bits; - let mut index = (addr >> line_size_log2) as usize; - index &= num_sets - 1; - indexing::bitwise_hash_function(higher_bits, index, num_sets) - } - }; - - assert!( - set_idx < num_sets as u64, - "Error: Set index out of bounds. This is caused by an incorrect or unimplemented set index function." - ); - set_idx -} - -impl std::fmt::Display for CacheConfig { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { - let size = human_bytes::human_bytes(self.total_bytes() as f64); - write!( - f, - "{size} ({} set, {}-way, {} byte line)", - self.num_sets, self.associativity, self.line_size - ) - } -} - /// todo: remove the copy stuff, very expensive otherwise -#[derive(Debug, PartialEq, Eq)] +#[derive(Debug)] pub struct GPUConfig { pub linear_to_raw_adress_translation: std::sync::OnceLock, @@ -376,17 +335,7 @@ pub struct GPUConfig { /// unified banked L2 data cache config pub data_cache_l2: Option>, - /// L1D write ratio - // pub l1_cache_write_ratio: usize, - /// The number of L1 cache banks - // pub l1_banks: usize, - // /// L1 banks byte interleaving granularity - // pub l1_banks_byte_interleaving: usize, - // // L1 banks hashing function - // pub l1_banks_hashing_function: usize, - // /// L1 Hit Latency - // pub l1_latency: usize, - /// smem Latency + /// Shared memory latency pub shared_memory_latency: usize, /// SP unit max latency pub max_sp_latency: usize, @@ -626,6 +575,7 @@ pub struct GPUConfig { pub static WORD_SIZE: address = 4; +#[must_use] pub fn pad_to_multiple(n: usize, k: usize) -> usize { let rem = n % k; if rem != 0 { @@ -664,9 +614,9 @@ impl GPUConfig { mem_id + self.num_simt_clusters } - pub fn threads_per_block_padded(&self, kernel: &KernelInfo) -> usize { + pub fn threads_per_block_padded(&self, kernel: &Kernel) -> usize { let threads_per_block = kernel.threads_per_block(); - pad_to_multiple(threads_per_block as usize, self.warp_size) + pad_to_multiple(threads_per_block, self.warp_size) } /// Number of bytes transferred per read or write command. @@ -679,11 +629,11 @@ impl GPUConfig { /// /// Depends on the following constraints: /// - - pub fn max_blocks(&self, kernel: &KernelInfo) -> eyre::Result { + pub fn max_blocks(&self, kernel: &Kernel) -> eyre::Result { let threads_per_block = kernel.threads_per_block(); - let threads_per_block = pad_to_multiple(threads_per_block as usize, self.warp_size); + let threads_per_block = pad_to_multiple(threads_per_block, self.warp_size); // limit by n_threads/shader - let by_thread_limit = self.max_threads_per_core / threads_per_block as usize; + let by_thread_limit = self.max_threads_per_core / threads_per_block; // limit by shmem/shader let by_shared_mem_limit = if kernel.config.shared_mem_bytes > 0 { @@ -712,7 +662,7 @@ impl GPUConfig { by_register_limit, ] .into_iter() - .filter_map(|limit| limit) + .flatten() .min() .unwrap_or(usize::MAX); // result = gs_min2(result, result_shmem); @@ -733,18 +683,58 @@ impl GPUConfig { )); } - if self.adaptive_cache_config && !kernel.cache_config_set { + if self.adaptive_cache_config { // more info about adaptive cache, see // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x let total_shared_mem = kernel.config.shared_mem_bytes as usize * limit; - assert!( - total_shared_mem >= 0 - && self - .shared_memory_sizes - .last() - .map(|size| total_shared_mem <= (*size as usize)) - .unwrap_or(true) - ); + if let Some(size) = self.shared_memory_sizes.last() { + assert!(total_shared_mem <= (*size as usize)); + } + + // Unified cache config is in KB. Converting to B + // unsigned total_unified = m_L1D_config.m_unified_cache_size * 1024; + // + // bool l1d_configured = false; + // unsigned max_assoc = m_L1D_config.get_max_assoc(); + // + // for (std::vector::const_iterator it = shmem_opt_list.begin(); + // it < shmem_opt_list.end(); it++) { + // if (total_shmem <= *it) { + // float l1_ratio = 1 - ((float)*(it) / total_unified); + // // make sure the ratio is between 0 and 1 + // assert(0 <= l1_ratio && l1_ratio <= 1); + // // round to nearest instead of round down + // m_L1D_config.set_assoc(max_assoc * l1_ratio + 0.5f); + // l1d_configured = true; + // break; + // } + // } + // + // assert(l1d_configured && "no shared memory option found"); + + // if (m_L1D_config.is_streaming()) { + // // for streaming cache, if the whole memory is allocated + // // to the L1 cache, then make the allocation to be on_MISS + // // otherwise, make it ON_FILL to eliminate line allocation fails + // // i.e. MSHR throughput is the same, independent on the L1 cache + // // size/associativity + // if (total_shmem == 0) { + // m_L1D_config.set_allocation_policy(ON_MISS); + // + // if (gpgpu_ctx->accelsim_compat_mode) { + // printf("GPGPU-Sim: Reconfigure L1 allocation to ON_MISS\n"); + // } + // } else { + // m_L1D_config.set_allocation_policy(ON_FILL); + // if (gpgpu_ctx->accelsim_compat_mode) { + // printf("GPGPU-Sim: Reconfigure L1 allocation to ON_FILL\n"); + // } + // } + // } + // if (gpgpu_ctx->accelsim_compat_mode) { + // printf("GPGPU-Sim: Reconfigure L1 cache to %uKB\n", + // m_L1D_config.get_total_size_inKB()); + // } } Ok(limit) @@ -764,31 +754,21 @@ impl GPUConfig { | ArchOp::RET_OPS => { // integer units (latency, initiation_interval) = self.trace_opcode_latency_initiation_int; - // latency = int_latency; - // initiation_interval = int_init; } ArchOp::SP_OP => { // single precision units (latency, initiation_interval) = self.trace_opcode_latency_initiation_sp; - // latency = fp_latency; - // initiation_interval = fp_init; } ArchOp::DP_OP => { // double precision units (latency, initiation_interval) = self.trace_opcode_latency_initiation_dp; - // latency = dp_latency; - // initiation_interval = dp_init; } ArchOp::SFU_OP => { // special function units (latency, initiation_interval) = self.trace_opcode_latency_initiation_sfu; - // latency = sfu_latency; - // initiation_interval = sfu_init; } ArchOp::TENSOR_CORE_OP => { (latency, initiation_interval) = self.trace_opcode_latency_initiation_tensor; - // latency = tensor_latency; - // initiation_interval = tensor_init; } _ => {} } @@ -805,63 +785,6 @@ impl GPUConfig { } } -// void trace_config::reg_options(option_parser_t opp) { -// option_parser_register(opp, "-trace", OPT_CSTR, &g_traces_filename, -// "traces kernel file" -// "traces kernel file directory", -// "./traces/kernelslist.g"); -// -// option_parser_register(opp, "-trace_opcode_latency_initiation_int", OPT_CSTR, -// &trace_opcode_latency_initiation_int, -// "Opcode latencies and initiation for integers in " -// "trace driven mode ", -// "4,1"); -// option_parser_register(opp, "-trace_opcode_latency_initiation_sp", OPT_CSTR, -// &trace_opcode_latency_initiation_sp, -// "Opcode latencies and initiation for sp in trace " -// "driven mode ", -// "4,1"); -// option_parser_register(opp, "-trace_opcode_latency_initiation_dp", OPT_CSTR, -// &trace_opcode_latency_initiation_dp, -// "Opcode latencies and initiation for dp in trace " -// "driven mode ", -// "4,1"); -// option_parser_register(opp, "-trace_opcode_latency_initiation_sfu", OPT_CSTR, -// &trace_opcode_latency_initiation_sfu, -// "Opcode latencies and initiation for sfu in trace " -// "driven mode ", -// "4,1"); -// option_parser_register(opp, "-trace_opcode_latency_initiation_tensor", -// OPT_CSTR, &trace_opcode_latency_initiation_tensor, -// "Opcode latencies and initiation for tensor in trace " -// "driven mode ", -// "4,1"); -// -// for (unsigned j = 0; j < SPECIALIZED_UNIT_NUM; ++j) { -// std::stringstream ss; -// ss << "-trace_opcode_latency_initiation_spec_op_" << j + 1; -// option_parser_register(opp, ss.str().c_str(), OPT_CSTR, -// &trace_opcode_latency_initiation_specialized_op[j], -// "specialized unit config" -// " ", -// "4,4"); -// } -// } -// -// void trace_config::parse_config() { -// sscanf(trace_opcode_latency_initiation_int, "%u,%u", &int_latency, &int_init); -// sscanf(trace_opcode_latency_initiation_sp, "%u,%u", &fp_latency, &fp_init); -// sscanf(trace_opcode_latency_initiation_dp, "%u,%u", &dp_latency, &dp_init); -// sscanf(trace_opcode_latency_initiation_sfu, "%u,%u", &sfu_latency, &sfu_init); -// sscanf(trace_opcode_latency_initiation_tensor, "%u,%u", &tensor_latency, -// &tensor_init); -// -// for (unsigned j = 0; j < SPECIALIZED_UNIT_NUM; ++j) { -// sscanf(trace_opcode_latency_initiation_specialized_op[j], "%u,%u", -// &specialized_unit_latency[j], &specialized_unit_initiation[j]); -// } -// } - /// Cache set indexing function kind. #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] pub enum CacheSetIndexFunc { @@ -875,9 +798,9 @@ pub enum CacheSetIndexFunc { /// /// Cache write-allocate policy. /// -/// For more details about difference between FETCH_ON_WRITE and WRITE +/// For more details about difference between `FETCH_ON_WRITE` and WRITE /// VALIDAE policies Read: Jouppi, Norman P. "Cache write policies and -/// performance". ISCA 93. WRITE_ALLOCATE is the old write policy in +/// performance". ISCA 93. `WRITE_ALLOCATE` is the old write policy in /// GPGPU-sim 3.x, that send WRITE and READ for every write request #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] pub enum CacheWriteAllocatePolicy { @@ -955,8 +878,8 @@ pub enum DRAMSchedulerKind { /// Core Scheduler policy. /// -/// If two_level_active: -/// :: +/// If `two_level_active`: +/// <`num_active_warps>:: /// /// For complete list of prioritization values see shader.h. #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] @@ -986,11 +909,11 @@ impl GPUConfig { let shared_memory_sizes_string = "0"; let _shared_memory_sizes: Vec = if adaptive_cache_config { let sizes: Result, _> = shared_memory_sizes_string - .split(",") + .split(',') .map(str::parse) .collect(); let mut sizes: Vec<_> = sizes?.into_iter().map(|size| size * 1024).collect(); - sizes.sort(); + sizes.sort_unstable(); sizes } else { vec![] @@ -1004,16 +927,10 @@ impl GPUConfig { pub fn address_mapping(&self) -> &addrdec::LinearToRawAddressTranslation { self.linear_to_raw_adress_translation - .get_or_init(|| addrdec::LinearToRawAddressTranslation::new(&self).unwrap()) + .get_or_init(|| addrdec::LinearToRawAddressTranslation::new(self).unwrap()) } } -// opp, "-gpgpu_pipeline_widths", OPT_CSTR, &pipeline_widths_string, -// "Pipeline widths " -// "ID_OC_SP,ID_OC_DP,ID_OC_INT,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_DP,OC_EX_" -// "INT,OC_EX_SFU,OC_EX_MEM,EX_WB,ID_OC_TENSOR_CORE,OC_EX_TENSOR_CORE", -// "1,1,1,1,1,1,1,1,1,1,1,1,1") - impl Default for GPUConfig { fn default() -> Self { Self { @@ -1032,7 +949,8 @@ impl Default for GPUConfig { write_policy: CacheWritePolicy::READ_ONLY, allocate_policy: CacheAllocatePolicy::ON_MISS, write_allocate_policy: CacheWriteAllocatePolicy::NO_WRITE_ALLOCATE, - set_index_function: CacheSetIndexFunc::LINEAR_SET_FUNCTION, + // set_index_function: CacheSetIndexFunc::LINEAR_SET_FUNCTION, + set_index_function: Box::new(set_index::linear::SetIndex::default()), mshr_kind: mshr::Kind::TEX_FIFO, mshr_entries: 128, mshr_max_merge: 4, @@ -1052,7 +970,8 @@ impl Default for GPUConfig { write_policy: CacheWritePolicy::READ_ONLY, allocate_policy: CacheAllocatePolicy::ON_FILL, write_allocate_policy: CacheWriteAllocatePolicy::NO_WRITE_ALLOCATE, - set_index_function: CacheSetIndexFunc::LINEAR_SET_FUNCTION, + // set_index_function: CacheSetIndexFunc::LINEAR_SET_FUNCTION, + set_index_function: Box::new(set_index::linear::SetIndex::default()), mshr_kind: mshr::Kind::ASSOC, mshr_entries: 2, mshr_max_merge: 64, @@ -1072,7 +991,8 @@ impl Default for GPUConfig { write_policy: CacheWritePolicy::READ_ONLY, allocate_policy: CacheAllocatePolicy::ON_FILL, write_allocate_policy: CacheWriteAllocatePolicy::NO_WRITE_ALLOCATE, - set_index_function: CacheSetIndexFunc::LINEAR_SET_FUNCTION, + // set_index_function: CacheSetIndexFunc::LINEAR_SET_FUNCTION, + set_index_function: Box::new(set_index::linear::SetIndex::default()), mshr_kind: mshr::Kind::ASSOC, mshr_entries: 2, mshr_max_merge: 48, @@ -1085,8 +1005,8 @@ impl Default for GPUConfig { // {::,:::,::, | none} data_cache_l1: Some(Arc::new(L1DCacheConfig { l1_latency: 1, - l1_banks_hashing_function: CacheSetIndexFunc::LINEAR_SET_FUNCTION, - // l1_banks_hashing_function: CacheSetIndexFunc::FERMI_HASH_SET_FUNCTION, + // l1_banks_hashing_function: CacheSetIndexFunc::LINEAR_SET_FUNCTION, + l1_banks_hashing_function: Box::new(set_index::linear::SetIndex::default()), l1_banks_byte_interleaving: 32, l1_banks: 1, inner: Arc::new(CacheConfig { @@ -1098,7 +1018,8 @@ impl Default for GPUConfig { write_policy: CacheWritePolicy::LOCAL_WB_GLOBAL_WT, allocate_policy: CacheAllocatePolicy::ON_MISS, write_allocate_policy: CacheWriteAllocatePolicy::NO_WRITE_ALLOCATE, - set_index_function: CacheSetIndexFunc::FERMI_HASH_SET_FUNCTION, + // set_index_function: CacheSetIndexFunc::FERMI_HASH_SET_FUNCTION, + set_index_function: Box::new(set_index::fermi::SetIndex::default()), mshr_kind: mshr::Kind::ASSOC, mshr_entries: 128, mshr_max_merge: 8, @@ -1120,7 +1041,8 @@ impl Default for GPUConfig { write_policy: CacheWritePolicy::WRITE_BACK, allocate_policy: CacheAllocatePolicy::ON_MISS, write_allocate_policy: CacheWriteAllocatePolicy::WRITE_ALLOCATE, - set_index_function: CacheSetIndexFunc::LINEAR_SET_FUNCTION, + // set_index_function: CacheSetIndexFunc::LINEAR_SET_FUNCTION, + set_index_function: Box::new(set_index::linear::SetIndex::default()), mshr_kind: mshr::Kind::ASSOC, mshr_entries: 1024, mshr_max_merge: 1024, @@ -1277,8 +1199,8 @@ mod tests { fn parse_cache_config(config: &str) -> bindings::CacheConfig { use bindings::parse_cache_config as parse; - let cache_config = unsafe { parse(config.as_ptr().cast()) }; - cache_config + + unsafe { parse(config.as_ptr().cast()) } } #[test] @@ -1405,29 +1327,32 @@ mod tests { fn test_l1i_block_addr() { let config = super::GPUConfig::default(); let l1i_cache_config = config.inst_cache_l1.unwrap(); - assert_eq!(l1i_cache_config.block_addr(4026531848), 4026531840); + assert_eq!(l1i_cache_config.block_addr(4_026_531_848), 4_026_531_840); } #[test] fn test_l2d_block_addr() { let config = super::GPUConfig::default(); let l2d_cache_config = config.data_cache_l2.unwrap(); - assert_eq!(l2d_cache_config.inner.block_addr(34887082112), 34887082112); + assert_eq!( + l2d_cache_config.inner.block_addr(34_887_082_112), + 34_887_082_112 + ); } #[test] fn test_l1i_mshr_addr() { let config = super::GPUConfig::default(); let l1i_cache_config = config.inst_cache_l1.unwrap(); - assert_eq!(l1i_cache_config.mshr_addr(4026531848), 4026531840); - assert_eq!(l1i_cache_config.mshr_addr(4026531992), 4026531968); + assert_eq!(l1i_cache_config.mshr_addr(4_026_531_848), 4_026_531_840); + assert_eq!(l1i_cache_config.mshr_addr(4_026_531_992), 4_026_531_968); } #[test] fn test_l2d_set_index() { let config = super::GPUConfig::default(); let l2d_config = config.data_cache_l2.unwrap(); - let block_addr = 34887082112; + let block_addr = 34_887_082_112; assert_eq!(l2d_config.inner.set_index(block_addr), 1); } } diff --git a/src/lib.rs b/src/lib.rs index 5287c801..15ca43f8 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -2,14 +2,14 @@ // #![allow(warnings)] // pub mod gpgpusim; -pub mod cache; +// pub mod cache; pub mod config; -pub mod dram; +// pub mod dram; pub mod ported; -#[cfg(feature = "python")] -pub mod python; -pub mod sim; +// #[cfg(feature = "python")] +// pub mod python; +// pub mod sim; -pub use cache::{Cache, Config as CacheConfig}; -pub use dram::MainMemory; -pub use sim::{DevicePtr, Kernel, Simulation, ThreadIndex}; +// pub use cache::{Cache, Config as CacheConfig}; +// pub use dram::MainMemory; +// pub use sim::{DevicePtr, Kernel, Simulation, ThreadIndex}; diff --git a/src/ported/addrdec.rs b/src/ported/addrdec.rs index 7af95c49..f26e0463 100644 --- a/src/ported/addrdec.rs +++ b/src/ported/addrdec.rs @@ -6,21 +6,21 @@ use regex::Regex; /// Base 2 logarithm of n. /// /// Effectively the minium number of bits required to store n. -pub fn logb2(n: u32) -> u32 { +#[must_use] pub fn logb2(n: u32) -> u32 { n.max(1).ilog2() } /// Compute power of two greater than or equal to n /// /// see: https://www.techiedelight.com/round-next-highest-power-2/ -pub fn next_power2(mut n: u32) -> u32 { +#[must_use] pub fn next_power2(mut n: u32) -> u32 { // avoid subtract with overflow if n == 0 { return 0; } // decrement n (handle the case when n itself is a power of 2) - n = n - 1; + n -= 1; // unset rightmost bit until only one bit is left while n > 0 && (n & (n - 1)) > 0 { @@ -32,7 +32,7 @@ pub fn next_power2(mut n: u32) -> u32 { n << 1 } -pub fn mask_limit(mask: address) -> (u8, u8) { +#[must_use] pub fn mask_limit(mask: address) -> (u8, u8) { let mut high = 64; let mut low = 0; let mut low_found = false; @@ -129,7 +129,7 @@ pub struct AddressDecodingConfig { pub burst: Mask, } -const ACCELSIM_ADDRESS_DECODE_CONFIG_REGEX: Lazy = +static ACCELSIM_ADDRESS_DECODE_CONFIG_REGEX: Lazy = Lazy::new(|| Regex::new(r"(dramid@(?P\d+))?;?(?P.*)").unwrap()); impl AddressDecodingConfig { @@ -210,7 +210,7 @@ impl AddressDecodingConfig { } impl LinearToRawAddressTranslation { - pub fn partition_address(&self, addr: address) -> address { + #[must_use] pub fn partition_address(&self, addr: address) -> address { if !self.has_gap { let mut mask = self.decode_config.chip.mask; mask |= self.sub_partition_id_mask; @@ -227,7 +227,7 @@ impl LinearToRawAddressTranslation { } } - pub fn tlx(&self, addr: address) -> DecodedAddress { + #[must_use] pub fn tlx(&self, addr: address) -> DecodedAddress { let mut tlx = DecodedAddress::default(); let num_channels = self.num_channels as u64; @@ -284,20 +284,20 @@ impl LinearToRawAddressTranslation { let num_sub_partitions_per_channel_log2 = logb2(num_sub_partitions_per_channel as u32); let mut num_chip_bits = num_channels_log2; - let gap = num_channels as i64 - 2u32.pow(num_chip_bits) as i64; + let gap = num_channels as i64 - i64::from(2u32.pow(num_chip_bits)); if gap > 0 { num_chip_bits += 1; } let mut decode_config = if let Some(ref mapping_config) = config.memory_addr_mapping { - AddressDecodingConfig::parse_accelsim_config(&mapping_config)? + AddressDecodingConfig::parse_accelsim_config(mapping_config)? } else { AddressDecodingConfig { addr_chip_start: Some(10), - chip: 0x0000000000001C00.into(), - bank: 0x0000000000000300.into(), - row: 0x000000000FFF0000.into(), - col: 0x000000000000E0FF.into(), - burst: 0x000000000000000F.into(), + chip: 0x0000_0000_0000_1C00.into(), + bank: 0x0000_0000_0000_0300.into(), + row: 0x0000_0000_0FFF_0000.into(), + col: 0x0000_0000_0000_E0FF.into(), + burst: 0x0000_0000_0000_000F.into(), } }; @@ -366,7 +366,7 @@ impl LinearToRawAddressTranslation { }) } - pub fn num_sub_partition_total(&self) -> usize { + #[must_use] pub fn num_sub_partition_total(&self) -> usize { self.num_channels * self.num_sub_partitions_per_channel } } @@ -386,7 +386,7 @@ fn packbits(mask: super::address, val: super::address, low: u8, high: u8) -> sup pos += 1; } } - return res; + res } #[derive(Default, Debug, Clone, Copy, Eq, PartialEq)] @@ -417,18 +417,18 @@ mod tests { #[inline] fn bit_str(n: u64) -> String { - format!("{:064b}", n) + format!("{n:064b}") } impl From for super::DecodedAddress { fn from(addr: playground::addrdec::AddrDec) -> Self { Self { - chip: addr.chip as u64, - bk: addr.bk as u64, - row: addr.row as u64, - col: addr.col as u64, - burst: addr.burst as u64, - sub_partition: addr.sub_partition as u64, + chip: u64::from(addr.chip), + bk: u64::from(addr.bk), + row: u64::from(addr.row), + col: u64::from(addr.col), + burst: u64::from(addr.burst), + sub_partition: u64::from(addr.sub_partition), } } } @@ -457,23 +457,23 @@ mod tests { dbg!(&dec_config); assert_eq!( bit_str(dec_config.chip.mask), - bit_str(0b00000000_00000000_00000000_00000000) + bit_str(0b0000_0000_0000_0000_0000_0000_0000_0000) ); assert_eq!( bit_str(dec_config.bank.mask), - bit_str(0b00000000_00000000_01110000_10000000) + bit_str(0b0000_0000_0000_0000_0111_0000_1000_0000) ); assert_eq!( bit_str(dec_config.row.mask), - bit_str(0b00001111_11111111_10000000_00000000) + bit_str(0b0000_1111_1111_1111_1000_0000_0000_0000) ); assert_eq!( bit_str(dec_config.col.mask), - bit_str(0b00000000_00000000_00001111_01111111) + bit_str(0b0000_0000_0000_0000_0000_1111_0111_1111) ); assert_eq!( bit_str(dec_config.burst.mask), - bit_str(0b00000000_00000000_00000000_00011111) + bit_str(0b0000_0000_0000_0000_0000_0000_0001_1111) ); let mut config = GPUConfig::default(); @@ -483,11 +483,11 @@ mod tests { let mapping = super::LinearToRawAddressTranslation::new(&config)?; let dec_config = mapping.decode_config; - assert_eq!(bit_str(dec_config.chip.mask), bit_str(0x0000000000000700)); - assert_eq!(bit_str(dec_config.bank.mask), bit_str(0x0000000000038080)); - assert_eq!(bit_str(dec_config.row.mask), bit_str(0x000000007ffc0000)); - assert_eq!(bit_str(dec_config.col.mask), bit_str(0x000000000000787f)); - assert_eq!(bit_str(dec_config.burst.mask), bit_str(0x000000000000001f)); + assert_eq!(bit_str(dec_config.chip.mask), bit_str(0x0000_0000_0000_0700)); + assert_eq!(bit_str(dec_config.bank.mask), bit_str(0x0000_0000_0003_8080)); + assert_eq!(bit_str(dec_config.row.mask), bit_str(0x0000_0000_7ffc_0000)); + assert_eq!(bit_str(dec_config.col.mask), bit_str(0x0000_0000_0000_787f)); + assert_eq!(bit_str(dec_config.burst.mask), bit_str(0x0000_0000_0000_001f)); assert_eq!((dec_config.chip.low, dec_config.chip.high), (8, 11)); assert_eq!((dec_config.bank.low, dec_config.bank.high), (7, 18)); @@ -503,20 +503,20 @@ mod tests { use playground::addrdec::packbits as ref_packbits; assert_eq!(packbits(0, 0, 0, 64), ref_packbits(0, 0, 0, 64)); assert_eq!( - packbits(0, 0xFFFFFFFFFFFFFFFF, 0, 64), - ref_packbits(0, 0xFFFFFFFFFFFFFFFF, 0, 64), + packbits(0, 0xFFFF_FFFF_FFFF_FFFF, 0, 64), + ref_packbits(0, 0xFFFF_FFFF_FFFF_FFFF, 0, 64), ); assert_eq!( - packbits(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0, 64), - ref_packbits(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0, 64), + packbits(0xFFFF_FFFF_FFFF_FFFF, 0xFFFF_FFFF_FFFF_FFFF, 0, 64), + ref_packbits(0xFFFF_FFFF_FFFF_FFFF, 0xFFFF_FFFF_FFFF_FFFF, 0, 64), ); assert_eq!( - packbits(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 64, 255), - ref_packbits(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 64, 64), + packbits(0xFFFF_FFFF_FFFF_FFFF, 0xFFFF_FFFF_FFFF_FFFF, 64, 255), + ref_packbits(0xFFFF_FFFF_FFFF_FFFF, 0xFFFF_FFFF_FFFF_FFFF, 64, 64), ); assert_eq!( - packbits(0xFFFFFFFFFFFFFFFF, 15, 0, 4), - ref_packbits(0xFFFFFFFFFFFFFFFF, 15, 0, 4), + packbits(0xFFFF_FFFF_FFFF_FFFF, 15, 0, 4), + ref_packbits(0xFFFF_FFFF_FFFF_FFFF, 15, 0, 4), ); } @@ -526,42 +526,42 @@ mod tests { config.num_memory_controllers = 8; config.num_sub_partition_per_memory_channel = 2; - let (tlx_addr, ref_tlx_addr) = compute_tlx(&config, 140159034064896); + let (tlx_addr, ref_tlx_addr) = compute_tlx(&config, 140_159_034_064_896); dbg!(&tlx_addr, &ref_tlx_addr); assert_eq!(ref_tlx_addr.sub_partition, 0); assert_eq!(tlx_addr.sub_partition, 0); - let (tlx_addr, ref_tlx_addr) = compute_tlx(&config, 140159034065024); + let (tlx_addr, ref_tlx_addr) = compute_tlx(&config, 140_159_034_065_024); dbg!(&tlx_addr, &ref_tlx_addr); assert_eq!(ref_tlx_addr.sub_partition, 1); assert_eq!(tlx_addr.sub_partition, 1); - let (tlx_addr, ref_tlx_addr) = compute_tlx(&config, 140159034065120); + let (tlx_addr, ref_tlx_addr) = compute_tlx(&config, 140_159_034_065_120); dbg!(&tlx_addr, &ref_tlx_addr); assert_eq!(ref_tlx_addr.sub_partition, 1); assert_eq!(tlx_addr.sub_partition, 1); - let (tlx_addr, ref_tlx_addr) = compute_tlx(&config, 140159034065152); + let (tlx_addr, ref_tlx_addr) = compute_tlx(&config, 140_159_034_065_152); dbg!(&tlx_addr, &ref_tlx_addr); assert_eq!(ref_tlx_addr.sub_partition, 2); assert_eq!(tlx_addr.sub_partition, 2); - let (tlx_addr, ref_tlx_addr) = compute_tlx(&config, 140159034065472); + let (tlx_addr, ref_tlx_addr) = compute_tlx(&config, 140_159_034_065_472); dbg!(&tlx_addr, &ref_tlx_addr); assert_eq!(ref_tlx_addr.sub_partition, 4); assert_eq!(tlx_addr.sub_partition, 4); - let (tlx_addr, ref_tlx_addr) = compute_tlx(&config, 140159034066048); + let (tlx_addr, ref_tlx_addr) = compute_tlx(&config, 140_159_034_066_048); dbg!(&tlx_addr, &ref_tlx_addr); assert_eq!(ref_tlx_addr.sub_partition, 9); assert_eq!(tlx_addr.sub_partition, 9); - let (tlx_addr, ref_tlx_addr) = compute_tlx(&config, 140159034066432); + let (tlx_addr, ref_tlx_addr) = compute_tlx(&config, 140_159_034_066_432); dbg!(&tlx_addr, &ref_tlx_addr); assert_eq!(ref_tlx_addr.sub_partition, 12); assert_eq!(tlx_addr.sub_partition, 12); - let (tlx_addr, ref_tlx_addr) = compute_tlx(&config, 140159034066944); + let (tlx_addr, ref_tlx_addr) = compute_tlx(&config, 140_159_034_066_944); dbg!(&tlx_addr, &ref_tlx_addr); assert_eq!(ref_tlx_addr.sub_partition, 0); assert_eq!(tlx_addr.sub_partition, 0); @@ -570,7 +570,7 @@ mod tests { #[test] fn test_tlx() { let config = GPUConfig::default(); - let (tlx_addr, ref_tlx_addr) = compute_tlx(&config, 139823420539008); + let (tlx_addr, ref_tlx_addr) = compute_tlx(&config, 139_823_420_539_008); let expected = super::DecodedAddress { chip: 0, bk: 1, @@ -587,23 +587,23 @@ mod tests { fn test_mask_limit() { use playground::addrdec::mask_limit as ref_mask_limit; - let mask = 0b0000000000000000000000000000000000000000000000000000000000000000; + let mask = 0b0000_0000_0000_0000_0000_0000_0000_0000_0000_0000_0000_0000_0000_0000_0000_0000; diff::assert_eq!(super::mask_limit(mask), (0, 64)); diff::assert_eq!(ref_mask_limit(mask), (0, 64)); - let mask = 0b0000000000000000000000000000000000000000000000000111000010000000; + let mask = 0b0000_0000_0000_0000_0000_0000_0000_0000_0000_0000_0000_0000_0111_0000_1000_0000; diff::assert_eq!(super::mask_limit(mask), (7, 15)); diff::assert_eq!(ref_mask_limit(mask), (7, 15)); - let mask = 0b0000000000000000000000000000000000001111111111111000000000000000; + let mask = 0b0000_0000_0000_0000_0000_0000_0000_0000_0000_1111_1111_1111_1000_0000_0000_0000; diff::assert_eq!(super::mask_limit(mask), (15, 28)); diff::assert_eq!(ref_mask_limit(mask), (15, 28)); - let mask = 0b0000000000000000000000000000000000000000000000000000111101111111; + let mask = 0b0000_0000_0000_0000_0000_0000_0000_0000_0000_0000_0000_0000_0000_1111_0111_1111; diff::assert_eq!(super::mask_limit(mask), (0, 12)); diff::assert_eq!(ref_mask_limit(mask), (0, 12)); - let mask = 0b0000000000000000000000000000000000000000000000000000000000011111; + let mask = 0b0000_0000_0000_0000_0000_0000_0000_0000_0000_0000_0000_0000_0000_0000_0001_1111; diff::assert_eq!(super::mask_limit(mask), (0, 5)); diff::assert_eq!(ref_mask_limit(mask), (0, 5)); } diff --git a/src/ported/arbitration.rs b/src/ported/arbitration.rs index 78f95e21..310321fd 100644 --- a/src/ported/arbitration.rs +++ b/src/ported/arbitration.rs @@ -42,7 +42,7 @@ impl ArbitrationMetadata { } /// check if a subpartition still has credit - pub fn has_credits(&self, inner_sub_partition_id: usize) -> bool { + #[must_use] pub fn has_credits(&self, inner_sub_partition_id: usize) -> bool { if self.private_credit[inner_sub_partition_id] < self.private_credit_limit { return true; } @@ -89,7 +89,7 @@ impl ArbitrationMetadata { } /// return the last subpartition that borrowed credit - pub fn last_borrower(&self) -> usize { + #[must_use] pub fn last_borrower(&self) -> usize { self.last_borrower } } diff --git a/src/ported/barrier.rs b/src/ported/barrier.rs index 1f04265b..458c81c6 100644 --- a/src/ported/barrier.rs +++ b/src/ported/barrier.rs @@ -12,7 +12,7 @@ pub struct BarrierSet { } impl BarrierSet { - pub fn new( + #[must_use] pub fn new( _max_warps_per_core: usize, _max_blocks_per_core: usize, _max_barriers_per_block: usize, diff --git a/src/ported/cache.rs b/src/ported/cache.rs index 9df952d4..7c5fd604 100644 --- a/src/ported/cache.rs +++ b/src/ported/cache.rs @@ -83,7 +83,7 @@ pub struct Event { } impl Event { - pub fn new(kind: EventKind) -> Self { + #[must_use] pub fn new(kind: EventKind) -> Self { Self { kind, evicted_block: None, @@ -92,9 +92,7 @@ impl Event { } pub trait Component { - fn cycle(&mut self) { - todo!("component: cycle"); - } + fn cycle(&mut self); } pub trait Cache: Component + CacheBandwidth { @@ -152,15 +150,8 @@ pub trait Cache: Component + CacheBandwidth { } } -// not clear if we ever need this pub trait CacheBandwidth { - fn has_free_data_port(&self) -> bool { - todo!("cache: has_free_data_port"); - false - } + fn has_free_data_port(&self) -> bool; - fn has_free_fill_port(&self) -> bool { - todo!("cache: has_free_fill_port"); - false - } + fn has_free_fill_port(&self) -> bool; } diff --git a/src/ported/cache_block.rs b/src/ported/cache_block.rs index cda68453..0e0522c6 100644 --- a/src/ported/cache_block.rs +++ b/src/ported/cache_block.rs @@ -100,7 +100,7 @@ impl Default for LineCacheBlock { } impl LineCacheBlock { - pub fn new() -> Self { + #[must_use] pub fn new() -> Self { Self::default() } @@ -143,7 +143,7 @@ impl LineCacheBlock { self.is_readable = true; } if self.set_byte_mask_on_fill { - self.set_byte_mask(&byte_mask) + self.set_byte_mask(byte_mask) } self.fill_time = time; @@ -171,32 +171,32 @@ impl LineCacheBlock { } #[inline] - pub fn status(&self, _mask: &mem_fetch::MemAccessSectorMask) -> Status { + #[must_use] pub fn status(&self, _mask: &mem_fetch::MemAccessSectorMask) -> Status { self.status } #[inline] - pub fn is_valid(&self) -> bool { + #[must_use] pub fn is_valid(&self) -> bool { self.status == Status::VALID } #[inline] - pub fn is_modified(&self) -> bool { + #[must_use] pub fn is_modified(&self) -> bool { self.status == Status::MODIFIED } #[inline] - pub fn is_invalid(&self) -> bool { + #[must_use] pub fn is_invalid(&self) -> bool { self.status == Status::INVALID } #[inline] - pub fn is_reserved(&self) -> bool { + #[must_use] pub fn is_reserved(&self) -> bool { self.status == Status::RESERVED } #[inline] - pub fn is_readable(&self, _mask: &mem_fetch::MemAccessSectorMask) -> bool { + #[must_use] pub fn is_readable(&self, _mask: &mem_fetch::MemAccessSectorMask) -> bool { self.is_readable } @@ -206,28 +206,28 @@ impl LineCacheBlock { } #[inline] - pub fn alloc_time(&self) -> u64 { + #[must_use] pub fn alloc_time(&self) -> u64 { self.alloc_time } #[inline] - pub fn last_access_time(&self) -> u64 { + #[must_use] pub fn last_access_time(&self) -> u64 { self.last_access_time } #[inline] - pub fn modified_size(&self) -> u32 { + #[must_use] pub fn modified_size(&self) -> u32 { // cache line size mem_sub_partition::SECTOR_CHUNCK_SIZE * mem_sub_partition::SECTOR_SIZE } #[inline] - pub fn dirty_byte_mask(&self) -> mem_fetch::MemAccessByteMask { + #[must_use] pub fn dirty_byte_mask(&self) -> mem_fetch::MemAccessByteMask { self.dirty_byte_mask } #[inline] - pub fn dirty_sector_mask(&self) -> mem_fetch::MemAccessSectorMask { + #[must_use] pub fn dirty_sector_mask(&self) -> mem_fetch::MemAccessSectorMask { if self.is_modified() { !BitArray::ZERO } else { diff --git a/src/ported/cluster.rs b/src/ported/cluster.rs index 253eff19..16a05dab 100644 --- a/src/ported/cluster.rs +++ b/src/ported/cluster.rs @@ -1,6 +1,6 @@ use super::{interconn as ic, mem_fetch, MockSimulator, Packet, SIMTCore}; use crate::config::GPUConfig; -use crate::ported; +use crate::ported::{self, Kernel}; use console::style; use std::cell::RefCell; use std::collections::VecDeque; @@ -18,7 +18,6 @@ pub struct SIMTCoreCluster { pub interconn: Arc, - // pub core_sim_order: Vec, pub core_sim_order: VecDeque, pub block_issue_next_core: Mutex, pub response_fifo: VecDeque, @@ -92,7 +91,7 @@ where .lock() .unwrap() .iter() - .map(|c| c.not_completed()) + .map(ported::core::SIMTCore::not_completed) .sum() } @@ -131,7 +130,7 @@ where self.cluster_id, self.response_fifo .iter() - .map(|fetch| fetch.to_string()) + .map(std::string::ToString::to_string) .collect::>(), )) .cyan() @@ -209,7 +208,7 @@ where }; // m_stats->m_incoming_traffic_stats->record_traffic(mf, packet_size); fetch.status = mem_fetch::Status::IN_CLUSTER_TO_SHADER_QUEUE; - self.response_fifo.push_back(fetch.clone()); + self.response_fifo.push_back(fetch); // m_stats->n_mem_to_simt[m_cluster_id] += mf->get_num_flits(false); } @@ -232,19 +231,15 @@ where log::debug!("cluster {} cycle {}", self.cluster_id, self.cycle.get()); let mut cores = self.cores.lock().unwrap(); - // for core in cores.iter_mut() { - for core_id in self.core_sim_order.iter() { - // core.cycle() + for core_id in &self.core_sim_order { cores[*core_id].cycle() } - // if (m_config->simt_core_sim_order == 1) { - // self.core_sim_order.rotate_left(1); - let first = self.core_sim_order.pop_front().unwrap(); - self.core_sim_order.push_back(first); - // m_core_sim_order.splice(m_core_sim_order.end(), m_core_sim_order, - // m_core_sim_order.begin()); - // } + if let ported::config::SchedulingOrder::RoundRobin = self.config.simt_core_sim_order { + self.core_sim_order.rotate_left(1); + // let first = self.core_sim_order.pop_front().unwrap(); + // self.core_sim_order.push_back(first); + } } pub fn issue_block_to_core(&self, sim: &MockSimulator) -> usize { @@ -259,37 +254,18 @@ where let mut num_blocks_issued = 0; let mut block_issue_next_core = self.block_issue_next_core.lock().unwrap(); - // dbg!(&sim.select_kernel()); for core_id in 0..num_cores { - // debug_assert_eq!(i, core.id); let core_id = (core_id + *block_issue_next_core + 1) % num_cores; let core = &mut cores[core_id]; - // let mut kernel = None; - let kernel: Option> = if self.config.concurrent_kernel_sm { - unimplemented!("concurrent kernel sm"); + let kernel: Option> = if self.config.concurrent_kernel_sm { // always select latest issued kernel // kernel = sim.select_kernel() - sim.select_kernel().map(Arc::clone) + // sim.select_kernel().map(Arc::clone); + unimplemented!("concurrent kernel sm"); } else { let mut current_kernel = core.inner.current_kernel.as_ref(); - // .map(Arc::clone); - // match core.inner.current_kernel { - // Some(current) if current.no_more_blocks_to_run() && core.not_completed() == 0 => { - // // new kernel - // sim.select_kernel() - // } - // None => { - // // new kernel - // sim.select_kernel() - // } - // - // } - // let kernel = core.inner.current_kernel; - // if let Some(current_kernel) = kernel { - // } - // kernel - let should_select_new_kernel = if let Some(ref current) = current_kernel { + let should_select_new_kernel = if let Some(current) = current_kernel { // if no more blocks left, get new kernel once current block completes current.no_more_blocks_to_run() && core.not_completed() == 0 } else { @@ -310,74 +286,24 @@ where if should_select_new_kernel { current_kernel = sim.select_kernel(); - if let Some(ref k) = current_kernel { + if let Some(k) = current_kernel { core.set_kernel(Arc::clone(k)); } } current_kernel.map(Arc::clone) - - // Select current core kernel. - // If no more cta, get a new kernel once core completed warps - // match core.inner.current_kernel { - // Some(current_kernel) - // if current_kernel.no_more_blocks_to_run() && core.not_completed() == 0 => - // { - // if should_select_new_kernel { - // kernel = sim.select_kernel(); - // if let Some(k) = kernel { - // core.set_kernel(Arc::clone(k)); - // } - // } - // } - // _ => {} - // } - // Select current core kernel. - // If no more cta, get a new kernel once core completed warps - // if current_kernel.no_more_blocks_to_run() && core.not_completed() == 0 { - // kernel = sim.select_kernel(); - // if let Some(k) = kernel { - // core.set_kernel(Arc::clone(k)); - // } - // } }; - // log::debug!( - // "core {}-{}: {} active warps, current kernel {:?}, more blocks={:?}", - // self.cluster_id, - // core.inner.core_id, - // core.inner.num_active_warps, - // core.inner.current_kernel.as_ref().map(|k| k.name()), - // core.inner - // .current_kernel - // .as_ref() - // .map(|k| !k.no_more_blocks_to_run()) - // ); - // log::debug!( - // "core {}-{}: selected kernel {:?}", - // self.cluster_id, - // core.inner.core_id, - // kernel.as_ref().map(|k| k.name()) - // ); if let Some(kernel) = kernel { - // let core_id = 0; log::debug!( "core {}-{}: selected kernel {} more blocks={} can issue={}", self.cluster_id, core_id, kernel, !kernel.no_more_blocks_to_run(), - core.can_issue_block(&*kernel), + core.can_issue_block(&kernel), ); - // log::debug!( - // "kernel: no more blocks to run={} can issue block {}", - // kernel.no_more_blocks_to_run(), - // core.can_issue_block(&*kernel) - // ); - // log::debug!("kernel: {:#?}", &*kernel); - - if !kernel.no_more_blocks_to_run() && core.can_issue_block(&*kernel) { - // core.issue_block(Arc::clone(kernel)); + if !kernel.no_more_blocks_to_run() && core.can_issue_block(&kernel) { core.issue_block(kernel); num_blocks_issued += 1; *block_issue_next_core = core_id; diff --git a/src/ported/core.rs b/src/ported/core.rs index 8a9defdc..c2337145 100644 --- a/src/ported/core.rs +++ b/src/ported/core.rs @@ -1,8 +1,8 @@ use super::instruction::WarpInstruction; use super::scheduler::SchedulerWarp; use super::{ - address, barrier, cache, opcodes, operand_collector as opcoll, register_set, scoreboard, - simd_function_unit as fu, KernelInfo, LoadStoreUnit, + address, barrier, cache, kernel::Kernel, opcodes, operand_collector as opcoll, register_set, + scoreboard, simd_function_unit as fu, LoadStoreUnit, }; use super::{interconn as ic, l1, mem_fetch, scheduler as sched}; use crate::config::{self, GPUConfig}; @@ -41,9 +41,9 @@ pub type WarpMask = BitArr!(for WARP_PER_CTA_MAX); /// Start of the program memory space /// /// Note: should be distinct from other memory spaces. -pub const PROGRAM_MEM_START: usize = 0xF0000000; +pub const PROGRAM_MEM_START: usize = 0xF000_0000; -pub const PROGRAM_MEM_ALLOC: Lazy = Lazy::new(|| super::Allocation { +pub static PROGRAM_MEM_ALLOC: Lazy = Lazy::new(|| super::Allocation { name: Some("PROGRAM_MEM".to_string()), id: 0, start_addr: PROGRAM_MEM_START as super::address, @@ -53,25 +53,19 @@ pub const PROGRAM_MEM_ALLOC: Lazy = Lazy::new(|| super::Alloc #[derive(Debug)] pub struct ThreadState { pub active: bool, - // pub block_id: usize, - // pub active: bool, pub pc: usize, } #[derive(Debug, Default)] pub struct InstrFetchBuffer { valid: bool, - pc: address, - num_bytes: usize, warp_id: usize, } impl InstrFetchBuffer { - pub fn new() -> Self { + #[must_use] pub fn new() -> Self { Self { valid: false, - pc: 0, - num_bytes: 0, warp_id: 0, } } @@ -87,12 +81,12 @@ pub struct InnerSIMTCore { pub warp_instruction_unique_uid: Arc, pub stats: Arc>, pub config: Arc, - pub current_kernel: Option>, + pub current_kernel: Option>, pub last_warp_fetched: Option, pub interconn: Arc, pub load_store_unit: Arc>>>, pub active_thread_mask: BitArr!(for MAX_THREAD_PER_SM), - pub occupied_hw_thread_ids: BitArr!(for MAX_THREAD_PER_SM), + occupied_hw_thread_ids: BitArr!(for MAX_THREAD_PER_SM), pub dynamic_warp_id: usize, pub num_active_blocks: usize, pub num_active_warps: usize, @@ -107,9 +101,8 @@ pub struct InnerSIMTCore { pub allocations: Rc>, pub instr_l1_cache: Box, pub instr_fetch_buffer: InstrFetchBuffer, - pub warps: Vec, + pub warps: Vec, pub thread_state: Vec>, - // pub thread_info: Vec>, pub scoreboard: Arc>, pub operand_collector: Rc>, pub pipeline_reg: Vec>>, @@ -134,7 +127,7 @@ pub enum Packet { impl std::fmt::Display for Packet { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { match self { - Packet::Fetch(fetch) => write!(f, "{}", fetch), + Packet::Fetch(fetch) => write!(f, "{fetch}"), } } } @@ -146,7 +139,7 @@ where // Returns numbers of addresses in translated_addrs. // // Each addr points to a 4B (32-bit) word - pub fn translate_local_memaddr( + #[must_use] pub fn translate_local_memaddr( &self, local_addr: address, thread_id: usize, @@ -206,7 +199,7 @@ where // will overflow into next thread's space debug_assert_eq!(local_addr % 4, 0); for i in 0..num_accesses { - let local_word = local_addr / 4 + (i as u64); + let local_word = local_addr / 4 + u64::from(i); let linear_address: address = local_word * max_concurrent_threads as u64 * 4 + thread_base as u64 + super::instruction::LOCAL_GENERIC_START; @@ -218,14 +211,14 @@ where let local_word = local_addr / 4; let local_word_offset = local_addr % 4; // Make sure access doesn't overflow into next 4B chunk - debug_assert_eq!((local_addr + data_size as address - 1) / 4, local_word); + debug_assert_eq!((local_addr + u64::from(data_size) - 1) / 4, local_word); let linear_address: address = local_word * max_concurrent_threads as u64 * 4 + local_word_offset + thread_base as u64 + super::instruction::LOCAL_GENERIC_START; translated_addresses.push(linear_address); } - return translated_addresses; + translated_addresses } } @@ -390,17 +383,6 @@ where if warp.done() && warp.functional_done() { warp.ibuffer_flush(); - // note: not modeling barriers for now - // self.barriers.warp_exit(pipe_reg_ref.warp_id); - } - - // let mut warp = self.warps.get_mut(warp_id).unwrap().lock().unwrap(); - if pipe_reg_ref.opcode.category == opcodes::ArchOp::BARRIER_OP { - // m_warp[warp_id]->store_info_of_last_inst_at_barrier(*pipe_reg); - // self.barriers.warp_reaches_barrier(warp.block_id, warp_id, next_inst); - } else if pipe_reg_ref.opcode.category == opcodes::ArchOp::MEMORY_BARRIER_OP { - // m_warp[warp_id]->set_membar(); - // warp.set_membar(); } log::debug!( @@ -550,22 +532,6 @@ where }) .collect(); - // SKIPPING SPECIALIZED UNITS - // for (int j = 0; j < m_config->m_specialized_unit.size(); j++) { - // m_pipeline_reg.push_back( - // register_set(m_config->m_specialized_unit[j].id_oc_spec_reg_width, - // m_config->m_specialized_unit[j].name)); - // m_config->m_specialized_unit[j].ID_OC_SPEC_ID = m_pipeline_reg.size() - 1; - // m_specilized_dispatch_reg.push_back( - // &m_pipeline_reg[m_pipeline_reg.size() - 1]); - // } - // for (int j = 0; j < m_config->m_specialized_unit.size(); j++) { - // m_pipeline_reg.push_back( - // register_set(m_config->m_specialized_unit[j].oc_ex_spec_reg_width, - // m_config->m_specialized_unit[j].name)); - // m_config->m_specialized_unit[j].OC_EX_SPEC_ID = m_pipeline_reg.size() - 1; - // } - if config.sub_core_model { // in subcore model, each scheduler should has its own // issue register, so ensure num scheduler = reg width @@ -581,20 +547,6 @@ where config.num_schedulers_per_core, pipeline_reg[PipelineStage::ID_OC_MEM as usize].size() ); - // if (m_config->gpgpu_tensor_core_avail) - // assert(m_config->gpgpu_num_sched_per_core == - // m_pipeline_reg[ID_OC_TENSOR_CORE].get_size()); - // if (m_config->gpgpu_num_dp_units > 0) - // assert(m_config->gpgpu_num_sched_per_core == - // m_pipeline_reg[ID_OC_DP].get_size()); - // if (m_config->gpgpu_num_int_units > 0) - // assert(m_config->gpgpu_num_sched_per_core == - // m_pipeline_reg[ID_OC_INT].get_size()); - // for (int j = 0; j < m_config->m_specialized_unit.size(); j++) { - // if (m_config->m_specialized_unit[j].num_units > 0) - // assert(m_config->gpgpu_num_sched_per_core == - // m_config->m_specialized_unit[j].id_oc_spec_reg_width); - // } } let fetch_interconn = Arc::new(ic::CoreMemoryInterface { @@ -609,7 +561,7 @@ where core_id, cluster_id, warps.clone(), - fetch_interconn.clone(), + fetch_interconn, operand_collector.clone(), scoreboard.clone(), config.clone(), @@ -634,7 +586,7 @@ where warp_instruction_unique_uid, stats, allocations, - config: config.clone(), + config, current_kernel: None, last_warp_fetched: None, active_thread_mask: BitArray::ZERO, @@ -652,10 +604,10 @@ where instr_fetch_buffer: InstrFetchBuffer::default(), interconn, load_store_unit, - warps: warps.clone(), + warps, pipeline_reg, result_busses, - scoreboard: scoreboard.clone(), + scoreboard, operand_collector, barriers, thread_state, @@ -798,150 +750,52 @@ where } fn init_schedulers(&mut self) { - // let scheduler_kind = config::SchedulerKind::LRR; let scheduler_kind = config::SchedulerKind::GTO; self.schedulers = (0..self.inner.config.num_schedulers_per_core) - .map(|sched_id| match scheduler_kind { - // config::SchedulerKind::LRR => { - // let mem_out = &self.inner.pipeline_reg[PipelineStage::ID_OC_MEM as usize]; - // Box::new(sched::LrrScheduler::new( - // // &self.inner.warps, - // sched_id, - // self.inner.cluster_id, - // self.inner.core_id, - // self.inner.warps.clone(), - // // mem_out, - // // &self.inner, - // self.inner.scoreboard.clone(), - // self.inner.stats.clone(), - // self.inner.config.clone(), - // // m_stats, this, m_scoreboard, m_simt_stack, &m_warp, - // // &m_pipeline_reg[ID_OC_SP], &m_pipeline_reg[ID_OC_DP], - // // &m_pipeline_reg[ID_OC_SFU], &m_pipeline_reg[ID_OC_INT], - // // &m_pipeline_reg[ID_OC_TENSOR_CORE], m_specilized_dispatch_reg, - // // &m_pipeline_reg[ID_OC_MEM], i - // )) as Box - // // self.schedulers.push_back(Box::new(lrr)); - // } - config::SchedulerKind::GTO => { - Box::new(sched::GTOScheduler::new( - // &self.inner.warps, + .map(|sched_id| { + let scheduler_stats = Arc::new(Mutex::new(stats::scheduler::Scheduler::default())); + match scheduler_kind { + config::SchedulerKind::GTO => Box::new(sched::gto::Scheduler::new( sched_id, self.inner.cluster_id, self.inner.core_id, self.inner.warps.clone(), - // mem_out, - // &self.inner, self.inner.scoreboard.clone(), - self.inner.stats.clone(), + scheduler_stats, self.inner.config.clone(), - // &self.inner.pipeline_reg[PipelineStage::ID_OC_MEM as usize], - // m_stats, this, m_scoreboard, m_simt_stack, &m_warp, - // &m_pipeline_reg[ID_OC_SP], &m_pipeline_reg[ID_OC_DP], - // &m_pipeline_reg[ID_OC_SFU], &m_pipeline_reg[ID_OC_INT], - // &m_pipeline_reg[ID_OC_TENSOR_CORE], m_specilized_dispatch_reg, - // &m_pipeline_reg[ID_OC_MEM], i - - // ORIGINAL PARAMS - // m_stats, - // this, - // m_scoreboard, - // m_simt_stack, - // &m_warp, - // &m_pipeline_reg[ID_OC_SP], - // &m_pipeline_reg[ID_OC_DP], - // &m_pipeline_reg[ID_OC_SFU], - // &m_pipeline_reg[ID_OC_INT], - // &m_pipeline_reg[ID_OC_TENSOR_CORE], - // m_specilized_dispatch_reg, - // &m_pipeline_reg[ID_OC_MEM], - // i, - )) as Box - // schedulers.push_back(gto); + )) + as Box, + scheduler_kind => unimplemented!("scheduler: {:?}", &scheduler_kind), } - // SchedulerKind::TwoLevelActive => { - // Box::new(sched::TwoLevelActiveScheduler::new( - // m_stats, this, m_scoreboard, m_simt_stack, &m_warp, - // &m_pipeline_reg[ID_OC_SP], &m_pipeline_reg[ID_OC_DP], - // &m_pipeline_reg[ID_OC_SFU], &m_pipeline_reg[ID_OC_INT], - // &m_pipeline_reg[ID_OC_TENSOR_CORE], m_specilized_dispatch_reg, - // &m_pipeline_reg[ID_OC_MEM], i, m_config->gpgpu_scheduler_string); - // schedulers.push_back(tla); - // }, - other => todo!("scheduler: {:?}", &other), - // SchedulerKind::RRR => { - // let rrr = RrrScheduler::new( - // m_stats, this, m_scoreboard, m_simt_stack, &m_warp, - // &m_pipeline_reg[ID_OC_SP], &m_pipeline_reg[ID_OC_DP], - // &m_pipeline_reg[ID_OC_SFU], &m_pipeline_reg[ID_OC_INT], - // &m_pipeline_reg[ID_OC_TENSOR_CORE], m_specilized_dispatch_reg, - // &m_pipeline_reg[ID_OC_MEM], i); - // schedulers.push_back(rrr); - // }, - // SchedulerKind::OldestFirst => { - // let oldest = OldestScheduler::new( - // m_stats, this, m_scoreboard, m_simt_stack, &m_warp, - // &m_pipeline_reg[ID_OC_SP], &m_pipeline_reg[ID_OC_DP], - // &m_pipeline_reg[ID_OC_SFU], &m_pipeline_reg[ID_OC_INT], - // &m_pipeline_reg[ID_OC_TENSOR_CORE], m_specilized_dispatch_reg, - // &m_pipeline_reg[ID_OC_MEM], i); - // schedulers.push_back(oldest); - // }, - // SchedulerKind::WarpLimiting => { - // let swl = SwlScheduler::new( - // m_stats, this, m_scoreboard, m_simt_stack, &m_warp, - // &m_pipeline_reg[ID_OC_SP], &m_pipeline_reg[ID_OC_DP], - // &m_pipeline_reg[ID_OC_SFU], &m_pipeline_reg[ID_OC_INT], - // &m_pipeline_reg[ID_OC_TENSOR_CORE], m_specilized_dispatch_reg, - // &m_pipeline_reg[ID_OC_MEM], i, m_config->gpgpu_scheduler_string); - // schedulers.push_back(swl); - // }, }) .collect(); - // } for (i, warp) in self.inner.warps.iter().enumerate() { - // distribute i's evenly though schedulers; + // distribute warps evenly though schedulers let sched_idx = i % self.inner.config.num_schedulers_per_core; let scheduler = &mut self.schedulers[sched_idx]; - scheduler.add_supervised_warp(Rc::clone(&warp)); + scheduler.add_supervised_warp(Rc::clone(warp)); } - // for scheduler in self.schedulers.iter_mut() { - // // todo!("call done_adding_supervised_warps"); - // scheduler.done_adding_supervised_warps(); - // } - // for (unsigned i = 0; i < m_config->gpgpu_num_sched_per_core; ++i) { - // schedulers[i]->done_adding_supervised_warps(); - // } } - pub fn active(&self) -> bool { + #[must_use] pub fn active(&self) -> bool { self.inner.num_active_blocks > 0 } /// return the next pc of a thread pub fn next_pc(&mut self, thread_id: usize) -> Option { - // if (tid == -1) return -1; - // PC should already be updatd to next PC at this point (was - // set in shader_decode() last time thread ran) - self.inner - .thread_state - .get(thread_id) - .map(Option::as_ref) - .flatten() - .map(|t| t.pc) + self.inner.thread_state[thread_id].as_ref().map(|t| t.pc) } fn register_thread_in_block_exited( &mut self, block_hw_id: usize, - kernel: &Option>, + kernel: &Option>, ) { let current_kernel: &mut Option<_> = - &mut self.inner.current_kernel.as_ref().map(|k| k.as_ref()); + &mut self.inner.current_kernel.as_ref().map(std::convert::AsRef::as_ref); - // see: m_cta_status debug_assert!(block_hw_id < MAX_CTA_PER_SHADER); debug_assert!(self.inner.block_status[block_hw_id] > 0); self.inner.block_status[block_hw_id] -= 1; @@ -952,22 +806,7 @@ where // m_stats->ctas_completed++; // m_gpu->inc_completed_cta(); self.inner.num_active_blocks -= 1; - // m_barriers.deallocate_barrier(cta_num); - // shader_CTA_count_unlog(m_sid, 1); - // - // SHADER_DPRINTF( - // LIVENESS, - // "GPGPU-Sim uArch: Finished CTA #%u (%lld,%lld), %u CTAs running\n", - // cta_num, m_gpu->gpu_sim_cycle, m_gpu->gpu_tot_sim_cycle, - // m_n_active_cta); - // if self.inner.num_active_blocks == 0 { - // SHADER_DPRINTF( - // LIVENESS, - // "GPGPU-Sim uArch: Empty (last released kernel %u \'%s\').\n", - // kernel->get_uid(), kernel->name().c_str()); - // fflush(stdout); - // // Shader can only be empty when no more cta are dispatched if kernel.as_ref().map(|k| k.config.id) != current_kernel.map(|k| k.config.id) { // debug_assert!(current_kernel.is_none() || kernel.no_more_blocks_to_run()); @@ -975,23 +814,14 @@ where *current_kernel = None; } // - // // Jin: for concurrent kernels on sm // self.release_shader_resource_1block(cta_num, kernel); // kernel->dec_running(); if let Some(kernel) = kernel { - if kernel.no_more_blocks_to_run() { - if !kernel.running() { - // SHADER_DPRINTF(LIVENESS, - // "GPGPU-Sim uArch: GPU detected kernel %u \'%s\' " - // "finished on shader %u.\n", - // kernel->get_uid(), kernel->name().c_str(), m_sid); - // - if current_kernel.map(|k| k.config.id) == Some(kernel.config.id) { - *current_kernel = None; - } - - // m_gpu->set_kernel_done(kernel); - } + if kernel.no_more_blocks_to_run() + && !kernel.running() + && current_kernel.map(|k| k.config.id) == Some(kernel.config.id) + { + *current_kernel = None; } } } @@ -1011,7 +841,7 @@ where .cloned() .unwrap_or_default() .iter() - .map(|access| access.to_string()) + .map(std::string::ToString::to_string) .collect::>(), )) .green() @@ -1024,21 +854,16 @@ where let mut warp = warp.try_borrow_mut().unwrap(); warp.has_imiss_pending = false; - let pc = warp.pc().unwrap() as u64; self.inner.instr_fetch_buffer = InstrFetchBuffer { valid: true, - pc, - num_bytes: fetch.data_size as usize, warp_id: fetch.warp_id, }; // verify that we got the instruction we were expecting. - // TODO: this does not work because the fetch.addr() is not the same anymore? - // it gets changed to the block addr on the way and not ever changed back.. - // debug_assert_eq!( - // warp.pc(), - // Some(fetch.addr() as usize - super::PROGRAM_MEM_START) - // ); + debug_assert_eq!( + warp.pc(), + Some(fetch.addr() as usize - super::PROGRAM_MEM_START) + ); self.inner.instr_fetch_buffer.valid = true; // warp.set_last_fetch(m_gpu->gpu_sim_cycle); @@ -1060,7 +885,6 @@ where let sb = self.inner.scoreboard.read().unwrap(); let pending_writes = sb.pending_writes(warp_id); - // .clone(); // if warp.functional_done() && warp.hardware_done() && warp.done_exit() { // continue; @@ -1080,7 +904,6 @@ where ); } - // log!("\n\n"); for i in 0..max_warps { let last = self.inner.last_warp_fetched.unwrap_or(0); let warp_id = (last + 1 + i) % max_warps; @@ -1153,7 +976,6 @@ where } } self.inner.num_active_warps -= 1; - debug_assert!(self.inner.num_active_warps >= 0); } let mut warp = self.inner.warps[warp_id].try_borrow_mut().unwrap(); @@ -1197,13 +1019,14 @@ where let mut num_bytes = 16; let line_size = icache_config.line_size as usize; let offset_in_block = pc & (line_size - 1); - if offset_in_block + num_bytes > line_size as usize { - num_bytes = line_size as usize - offset_in_block; + if offset_in_block + num_bytes > line_size { + num_bytes = line_size - offset_in_block; } + let inst_alloc = &*PROGRAM_MEM_ALLOC; let access = mem_fetch::MemAccess::new( mem_fetch::AccessKind::INST_ACC_R, ppc as u64, - Some(PROGRAM_MEM_ALLOC.clone()), + Some(inst_alloc.clone()), num_bytes as u32, false, // todo: is this correct? @@ -1214,7 +1037,7 @@ where let fetch = mem_fetch::MemFetch::new( None, access, - &*self.inner.config, + &self.inner.config, mem_fetch::READ_PACKET_SIZE.into(), warp_id, self.inner.core_id, @@ -1222,7 +1045,6 @@ where ); let status = if self.inner.config.perfect_inst_const_cache { - // shader_cache_access_log(m_sid, INSTRUCTION, 0); cache::RequestStatus::HIT } else { let mut events = Vec::new(); @@ -1240,47 +1062,28 @@ where self.inner.last_warp_fetched = Some(warp_id); if status == cache::RequestStatus::MISS { - // let warp = self.inner.warps.get_mut(warp_id).unwrap(); - // let warp = warp.lock().unwrap(); - // .as_mut() - // .unwrap(); warp.has_imiss_pending = true; // warp.set_last_fetch(m_gpu->gpu_sim_cycle); } else if status == cache::RequestStatus::HIT { self.inner.instr_fetch_buffer = InstrFetchBuffer { valid: true, - pc: pc as u64, - num_bytes, + // pc: pc as u64, + // num_bytes, warp_id, }; // m_warp[warp_id]->set_last_fetch(m_gpu->gpu_sim_cycle); - // delete mf; } else { debug_assert_eq!(status, cache::RequestStatus::RESERVATION_FAIL); - // delete mf; } break; } - // } } } } self.inner.instr_l1_cache.cycle(); } - /// shader core decode pipeline stage - /// - /// NOTE: inst fetch buffer valid after 279 cycles - /// - /// investigate: - /// - fetch buffer becomes valid when icache has access ready - /// - icache has access ready whenm mshrs has next access - /// - mshrs has next access when mshrs::current_response queue is not empty - /// - mshrs::current_response is pushed into by mshr_table::mark_ready - /// - mshr_table::mark_ready is called by baseline_cache::fill - /// - only trace_shader_core_ctx::accept_fetch_response calls baseline_cache::fill - /// - only void simt_core_cluster::icnt_cycle() calls accept_fetch_response when there is a - /// response + /// Shader core decode fn decode(&mut self) { let InstrFetchBuffer { valid, warp_id, .. } = self.inner.instr_fetch_buffer; @@ -1365,7 +1168,7 @@ where instr, ); - warp.ibuffer_fill(slot, instr.clone()); + warp.ibuffer_fill(slot, instr); warp.num_instr_in_pipeline += 1; } @@ -1522,8 +1325,7 @@ where .inner .result_busses .iter_mut() - .filter(|bus| !bus[instr.latency]) - .next(); + .find(|bus| !bus[instr.latency]); log::debug!( "{} {} (partition issue={}, reg id={:?}) ready for issue to fu[{:03}]={}", @@ -1545,12 +1347,9 @@ where Some(result_bus) if schedule_wb_now => { debug_assert!(instr.latency < fu::MAX_ALU_LATENCY); result_bus.set(instr.latency, true); - // fu.issue(&mut issue_inst); - // let ready_reg = ready_reg.take(); fu.issue(ready_reg.take().unwrap()); } _ if !schedule_wb_now => { - // fu.issue(&mut issue_inst); fu.issue(ready_reg.take().unwrap()); } _ => { @@ -1617,7 +1416,7 @@ where unit.invalidate(); } - pub fn ldst_unit_response_buffer_full(&self) -> bool { + #[must_use] pub fn ldst_unit_response_buffer_full(&self) -> bool { self.inner .load_store_unit .lock() @@ -1625,7 +1424,7 @@ where .response_buffer_full() } - pub fn fetch_unit_response_buffer_full(&self) -> bool { + #[must_use] pub fn fetch_unit_response_buffer_full(&self) -> bool { false } @@ -1639,15 +1438,15 @@ where self.inner.load_store_unit.lock().unwrap().fill(fetch); } - pub fn not_completed(&self) -> usize { + #[must_use] pub fn not_completed(&self) -> usize { self.inner.num_active_threads } - pub fn is_active(&self) -> bool { + #[must_use] pub fn is_active(&self) -> bool { self.inner.num_active_blocks > 0 } - pub fn set_kernel(&mut self, kernel: Arc) { + pub fn set_kernel(&mut self, kernel: Arc) { log::debug!("kernel {} bind to core {:?}", kernel, self.id()); self.inner.current_kernel = Some(kernel); } @@ -1659,16 +1458,19 @@ where ) -> Option { let mut step = 0; while step < self.inner.config.max_threads_per_core { - let hw_thread_id = step; - while hw_thread_id < step + thread_block_size { - if self.inner.occupied_hw_thread_ids[hw_thread_id] { - break; - } - } - // consecutive non-active - if hw_thread_id == step + thread_block_size { + if self.inner.occupied_hw_thread_ids[step..(step + thread_block_size)].not_any() { + // found consecutive non-active break; } + // for hw_thread_id in step..(step + thread_block_size) { + // if self.inner.occupied_hw_thread_ids[hw_thread_id] { + // break; + // } + // } + // consecutive non-active + // if hw_thread_id == step + thread_block_size { + // break; + // } step += thread_block_size; } if step >= self.inner.config.max_threads_per_core { @@ -1676,138 +1478,42 @@ where None } else { if occupy { - for hw_thread_id in step..step + thread_block_size { - self.inner.occupied_hw_thread_ids.set(hw_thread_id, true); - } + self.inner.occupied_hw_thread_ids[step..(step + thread_block_size)].fill(true); + // for hw_thread_id in step..(step + thread_block_size) { + // self.inner.occupied_hw_thread_ids.set(hw_thread_id, true); + // } } Some(step) } } - // int shader_core_ctx::find_available_hwtid(unsigned int cta_size, bool occupy) { - // unsigned int step; - // for (step = 0; step < m_config->n_thread_per_shader; step += cta_size) { - // unsigned int hw_tid; - // for (hw_tid = step; hw_tid < step + cta_size; hw_tid++) { - // if (m_occupied_hwtid.test(hw_tid)) break; - // } - // if (hw_tid == step + cta_size) // consecutive non-active - // break; - // } - // if (step >= m_config->n_thread_per_shader) // didn't find - // return -1; - // else { - // if (occupy) { - // for (unsigned hw_tid = step; hw_tid < step + cta_size; hw_tid++) - // m_occupied_hwtid.set(hw_tid); - // } - // return step; - // } - // } - - pub fn occupy_resource_for_block(&mut self, kernel: &KernelInfo, _occupy: bool) -> bool { - let thread_block_size = self.inner.config.threads_per_block_padded(kernel); - if self.inner.num_occupied_threads + thread_block_size - > self.inner.config.max_threads_per_core - { - return false; - } - if self - .find_available_hw_thread_id(thread_block_size, false) - .is_none() - { - return false; - } - unimplemented!("occupy resource for block"); - return true; - } - - // bool shader_core_ctx::occupy_shader_resource_1block(kernel_info_t &k, - // bool occupy) { - // unsigned threads_per_cta = k.threads_per_cta(); - // const class function_info *kernel = k.entry(); - // unsigned int padded_cta_size = threads_per_cta; - // unsigned int warp_size = m_config->warp_size; - // if (padded_cta_size % warp_size) - // padded_cta_size = ((padded_cta_size / warp_size) + 1) * (warp_size); - // - // if (m_occupied_n_threads + padded_cta_size > m_config->n_thread_per_shader) - // return false; - // - // if (find_available_hwtid(padded_cta_size, false) == -1) return false; - // - // const struct gpgpu_ptx_sim_info *kernel_info = ptx_sim_kernel_info(kernel); - // - // if (m_occupied_shmem + kernel_info->smem > m_config->gpgpu_shmem_size) - // return false; - // - // unsigned int used_regs = padded_cta_size * ((kernel_info->regs + 3) & ~3); - // if (m_occupied_regs + used_regs > m_config->gpgpu_shader_registers) - // return false; - // - // if (m_occupied_ctas + 1 > m_config->max_cta_per_core) return false; - // - // if (occupy) { - // m_occupied_n_threads += padded_cta_size; - // m_occupied_shmem += kernel_info->smem; - // m_occupied_regs += (padded_cta_size * ((kernel_info->regs + 3) & ~3)); - // m_occupied_ctas++; - // - // SHADER_DPRINTF(LIVENESS, - // "GPGPU-Sim uArch: Occupied %u threads, %u shared mem, %u " - // "registers, %u ctas, on shader %d\n", - // m_occupied_n_threads, m_occupied_shmem, m_occupied_regs, - // m_occupied_ctas, m_sid); - // } - // - // return true; - // } - pub fn can_issue_block(&mut self, kernel: &KernelInfo) -> bool { + pub fn can_issue_block(&mut self, kernel: &Kernel) -> bool { let max_blocks = self.inner.config.max_blocks(kernel).unwrap(); if self.inner.config.concurrent_kernel_sm { - unimplemented!("concurrent kernel sm model"); if max_blocks < 1 { return false; } - self.occupy_resource_for_block(kernel, false) + // self.occupy_resource_for_block(kernel, false); + unimplemented!("concurrent kernel sm model"); } else { self.inner.num_active_blocks < max_blocks } } - /// m_not_completed - // pub fn active_warps(&self) -> usize { - // 0 - // } - - fn set_max_blocks(&mut self, kernel: &KernelInfo) -> eyre::Result<()> { + fn set_max_blocks(&mut self, kernel: &Kernel) -> eyre::Result<()> { // calculate the max cta count and cta size for local memory address mapping self.inner.max_blocks_per_shader = self.inner.config.max_blocks(kernel)?; self.inner.thread_block_size = self.inner.config.threads_per_block_padded(kernel); Ok(()) } - pub fn id(&self) -> (usize, usize) { + #[must_use] pub fn id(&self) -> (usize, usize) { (self.inner.cluster_id, self.inner.core_id) } - // pub fn init_warps_from_traces( - // &mut self, - // kernel: &KernelInfo, - // start_thread: usize, - // end_thread: usize, - // ) { - // let start_warp = start_thread / self.inner.config.warp_size; - // let end_warp = (end_thread / self.inner.config.warp_size) - // + if end_thread % self.inner.config.warp_size != 0 { - // 1 - // } else { - // 0 - // }; - pub fn init_warps_from_traces( &mut self, - kernel: &Arc, + kernel: &Arc, start_warp: usize, end_warp: usize, ) { @@ -1835,27 +1541,13 @@ where end_thread: usize, block_id: u64, thread_block_size: usize, - kernel: Arc, + kernel: Arc, ) { - // log::debug!( - // "core {:?}: init warps (threads {}..{}) for block {} (hw {})", - // self.id(), - // start_thread, - // end_thread, - // block_id, - // block_hw_id - // ); - log::debug!("kernel: {}", &kernel); - let start_pc = self.next_pc(start_thread); let start_warp = start_thread / self.inner.config.warp_size; let _warp_per_cta = thread_block_size / self.inner.config.warp_size; let end_warp = end_thread / self.inner.config.warp_size - + if end_thread % self.inner.config.warp_size == 0 { - 0 - } else { - 1 - }; + + usize::from(end_thread % self.inner.config.warp_size != 0); for warp_id in start_warp..end_warp { let mut num_active = 0; @@ -1897,17 +1589,11 @@ where pub fn reinit(&mut self, start_thread: usize, end_thread: usize, reset_not_completed: bool) { if reset_not_completed { + self.inner.num_active_warps = 0; self.inner.num_active_threads = 0; self.inner.active_thread_mask.fill(false); - - // Jin: for concurrent kernels on a SM - // m_occupied_n_threads = 0; - // m_occupied_shmem = 0; - // m_occupied_regs = 0; - // m_occupied_ctas = 0; - // m_occupied_hwtid.reset(); - // m_occupied_cta_to_hwtid.clear(); - self.inner.num_active_warps = 0; + self.inner.occupied_block_to_hw_thread_id.clear(); + self.inner.occupied_hw_thread_ids.fill(false); } for t in start_thread..end_thread { self.inner.thread_state[t] = None; @@ -1925,20 +1611,18 @@ where ); for w in start_warp..end_warp { - // log::debug!("reset warp = {}/{}", w + 1, self.inner.warps.len()); self.inner.warps[w].try_borrow_mut().unwrap().reset(); - // simt_stack[i]->reset(); } } - pub fn issue_block(&mut self, kernel: Arc) -> () { + pub fn issue_block(&mut self, kernel: Arc) { log::debug!("core {:?}: issue block", self.id()); if self.inner.config.concurrent_kernel_sm { - let occupied = self.occupy_resource_for_block(&*kernel, true); - assert!(occupied); + // let occupied = self.occupy_resource_for_block(&*kernel, true); + // assert!(occupied); unimplemented!("concurrent kernel sm"); } else { - self.set_max_blocks(&*kernel).unwrap(); + self.set_max_blocks(&kernel).unwrap(); } // kernel.inc_running(); @@ -1956,13 +1640,12 @@ where self.inner.block_status ); let free_block_hw_id = (0..max_blocks_per_core) - .filter(|i| self.inner.block_status[*i] == 0) - .next() + .find(|i| self.inner.block_status[*i] == 0) .unwrap(); // determine hardware threads and warps that will be used for this block let thread_block_size = kernel.threads_per_block(); - let padded_thread_block_size = self.inner.config.threads_per_block_padded(&*kernel); + let padded_thread_block_size = self.inner.config.threads_per_block_padded(&kernel); // hw warp id = hw thread id mod warp size, so we need to find a range // of hardware thread ids corresponding to an integral number of hardware diff --git a/src/ported/deadlock.rs b/src/ported/deadlock.rs new file mode 100644 index 00000000..271179ea --- /dev/null +++ b/src/ported/deadlock.rs @@ -0,0 +1,103 @@ +use super::{core, interconn as ic, mem_fetch, operand_collector, register_set}; + +#[derive(Debug, PartialEq, Eq)] +pub struct State { + pub interconn_to_l2_queue: Vec>, + pub l2_to_interconn_queue: Vec>, + pub l2_to_dram_queue: Vec>, + pub dram_to_l2_queue: Vec>, + pub dram_latency_queue: Vec>, + pub functional_unit_pipelines: Vec>, + // pub operand_collectors: Vec>, + // pub schedulers: Vec>, + // functional_unit_pipelines + // schedulers + // operand_collectors +} + +impl State { + #[must_use] + pub fn new(total_cores: usize, num_mem_partitions: usize, num_sub_partitions: usize) -> Self { + Self { + // per sub partition + interconn_to_l2_queue: vec![vec![]; num_sub_partitions], + l2_to_interconn_queue: vec![vec![]; num_sub_partitions], + l2_to_dram_queue: vec![vec![]; num_sub_partitions], + dram_to_l2_queue: vec![vec![]; num_sub_partitions], + // per partition + dram_latency_queue: vec![vec![]; num_mem_partitions], + // per core + functional_unit_pipelines: vec![vec![]; total_cores], + // operand_collectors: vec![None; total_cores], + // schedulers: vec![vec![]; total_cores], + } + } +} + +impl super::MockSimulator +where + I: ic::Interconnect + 'static, +{ + pub fn gather_state(&self) -> State { + let total_cores = self.config.total_cores(); + let num_partitions = self.mem_partition_units.len(); + let num_sub_partitions = self.mem_sub_partitions.len(); + + let mut state = State::new(total_cores, num_partitions, num_sub_partitions); + + for (cluster_id, cluster) in self.clusters.iter().enumerate() { + for (core_id, core) in cluster.cores.lock().unwrap().iter().enumerate() { + let global_core_id = cluster_id * self.config.num_cores_per_simt_cluster + core_id; + assert_eq!(core.inner.core_id, global_core_id); + + // this is the one we will use (unless the assertion is ever false) + let core_id = core.inner.core_id; + + // core: functional units + for (fu_id, fu) in core.functional_units.iter().enumerate() { + let _fu = fu.lock().unwrap(); + let issue_port = core.issue_ports[fu_id]; + let issue_reg: register_set::RegisterSet = core.inner.pipeline_reg + [issue_port as usize] + .borrow() + .clone(); + assert_eq!(issue_port, issue_reg.stage); + + state.functional_unit_pipelines[core_id].push(issue_reg); + } + // core: operand collector + // state.operand_collectors[core_id] = + // Some(core.inner.operand_collector.borrow().clone()); + // core: schedulers + // state.schedulers[core_id].extend(core.schedulers.iter().map(Into::into)); + } + } + for (partition_id, partition) in self.mem_partition_units.iter().enumerate() { + state.dram_latency_queue[partition_id] + .extend(partition.dram_latency_queue.clone().into_iter()); + } + for (sub_id, sub) in self.mem_sub_partitions.iter().enumerate() { + for (dest_queue, src_queue) in [ + ( + &mut state.interconn_to_l2_queue[sub_id], + &sub.borrow().interconn_to_l2_queue, + ), + ( + &mut state.l2_to_interconn_queue[sub_id], + &sub.borrow().l2_to_interconn_queue, + ), + ( + &mut state.l2_to_dram_queue[sub_id], + &sub.borrow().l2_to_dram_queue.lock().unwrap(), + ), + ( + &mut state.dram_to_l2_queue[sub_id], + &sub.borrow().dram_to_l2_queue, + ), + ] { + dest_queue.extend(src_queue.clone().into_iter()); + } + } + state + } +} diff --git a/src/ported/deprecated/core.rs b/src/ported/deprecated/core.rs new file mode 100644 index 00000000..96d42b04 --- /dev/null +++ b/src/ported/deprecated/core.rs @@ -0,0 +1,15 @@ +// pub fn occupy_resource_for_block(&mut self, kernel: &KernelInfo, _occupy: bool) -> bool { +// let thread_block_size = self.inner.config.threads_per_block_padded(kernel); +// if self.inner.num_occupied_threads + thread_block_size +// > self.inner.config.max_threads_per_core +// { +// return false; +// } +// if self +// .find_available_hw_thread_id(thread_block_size, false) +// .is_none() +// { +// return false; +// } +// unimplemented!("occupy resource for block"); +// } diff --git a/src/ported/deprecated/scheduler.rs b/src/ported/deprecated/scheduler.rs new file mode 100644 index 00000000..0420fdca --- /dev/null +++ b/src/ported/deprecated/scheduler.rs @@ -0,0 +1,199 @@ +#[derive(Debug)] +pub struct LrrScheduler { + inner: BaseSchedulerUnit, +} + +impl SchedulerUnit for LrrScheduler { + // impl<'a> SchedulerUnit for LrrScheduler<'a> { + fn order_warps( + &mut self, + // out: &mut VecDeque, + // warps: &mut Vec, + // last_issued_warps: &Vec, + // num_warps_to_add: usize, + ) { + self.inner.order_lrr(); + // let num_warps_to_add = self.inner.supervised_warps.len(); + // order_lrr( + // &mut self.inner.next_cycle_prioritized_warps, + // &mut self.inner.supervised_warps, + // &mut self.inner.last_supervised_issued_idx, + // // &mut self.inner.last_supervised_issued(), + // num_warps_to_add, + // ); + } + + fn add_supervised_warp(&mut self, warp: CoreWarp) { + self.inner.supervised_warps.push_back(warp); + // self.inner.add_supervised_warp_id(warp_id); + } + + fn prioritized_warps(&self) -> &VecDeque { + self.inner.prioritized_warps() + } + + // fn add_supervised_warp_id(&mut self, warp_id: usize) { + // self.inner.add_supervised_warp_id(warp_id); + // } + + // fn done_adding_supervised_warps(&mut self) { + // self.inner.last_supervised_issued_idx = self.inner.supervised_warps.len(); + // } + + // fn cycle(&mut self, core: &mut super::core::InnerSIMTCore) { + // fn cycle(&mut self, core: ()) { + fn cycle(&mut self, issuer: &mut dyn super::core::WarpIssuer) { + self.order_warps(); + self.inner.cycle(issuer); + } +} + +// impl<'a> LrrScheduler<'a> { +impl LrrScheduler { + // fn order_warps( + // &self, + // out: &mut VecDeque, + // warps: &mut Vec, + // last_issued_warps: &Vec, + // num_warps_to_add: usize, + // ) { + // todo!("scheduler unit: order warps") + // } + + // pub fn new( + // id: usize, + // // warps: &'a Vec, + // warps: Vec, + // // warps: &'a Vec>, + // // mem_out: &'a register_set::RegisterSet, + // // core: &'a super::core::InnerSIMTCore, + // scoreboard: Arc>, + // stats: Arc>, + // config: Arc, + // ) -> Self { + // // todo!("lrr scheduler: new"); + // let inner = BaseSchedulerUnit::new( + // id, // mem_out, core, + // warps, scoreboard, stats, config, + // ); + // Self { inner } + // } + + // lrr_scheduler(shader_core_stats *stats, shader_core_ctx *shader, + // Scoreboard *scoreboard, simt_stack **simt, + // std::vector *warp, register_set *sp_out, + // register_set *dp_out, register_set *sfu_out, + // register_set *int_out, register_set *tensor_core_out, + // std::vector &spec_cores_out, + // register_set *mem_out, int id) + // : scheduler_unit(stats, shader, scoreboard, simt, warp, sp_out, dp_out, + // sfu_out, int_out, tensor_core_out, spec_cores_out, + // mem_out, id) {} + + // virtual void order_warps(); +} + +fn order_rrr( + &mut self, + // out: &mut VecDeque, + // warps: &mut Vec, + // std::vector &result_list, const typename std::vector &input_list, + // const typename std::vector::const_iterator &last_issued_from_input, + // unsigned num_warps_to_add) +) { + unimplemented!("order rrr is untested"); + let num_warps_to_add = self.supervised_warps.len(); + let out = &mut self.next_cycle_prioritized_warps; + // order_lrr( + // &mut self.inner.next_cycle_prioritized_warps, + // &mut self.inner.supervised_warps, + // &mut self.inner.last_supervised_issued_idx, + // // &mut self.inner.last_supervised_issued(), + // num_warps_to_add, + // ); + + out.clear(); + + let current_turn_warp_ref = self.warps.get(self.current_turn_warp).unwrap(); + let current_turn_warp = current_turn_warp_ref.try_borrow().unwrap(); + // .as_ref() + // .unwrap(); + + if self.num_issued_last_cycle > 0 + || current_turn_warp.done_exit() + || current_turn_warp.waiting() + { + // std::vector::const_iterator iter = + // (last_issued_from_input == input_list.end()) ? + // input_list.begin() : last_issued_from_input + 1; + + let mut iter = self + .supervised_warps + .iter() + .skip(self.last_supervised_issued_idx + 1) + .chain(self.supervised_warps.iter()); + + for w in iter.take(num_warps_to_add) { + let warp = w.try_borrow().unwrap(); + let warp_id = warp.warp_id; + if !warp.done_exit() && !warp.waiting() { + out.push_back(w.clone()); + self.current_turn_warp = warp_id; + break; + } + } + // for (unsigned count = 0; count < num_warps_to_add; ++iter, ++count) { + // if (iter == input_list.end()) { + // iter = input_list.begin(); + // } + // unsigned warp_id = (*iter)->get_warp_id(); + // if (!(*iter)->done_exit() && !(*iter)->waiting()) { + // result_list.push_back(*iter); + // m_current_turn_warp = warp_id; + // break; + // } + // } + } else { + out.push_back(current_turn_warp_ref.clone()); + } +} + +fn order_lrr( + &mut self, + // out: &mut VecDeque, + // warps: &mut Vec, + // // last_issued_warps: &Vec, + // // last_issued_warps: impl Iterator, + // // last_issued_warps: &mut std::slice::Iter<'_, SchedulerWarp>, + // // last_issued_warps: impl Iterator, + // last_issued_warp_idx: &mut usize, + // num_warps_to_add: usize, +) { + unimplemented!("order lrr is not tested"); + let num_warps_to_add = self.supervised_warps.len(); + let out = &mut self.next_cycle_prioritized_warps; + + debug_assert!(num_warps_to_add <= self.warps.len()); + out.clear(); + // if last_issued_warps + // typename std::vector::const_iterator iter = (last_issued_from_input == input_list.end()) ? input_list.begin() + // : last_issued_from_input + 1; + // + let mut last_issued_iter = self.warps.iter().skip(self.last_supervised_issued_idx); + + let mut iter = last_issued_iter.chain(self.warps.iter()); + // .filter_map(|x| x.as_ref()); + // .filter_map(|x| x.as_ref()); + + out.extend(iter.take(num_warps_to_add).cloned()); + // for count in 0..num_warps_to_add { + // let Some(warp) = iter.next() else { + // return; + // }; + // // if (iter == input_list.end()) { + // // iter = input_list.begin(); + // // } + // out.push_back(warp.clone()); + // } + // todo!("order lrr: order warps") +} diff --git a/src/ported/set_index_function.rs b/src/ported/deprecated/set_index_function.rs similarity index 95% rename from src/ported/set_index_function.rs rename to src/ported/deprecated/set_index_function.rs index 66b6e059..a4e2219e 100644 --- a/src/ported/set_index_function.rs +++ b/src/ported/deprecated/set_index_function.rs @@ -1,10 +1,6 @@ use super::address; use color_eyre::eyre; -pub fn bitwise_hash_function(higher_bits: address, index: usize, bank_set_num: usize) -> u64 { - index as u64 ^ (higher_bits & (bank_set_num as u64 - 1)) -} - /// Set Indexing function from "Pseudo-randomly interleaved memory." /// Rau, B. R et al. /// ISCA 1991 @@ -30,6 +26,7 @@ pub fn bitwise_hash_function(higher_bits: address, index: usize, bank_set_num: u /// IPOLY hashing guarantees conflict-free for all 2^n strides which widely /// exit in GPGPU applications and also show good performance for other /// strides. +#[must_use] pub fn ipoly_hash_function(_higher_bits: address, _index: usize, _bank_set_num: usize) -> u64 { todo!("ipoly_hash_function"); } diff --git a/src/ported/dram.rs b/src/ported/dram.rs index bfab2c7c..b8d5d235 100644 --- a/src/ported/dram.rs +++ b/src/ported/dram.rs @@ -55,7 +55,7 @@ impl DRAM { /// DRAM access /// /// Here, we do nothing except logging statistics - /// see: memory_stats_t::memlatstat_dram_access() + /// see: `memory_stats_t::memlatstat_dram_access`() pub fn access(&mut self, fetch: &mem_fetch::MemFetch) { let dram_id = fetch.tlx_addr.chip as usize; let bank = fetch.tlx_addr.bk as usize; @@ -92,7 +92,7 @@ impl DRAM { // todo!("dram: return_queue_top"); // } // - pub fn full(&self, _is_write: bool) -> bool { + #[must_use] pub fn full(&self, _is_write: bool) -> bool { false // let write_queue_size = self.config.dram_frfcfs_write_queue_size; // let sched_queue_size = self.config.dram_frfcfs_sched_queue_size; diff --git a/src/ported/fifo.rs b/src/ported/fifo.rs index a9f28743..a09920e6 100644 --- a/src/ported/fifo.rs +++ b/src/ported/fifo.rs @@ -50,7 +50,7 @@ where .map(|max| max.to_string()) .as_deref() .unwrap_or(""), - self.inner.iter().map(|i| i.to_string()).collect::>() // .join(", ") + self.inner.iter().map(std::string::ToString::to_string).collect::>() // .join(", ") ) // f.debug_list() // .entries(self.inner.iter().map(|i| i)) // i.to_string())) @@ -59,7 +59,7 @@ where } impl FifoQueue { - pub fn iter(&self) -> std::collections::vec_deque::Iter { + #[must_use] pub fn iter(&self) -> std::collections::vec_deque::Iter { self.inner.iter() } } diff --git a/src/ported/instruction.rs b/src/ported/instruction.rs index 66f1eed8..23d90d3a 100644 --- a/src/ported/instruction.rs +++ b/src/ported/instruction.rs @@ -1,3 +1,4 @@ +use super::kernel::Kernel; use super::mem_fetch::{AccessKind, BitString, MemAccess}; use super::opcodes::{ArchOp, Op, Opcode}; use super::{address, mem_fetch, operand_collector as opcoll, scheduler as sched}; @@ -53,12 +54,6 @@ struct TransactionInfo { active_mask: sched::ThreadActiveMask, } -impl TransactionInfo { - pub fn test_bytes(&self, start_bit: usize, end_bit: usize) -> bool { - self.byte_mask[start_bit..end_bit].any() - } -} - pub const MAX_ACCESSES_PER_INSN_PER_THREAD: usize = 8; #[derive(Debug, Default, Clone, PartialEq, Eq, Hash)] @@ -96,7 +91,7 @@ fn line_size_based_tag_func(addr: address, line_size: u64) -> u64 { addr & !(line_size - 1) } -pub const GLOBAL_HEAP_START: u64 = 0xC0000000; +pub const GLOBAL_HEAP_START: u64 = 0xC000_0000; // Volta max shmem size is 96kB pub const SHARED_MEM_SIZE_MAX: u64 = 96 * (1 << 10); // Volta max local mem is 16kB @@ -113,8 +108,6 @@ pub const TOTAL_LOCAL_MEM: u64 = pub const SHARED_GENERIC_START: u64 = GLOBAL_HEAP_START - TOTAL_SHARED_MEM; pub const LOCAL_GENERIC_START: u64 = SHARED_GENERIC_START - TOTAL_LOCAL_MEM; -// const MAX_REG_OPERANDS: usize = 32; - #[derive(Clone, PartialEq, Eq, Hash)] pub struct WarpInstruction { /// Globally unique id for this warp instruction. @@ -125,7 +118,6 @@ pub struct WarpInstruction { /// The ID of the scheduler unit that issued this instruction. pub scheduler_id: Option, pub pc: usize, - // todo: keep? pub trace_idx: usize, pub opcode: Opcode, pub active_mask: sched::ThreadActiveMask, @@ -157,7 +149,6 @@ impl std::fmt::Debug for WarpInstruction { f.debug_struct("WarpInstruction") .field("opcode", &self.opcode) .field("warp_id", &self.warp_id) - // .field("empty", &self.empty) .field("pc", &self.pc) .field("active_mask", &self.active_mask.to_bit_string()) .field("memory_space", &self.memory_space) @@ -172,55 +163,26 @@ impl std::fmt::Display for WarpInstruction { } } -// impl Default for WarpInstruction { -// fn default() -> Self { -// let mut threads = [(); 32].map(|_| PerThreadInfo::default()); -// Self { -// uid: 0, -// warp_id: 0, -// scheduler_id: 0, -// opcode: Opcode { -// op: Op::NOP, -// category: ArchOp::NO_OP, -// }, -// pc: 0, -// threads, -// memory_space: MemorySpace::None, -// is_atomic: false, -// active_mask: BitArray::ZERO, -// cache_operator: CacheOperator::UNDEFINED, -// latency: 0, // todo -// initiation_interval: 0, // todo -// data_size: 0, -// empty: true, -// mem_access_queue: VecDeque::new(), -// outputs: [0; 8], -// in_count: 0, -// inputs: [0; 24], -// out_count: 0, -// } -// } -// } - -pub static MAX_WARP_SIZE: usize = 32; +pub const MAX_WARP_SIZE: usize = 32; fn is_number(s: &str) -> bool { !s.is_empty() && s.chars().all(char::is_numeric) } -fn get_data_width_from_opcode(opcode: &str) -> Result { - let opcode_tokens: Vec<_> = opcode - .split(".") - .map(|t| t.trim()) +fn opcode_tokens(opcode: &str) -> impl Iterator { + opcode + .split('.') + .map(str::trim) .filter(|t| !t.is_empty()) - .collect(); +} - for token in opcode_tokens { +fn get_data_width_from_opcode(opcode: &str) -> Result { + for token in opcode_tokens(opcode) { assert!(!token.is_empty()); if is_number(token) { return Ok(token.parse::()? / 8); - } else if let Some('U') = token.chars().nth(0) { + } else if let Some('U') = token.chars().next() { if is_number(&token[1..token.len()]) { // handle the U* case return Ok(token[1..token.len()].parse::()? / 8); @@ -233,10 +195,7 @@ fn get_data_width_from_opcode(opcode: &str) -> Result Self { - // let mut threads = [(); config.warp_size].map(|_| PerThreadInfo::default()); - let threads = (0..config.warp_size) - .map(|_| PerThreadInfo::default()) - .collect(); + let threads = vec![PerThreadInfo::default(); config.warp_size]; Self { uid: 0, warp_id: 0, @@ -252,30 +211,21 @@ impl WarpInstruction { is_atomic: false, active_mask: BitArray::ZERO, cache_operator: CacheOperator::UNDEFINED, - latency: 1, // TODO: used to be one - initiation_interval: 1, // TODO: used to be one - issue_cycle: None, // TODO: used to be one + latency: 1, + initiation_interval: 1, + issue_cycle: None, dispatch_delay_cycles: 0, data_size: 0, instr_width: 16, - // empty: true, mem_access_queue: VecDeque::new(), outputs: [None; 8], - // in_count: 0, inputs: [None; 24], - // out_count: 0, - // src_arch_reg: [(); opcoll::MAX_REG_OPERANDS].map(|_| None), src_arch_reg: [None; opcoll::MAX_REG_OPERANDS], dest_arch_reg: [None; opcoll::MAX_REG_OPERANDS], - // dest_arch_reg: [(); opcoll::MAX_REG_OPERANDS].map(|_| None), - // for (unsigned i = 0; i < MAX_REG_OPERANDS; i++) { - // arch_reg.src[i] = -1; - // arch_reg.dst[i] = -1; - // } } } - pub fn from_trace(kernel: &super::KernelInfo, trace: trace::MemAccessTraceEntry) -> Self { + pub fn from_trace(kernel: &Kernel, trace: trace::MemAccessTraceEntry) -> Self { // fill active mask let mut active_mask = BitArray::ZERO; active_mask.store(trace.active_mask); @@ -289,12 +239,12 @@ impl WarpInstruction { let mut dest_arch_reg = [None; opcoll::MAX_REG_OPERANDS]; // get the opcode - let opcode_tokens: Vec<_> = trace.instr_opcode.split(".").collect(); + let opcode_tokens: Vec<_> = trace.instr_opcode.split('.').collect(); debug_assert!(!opcode_tokens.is_empty()); let opcode1 = opcode_tokens[0]; let Some(&opcode) = kernel.opcodes.get(opcode1) else { - panic!("undefined opcode {}", opcode1); + panic!("undefined opcode {opcode1}"); }; // fill regs information @@ -335,23 +285,23 @@ impl WarpInstruction { // handle special cases and fill memory space - let mut memory_op: Option = None; + // let mut memory_op: Option = None; let mut is_atomic = false; - let mut const_cache_operand = false; + // let mut const_cache_operand = false; let mut cache_operator = CacheOperator::UNDEFINED; // TODO: convert to none? let mut memory_space = None; match opcode.op { Op::LDC => { - memory_op = Some(MemOp::Load); + // memory_op = Some(MemOp::Load); data_size = 4; - const_cache_operand = true; + // const_cache_operand = true; memory_space = Some(MemorySpace::Constant); cache_operator = CacheOperator::ALL; } Op::LDG | Op::LDL => { assert!(data_size > 0); - memory_op = Some(MemOp::Load); + // memory_op = Some(MemOp::Load); cache_operator = CacheOperator::ALL; memory_space = if opcode.op == Op::LDL { Some(MemorySpace::Local) @@ -365,7 +315,7 @@ impl WarpInstruction { } Op::STG | Op::STL => { assert!(data_size > 0); - memory_op = Some(MemOp::Store); + // memory_op = Some(MemOp::Store); cache_operator = CacheOperator::ALL; memory_space = if opcode.op == Op::STL { Some(MemorySpace::Local) @@ -375,7 +325,7 @@ impl WarpInstruction { } Op::ATOM | Op::RED | Op::ATOMG => { assert!(data_size > 0); - memory_op = Some(MemOp::Load); + // memory_op = Some(MemOp::Load); // op = Op::LOAD; memory_space = Some(MemorySpace::Global); is_atomic = true; @@ -384,18 +334,18 @@ impl WarpInstruction { } Op::LDS => { assert!(data_size > 0); - memory_op = Some(MemOp::Load); + // memory_op = Some(MemOp::Load); memory_space = Some(MemorySpace::Shared); } Op::STS => { assert!(data_size > 0); - memory_op = Some(MemOp::Store); + // memory_op = Some(MemOp::Store); memory_space = Some(MemorySpace::Shared); } Op::ATOMS => { assert!(data_size > 0); is_atomic = true; - memory_op = Some(MemOp::Load); + // memory_op = Some(MemOp::Load); memory_space = Some(MemorySpace::Shared); } Op::LDSM => { @@ -405,11 +355,11 @@ impl WarpInstruction { Op::ST | Op::LD => { assert!(data_size > 0); is_atomic = true; - memory_op = Some(if opcode.op == Op::LD { - MemOp::Load - } else { - MemOp::Store - }); + // memory_op = Some(if opcode.op == Op::LD { + // MemOp::Load + // } else { + // MemOp::Store + // }); // resolve generic loads let trace::KernelLaunch { shared_mem_base_addr, @@ -443,7 +393,7 @@ impl WarpInstruction { Self { uid: 0, - warp_id: trace.warp_id_in_block as usize, // todo: block or sm? + warp_id: trace.warp_id_in_block as usize, scheduler_id: None, opcode, pc: trace.instr_offset as usize, @@ -474,7 +424,7 @@ impl WarpInstruction { } } - pub fn has_dispatch_delay(&self) -> bool { + #[must_use] pub fn has_dispatch_delay(&self) -> bool { self.dispatch_delay_cycles > 0 } @@ -525,21 +475,21 @@ impl WarpInstruction { // m_uid = ++(m_config->gpgpu_ctx->warp_inst_sm_next_uid); - pub fn active_thread_count(&self) -> usize { + #[must_use] pub fn active_thread_count(&self) -> usize { self.active_mask.count_ones() } - pub fn is_load(&self) -> bool { + #[must_use] pub fn is_load(&self) -> bool { let op = self.opcode.category; matches!(op, ArchOp::LOAD_OP | ArchOp::TENSOR_CORE_LOAD_OP) } - pub fn is_store(&self) -> bool { + #[must_use] pub fn is_store(&self) -> bool { let op = self.opcode.category; matches!(op, ArchOp::STORE_OP | ArchOp::TENSOR_CORE_STORE_OP) } - pub fn is_atomic(&self) -> bool { + #[must_use] pub fn is_atomic(&self) -> bool { let op = self.opcode.op; matches!( op, @@ -547,11 +497,11 @@ impl WarpInstruction { ) } - pub fn addr(&self) -> Option
{ + #[must_use] pub fn addr(&self) -> Option
{ self.mem_access_queue.front().map(|access| access.addr) } - pub fn access_kind(&self) -> Option { + #[must_use] pub fn access_kind(&self) -> Option { let is_write = self.is_store(); match self.memory_space { Some(MemorySpace::Constant) => Some(AccessKind::CONST_ACC_R), @@ -586,7 +536,7 @@ impl WarpInstruction { let is_write = self.is_store(); // Calculate memory accesses generated by this warp - let mut cache_block_size_bytes = 0; + // let mut cache_block_size_bytes = 0; // Number of portions a warp is divided into for // shared memory bank conflict check @@ -597,9 +547,6 @@ impl WarpInstruction { Some(MemorySpace::Shared) => { let subwarp_size = config.warp_size / warp_parts; let mut total_accesses = 0; - // dbg!(&warp_parts); - // dbg!(&config.warp_size); - // dbg!(&subwarp_size); let mut banks = Vec::new(); let mut words = Vec::new(); @@ -613,17 +560,14 @@ impl WarpInstruction { if !self.active_mask[thread] { continue; } - // dbg!(&thread); - // dbg!(&self.threads[thread].mem_req_addr); let Some(addr) = self.threads[thread].mem_req_addr.first() else { continue; }; // FIXME: deferred allocation of shared memory should not accumulate // across kernel launches - // assert( addr < m_config->gpgpu_shmem_size ); let bank = config.shared_mem_bank(*addr); // line_size_based_tag_func - let word = line_size_based_tag_func(*addr, config::WORD_SIZE as u64); + let word = line_size_based_tag_func(*addr, config::WORD_SIZE); let accesses = bank_accesses.entry(bank).or_default(); *accesses.entry(word).or_default() += 1; @@ -634,7 +578,6 @@ impl WarpInstruction { // dbg!(&bank_accesses); if config.shared_memory_limited_broadcast { - panic!("shmem limited broadcast is used"); // step 2: look for and select a broadcast bank/word if one occurs let mut broadcast_detected = false; let mut broadcast_word_addr = None; @@ -659,7 +602,7 @@ impl WarpInstruction { let mut max_bank_accesses = 0; for (bank, accesses) in &bank_accesses { let mut bank_accesses = 0; - for (_addr, num_accesses) in accesses { + for num_accesses in accesses.values() { bank_accesses += num_accesses; if broadcast_detected && broadcast_bank.is_some_and(|b| b == bank) { for (addr, num_accesses) in accesses { @@ -676,19 +619,15 @@ impl WarpInstruction { } } // step 4: accumulate - total_accesses += max_bank_accesses; + // total_accesses += max_bank_accesses; + unimplemented!("shmem limited broadcast is used"); } else { - // step 2: look for the bank with the maximum number of different - // words accessed + // step 2: look for the bank with the most unique words accessed let max_bank_accesses = bank_accesses .values() - .map(|accesses| accesses.len()) + .map(std::collections::HashMap::len) .max() .unwrap_or(0); - // let mut max_bank_accesses = 0; - // for (bank, accesses) in &bank_accesses.values() { - // max_bank_accesses = max_bank_accesses.max(accesses.len()); - // } // step 3: accumulate total_accesses += max_bank_accesses; } @@ -700,24 +639,23 @@ impl WarpInstruction { debug_assert!(total_accesses > 0); debug_assert!(total_accesses <= config.warp_size); - // panic!("shared mem request"); // shared memory conflicts modeled as larger initiation interval self.dispatch_delay_cycles = total_accesses; - // TODO: shared mem does not generate mem accesses? + // shared mem does not generate mem accesses? None } Some(MemorySpace::Texture) => { - if let Some(l1_tex) = &config.tex_cache_l1 { - cache_block_size_bytes = l1_tex.line_size; - } + // if let Some(l1_tex) = &config.tex_cache_l1 { + // cache_block_size_bytes = l1_tex.line_size; + // } None } Some(MemorySpace::Constant) => { - if let Some(l1_const) = &config.const_cache_l1 { - cache_block_size_bytes = l1_const.line_size; - } + // if let Some(l1_const) = &config.const_cache_l1 { + // cache_block_size_bytes = l1_const.line_size; + // } None } Some(MemorySpace::Global | MemorySpace::Local) => { @@ -728,7 +666,7 @@ impl WarpInstruction { unimplemented!("atomics not supported for now"); } else { // here, we return the memory accesses - let accesses = self.memory_coalescing_arch(is_write, access_kind, &config); + let accesses = self.memory_coalescing_arch(is_write, access_kind, config); Some(accesses) } } else { @@ -738,36 +676,10 @@ impl WarpInstruction { ); } } - None => panic!("generate mem accesses but dont have mem space"), - // other => todo!("generate mem accesses[{other:?}]: not yet implemented"), + None => panic!("generate mem accesses for instruction without mem space"), } } - /// this just sets values - // pub fn issue( - // &mut self, - // mask: sched::ThreadActiveMask, - // warp_id: usize, - // cycle: u64, - // dynamic_warp_id: usize, - // scheduler_id: usize, - // ) { - // // assert_eq!(self.active_mask, mask); - // // assert_eq!(self.warp_id, warp_id); - // // assert_eq!(self.scheduler_id, scheduler_id); - // - // self.active_mask = mask; - // self.active_mask = mask; - // // self.id = ++(m_config->gpgpu_ctx->warp_inst_sm_next_uid); - // self.warp_id = warp_id; - // // self.dynamic_warp_id = dynamic_warp_id; - // // self.issue_cycle = cycle; - // // self.cycles = self.initiation_interval; - // // self.cache_hit = false; - // // self.empty = false; - // self.scheduler_id = Some(scheduler_id; - // } - fn memory_coalescing_arch( &self, is_write: bool, @@ -776,20 +688,14 @@ impl WarpInstruction { ) -> Vec { // see the CUDA manual where it discusses coalescing rules // before reading this - // let segment_size = 0; let warp_parts = config.shared_memory_warp_parts; - // let sector_segment_size = false; let coalescing_arch = config.coalescing_arch as usize; - let sector_segment_size = if coalescing_arch >= 20 && coalescing_arch < 39 { + let sector_segment_size = if (20..39).contains(&coalescing_arch) { // Fermi and Kepler, L1 is normal and L2 is sector config.global_mem_skip_l1_data_cache || self.cache_operator == CacheOperator::GLOBAL - } else if coalescing_arch >= 40 { - // Maxwell, Pascal and Volta, L1 and L2 are sectors - // all requests should be 32 bytes - true } else { - false + coalescing_arch >= 40 }; let segment_size = match self.data_size { @@ -852,10 +758,6 @@ impl WarpInstruction { // chunk does this thread access? let tx = subwarp_transactions.entry(block_addr).or_default(); // can only write to one segment - // it seems like in trace driven, - // a thread can write to more than one segment - // - // assert(block_address == line_size_based_tag_func(addr+data_size_coales-1,segment_size)); tx.chunk_mask.set(chunk as usize, true); tx.active_mask.set(thread_id, true); @@ -870,7 +772,7 @@ impl WarpInstruction { // it seems like in trace driven, a thread can write to more than one // segment handle this special case - let coalesc_end_addr = addr + data_size_coales as u64 - 1; + let coalesc_end_addr = addr + u64::from(data_size_coales) - 1; if block_addr != line_size_based_tag_func(coalesc_end_addr, segment_size) { let block_addr = line_size_based_tag_func(coalesc_end_addr, segment_size); let chunk = (coalesc_end_addr & 127) / 32; @@ -926,9 +828,6 @@ impl WarpInstruction { mut addr: address, segment_size: u64, ) -> MemAccess { - // dbg!(&tx); - // dbg!(&tx.chunk_mask.to_string()); - debug_assert_eq!(addr & (segment_size - 1), 0); debug_assert!(tx.chunk_mask.count_ones() >= 1); // halves (used to check if 64 byte segment can be @@ -943,23 +842,11 @@ impl WarpInstruction { // only lower 64 bytes used req_size_bytes = 64; halves |= &tx.chunk_mask[0..2]; - // if tx.chunk_mask[0] { - // halves.set(0, true); - // } - // if tx.chunk_mask[1] { - // halves.set(1, true); - // } } else if !lower_half_used && upper_half_used { // only upper 64 bytes used - addr = addr + 64; + addr += 64; req_size_bytes = 64; halves |= &tx.chunk_mask[2..4]; - // if tx.chunk_mask[2] { - // halves.set(0, true); - // } - // if tx.chunk_mask[3] { - // halves.set(1, true); - // } } else { assert!(lower_half_used && upper_half_used); } @@ -967,13 +854,9 @@ impl WarpInstruction { // need to set halves if addr % 128 == 0 { halves |= &tx.chunk_mask[0..2]; - // if (q[0]) h.set(0); - // if (q[1]) h.set(1); } else { debug_assert_eq!(addr % 128, 64); halves |= &tx.chunk_mask[2..4]; - // if (q[2]) h.set(0); - // if (q[3]) h.set(1); } } @@ -983,14 +866,14 @@ impl WarpInstruction { if lower_half_used && !upper_half_used { req_size_bytes = 32; } else if !lower_half_used && upper_half_used { - addr = addr + 32; + addr += 32; req_size_bytes = 32; } else { assert!(lower_half_used && upper_half_used); } } - let access = MemAccess::new( + MemAccess::new( access_kind, addr, None, // we cannot know the allocation start address in this context @@ -999,8 +882,7 @@ impl WarpInstruction { tx.active_mask, tx.byte_mask, tx.chunk_mask, - ); - access + ) } pub fn set_addr(&mut self, thread_id: usize, addr: address) { @@ -1008,44 +890,10 @@ impl WarpInstruction { thread.mem_req_addr[0] = addr; } - // fn set_addresses(&mut self, thread_id: usize, addrs: &[address], count: usize) { pub fn set_addresses(&mut self, thread_id: usize, addresses: Vec
) { let thread = &mut self.threads[thread_id]; for (i, addr) in addresses.into_iter().enumerate() { thread.mem_req_addr[i] = addr; } - - // let max_count = thread.mem_req_addr.len(); - // debug_assert!(count <= max_count); - // let count = count.min(max_count).min(addrs.len()); - // for i in 0..count { - // thread.mem_req_addr[i] = addrs[i]; - // } - } - - // pub fn is_active(&self, thread: usize) -> bool { - // self.active_mask[thread] - // } -} - -pub fn opcode_tokens(opcode: &str) -> Vec<&str> { - opcode - .split(".") - .map(|t| t.trim()) - .filter(|t| !t.is_empty()) - .collect() -} - -pub fn datawidth_for_opcode(opcode: &str) -> u32 { - let tokens = opcode_tokens(opcode); - for t in tokens { - if let Ok(num) = t.parse::() { - return num / 8; - } else if t.chars().nth(0) == Some('U') { - if let Ok(num) = t[1..].parse::() { - return num / 8; - } - } } - 4 // default is 4 bytes } diff --git a/src/ported/interconn.rs b/src/ported/interconn.rs index 30b493a4..6b566e27 100644 --- a/src/ported/interconn.rs +++ b/src/ported/interconn.rs @@ -29,14 +29,14 @@ pub trait Interconnect

{ #[derive(Debug)] pub struct ToyInterconnect

{ - pub capacity: Option, + // pub capacity: Option, pub num_cores: usize, pub num_mems: usize, pub num_subnets: usize, pub num_nodes: usize, pub num_classes: usize, round_robin_turn: Vec>>, - input_queue: Vec>>>>, + // input_queue: Vec>>>>, output_queue: Vec>>>>, // deviceID to icntID map // deviceID : Starts from 0 for shaders and then continues until mem nodes @@ -45,41 +45,42 @@ pub struct ToyInterconnect

{ } impl

ToyInterconnect

{ - pub fn new(num_cores: usize, num_mems: usize, capacity: Option) -> ToyInterconnect

{ + #[must_use] + pub fn new(num_cores: usize, num_mems: usize) -> ToyInterconnect

{ let num_subnets = 2; let num_nodes = num_cores + num_mems; let num_classes = 1; - let mut input_queue: Vec>>>> = Vec::new(); + // let mut input_queue: Vec>>>> = Vec::new(); let mut output_queue: Vec>>>> = Vec::new(); let mut round_robin_turn: Vec>> = Vec::new(); for subnet in 0..num_subnets { - input_queue.push(Vec::new()); + // input_queue.push(Vec::new()); output_queue.push(Vec::new()); round_robin_turn.push(Vec::new()); for node in 0..num_nodes { - input_queue[subnet].push(Vec::new()); + // input_queue[subnet].push(Vec::new()); output_queue[subnet].push(Vec::new()); round_robin_turn[subnet].push(Mutex::new(0)); for _class in 0..num_classes { - input_queue[subnet][node].push(Mutex::new(VecDeque::new())); + // input_queue[subnet][node].push(Mutex::new(VecDeque::new())); output_queue[subnet][node].push(Mutex::new(VecDeque::new())); } } } Self { - capacity, + // capacity, num_cores, num_mems, num_subnets, num_nodes, num_classes, - input_queue, - output_queue, round_robin_turn, + // input_queue, + output_queue, } } } @@ -92,8 +93,8 @@ where // todo: this is not efficient, could keep track of this with a variable self.output_queue .iter() - .flat_map(|x| x) - .flat_map(|x| x) + .flatten() + .flatten() .any(|reqs: &Mutex>| !reqs.lock().unwrap().is_empty()) } @@ -101,10 +102,10 @@ where assert!(self.has_buffer(src_device, size)); let is_memory_node = self.num_subnets > 1 && dest_device >= self.num_cores; - let subnet = if is_memory_node { 1 } else { 0 }; + let subnet = usize::from(is_memory_node); log::debug!( "{}: {size} bytes from device {src_device} to {dest_device} (subnet {subnet})", - style(format!("INTERCONN PUSH {}", packet)).bold(), + style(format!("INTERCONN PUSH {packet}")).bold(), ); let mut queue = self.output_queue[subnet][dest_device][0].lock().unwrap(); @@ -113,7 +114,7 @@ where fn pop(&self, device: usize) -> Option

{ let icnt_id = device; - let subnet = if device >= self.num_cores { 1 } else { 0 }; + let subnet = usize::from(device >= self.num_cores); let mut lock = self.round_robin_turn[subnet][icnt_id].lock().unwrap(); let mut turn = *lock; @@ -137,15 +138,16 @@ where // do nothing } - fn has_buffer(&self, device: usize, _size: u32) -> bool { - let Some(capacity) = self.capacity else { - return true; - }; - - // TODO: using input queue makes no sense as we push into output directly - let subnet = if device >= self.num_cores { 1 } else { 0 }; - let queue = self.input_queue[subnet][device][0].lock().unwrap(); - queue.len() <= capacity + fn has_buffer(&self, _device: usize, _size: u32) -> bool { + true + // let Some(capacity) = self.capacity else { + // return true; + // }; + // + // // TODO: using input queue makes no sense as we push into output directly + // let subnet = usize::from(device >= self.num_cores); + // let queue = self.input_queue[subnet][device][0].lock().unwrap(); + // queue.len() <= capacity } } @@ -180,7 +182,7 @@ impl MemFetchInterface for CoreMemoryInterface { let request_size = if write { size } else { - mem_fetch::READ_PACKET_SIZE as u32 + u32::from(mem_fetch::READ_PACKET_SIZE) }; !self.interconn.has_buffer(self.cluster_id, request_size) } @@ -197,9 +199,7 @@ impl MemFetchInterface for CoreMemoryInterface { } let dest_sub_partition_id = fetch.sub_partition_id(); - let mem_dest = self - .config - .mem_id_to_device_id(dest_sub_partition_id as usize); + let mem_dest = self.config.mem_id_to_device_id(dest_sub_partition_id); log::debug!( "cluster {} icnt_inject_request_packet({}) dest sub partition id={} dest mem node={}", @@ -269,7 +269,7 @@ mod tests { let config = IntersimConfig::from_file(&config_file)?; - assert_eq!(config.get_bool("use_map"), false); + assert!(!config.get_bool("use_map")); assert_eq!(config.get_int("num_vcs"), 1); // this means vc can only ever be zero assert_eq!(config.get_int("ejection_buffer_size"), 0); assert_eq!(config.get_string("sim_type"), "gpgpusim"); diff --git a/src/ported/kernel.rs b/src/ported/kernel.rs new file mode 100644 index 00000000..ecf0797d --- /dev/null +++ b/src/ported/kernel.rs @@ -0,0 +1,212 @@ +use super::{instruction, opcodes, scheduler as sched}; +use color_eyre::{ + eyre::{self}, + Help, +}; +use std::collections::HashSet; +use std::path::Path; +use std::sync::{Mutex, RwLock}; +use std::time::Instant; +use trace_model::{KernelLaunch, MemAccessTraceEntry, Point}; + +pub fn read_trace(path: impl AsRef) -> eyre::Result> { + use serde::Deserializer; + + let reader = utils::fs::open_readable(path.as_ref())?; + let mut reader = rmp_serde::Deserializer::new(reader); + let mut trace = vec![]; + let decoder = nvbit_io::Decoder::new(|access: MemAccessTraceEntry| { + trace.push(access); + }); + reader.deserialize_seq(decoder).suggestion("maybe the traces does not match the most recent binary trace format, try re-generating the traces.")?; + Ok(trace) +} + +/// Kernel represents a kernel. +/// +/// This includes its launch configuration, +/// as well as its state of execution. +#[derive(Debug)] +pub struct Kernel { + pub opcodes: &'static opcodes::OpcodeMap, + pub config: KernelLaunch, + trace: Vec, + trace_pos: RwLock, + launched: Mutex, + num_cores_running: usize, +} + +impl PartialEq for Kernel { + fn eq(&self, other: &Self) -> bool { + self.id() == other.id() + } +} + +impl std::fmt::Display for Kernel { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("Kernel") + .field("name", &self.name()) + .field("id", &self.id()) + .finish() + } +} + +impl Kernel { + pub fn from_trace(traces_dir: impl AsRef, config: KernelLaunch) -> Self { + let start = Instant::now(); + log::info!( + "parsing kernel for launch {:?} from {}", + &config, + &config.trace_file + ); + let trace_path = traces_dir + .as_ref() + .join(&config.trace_file) + .with_extension("msgpack"); + + let trace = read_trace(trace_path).unwrap(); + + // sanity check + assert!(trace_model::is_valid_trace(&trace)); + + // check if grid size is equal to the number of unique blocks in the trace + let all_blocks: HashSet<_> = trace.iter().map(|t| &t.block_id).collect(); + log::info!( + "parsed kernel trace for {:?}: {}/{} blocks in {:?}", + config.name, + all_blocks.len(), + config.grid.size(), + start.elapsed() + ); + assert_eq!(config.grid.size(), all_blocks.len() as u64); + + let opcodes = opcodes::get_opcode_map(&config).unwrap(); + + Self { + config, + trace, + trace_pos: RwLock::new(0), + opcodes, + launched: Mutex::new(false), + num_cores_running: 0, + } + } + + pub fn shared_memory_bytes_human_readable(&self) -> String { + human_bytes::human_bytes(f64::from(self.config.shared_mem_bytes)) + } + + pub fn set_launched(&self) { + *self.launched.lock().unwrap() = true; + } + + pub fn launched(&self) -> bool { + *self.launched.lock().unwrap() + } + + pub fn id(&self) -> u64 { + self.config.id + } + + pub fn next_threadblock_traces(&self, warps: &mut [sched::WarpRef]) { + let mut trace_pos = self.trace_pos.write().unwrap(); + + let mut instructions = 0; + let trace_size = self.trace.len(); + + if *trace_pos + 1 >= trace_size || trace_size == 0 { + // no more threadblocks + log::info!("blocks done: no more threadblock traces"); + return; + } + let next_block = &self.trace[*trace_pos + 1].block_id; + + while *trace_pos < trace_size { + let entry = &self.trace[*trace_pos]; + if entry.block_id != *next_block { + // get instructions until new block + break; + } + + let warp_id = entry.warp_id_in_block as usize; + let instr = instruction::WarpInstruction::from_trace(self, entry.clone()); + let warp = warps.get_mut(warp_id).unwrap(); + let mut warp = warp.try_borrow_mut().unwrap(); + warp.push_trace_instruction(instr); + + instructions += 1; + *trace_pos += 1; + } + + log::debug!( + "added {instructions} instructions ({} per warp) for block {next_block}", + instructions / warps.len() + ); + debug_assert!(instructions > 0); + // debug_assert!(instructions % 32 == 0); + // dbg!(warps + // .iter() + // .map(|w| w.try_borrow().unwrap().trace_instructions.len()) + // .collect::>()); + // debug_assert!( + // warps + // .iter() + // .map(|w| w.try_borrow().unwrap().trace_instructions.len()) + // .collect::>() + // .len() + // == 1, + // "all warps have the same number of instructions" + // ); + // dbg!(warps + // .iter() + // .map(|w| w.try_borrow().unwrap().trace_instructions.len()) + // .collect::>()); + + debug_assert!( + warps + .iter() + .all(|w| !w.try_borrow().unwrap().trace_instructions.is_empty()), + "all warps have at least one instruction (need at least an EXIT)" + ); + } + + pub fn inc_running(&mut self) { + self.num_cores_running += 1; + } + + pub fn name(&self) -> &str { + &self.config.name + } + + pub fn was_launched(&self) -> bool { + *self.launched.lock().unwrap() + } + + pub fn running(&self) -> bool { + self.num_cores_running > 0 + } + + pub fn current_block(&self) -> Option { + let traces_pos = self.trace_pos.read().unwrap(); + let trace = self.trace.get(*traces_pos)?; + Some(Point::new(trace.block_id.clone(), self.config.grid.clone())) + } + + pub fn done(&self) -> bool { + self.no_more_blocks_to_run() && !self.running() + } + + pub fn num_blocks(&self) -> usize { + let grid = &self.config.grid; + grid.x as usize * grid.y as usize * grid.z as usize + } + + pub fn threads_per_block(&self) -> usize { + let block = &self.config.block; + block.x as usize * block.y as usize * block.z as usize + } + + pub fn no_more_blocks_to_run(&self) -> bool { + self.current_block().is_none() + } +} diff --git a/src/ported/l1/base.rs b/src/ported/l1/base.rs index 02433e63..e8251491 100644 --- a/src/ported/l1/base.rs +++ b/src/ported/l1/base.rs @@ -38,18 +38,18 @@ impl BandwidthManager { } /// Use the data port based on the outcome and - /// events generated by the mem_fetch request + /// events generated by the `mem_fetch` request pub fn use_data_port( &mut self, data_size: u32, access_status: cache::RequestStatus, - events: &mut Vec, + events: &mut [cache::Event], ) { let port_width = self.config.data_port_width() as u32; match access_status { cache::RequestStatus::HIT => { let mut data_cycles = data_size / port_width; - data_cycles += if data_size % port_width > 0 { 1 } else { 0 }; + data_cycles += u32::from(data_size % port_width > 0); self.data_port_occupied_cycles += data_cycles as usize; } cache::RequestStatus::HIT_RESERVED | cache::RequestStatus::MISS => { @@ -139,7 +139,7 @@ impl PendingRequest {} /// Base cache /// -/// Implements common functions for read_only_cache and data_cache +/// Implements common functions for `read_only_cache` and `data_cache` /// Each subclass implements its own 'access' function #[derive()] pub struct Base @@ -198,12 +198,8 @@ impl Base { config: Arc, cache_config: Arc, ) -> Self { - // for now we initialize the tag array and mshr - - // m_tag_array(new tag_array(config, core_id, type_id)), - let tag_array = tag_array::TagArray::new(core_id, 0, cache_config.clone()); + let tag_array = tag_array::TagArray::new(cache_config.clone()); - // m_mshrs(config.m_mshr_entries, config.m_mshr_max_merge), debug_assert!(matches!( cache_config.mshr_kind, mshr::Kind::ASSOC | mshr::Kind::SECTOR_ASSOC @@ -226,15 +222,12 @@ impl Base { pending: HashMap::new(), miss_queue: VecDeque::new(), miss_queue_status: mem_fetch::Status::INITIALIZED, - // write_alloc_type: mem_fetch::AccessKind::L1_WR_ALLOC_R, - // write_back_type: mem_fetch::AccessKind::L1_WRBK_ACC, } } /// Checks whether this request can be handled in this cycle. /// - /// `n` equals the number of misses to be handled on - /// this cycle. + /// `n` equals the number of misses to be handled in this cycle. pub fn miss_queue_can_fit(&self, n: usize) -> bool { self.miss_queue.len() + n < self.cache_config.miss_queue_size } @@ -248,7 +241,7 @@ impl Base { /// Checks if fetch is waiting to be filled by lower memory level pub fn waiting_for_fill(&self, fetch: &mem_fetch::MemFetch) -> bool { - self.pending.contains_key(&fetch) + self.pending.contains_key(fetch) } /// Are any (accepted) accesses that had to wait for memory now ready? @@ -355,7 +348,7 @@ impl Base { ); // change address to mshr block address - fetch.data_size = self.cache_config.atom_size() as u32; + fetch.data_size = self.cache_config.atom_size(); fetch.access.addr = mshr_addr; self.mshrs.add(mshr_addr, fetch.clone()); @@ -467,8 +460,8 @@ where other => unimplemented!("cache allocate policy {:?} is not implemented", other), } - let access_sector_mask = fetch.access_sector_mask().clone(); - let access_byte_mask = fetch.access_byte_mask().clone(); + let access_sector_mask = *fetch.access_sector_mask(); + let access_byte_mask = *fetch.access_byte_mask(); let has_atomic = self .mshrs @@ -518,9 +511,8 @@ mod tests { let cache_stats = Arc::new(Mutex::new(stats::Cache::default())); let cache_config = config.data_cache_l1.clone().unwrap(); - let stats = Arc::new(Mutex::new(stats::Stats::from_config(&*config))); - let interconn: Arc> = - Arc::new(ic::ToyInterconnect::new(0, 0, None)); + let stats = Arc::new(Mutex::new(stats::Stats::from_config(&config))); + let interconn: Arc> = Arc::new(ic::ToyInterconnect::new(0, 0)); let port = Arc::new(ic::CoreMemoryInterface { interconn, cluster_id: 0, diff --git a/src/ported/l1/data.rs b/src/ported/l1/data.rs index d7373043..e476f064 100644 --- a/src/ported/l1/data.rs +++ b/src/ported/l1/data.rs @@ -50,7 +50,7 @@ where } } - pub fn cache_config(&self) -> &Arc { + #[must_use] pub fn cache_config(&self) -> &Arc { &self.inner.cache_config } @@ -61,7 +61,7 @@ where cache_index: Option, fetch: mem_fetch::MemFetch, time: u64, - _events: &mut Vec, + _events: &mut [cache::Event], _probe_status: cache::RequestStatus, ) -> cache::RequestStatus { debug_assert_eq!(addr, fetch.addr()); @@ -119,7 +119,7 @@ where // cache_index: usize, fetch: mem_fetch::MemFetch, time: u64, - _events: &mut Vec, + _events: &mut [cache::Event], _probe_status: cache::RequestStatus, ) -> cache::RequestStatus { let super::base::Base { @@ -143,7 +143,7 @@ where tag_array.num_dirty += 1; } } - return cache::RequestStatus::HIT; + cache::RequestStatus::HIT } /// Sends write request to lower level memory (write or writeback) @@ -168,11 +168,9 @@ where &mut self, addr: address, cache_index: Option, - // cache_index: usize, fetch: mem_fetch::MemFetch, time: u64, events: &mut Vec, - // events: &[cache::Event], _probe_status: cache::RequestStatus, ) -> cache::RequestStatus { if !self.inner.miss_queue_can_fit(1) { @@ -213,25 +211,22 @@ where // (already modified lower level) if writeback && writeback_policy != config::CacheWritePolicy::WRITE_THROUGH { if let Some(evicted) = evicted { - let debug_fetch = fetch.to_string(); - let is_write = true; let writeback_access = mem_fetch::MemAccess::new( self.write_back_type, evicted.block_addr, evicted.allocation.clone(), - evicted.modified_size as u32, + evicted.modified_size, is_write, *fetch.access_warp_mask(), evicted.byte_mask, evicted.sector_mask, ); - // dbg!(&writeback_access); let mut writeback_fetch = mem_fetch::MemFetch::new( - fetch.instr, + fetch.instr.clone(), writeback_access, - &*self.inner.config, + &self.inner.config, if is_write { ported::WRITE_PACKET_SIZE } else { @@ -242,7 +237,6 @@ where 0, 0, ); - // dbg!(&writeback_fetch); // the evicted block may have wrong chip id when // advanced L2 hashing is used, so set the right chip @@ -256,7 +250,7 @@ where log::trace!( "handling READ MISS for {}: => sending writeback {}", - debug_fetch, + fetch, writeback_fetch ); @@ -266,7 +260,7 @@ where return cache::RequestStatus::MISS; } - return cache::RequestStatus::RESERVATION_FAIL; + cache::RequestStatus::RESERVATION_FAIL } fn write_miss_no_write_allocate( @@ -297,7 +291,6 @@ where } // on miss, generate write through - // (no write buffering -- too many threads for that) let event = cache::Event { kind: cache::EventKind::WRITE_REQUEST_SENT, evicted_block: None, @@ -331,7 +324,7 @@ where log::debug!("handling write miss for {} (block addr={}, mshr addr={}, mshr hit={} mshr avail={}, miss queue full={})", &fetch, block_addr, mshr_addr, mshr_hit, mshr_free, self.inner.miss_queue_can_fit(2)); - if !self.inner.miss_queue_can_fit(2) || (!(mshr_hit && mshr_free) && !mshr_miss_but_free) { + if !self.inner.miss_queue_can_fit(2) || !(mshr_miss_but_free || mshr_hit && mshr_free) { // check what is the exact failure reason let failure = if !self.inner.miss_queue_can_fit(2) { cache::ReservationFailure::MISS_QUEUE_FULL @@ -492,7 +485,7 @@ where cache_index: Option, fetch: mem_fetch::MemFetch, time: u64, - events: &mut Vec, + events: &mut [cache::Event], probe_status: cache::RequestStatus, ) -> cache::RequestStatus { let func = match self.inner.cache_config.write_policy { @@ -556,24 +549,19 @@ where 1, ); } + } else if probe_status == cache::RequestStatus::HIT { + access_status = self.read_hit(addr, cache_index, fetch, time, events, probe_status); + } else if probe_status != cache::RequestStatus::RESERVATION_FAIL { + access_status = self.read_miss(addr, cache_index, fetch, time, events, probe_status); } else { - if probe_status == cache::RequestStatus::HIT { - access_status = self.read_hit(addr, cache_index, fetch, time, events, probe_status); - } else if probe_status != cache::RequestStatus::RESERVATION_FAIL { - access_status = - self.read_miss(addr, cache_index, fetch, time, events, probe_status); - } else { - // the only reason for reservation fail here is LINE_ALLOC_FAIL - // (i.e all lines are reserved) - let mut stats = self.inner.stats.lock().unwrap(); - stats.inc( - *fetch.access_kind(), - cache::AccessStat::ReservationFailure( - cache::ReservationFailure::LINE_ALLOC_FAIL, - ), - 1, - ); - } + // the only reason for reservation fail here is LINE_ALLOC_FAIL + // (i.e all lines are reserved) + let mut stats = self.inner.stats.lock().unwrap(); + stats.inc( + *fetch.access_kind(), + cache::AccessStat::ReservationFailure(cache::ReservationFailure::LINE_ALLOC_FAIL), + 1, + ); } self.inner @@ -742,7 +730,7 @@ mod tests { use super::Data; use crate::config; use crate::ported::{ - self, cache::Cache, instruction, interconn as ic, mem_fetch, parse_commands, + self, cache::Cache, instruction, interconn as ic, kernel::Kernel, mem_fetch, parse_commands, }; use std::collections::VecDeque; use std::path::PathBuf; @@ -808,7 +796,6 @@ mod tests { #[test] fn test_data_l1_full_trace() { let _control_size = 0; - // let warp_id = 0; let core_id = 0; let cluster_id = 0; @@ -824,7 +811,7 @@ mod tests { cycle, interconn, stats.clone(), - config.clone(), + config, Arc::clone(&cache_config.inner), mem_fetch::AccessKind::L1_WR_ALLOC_R, mem_fetch::AccessKind::L1_WRBK_ACC, @@ -832,34 +819,22 @@ mod tests { let trace_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")) .join("test-apps/vectoradd/traces/vectoradd-100-32-trace/"); - // let command_traces_path = - // traces_dircommands.json"); dbg!(&trace_dir); let commands: Vec = parse_commands(&trace_dir.join("commands.json")).expect("parse trace commands"); dbg!(&commands); - // let mut kernels: VecDeque> = VecDeque::new(); let mut kernels: VecDeque<_> = VecDeque::new(); for cmd in commands { match cmd { Command::MemcpyHtoD { .. } => {} Command::MemAlloc { .. } => {} - // Command::MemcpyHtoD { - // allocation_name, - // dest_device_addr, - // num_bytes, - // } => { - // // sim.memcopy_to_gpu(*dest_device_addr, *num_bytes, allocation_name); - // } Command::KernelLaunch(launch) => { - let kernel = ported::KernelInfo::from_trace(&trace_dir, launch.clone()); - // kernels.push_back(Arc::new(kernel)); + let kernel = Kernel::from_trace(&trace_dir, launch.clone()); kernels.push_back(kernel); } } } - // dbg!(&kernels); // for kernel in &mut kernels { // let mut block_iter = kernel.next_block_iter.lock().unwrap(); @@ -1011,11 +986,11 @@ mod tests { num_registers: 8, binary_version: 61, stream_id: 0, - shared_mem_base_addr: 140663786045440, - local_mem_base_addr: 140663752491008, + shared_mem_base_addr: 140_663_786_045_440, + local_mem_base_addr: 140_663_752_491_008, nvbit_version: "1.5.5".to_string(), }; - let kernel = crate::ported::KernelInfo::from_trace(trace_dir, launch); + let kernel = Kernel::from_trace(trace_dir, launch); let trace_instr = trace_model::MemAccessTraceEntry { cuda_ctx: 0, @@ -1048,10 +1023,10 @@ mod tests { num_src_regs: 0, addrs: concat( [ - 140663086646144, - 140663086646148, - 140663086646152, - 140663086646156, + 140_663_086_646_144, + 140_663_086_646_148, + 140_663_086_646_152, + 140_663_086_646_156, ], [0; 32 - 4], ) @@ -1062,7 +1037,7 @@ mod tests { let mut instr = instruction::WarpInstruction::from_trace(&kernel, trace_instr); dbg!(&instr); let mut accesses = instr - .generate_mem_accesses(&*config) + .generate_mem_accesses(&config) .expect("generated acceseses"); assert_eq!(accesses.len(), 1); diff --git a/src/ported/ldst_unit.rs b/src/ported/ldst_unit.rs index 04c061b5..bb2739af 100644 --- a/src/ported/ldst_unit.rs +++ b/src/ported/ldst_unit.rs @@ -32,7 +32,7 @@ fn new_mem_fetch( mem_fetch::MemFetch::new( Some(instr), access, - &config, + config, control_size, warp_id, core_id, @@ -46,7 +46,7 @@ pub struct LoadStoreUnit { cluster_id: usize, next_writeback: Option, response_fifo: VecDeque, - warps: Vec, + warps: Vec, pub data_l1: Option>, config: Arc, pub stats: Arc>, @@ -91,6 +91,7 @@ enum WritebackClient { } #[derive(strum::EnumCount, strum::FromRepr, Hash, PartialEq, Eq, Clone, Copy, Debug)] +#[allow(dead_code)] #[repr(usize)] enum MemStageAccessKind { C_MEM, @@ -104,6 +105,7 @@ enum MemStageAccessKind { } #[derive(strum::EnumCount, strum::FromRepr, Hash, PartialEq, Eq, Clone, Copy, Debug)] +#[allow(dead_code)] #[repr(usize)] enum MemStageStallKind { NO_RC_FAIL = 0, @@ -125,7 +127,7 @@ where id: usize, core_id: usize, cluster_id: usize, - warps: Vec, + warps: Vec, fetch_interconn: Arc, operand_collector: Rc>, scoreboard: Arc>, @@ -157,7 +159,7 @@ where // initialize l1 data cache let cache_stats = Arc::new(Mutex::new(stats::Cache::default())); Some(Box::new(l1::Data::new( - format!("ldst-unit-{}-{}-L1-DATA-CACHE", cluster_id, core_id), + format!("ldst-unit-{cluster_id}-{core_id}-L1-DATA-CACHE"), core_id, cluster_id, Rc::clone(&cycle), @@ -193,7 +195,7 @@ where } } - pub fn response_buffer_full(&self) -> bool { + #[must_use] pub fn response_buffer_full(&self) -> bool { self.response_fifo.len() >= self.config.num_ldst_response_buffer_size } @@ -341,7 +343,7 @@ where "{}", style(format!( "ldst unit writeback: has global {:?} ({})", - &next_global.instr.as_ref().map(|i| i.to_string()), + &next_global.instr.as_ref().map(std::string::ToString::to_string), &next_global.addr() )) .magenta(), @@ -423,7 +425,7 @@ where _rc_fail: &mut MemStageStallKind, _kind: &mut MemStageAccessKind, ) -> bool { - false + true } fn texture_cycle( @@ -431,7 +433,7 @@ where _rc_fail: &mut MemStageStallKind, _kind: &mut MemStageAccessKind, ) -> bool { - false + true } fn memory_cycle( @@ -491,7 +493,7 @@ where } else { mem_fetch::READ_PACKET_SIZE }; - let size = access.req_size_bytes + control_size as u32; + let size = access.req_size_bytes + u32::from(control_size); if self.fetch_interconn.full( size, @@ -669,12 +671,13 @@ where } } + #[allow(dead_code)] fn process_cache_access( &mut self, _cache: (), _addr: address, instr: &mut WarpInstruction, - events: &mut Vec, + events: &mut [cache::Event], fetch: mem_fetch::MemFetch, status: cache::RequestStatus, ) -> MemStageStallKind { @@ -774,7 +777,7 @@ where ); if *still_pending > 0 { - pending.remove(&out_reg); + pending.remove(out_reg); log::trace!("l1 latency queue release registers"); self.scoreboard .write() @@ -834,13 +837,13 @@ where } } - fn pending_writes(&self, warp_id: usize, reg_id: u32) -> Option { + #[must_use] pub fn pending_writes(&self, warp_id: usize, reg_id: u32) -> Option { let pending = self.pending_writes.get(&warp_id)?; let pending = pending.get(®_id)?; Some(*pending) } - fn pending_writes_mut(&mut self, warp_id: usize, reg_id: u32) -> &mut usize { + pub fn pending_writes_mut(&mut self, warp_id: usize, reg_id: u32) -> &mut usize { let pending = self.pending_writes.entry(warp_id).or_default(); pending.entry(reg_id).or_default() } @@ -935,13 +938,13 @@ where self.pipelined_simd_unit .pipeline_reg .iter() - .map(|reg| reg.as_ref().map(|r| r.to_string())) + .map(|reg| reg.as_ref().map(std::string::ToString::to_string)) .collect::>(), self.pipelined_simd_unit.num_active_instr_in_pipeline(), self.pipelined_simd_unit.pipeline_reg.len(), self.response_fifo .iter() - .map(|t| t.to_string()) + .map(std::string::ToString::to_string) .collect::>(), ); @@ -968,9 +971,7 @@ where } } - drop(simd_unit); - - if let Some(ref fetch) = self.response_fifo.front() { + if let Some(fetch) = self.response_fifo.front() { match fetch.access_kind() { mem_fetch::AccessKind::TEXTURE_ACC_R => { todo!("ldst unit: tex access"); @@ -993,7 +994,7 @@ where if fetch.kind == mem_fetch::Kind::WRITE_ACK || (self.config.perfect_mem && fetch.is_write()) { - self.store_ack(&fetch); + self.store_ack(fetch); self.response_fifo.pop_front(); } else { // L1 cache is write evict: @@ -1051,11 +1052,10 @@ where let mut access_kind = MemStageAccessKind::C_MEM; let mut done = true; done &= self.shared_cycle(&mut stall_kind, &mut access_kind); - // done &= self.constant_cycle(&mut stall_kind, &mut access_kind); - // done &= self.texture_cycle(&mut stall_kind, &mut access_kind); + done &= self.constant_cycle(&mut stall_kind, &mut access_kind); + done &= self.texture_cycle(&mut stall_kind, &mut access_kind); done &= self.memory_cycle(&mut stall_kind, &mut access_kind); - // let mut num_stall_scheduler_mem = 0; if !done { // log stall types and return debug_assert_ne!(stall_kind, MemStageStallKind::NO_RC_FAIL); diff --git a/src/ported/mem_fetch.rs b/src/ported/mem_fetch.rs index 8fb15a9d..546abd89 100644 --- a/src/ported/mem_fetch.rs +++ b/src/ported/mem_fetch.rs @@ -21,15 +21,11 @@ pub type MemAccessSectorMask = BitArr!(for mem_sub_partition::SECTOR_CHUNCK_SIZE pub enum Kind { READ_REQUEST = 0, WRITE_REQUEST, - READ_REPLY, // send to shader + READ_REPLY, WRITE_ACK, - // Atomic, - // Const, - // Tex, } #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] -#[allow(clippy::incorrect_ident_case)] pub enum Status { INITIALIZED, IN_L1I_MISS_QUEUE, @@ -179,7 +175,7 @@ impl std::fmt::Display for MemAccess { impl MemAccess { /// todo: where is this initialized - pub fn new( + #[must_use] pub fn new( kind: AccessKind, addr: address, allocation: Option, @@ -196,31 +192,22 @@ impl MemAccess { if let Some(ref alloc) = allocation { debug_assert!(alloc.start_addr <= addr); } - Self { - warp_mask, - byte_mask, - sector_mask, - req_size_bytes, - is_write, - kind, - addr, - allocation, - } + Self { addr, allocation, is_write, req_size_bytes, kind, warp_mask, byte_mask, sector_mask } } #[inline] - pub fn relative_addr(&self) -> Option { + #[must_use] pub fn relative_addr(&self) -> Option { self.allocation .as_ref() .map(|alloc| alloc.start_addr) .and_then(|start| self.addr.checked_sub(start)) } - pub fn control_size(&self) -> u32 { + #[must_use] pub fn control_size(&self) -> u32 { if self.is_write { - WRITE_PACKET_SIZE as u32 + u32::from(WRITE_PACKET_SIZE) } else { - READ_PACKET_SIZE as u32 + u32::from(READ_PACKET_SIZE) } } @@ -286,7 +273,7 @@ impl std::fmt::Display for MemFetch { if let Some(ref alloc) = self.access.allocation { write!(f, "@{}+{})", alloc.id, addr - alloc.start_addr) } else { - write!(f, "@{})", addr) + write!(f, "@{addr})") } } } @@ -403,25 +390,25 @@ impl MemFetch { .map_or(false, WarpInstruction::is_atomic) } - pub fn is_texture(&self) -> bool { + #[must_use] pub fn is_texture(&self) -> bool { self.instr .as_ref() .map_or(false, |i| i.memory_space == Some(MemorySpace::Texture)) } - pub fn is_write(&self) -> bool { + #[must_use] pub fn is_write(&self) -> bool { self.access.is_write } - pub fn addr(&self) -> address { + #[must_use] pub fn addr(&self) -> address { self.access.addr } - pub fn relative_addr(&self) -> Option

{ + #[must_use] pub fn relative_addr(&self) -> Option
{ self.access.relative_addr() } - pub fn size(&self) -> u32 { + #[must_use] pub fn size(&self) -> u32 { self.data_size + self.control_size } @@ -429,23 +416,23 @@ impl MemFetch { // self.instr.cache_op // } - pub fn access_byte_mask(&self) -> &MemAccessByteMask { + #[must_use] pub fn access_byte_mask(&self) -> &MemAccessByteMask { &self.access.byte_mask } - pub fn access_warp_mask(&self) -> &ThreadActiveMask { + #[must_use] pub fn access_warp_mask(&self) -> &ThreadActiveMask { &self.access.warp_mask } - pub fn access_sector_mask(&self) -> &MemAccessSectorMask { + #[must_use] pub fn access_sector_mask(&self) -> &MemAccessSectorMask { &self.access.sector_mask } - pub fn sub_partition_id(&self) -> usize { + #[must_use] pub fn sub_partition_id(&self) -> usize { self.tlx_addr.sub_partition as usize } - pub fn access_kind(&self) -> &AccessKind { + #[must_use] pub fn access_kind(&self) -> &AccessKind { &self.access.kind } @@ -454,7 +441,7 @@ impl MemFetch { self.last_status_change = Some(time); } - pub fn is_reply(&self) -> bool { + #[must_use] pub fn is_reply(&self) -> bool { matches!(self.kind, Kind::READ_REPLY | Kind::WRITE_ACK) } diff --git a/src/ported/mem_partition_unit.rs b/src/ported/mem_partition_unit.rs index ac1a160b..5998bebd 100644 --- a/src/ported/mem_partition_unit.rs +++ b/src/ported/mem_partition_unit.rs @@ -1,9 +1,7 @@ use super::mem_fetch::BitString; use crate::config::GPUConfig; use crate::ported::{ - self, address, - cache::Cache, - dram, + self, address, dram, fifo::{FifoQueue, Queue}, mem_fetch, mem_sub_partition::MemorySubPartition, @@ -23,6 +21,7 @@ pub struct MemoryPartitionUnit { pub arbitration_metadata: super::arbitration::ArbitrationMetadata, config: Arc, + #[allow(dead_code)] stats: Arc>, } @@ -38,19 +37,19 @@ impl MemoryPartitionUnit { .map(|i| { let sub_id = id * num_sub_partitions + i; - let sub = Rc::new(RefCell::new(MemorySubPartition::new( + + Rc::new(RefCell::new(MemorySubPartition::new( sub_id, id, Rc::clone(&cycle), Arc::clone(&config), Arc::clone(&stats), - ))); - sub + ))) }) .collect(); let dram = dram::DRAM::new(config.clone(), stats.clone()); - let arbitration_metadata = super::arbitration::ArbitrationMetadata::new(&*config); + let arbitration_metadata = super::arbitration::ArbitrationMetadata::new(&config); Self { id, config, @@ -62,7 +61,7 @@ impl MemoryPartitionUnit { } } - pub fn busy(&self) -> bool { + #[must_use] pub fn busy(&self) -> bool { self.sub_partitions .iter() .any(|sub| sub.try_borrow().unwrap().busy()) @@ -93,7 +92,7 @@ impl MemoryPartitionUnit { } pub fn cache_cycle(&mut self, cycle: u64) { - for mem_sub in self.sub_partitions.iter_mut() { + for mem_sub in &mut self.sub_partitions { mem_sub.borrow_mut().cache_cycle(cycle); } } @@ -216,7 +215,7 @@ impl MemoryPartitionUnit { let dram_latency_queue: Vec<_> = self .dram_latency_queue .iter() - .map(|f| f.to_string()) + .map(std::string::ToString::to_string) .collect(); log::debug!( "\t dram latency queue ({:3}) = {:?}", diff --git a/src/ported/mem_sub_partition.rs b/src/ported/mem_sub_partition.rs index 152005bf..224c6e33 100644 --- a/src/ported/mem_sub_partition.rs +++ b/src/ported/mem_sub_partition.rs @@ -1,7 +1,6 @@ use crate::config::{self, GPUConfig}; use crate::ported::{ self, address, cache, - cache::Cache, fifo::{FifoQueue, Queue}, interconn as ic, l2, mem_fetch, }; @@ -19,25 +18,25 @@ pub const SECTOR_CHUNCK_SIZE: u32 = 4; /// Sector size is 32 bytes width pub const SECTOR_SIZE: u32 = 32; -pub fn was_write_sent(events: &[cache::Event]) -> bool { +#[must_use] pub fn was_write_sent(events: &[cache::Event]) -> bool { events .iter() .any(|event| event.kind == cache::EventKind::WRITE_REQUEST_SENT) } -pub fn was_writeback_sent(events: &[cache::Event]) -> Option<&cache::Event> { +#[must_use] pub fn was_writeback_sent(events: &[cache::Event]) -> Option<&cache::Event> { events .iter() .find(|event| event.kind == cache::EventKind::WRITE_BACK_REQUEST_SENT) } -pub fn was_read_sent(events: &[cache::Event]) -> bool { +#[must_use] pub fn was_read_sent(events: &[cache::Event]) -> bool { events .iter() .any(|event| event.kind == cache::EventKind::READ_REQUEST_SENT) } -pub fn was_writeallocate_sent(events: &[cache::Event]) -> bool { +#[must_use] pub fn was_writeallocate_sent(events: &[cache::Event]) -> bool { events .iter() .any(|event| event.kind == cache::EventKind::WRITE_ALLOCATE_SENT) @@ -271,7 +270,6 @@ where } pub fn push(&mut self, fetch: mem_fetch::MemFetch) { - // todo!("mem sub partition: push"); // m_stats->memlatstat_icnt2mem_pop(m_req); let mut requests = Vec::new(); let l2_config = self.config.data_cache_l2.as_ref().unwrap(); @@ -322,11 +320,7 @@ where } pub fn flush_l2(&mut self) -> Option { - if let Some(l2) = &mut self.l2_cache { - Some(l2.flush()) - } else { - None - } + self.l2_cache.as_mut().map(|l2| l2.flush()) } pub fn invalidate_l2(&mut self) { @@ -341,13 +335,8 @@ where let fetch = self.l2_to_interconn_queue.dequeue()?; // self.request_tracker.remove(fetch); if fetch.is_atomic() { - // fetch.do_atomic(); unimplemented!("atomic memory operation"); } - // panic!( - // "l2 to dram queue fetch: access kind = {:?}", - // fetch.access_kind(), - // ); match fetch.access_kind() { // writeback accesses not counted AccessKind::L2_WRBK_ACC | AccessKind::L1_WRBK_ACC => None, @@ -357,30 +346,19 @@ where pub fn top(&mut self) -> Option<&mem_fetch::MemFetch> { use super::AccessKind; - match self + if let Some(AccessKind::L2_WRBK_ACC | AccessKind::L1_WRBK_ACC) = self .l2_to_interconn_queue .first() - .map(|fetch| fetch.access_kind()) + .map(ported::mem_fetch::MemFetch::access_kind) { - Some(AccessKind::L2_WRBK_ACC | AccessKind::L1_WRBK_ACC) => { - self.l2_to_interconn_queue.dequeue(); - // self.request_tracker.remove(fetch); - return None; - } - _ => {} + self.l2_to_interconn_queue.dequeue(); + // self.request_tracker.remove(fetch); + return None; } self.l2_to_interconn_queue.first() } - // pub fn full(&self) -> bool { - // self.interconn_to_l2_queue.full() - // } - // - // pub fn has_available_size(&self, size: usize) -> bool { - // self.interconn_to_l2_queue.has_available_size(size) - // } - pub fn set_done(&mut self, fetch: &mem_fetch::MemFetch) { self.request_tracker.remove(fetch); } @@ -408,7 +386,7 @@ where log_line, self.rop_queue .iter() - .map(|f| f.to_string()) + .map(std::string::ToString::to_string) .collect::>(), self.interconn_to_l2_queue, self.l2_to_interconn_queue, @@ -434,10 +412,8 @@ where // todo: move config into l2 let l2_config = self.config.data_cache_l2.as_ref().unwrap(); - // if !l2_config.disabled {} if l2_cache.has_ready_accesses() && !queue_full { let mut fetch = l2_cache.next_access().unwrap(); - // panic!("fetch from l2 cache ready"); // Don't pass write allocate read request back to upper level cache if fetch.access_kind() != &AccessKind::L2_WR_ALLOC_R { @@ -445,20 +421,16 @@ where fetch.set_status(Status::IN_PARTITION_L2_TO_ICNT_QUEUE, 0); // m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle); self.l2_to_interconn_queue.enqueue(fetch); - } else { - if l2_config.inner.write_allocate_policy - == CacheWriteAllocatePolicy::FETCH_ON_WRITE - { - let mut original_write_fetch = *fetch.original_fetch.unwrap(); - original_write_fetch.set_reply(); - original_write_fetch - .set_status(mem_fetch::Status::IN_PARTITION_L2_TO_ICNT_QUEUE, 0); - // m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle); - self.l2_to_interconn_queue.enqueue(original_write_fetch); - todo!("fetch on write: l2 to icnt queue"); - } - // self.request_tracker.remove(fetch); - // delete mf; + } else if l2_config.inner.write_allocate_policy + == CacheWriteAllocatePolicy::FETCH_ON_WRITE + { + let mut original_write_fetch = *fetch.original_fetch.unwrap(); + original_write_fetch.set_reply(); + original_write_fetch + .set_status(mem_fetch::Status::IN_PARTITION_L2_TO_ICNT_QUEUE, 0); + // m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle); + self.l2_to_interconn_queue.enqueue(original_write_fetch); + todo!("fetch on write: l2 to icnt queue"); } } } @@ -468,14 +440,12 @@ where // DRAM to L2 (texture) and icnt (not texture) if let Some(reply) = self.dram_to_l2_queue.first() { match self.l2_cache { - Some(ref mut l2_cache) if l2_cache.waiting_for_fill(&reply) => { + Some(ref mut l2_cache) if l2_cache.waiting_for_fill(reply) => { if l2_cache.has_free_fill_port() { let mut reply = self.dram_to_l2_queue.dequeue().unwrap(); log::debug!("filling L2 with {}", &reply); reply.set_status(mem_fetch::Status::IN_PARTITION_L2_FILL_QUEUE, 0); - // dbg!(cycle, self.memcpy_cycle_offset); l2_cache.fill(reply, time); - // l2_cache.fill(&mut reply) // reply will be gone forever at this point // m_dram_L2_queue->pop(); } else { @@ -509,16 +479,13 @@ where if !self.l2_to_dram_queue.lock().unwrap().full() { if let Some(fetch) = self.interconn_to_l2_queue.first() { if let Some(ref mut l2_cache) = self.l2_cache { - if (self.config.data_cache_l2_texture_only && fetch.is_texture()) - || !self.config.data_cache_l2_texture_only - { + if !self.config.data_cache_l2_texture_only || fetch.is_texture() { // L2 is enabled and access is for L2 let output_full = self.l2_to_interconn_queue.full(); let port_free = l2_cache.has_free_data_port(); if !output_full && port_free { let mut events = Vec::new(); - // dbg!(cycle, self.memcpy_cycle_offset); let status = l2_cache.access(fetch.addr(), fetch.clone(), &mut events, time); let write_sent = was_write_sent(&events); @@ -545,17 +512,13 @@ where ); self.l2_to_interconn_queue.enqueue(fetch); } - // m_icnt_L2_queue->pop(); } else { assert!(write_sent); - // m_icnt_L2_queue->pop(); } } else if status != cache::RequestStatus::RESERVATION_FAIL { // L2 cache accepted request let mut fetch = self.interconn_to_l2_queue.dequeue().unwrap(); let wa_policy = l2_cache.write_allocate_policy(); - // let is_fetch_on_write = l2_cache.write_allocate_policy() - // == config::CacheWriteAllocatePolicy::FETCH_ON_WRITE; let should_fetch = matches!( wa_policy, config::CacheWriteAllocatePolicy::FETCH_ON_WRITE diff --git a/src/ported/mod.rs b/src/ported/mod.rs index b098109e..547b1100 100644 --- a/src/ported/mod.rs +++ b/src/ported/mod.rs @@ -1,3 +1,10 @@ +#![allow( + clippy::too_many_arguments, + clippy::missing_panics_doc, + clippy::missing_errors_doc, + clippy::too_many_lines +)] + pub mod addrdec; pub mod arbitration; pub mod barrier; @@ -5,10 +12,12 @@ pub mod cache; pub mod cache_block; pub mod cluster; pub mod core; +pub mod deadlock; pub mod dram; pub mod fifo; pub mod instruction; pub mod interconn; +pub mod kernel; pub mod l1; pub mod l2; pub mod ldst_unit; @@ -21,7 +30,7 @@ pub mod operand_collector; pub mod register_set; pub mod scheduler; pub mod scoreboard; -pub mod set_index_function; +pub mod set_index; pub mod simd_function_unit; pub mod sp_unit; pub mod tag_array; @@ -29,15 +38,18 @@ pub mod tag_array; #[cfg(test)] pub mod testing; -use self::cluster::*; -use self::core::*; -use addrdec::*; -use color_eyre::Help; +use self::cluster::SIMTCoreCluster; +use self::core::{ + warp_inst_complete, Packet, PipelineStage, SIMTCore, WarpMask, MAX_THREAD_PER_SM, + PROGRAM_MEM_START, +}; +use addrdec::DecodedAddress; use fifo::Queue; use interconn as ic; -use ldst_unit::*; -use mem_fetch::*; -use sp_unit::*; +use kernel::Kernel; +use ldst_unit::LoadStoreUnit; +use mem_fetch::{AccessKind, BitString, READ_PACKET_SIZE, WRITE_PACKET_SIZE}; +use sp_unit::SPUnit; use stats::Stats; use crate::config; @@ -45,231 +57,16 @@ use bitvec::array::BitArray; use color_eyre::eyre::{self}; use console::style; use std::cell::RefCell; -use std::collections::HashSet; use std::collections::{HashMap, VecDeque}; use std::ops::Deref; use std::path::{Path, PathBuf}; use std::rc::Rc; -use std::sync::{atomic, Arc, Mutex, RwLock}; +use std::sync::{atomic, Arc, Mutex}; use std::time::Instant; -use trace_model::{Command, KernelLaunch, MemAccessTraceEntry, Point}; +use trace_model::Command; pub type address = u64; -/// KernelInfo represents a kernel. -/// -/// This includes its launch configuration, -/// as well as its state of execution. -#[derive()] -pub struct KernelInfo { - pub opcodes: &'static opcodes::OpcodeMap, - pub config: KernelLaunch, - trace: Vec, - trace_pos: RwLock, - launched: Mutex, - num_cores_running: usize, - - pub cache_config_set: bool, -} - -impl PartialEq for KernelInfo { - fn eq(&self, other: &Self) -> bool { - self.id() == other.id() - } -} - -impl std::fmt::Debug for KernelInfo { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("KernelInfo") - .field("name", &self.name()) - .field("id", &self.id()) - .field("instructions", &self.trace.len()) - .field("launched", &self.launched) - .field("grid", &self.config.grid) - .field("block", &self.config.block) - .field("stream", &self.config.stream_id) - .field( - "shared_mem", - &human_bytes::human_bytes(self.config.shared_mem_bytes as f64), - ) - .field("registers", &self.config.num_registers) - // .field("block", &self.current_block()) - // .field("thread", &self.next_block_iter.lock().unwrap().peek()) - .finish() - } -} - -impl std::fmt::Display for KernelInfo { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("KernelInfo") - .field("name", &self.name()) - .field("id", &self.id()) - .finish() - } -} - -pub fn read_trace(path: impl AsRef) -> eyre::Result> { - use serde::Deserializer; - - let reader = utils::fs::open_readable(path.as_ref())?; - let mut reader = rmp_serde::Deserializer::new(reader); - let mut trace = vec![]; - let decoder = nvbit_io::Decoder::new(|access: MemAccessTraceEntry| { - trace.push(access); - }); - reader.deserialize_seq(decoder).suggestion("maybe the traces does not match the most recent binary trace format, try re-generating the traces.")?; - Ok(trace) -} - -impl KernelInfo { - pub fn from_trace(traces_dir: impl AsRef, config: KernelLaunch) -> Self { - let start = Instant::now(); - log::info!( - "parsing kernel for launch {:?} from {}", - &config, - &config.trace_file - ); - let trace_path = traces_dir - .as_ref() - .join(&config.trace_file) - .with_extension("msgpack"); - - let trace = read_trace(&trace_path).unwrap(); - - // sanity check - assert!(trace_model::is_valid_trace(&trace)); - - // check if grid size is equal to the number of unique blocks in the trace - let all_blocks: HashSet<_> = trace.iter().map(|t| &t.block_id).collect(); - log::info!( - "parsed kernel trace for {:?}: {}/{} blocks in {:?}", - config.name, - all_blocks.len(), - config.grid.size(), - start.elapsed() - ); - assert_eq!(config.grid.size(), all_blocks.len() as u64); - - let opcodes = opcodes::get_opcode_map(&config).unwrap(); - - Self { - config, - trace, - trace_pos: RwLock::new(0), - opcodes, - launched: Mutex::new(false), - num_cores_running: 0, - cache_config_set: false, - } - } - - pub fn id(&self) -> u64 { - self.config.id - } - - pub fn next_threadblock_traces(&self, warps: &mut [scheduler::CoreWarp]) { - let mut trace_pos = self.trace_pos.write().unwrap(); - - let mut instructions = 0; - let trace_size = self.trace.len(); - - if *trace_pos + 1 >= trace_size || trace_size == 0 { - // no more threadblocks - log::info!("blocks done: no more threadblock traces"); - return; - } - let next_block = &self.trace[*trace_pos + 1].block_id; - - while *trace_pos < trace_size { - let entry = &self.trace[*trace_pos]; - if entry.block_id != *next_block { - // get instructions until new block - break; - } - - let warp_id = entry.warp_id_in_block as usize; - let instr = instruction::WarpInstruction::from_trace(&self, entry.clone()); - let warp = warps.get_mut(warp_id).unwrap(); - let mut warp = warp.try_borrow_mut().unwrap(); - warp.push_trace_instruction(instr); - - instructions += 1; - *trace_pos += 1; - } - - log::debug!( - "added {instructions} instructions ({} per warp) for block {next_block}", - instructions / warps.len() - ); - debug_assert!(instructions > 0); - // debug_assert!(instructions % 32 == 0); - // dbg!(warps - // .iter() - // .map(|w| w.try_borrow().unwrap().trace_instructions.len()) - // .collect::>()); - // debug_assert!( - // warps - // .iter() - // .map(|w| w.try_borrow().unwrap().trace_instructions.len()) - // .collect::>() - // .len() - // == 1, - // "all warps have the same number of instructions" - // ); - // dbg!(warps - // .iter() - // .map(|w| w.try_borrow().unwrap().trace_instructions.len()) - // .collect::>()); - - debug_assert!( - warps - .iter() - .all(|w| !w.try_borrow().unwrap().trace_instructions.is_empty()), - "all warps have at least one instruction (need at least an EXIT)" - ); - } - - pub fn inc_running(&mut self) { - self.num_cores_running += 1; - } - - pub fn name(&self) -> &str { - &self.config.name - } - - pub fn was_launched(&self) -> bool { - *self.launched.lock().unwrap() - } - - pub fn running(&self) -> bool { - self.num_cores_running > 0 - } - - pub fn current_block(&self) -> Option { - let traces_pos = self.trace_pos.read().unwrap(); - let trace = self.trace.get(*traces_pos)?; - Some(Point::new(trace.block_id.clone(), self.config.grid.clone())) - } - - pub fn done(&self) -> bool { - self.no_more_blocks_to_run() && !self.running() - } - - pub fn num_blocks(&self) -> usize { - let grid = &self.config.grid; - grid.x as usize * grid.y as usize * grid.z as usize - } - - pub fn threads_per_block(&self) -> usize { - let block = &self.config.block; - block.x as usize * block.y as usize * block.z as usize - } - - pub fn no_more_blocks_to_run(&self) -> bool { - self.current_block().is_none() - } -} - pub fn parse_commands(path: impl AsRef) -> eyre::Result> { let reader = utils::fs::open_readable(path.as_ref())?; let commands = serde_json::from_reader(reader)?; @@ -316,18 +113,19 @@ impl std::ops::Deref for Allocations { impl Allocations { pub fn insert(&mut self, range: std::ops::Range
, name: Option) { // check for intersections - if self.0.overlaps(&range) { - panic!("overlapping memory allocation {:?}", &range); - } + assert!( + !self.0.overlaps(&range), + "overlapping memory allocation {:?}", + &range + ); let id = self.0.len() + 1; // zero is reserved for instructions let start_addr = range.start; let end_addr = Some(range.end); self.0.insert( range, Allocation { - name, - // avoid joining of allocations using the id and range id, + name, start_addr, end_addr, }, @@ -343,7 +141,7 @@ pub struct MockSimulator { mem_sub_partitions: Vec< Rc>>>, >, - running_kernels: Vec>>, + running_kernels: Vec>>, executed_kernels: Mutex>, clusters: Vec>, #[allow(dead_code)] @@ -359,19 +157,20 @@ pub struct MockSimulator { traces_dir: PathBuf, commands: Vec, command_idx: usize, - kernels: VecDeque>, + kernels: VecDeque>, kernel_window_size: usize, busy_streams: VecDeque, cycle_limit: Option, log_after_cycle: Option, // gpu_stall_icnt2sh: usize, - // partition_replies_in_parallel: usize, + partition_replies_in_parallel: usize, } #[derive(Debug, Default)] pub struct AtomicCycle(std::sync::atomic::AtomicU64); impl AtomicCycle { + #[must_use] pub fn new(cycle: u64) -> Self { Self(std::sync::atomic::AtomicU64::new(cycle)) } @@ -403,14 +202,10 @@ impl FromConfig for stats::Stats { } } -// impl MockSimulator { -// impl<'a> MockSimulator<'a> { impl MockSimulator where - // I: ic::MemFetchInterface + 'static, I: ic::Interconnect + 'static, { - // see new trace_gpgpu_sim pub fn new( interconn: Arc, config: Arc, @@ -419,7 +214,7 @@ where ) -> Self { let _start = Instant::now(); let traces_dir = traces_dir.as_ref(); - let stats = Arc::new(Mutex::new(Stats::from_config(&*config))); + let stats = Arc::new(Mutex::new(Stats::from_config(&config))); let num_mem_units = config.num_memory_controllers; let num_sub_partitions = config.num_sub_partition_per_memory_channel; @@ -481,15 +276,14 @@ where // todo: make this a hashset? let busy_streams: VecDeque = VecDeque::new(); - let mut kernels: VecDeque> = VecDeque::new(); + let mut kernels: VecDeque> = VecDeque::new(); kernels.reserve_exact(window_size); let cycle_limit: Option = std::env::var("CYCLES") .ok() .as_deref() .map(str::parse) - .map(Result::ok) - .flatten(); + .and_then(Result::ok); // this causes first launch to use simt cluster let last_cluster_issue = config.num_simt_clusters - 1; @@ -516,6 +310,7 @@ where busy_streams, cycle_limit, log_after_cycle: None, + partition_replies_in_parallel: 0, } } @@ -523,7 +318,7 @@ where /// /// Todo: used hack to allow selecting the kernel from the shader core, /// but we could maybe refactor - pub fn select_kernel(&self) -> Option<&Arc> { + pub fn select_kernel(&self) -> Option<&Arc> { let mut executed_kernels = self.executed_kernels.lock().unwrap(); if let Some(k) = &self.running_kernels[self.last_issued_kernel] { if !k.no_more_blocks_to_run() @@ -590,8 +385,8 @@ where }) } - pub fn launch(&mut self, kernel: Arc) -> eyre::Result<()> { - *kernel.launched.lock().unwrap() = true; + pub fn launch(&mut self, kernel: Arc) -> eyre::Result<()> { + kernel.set_launched(); let threads_per_block = kernel.threads_per_block(); let max_threads_per_block = self.config.max_threads_per_core; if threads_per_block > max_threads_per_block { @@ -630,7 +425,7 @@ where pub fn set_cycle(&self, cycle: u64) { let mut stats = self.stats.lock().unwrap(); stats.sim.cycles = cycle; - self.cycle.set(cycle) + self.cycle.set(cycle); } pub fn cycle(&mut self) { @@ -641,8 +436,6 @@ where cluster.interconn_cycle(); } - let mut partition_replies_in_parallel_per_cycle = 0; - log::debug!( "POP from {} memory sub partitions", self.mem_sub_partitions.len() @@ -678,7 +471,7 @@ where let dram_latency_queue: Vec<_> = partition .dram_latency_queue .iter() - .map(|f| f.to_string()) + .map(std::string::ToString::to_string) .collect(); log::debug!( "\t dram latency queue ({:3}) = {:?}", @@ -705,13 +498,12 @@ where // drop(fetch); self.interconn .push(device, cluster_id, packet, response_packet_size); - partition_replies_in_parallel_per_cycle += 1; + self.partition_replies_in_parallel += 1; } else { // self.gpu_stall_icnt2sh += 1; } } } - // self.partition_replies_in_parallel += partition_replies_in_parallel_per_cycle; // dram log::debug!("cycle for {} drams", self.mem_partition_units.len()); @@ -731,8 +523,8 @@ where "moving mem requests from interconn to {} mem partitions", self.mem_sub_partitions.len() ); - let mut parallel_mem_partition_reqs_per_cycle = 0; - let mut stall_dram_full = 0; + // let mut parallel_mem_partition_reqs_per_cycle = 0; + // let mut stall_dram_full = 0; for (i, mem_sub) in self.mem_sub_partitions.iter_mut().enumerate() { let mut mem_sub = mem_sub.try_borrow_mut().unwrap(); // move memory request from interconnect into memory partition @@ -755,11 +547,11 @@ where ); mem_sub.push(fetch); - parallel_mem_partition_reqs_per_cycle += 1; + // self.parallel_mem_partition_reqs += 1; } } else { log::debug!("SKIP sub partition {} ({}): DRAM full stall", i, device); - stall_dram_full += 1; + self.stats.lock().unwrap().stall_dram_full += 1; } // we borrow all of sub here, which is a problem for the cyclic reference in l2 // interface @@ -774,17 +566,17 @@ where // self.interconn_transfer(); - let mut active_sms = 0; + // let mut active_sms = 0; for cluster in &mut self.clusters { let cores_completed = cluster.not_completed() == 0; let kernels_completed = self .running_kernels .iter() - .filter_map(|k| k.as_ref()) + .filter_map(std::option::Option::as_ref) .all(|k| k.no_more_blocks_to_run()); if !cores_completed || !kernels_completed { cluster.cycle(); - active_sms += cluster.num_active_sms(); + // active_sms += cluster.num_active_sms(); } } @@ -795,7 +587,7 @@ where // once all of threads are completed. let mut all_threads_complete = true; if self.config.flush_l1_cache { - for cluster in self.clusters.iter_mut() { + for cluster in &mut self.clusters { if cluster.not_completed() == 0 { cluster.cache_invalidate(); } else { @@ -806,7 +598,7 @@ where if self.config.flush_l2_cache { if !self.config.flush_l1_cache { - for cluster in self.clusters.iter_mut() { + for cluster in &mut self.clusters { if cluster.not_completed() > 0 { all_threads_complete = false; break; @@ -833,7 +625,7 @@ where } } - pub fn gpu_mem_alloc(&mut self, addr: address, num_bytes: u64, name: Option) { + pub fn gpu_mem_alloc(&mut self, addr: address, num_bytes: u64, name: Option<&str>) { log::info!( "memalloc: {:<20} {:>15} ({:>5} f32) at address {addr:>20}", name.as_deref().unwrap_or(""), @@ -858,13 +650,13 @@ where self.allocations .try_borrow_mut() .unwrap() - .insert(alloc_range.clone(), name); + .insert(alloc_range, name); if self.config.fill_l2_on_memcopy { let num_sub_partitions = self.config.num_sub_partition_per_memory_channel; let mut transfered = 0; while transfered < num_bytes { - let write_addr = addr + transfered as u64; + let write_addr = addr + transfered; let tlx_addr = self.config.address_mapping().tlx(write_addr); let partition_id = tlx_addr.sub_partition / num_sub_partitions as u64; @@ -932,7 +724,7 @@ where /// Process commands /// /// Take as many commands as possible until we have collected as many kernels to fill - /// the window_size or processed every command. + /// the `window_size` or processed every command. pub fn process_commands(&mut self) { while self.kernels.len() < self.kernel_window_size && self.command_idx < self.commands.len() { @@ -947,9 +739,11 @@ where allocation_name, device_ptr, num_bytes, - }) => self.gpu_mem_alloc(*device_ptr, *num_bytes, allocation_name.clone()), + }) => { + self.gpu_mem_alloc(*device_ptr, *num_bytes, allocation_name.clone().as_deref()) + } Command::KernelLaunch(launch) => { - let kernel = KernelInfo::from_trace(&self.traces_dir, launch.clone()); + let kernel = Kernel::from_trace(&self.traces_dir, launch.clone()); self.kernels.push_back(Arc::new(kernel)); } } @@ -971,7 +765,7 @@ where /// Launch all kernels within window that are on a stream that isn't already running pub fn launch_kernels(&mut self) { log::trace!("launching kernels"); - let mut launch_queue: Vec> = Vec::new(); + let mut launch_queue: Vec> = Vec::new(); for kernel in &self.kernels { let stream_busy = self .busy_streams @@ -991,19 +785,17 @@ where .transpose() .unwrap(); if let Some(up_to_kernel) = up_to_kernel { - if kernel.config.id > up_to_kernel { - panic!("launching kernel {}", kernel); - } + assert!( + kernel.config.id <= up_to_kernel, + "launching kernel {kernel}" + ); } - self.launch(kernel); + self.launch(kernel).unwrap(); } } pub fn reached_limit(&self, cycle: u64) -> bool { - match self.cycle_limit { - Some(limit) if cycle >= limit => true, - _ => false, - } + matches!(self.cycle_limit, Some(limit) if cycle >= limit) } pub fn commands_left(&self) -> bool { @@ -1020,7 +812,7 @@ where deadlock_check: bool, ) -> eyre::Result<()> { let mut cycle: u64 = 0; - let mut last_state_change: Option<(DeadlockCheckState, u64)> = None; + let mut last_state_change: Option<(deadlock::State, u64)> = None; while (self.commands_left() || self.kernels_left()) && !self.reached_limit(cycle) { self.process_commands(); @@ -1048,10 +840,11 @@ where match self.log_after_cycle { Some(ref log_after_cycle) if cycle >= *log_after_cycle => { - println!("initializing logging after cycle {}", cycle); + use std::io::Write; + + println!("initializing logging after cycle {cycle}"); let mut log_builder = env_logger::Builder::new(); - use std::io::Write; log_builder.format(|buf, record| { writeln!( buf, @@ -1105,7 +898,7 @@ where match &mut last_state_change { Some((last_state, update_cycle)) if &state == last_state => { - panic!("deadlock after cycle {}", update_cycle); + panic!("deadlock after cycle {update_cycle}"); } Some((ref mut last_state, ref mut update_cycle)) => { // log::info!("deadlock check: updated state in cycle {}", cycle); @@ -1120,7 +913,7 @@ where } if let Some(kernel) = finished_kernel { - self.cleanup_finished_kernel(&*kernel); + self.cleanup_finished_kernel(&kernel); } log::trace!( @@ -1133,7 +926,7 @@ where Ok(()) } - // pub fn set_kernel_done(&mut self, kernel: &mut KernelInfo) { + // pub fn set_kernel_done(&mut self, kernel: &mut Kernel) { // self.finished_kernels // .borrow_mut() // .push_back(kernel.config.id); @@ -1146,16 +939,11 @@ where // self.running_kernels.remove(running_kernel_idx); // } - fn finished_kernel(&mut self) -> Option> { + fn finished_kernel(&mut self) -> Option> { // check running kernels let _active = self.active(); - let finished_kernel: Option<&mut Option>> = self - .running_kernels - .iter_mut() - // .filter_map(|k| k.as_ref()) - // .filter_map(|k| k) - // .filter(|k| { - .find(|k| { + let finished_kernel: Option<&mut Option>> = + self.running_kernels.iter_mut().find(|k| { if let Some(k) = k { // TODO: could also check here if !self.active() k.no_more_blocks_to_run() && !k.running() && k.was_launched() @@ -1169,32 +957,9 @@ where } else { None } - // for running_kernel in self.running_kernels.iter().filter_map(|k| k.as_ref()) { - // if running_kernel.no_more_blocks_to_run() && !running_kernel.running() { - // // SHADER_DPRINTF(LIVENESS, - // // "GPGPU-Sim uArch: GPU detected kernel %u \'%s\' " - // // "finished on shader %u.\n", - // // kernel->get_uid(), kernel->name().c_str(), m_sid); - // // - // // if current_kernel.map(|k| k.config.id) == Some(kernel.config.id) { - // // *current_kernel = None; - // // } - // - // // m_gpu->set_kernel_done(kernel); - // todo!("kernel {} done", &running_kernel); - // } - // } - - // self.finished_kernels.borrow_mut().pop_front() } - fn cleanup_finished_kernel(&mut self, kernel: &KernelInfo) { - // if !self.reached_limit() && self.active()) { - // return; - // } - // trace_kernel_info_t *k = NULL; - // let finished_kernel_idx = self.kernels.iter().position(|k| k.config.id == id).unwrap(); - // let finished_kernel = &self.kernels[finished_kernel_idx]; + fn cleanup_finished_kernel(&mut self, kernel: &Kernel) { log::debug!( "cleanup finished kernel with id={}: {}", kernel.id(), @@ -1209,152 +974,6 @@ where // m_gpgpu_sim->update_stats(); // m_gpgpu_context->print_simulation_time(); // } - - // if let Some(stream_idx) = self - // .busy_streams - // .iter() - // .position(|stream| *stream == finished_kernel.config.stream_id) - // { - // self.busy_streams.remove(stream_idx); - // } - - // self.kernels.remove(finished_kernel_idx); - - // tracer->kernel_finalizer(k->get_trace_info()); - // delete k->entry(); - // delete k; - // kernels_info.erase(kernels_info.begin() + j); - // if (!limit_reached() && active()) break; - - // for stream in self.busy_streams.iter() { - // if stream = finished_kernel.config.stream_id - // if (busy_streams.at(l) == k->get_cuda_stream_id()) { - // busy_streams.erase(busy_streams.begin() + l); - // break; - // } - // } - // tracer->kernel_finalizer(k->get_trace_info()); - // delete k->entry(); - // delete k; - // kernels_info.erase(kernels_info.begin() + j); - // if (!limit_reached() && active()) break; - - // for (unsigned j = 0; j < kernels_info.size(); j++) { - // k = kernels_info.at(j); - // if (k->get_uid() == finished_kernel_uid || limit_reached() || !active()) { - // for (unsigned l = 0; l < busy_streams.size(); l++) { - // if (busy_streams.at(l) == k->get_cuda_stream_id()) { - // busy_streams.erase(busy_streams.begin() + l); - // break; - // } - // } - // tracer->kernel_finalizer(k->get_trace_info()); - // delete k->entry(); - // delete k; - // kernels_info.erase(kernels_info.begin() + j); - // if (!limit_reached() && active()) break; - // } - // } - // // make sure kernel was found and removed - // assert(k); - // // if (!silent) m_gpgpu_sim->print_stats(); - // } - } - - fn gather_state(&self) -> DeadlockCheckState { - let total_cores = self.config.total_cores(); - let num_partitions = self.mem_partition_units.len(); - let num_sub_partitions = self.mem_sub_partitions.len(); - - let mut state = DeadlockCheckState::new(total_cores, num_partitions, num_sub_partitions); - - for (cluster_id, cluster) in self.clusters.iter().enumerate() { - for (core_id, core) in cluster.cores.lock().unwrap().iter().enumerate() { - let global_core_id = cluster_id * self.config.num_cores_per_simt_cluster + core_id; - assert_eq!(core.inner.core_id, global_core_id); - - // this is the one we will use (unless the assertion is ever false) - let core_id = core.inner.core_id; - - // core: functional units - for (fu_id, fu) in core.functional_units.iter().enumerate() { - let _fu = fu.lock().unwrap(); - let issue_port = core.issue_ports[fu_id]; - let issue_reg: register_set::RegisterSet = core.inner.pipeline_reg - [issue_port as usize] - .borrow() - .clone(); - assert_eq!(issue_port, issue_reg.stage); - - state.functional_unit_pipelines[core_id].push(issue_reg); - } - // core: operand collector - state.operand_collectors[core_id] = - Some(core.inner.operand_collector.borrow().clone()); - // core: schedulers - // state.schedulers[core_id].extend(core.schedulers.iter().map(Into::into)); - } - } - for (partition_id, partition) in self.mem_partition_units.iter().enumerate() { - state.dram_latency_queue[partition_id] - .extend(partition.dram_latency_queue.clone().into_iter()); - } - for (sub_id, sub) in self.mem_sub_partitions.iter().enumerate() { - for (dest_queue, src_queue) in [ - ( - &mut state.interconn_to_l2_queue[sub_id], - &sub.borrow().interconn_to_l2_queue, - ), - ( - &mut state.l2_to_interconn_queue[sub_id], - &sub.borrow().l2_to_interconn_queue, - ), - ( - &mut state.l2_to_dram_queue[sub_id], - &sub.borrow().l2_to_dram_queue.lock().unwrap(), - ), - ( - &mut state.dram_to_l2_queue[sub_id], - &sub.borrow().dram_to_l2_queue, - ), - ] { - dest_queue.extend(src_queue.clone().into_iter()); - } - } - state - } -} - -#[derive(Debug, PartialEq, Eq)] -struct DeadlockCheckState { - pub interconn_to_l2_queue: Vec>, - pub l2_to_interconn_queue: Vec>, - pub l2_to_dram_queue: Vec>, - pub dram_to_l2_queue: Vec>, - pub dram_latency_queue: Vec>, - pub functional_unit_pipelines: Vec>, - pub operand_collectors: Vec>, - // pub schedulers: Vec>, - // functional_unit_pipelines - // schedulers - // operand_collectors -} - -impl DeadlockCheckState { - pub fn new(total_cores: usize, num_mem_partitions: usize, num_sub_partitions: usize) -> Self { - Self { - // per sub partition - interconn_to_l2_queue: vec![vec![]; num_sub_partitions], - l2_to_interconn_queue: vec![vec![]; num_sub_partitions], - l2_to_dram_queue: vec![vec![]; num_sub_partitions], - dram_to_l2_queue: vec![vec![]; num_sub_partitions], - // per partition - dram_latency_queue: vec![vec![]; num_mem_partitions], - // per core - functional_unit_pipelines: vec![vec![]; total_cores], - operand_collectors: vec![None; total_cores], - // schedulers: vec![vec![]; total_cores], - } } } @@ -1379,8 +998,6 @@ pub fn accelmain( traces_dir: impl AsRef, log_after_cycle: Option, ) -> eyre::Result { - log::info!("box version {}", 0); - let traces_dir = traces_dir.as_ref(); let (traces_dir, commands_path) = if traces_dir.is_dir() { (traces_dir.to_path_buf(), traces_dir.join("commands.json")) @@ -1397,26 +1014,21 @@ pub fn accelmain( }; // debugging config - let mut config = config::GPUConfig::default(); - - config.num_simt_clusters = 20; // 20 - config.num_cores_per_simt_cluster = 4; // 1 - config.num_schedulers_per_core = 2; // 1 - - config.num_memory_controllers = 8; // 8 - config.num_sub_partition_per_memory_channel = 2; // 2 - config.fill_l2_on_memcopy = true; // true - - let config = Arc::new(config); + let config = Arc::new(config::GPUConfig { + num_simt_clusters: 20, // 20 + num_cores_per_simt_cluster: 4, // 1 + num_schedulers_per_core: 2, // 1 + num_memory_controllers: 8, // 8 + num_sub_partition_per_memory_channel: 2, // 2 + fill_l2_on_memcopy: true, // true + ..config::GPUConfig::default() + }); let interconn = Arc::new(ic::ToyInterconnect::new( config.num_simt_clusters, config.num_memory_controllers * config.num_sub_partition_per_memory_channel, - // config.num_simt_clusters * config.num_cores_per_simt_cluster, - // config.num_mem_units, - Some(9), // found by printf debugging gpgusim )); - let mut sim = MockSimulator::new(interconn, Arc::clone(&config), &traces_dir, &commands_path); + let mut sim = MockSimulator::new(interconn, Arc::clone(&config), &traces_dir, commands_path); sim.log_after_cycle = log_after_cycle; @@ -1424,7 +1036,8 @@ pub fn accelmain( .unwrap_or_default() .to_lowercase() == "yes"; - sim.run_to_completion(&traces_dir, deadlock_check); + + sim.run_to_completion(&traces_dir, deadlock_check)?; let stats = sim.stats(); @@ -1435,12 +1048,7 @@ pub fn accelmain( mod tests { use crate::{ config, - ported::{ - self, - fifo::{self, Queue}, - interconn as ic, testing, - testing::diff, - }, + ported::{self, fifo, interconn as ic, testing, testing::diff}, }; use color_eyre::eyre; use itertools::Itertools; @@ -1568,7 +1176,7 @@ mod tests { .map(Into::into), ); box_sim_state.dram_arbitration_per_partition[partition_id] = - testing::state::ArbitrationState { + testing::state::Arbitration { last_borrower: partition.arbitration_metadata.last_borrower, shared_credit: partition.arbitration_metadata.shared_credit, private_credit: partition.arbitration_metadata.private_credit.clone().into(), @@ -1618,7 +1226,7 @@ mod tests { play_sim_state.last_cluster_issue = play_sim.last_cluster_issue() as usize; for (core_id, core) in play_sim.cores().enumerate() { - for regs in core.functional_unit_issue_register_sets().into_iter() { + for regs in core.functional_unit_issue_register_sets() { play_sim_state.functional_unit_pipelines_per_core[core_id].push(regs.into()); } let valid_units: HashSet<_> = box_sim_state.functional_unit_pipelines_per_core[core_id] @@ -1685,7 +1293,7 @@ mod tests { partitions_added += 1; play_sim_state.dram_arbitration_per_partition[partition_id] = - testing::state::ArbitrationState { + testing::state::Arbitration { last_borrower: partition.last_borrower(), shared_credit: partition.shared_credit(), private_credit: partition.private_credit().into(), @@ -1997,6 +1605,8 @@ mod tests { // } fn run_lockstep(trace_dir: &Path, trace_provider: TraceProvider) -> eyre::Result<()> { + use accelsim::tracegen::reader::Command as AccelsimCommand; + let manifest_dir = PathBuf::from(std::env!("CARGO_MANIFEST_DIR")); let box_trace_dir = trace_dir.join("trace"); @@ -2027,7 +1637,6 @@ mod tests { let accelsim_commands = accelsim::tracegen::reader::read_commands(&accelsim_trace_dir, reader)?; - use accelsim::tracegen::reader::Command as AccelsimCommand; let commands: Vec<_> = accelsim_commands .into_iter() .map(|cmd| match cmd { @@ -2037,7 +1646,7 @@ mod tests { AccelsimCommand::KernelLaunch((mut kernel, metadata)) => { // transform kernel instruction trace let kernel_trace_path = accelsim_trace_dir.join(&kernel.trace_file); - let reader = utils::fs::open_readable(&kernel_trace_path)?; + let reader = utils::fs::open_readable(kernel_trace_path)?; let parsed_trace = accelsim::tracegen::reader::read_trace_instructions( reader, metadata.trace_version, @@ -2130,7 +1739,6 @@ mod tests { dbg!(&box_commands_path); dbg!(&accelsim_kernelslist_path); - // assert!(false); let gpgpusim_config = manifest_dir.join("accelsim/gtx1080/gpgpusim.config"); let trace_config = manifest_dir.join("accelsim/gtx1080/gpgpusim.trace.config"); let inter_config = manifest_dir.join("accelsim/gtx1080/config_fermi_islip.icnt"); @@ -2143,37 +1751,29 @@ mod tests { assert!(trace_config.is_file()); assert!(inter_config.is_file()); - // let start = std::time::Instant::now(); - // let box_stats = super::accelmain(&vec_add_trace_dir.join("trace"), None)?; - // debugging config - let mut box_config = config::GPUConfig::default(); - box_config.num_simt_clusters = 20; // 20 - box_config.num_cores_per_simt_cluster = 4; // 1 - box_config.num_schedulers_per_core = 2; // 2 - box_config.num_memory_controllers = 8; // 8 - box_config.num_sub_partition_per_memory_channel = 2; // 2 - box_config.fill_l2_on_memcopy = true; // true - - let box_config = Arc::new(box_config); + let box_config = Arc::new(config::GPUConfig { + num_simt_clusters: 20, // 20 + num_cores_per_simt_cluster: 4, // 1 + num_schedulers_per_core: 2, // 2 + num_memory_controllers: 8, // 8 + num_sub_partition_per_memory_channel: 2, // 2 + fill_l2_on_memcopy: true, // true + ..config::GPUConfig::default() + }); let box_interconn = Arc::new(ic::ToyInterconnect::new( box_config.num_simt_clusters, box_config.num_memory_controllers * box_config.num_sub_partition_per_memory_channel, - // config.num_simt_clusters * config.num_cores_per_simt_cluster, - // config.num_mem_units, - Some(9), // found by printf debugging gpgusim )); let mut box_sim = super::MockSimulator::new( box_interconn, - box_config.clone(), + box_config, &box_trace_dir, &box_commands_path, ); - // let box_dur = start.elapsed(); - // let start = std::time::Instant::now(); let args = vec![ "-trace", accelsim_kernelslist_path.as_os_str().to_str().unwrap(), @@ -2189,10 +1789,6 @@ mod tests { let play_config = playground::Config::default(); let mut play_sim = playground::Accelsim::new(&play_config, &args)?; - // accelsim.run_to_completion(); - // let ref_stats = accelsim.stats().clone(); - // let ref_stats = playground::run(&config, &args)?; - // let mut play_time_cycle = std::time::Duration::ZERO; let mut play_time_other = std::time::Duration::ZERO; let mut box_time_cycle = std::time::Duration::ZERO; @@ -2225,15 +1821,15 @@ mod tests { .unwrap_or(200); assert!(check_every >= 1); - let _num_schedulers = box_sim.config.num_schedulers_per_core; - let num_clusters = box_sim.config.num_simt_clusters; - let cores_per_cluster = box_sim.config.num_cores_per_simt_cluster; - assert_eq!( - box_sim.config.total_cores(), - num_clusters * cores_per_cluster - ); - let _num_partitions = box_sim.mem_partition_units.len(); - let _num_sub_partitions = box_sim.mem_sub_partitions.len(); + // let _num_schedulers = box_sim.config.num_schedulers_per_core; + // let num_clusters = box_sim.config.num_simt_clusters; + // let cores_per_cluster = box_sim.config.num_cores_per_simt_cluster; + // assert_eq!( + // box_sim.config.total_cores(), + // num_clusters * cores_per_cluster + // ); + // let _num_partitions = box_sim.mem_partition_units.len(); + // let _num_sub_partitions = box_sim.mem_sub_partitions.len(); // // let mut box_sim_state = testing::state::Simulation::new( // num_clusters, @@ -2420,7 +2016,7 @@ mod tests { // dbg!(sub_id, box_icnt_l2_queue); // } } - println!("checking for diff after cycle {}", cycle); + println!("checking for diff after cycle {cycle}"); if use_full_diff { full_diff::assert_eq!(&box_sim_state, &play_sim_state); @@ -2441,7 +2037,7 @@ mod tests { } if let Some(kernel) = box_sim.finished_kernel() { - box_sim.cleanup_finished_kernel(&*kernel); + box_sim.cleanup_finished_kernel(&kernel); } box_time_other += start.elapsed(); @@ -2743,7 +2339,7 @@ mod tests { playground_bin.display() ) }) - .with_suggestion(|| format!("make sure to build playground with `cargo build -p playground` for the {:?} target", target))?; + .with_suggestion(|| format!("make sure to build playground with `cargo build -p playground` for the {target:?} target"))?; let gpgpu_sim_config = sim_config.config().unwrap(); let trace_config = sim_config.trace_config().unwrap(); @@ -2799,8 +2395,6 @@ mod tests { kernelslist: &Path, sim_config: &accelsim::SimConfig, ) -> eyre::Result<()> { - use std::io::Write; - dbg!(&traces_dir); dbg!(&kernelslist); dbg!(&sim_config); @@ -2849,24 +2443,21 @@ mod tests { let filter_func = |((_name, _kernel, stat_name), _value): &((String, u16, String), f64)| -> bool { // we ignore rates and other stats that can vary per run - match stat_name.as_str() { + !matches!( + stat_name.as_str(), "gpgpu_silicon_slowdown" - | "gpgpu_simulation_rate" - | "gpgpu_simulation_time_sec" - | "gpu_ipc" - | "gpu_occupancy" - | "gpu_tot_ipc" - | "l1_inst_cache_total_miss_rate" - | "l2_bandwidth_gbps" => false, - _ => true, - } + | "gpgpu_simulation_rate" + | "gpgpu_simulation_time_sec" + | "gpu_ipc" + | "gpu_occupancy" + | "gpu_tot_ipc" + | "l1_inst_cache_total_miss_rate" + | "l2_bandwidth_gbps" + ) }; - let cmp_play_stats: accelsim::Stats = playground_stats - .clone() - .into_iter() - .filter(filter_func) - .collect(); + let cmp_play_stats: accelsim::Stats = + playground_stats.into_iter().filter(filter_func).collect(); let cmp_accel_stats: accelsim::Stats = accelsim_stats .clone() diff --git a/src/ported/mshr.rs b/src/ported/mshr.rs index 5a49b6af..d58d116a 100644 --- a/src/ported/mshr.rs +++ b/src/ported/mshr.rs @@ -24,7 +24,6 @@ pub struct MshrTable { num_entries: usize, max_merged: usize, data: Table, - pending_lines: LineTable, /// If the current response is ready /// /// it may take several cycles to process the merged requests @@ -33,25 +32,23 @@ pub struct MshrTable { } impl MshrTable { - pub fn new(num_entries: usize, max_merged: usize) -> Self { + #[must_use] pub fn new(num_entries: usize, max_merged: usize) -> Self { let data = HashMap::with_capacity(2 * num_entries); Self { num_entries, max_merged, data, - pending_lines: HashMap::new(), current_response: VecDeque::new(), - // current_response_ready: false, } } /// Checks if there is a pending request to the lower memory level already - pub fn probe(&self, block_addr: address) -> bool { + #[must_use] pub fn probe(&self, block_addr: address) -> bool { self.data.contains_key(&block_addr) } /// Checks if there is space for tracking a new memory access - pub fn full(&self, block_addr: address) -> bool { + #[must_use] pub fn full(&self, block_addr: address) -> bool { match self.data.get(&block_addr) { Some(entry) => entry.list.len() >= self.max_merged, None => self.data.len() >= self.num_entries, @@ -109,16 +106,16 @@ impl MshrTable { } /// Returns true if ready accesses exist - pub fn has_ready_accesses(&self) -> bool { + #[must_use] pub fn has_ready_accesses(&self) -> bool { !self.current_response.is_empty() } /// Returns next ready accesses - pub fn ready_accesses(&self) -> Option<&VecDeque> { + #[must_use] pub fn ready_accesses(&self) -> Option<&VecDeque> { let Some(block_addr) = self.current_response.front() else { return None; }; - let Some(entry) = self.data.get(&block_addr) else { + let Some(entry) = self.data.get(block_addr) else { return None; }; Some(&entry.list) @@ -129,7 +126,7 @@ impl MshrTable { let Some(block_addr) = self.current_response.front() else { return None; }; - let Some(entry) = self.data.get_mut(&block_addr) else { + let Some(entry) = self.data.get_mut(block_addr) else { return None; }; Some(&mut entry.list) @@ -143,7 +140,7 @@ impl MshrTable { return None; }; - let Some(entry) = self.data.get_mut(&block_addr) else { + let Some(entry) = self.data.get_mut(block_addr) else { return None; }; @@ -152,7 +149,7 @@ impl MshrTable { let should_remove = entry.list.is_empty(); if should_remove { - self.data.remove(&block_addr); + self.data.remove(block_addr); self.current_response.pop_front(); } fetch @@ -165,7 +162,6 @@ mod tests { use crate::config; use crate::ported::{mem_fetch, scheduler::ThreadActiveMask}; use mem_fetch::{AccessKind, MemAccess, MemFetch}; - #[test] fn test_mshr_table() { @@ -173,7 +169,7 @@ mod tests { let cache_config = config.inst_cache_l1.as_ref().unwrap(); let mut mshrs = MshrTable::new(cache_config.mshr_entries, cache_config.mshr_max_merge); - let fetch_addr = 4026531848; + let fetch_addr = 4_026_531_848; let access = MemAccess::new( AccessKind::INST_ACC_R, fetch_addr, @@ -186,11 +182,11 @@ mod tests { ); let fetch = MemFetch::new(None, access, &config, 0, 0, 0, 0); let mshr_addr = cache_config.mshr_addr(fetch_addr); - assert_eq!(mshrs.probe(mshr_addr), false); - assert_eq!(mshrs.probe(mshr_addr), false); + assert!(!mshrs.probe(mshr_addr)); + assert!(!mshrs.probe(mshr_addr)); mshrs.add(mshr_addr, fetch); - assert_eq!(mshrs.probe(mshr_addr), true); + assert!(mshrs.probe(mshr_addr)); // TODO: test against bridge here } diff --git a/src/ported/operand_collector.rs b/src/ported/operand_collector.rs index 6a9d712e..4b6bcca7 100644 --- a/src/ported/operand_collector.rs +++ b/src/ported/operand_collector.rs @@ -44,6 +44,7 @@ pub struct Operand { } impl Operand { + #[must_use] pub fn new( warp_id: Option, cu_id: usize, @@ -62,6 +63,7 @@ impl Operand { } } + #[must_use] pub fn warp_id(&self) -> Option { self.warp_id } @@ -127,6 +129,7 @@ impl CollectorUnit { } // looks ok + #[must_use] pub fn ready(&self) -> bool { if self.free { return false; @@ -302,18 +305,22 @@ impl Default for Allocation { } impl Allocation { + #[must_use] pub fn new(kind: AllocationKind, op: Option) -> Self { Self { kind, op } } + #[must_use] pub fn is_read(&self) -> bool { self.kind == AllocationKind::READ_ALLOC } + #[must_use] pub fn is_write(&self) -> bool { self.kind == AllocationKind::WRITE_ALLOC } + #[must_use] pub fn is_free(&self) -> bool { self.kind == AllocationKind::NO_ALLOC } @@ -433,14 +440,13 @@ impl Arbiter { log::trace!("request: {:?}", &Self::compat(&request[bank])); } - log::trace!("inmatch: {:?}", &Self::compat(&inmatch)); + log::trace!("inmatch: {:?}", &Self::compat(inmatch)); // wavefront allocator from booksim // loop through diagonals of request matrix - let mut output = 0; for p in 0.._square { - output = (_pri + p) % _outputs; + let mut output = (_pri + p) % _outputs; // step through the current diagonal for input in 0.._inputs { @@ -464,8 +470,8 @@ impl Arbiter { } } - log::trace!("inmatch: {:?}", &Self::compat(&inmatch)); - log::trace!("outmatch: {:?}", &Self::compat(&outmatch)); + log::trace!("inmatch: {:?}", &Self::compat(inmatch)); + log::trace!("outmatch: {:?}", &Self::compat(outmatch)); // Round-robin the priority diagonal _pri = (_pri + 1) % _outputs; @@ -500,14 +506,13 @@ impl Arbiter { } pub fn add_read_requests(&mut self, cu: &CollectorUnit) { - for src_op in &cu.src_operands { - if let Some(src_op) = src_op { - let bank = src_op.bank; - self.queue[bank].push_back(src_op.clone()); - } + for src_op in cu.src_operands.iter().flatten() { + let bank = src_op.bank; + self.queue[bank].push_back(src_op.clone()); } } + #[must_use] pub fn bank_idle(&self, bank: usize) -> bool { self.allocated_banks[bank].is_free() } @@ -539,6 +544,7 @@ pub struct DispatchUnit { } impl DispatchUnit { + #[must_use] pub fn new(kind: OperandCollectorUnitKind) -> Self { Self { kind, @@ -603,6 +609,7 @@ pub struct InputPort { } impl InputPort { + #[must_use] pub fn new( in_ports: PortVec, out_ports: PortVec, @@ -633,7 +640,7 @@ pub enum OperandCollectorUnitKind { pub type CuSets = HashMap>>>; // operand collector based register file unit -#[derive(Debug, Clone, PartialEq, Eq)] +#[derive(Debug, Clone)] pub struct OperandCollectorRegisterFileUnit { pub config: Arc, @@ -711,7 +718,7 @@ impl OperandCollectorRegisterFileUnit { debug_assert!(cu.id == cu_id); } - for dispatch_unit in self.dispatch_units.iter_mut() { + for dispatch_unit in &mut self.dispatch_units { dispatch_unit.init(self.sub_core_model, self.num_warp_schedulers); } self.initialized = true; @@ -760,7 +767,7 @@ impl OperandCollectorRegisterFileUnit { } log::debug!("allocating {} reads ({:?})", read_ops.len(), &read_ops); - for (_bank, read) in &read_ops { + for read in read_ops.values() { assert!(read.collector_unit_id < self.collector_units.len()); let mut cu = self.collector_units[read.collector_unit_id].borrow_mut(); if let Some(operand) = read.operand { @@ -824,17 +831,17 @@ impl OperandCollectorRegisterFileUnit { debug_assert!(cu_upper_bound <= cu_set.len()); } - for k in cu_lower_bound..cu_upper_bound { - let mut collector_unit = cu_set[k].try_borrow_mut().unwrap(); + for collector_unit in &cu_set[cu_lower_bound..cu_upper_bound] { + let mut collector_unit = collector_unit.try_borrow_mut().unwrap(); if collector_unit.free { log::debug!( "{} cu={:?}", - style(format!("operand collector::allocate()")).green(), + style("operand collector::allocate()".to_string()).green(), collector_unit.kind ); - allocated = collector_unit.allocate(&input_port, &output_port); + allocated = collector_unit.allocate(input_port, output_port); self.arbiter.add_read_requests(&collector_unit); break; } @@ -1045,8 +1052,8 @@ mod test { let arbiter = (&opcoll.arbiter).into(); Self { ports, - dispatch_units, collector_units, + dispatch_units, arbiter, } } diff --git a/src/ported/register_set.rs b/src/ported/register_set.rs index ef6f7d59..3f4af0df 100644 --- a/src/ported/register_set.rs +++ b/src/ported/register_set.rs @@ -9,9 +9,9 @@ pub struct RegisterSet { } impl RegisterSet { - pub fn new(stage: super::PipelineStage, size: usize, id: usize) -> Self { + #[must_use] pub fn new(stage: super::PipelineStage, size: usize, id: usize) -> Self { let regs = (0..size).map(|_| None).collect(); - Self { regs, stage, id } + Self { stage, regs, id } } pub fn has_free(&self) -> bool { @@ -23,7 +23,7 @@ impl RegisterSet { } // pub fn has_free_sub_core(&self, sub_core_model: bool, reg_id: usize) -> bool { - pub fn has_free_sub_core(&self, reg_id: usize) -> bool { + #[must_use] pub fn has_free_sub_core(&self, reg_id: usize) -> bool { // in subcore model, each sched has a one specific // reg to use (based on sched id) // if !sub_core_model { @@ -66,7 +66,7 @@ impl RegisterSet { // } pub fn scheduler_id(&self, reg_id: usize) -> Option { - match self.regs.get(reg_id).map(Option::as_ref).flatten() { + match self.regs.get(reg_id).and_then(Option::as_ref) { Some(r) => { // debug_assert!(!r.empty()); r.scheduler_id @@ -94,7 +94,7 @@ impl RegisterSet { self.regs.iter().any(Option::is_some) } - pub fn get_ready(&self) -> Option<(usize, &Option)> { + #[must_use] pub fn get_ready(&self) -> Option<(usize, &Option)> { let mut ready: Option<(usize, &Option)> = None; for free in self.iter_occupied() { match (&ready, free) { @@ -193,8 +193,7 @@ impl RegisterSet { pub fn get_instruction_mut(&mut self) -> Option<&mut WarpInstruction> { self.get_ready_mut() .map(|(_, r)| r) - .map(Option::as_mut) - .flatten() + .and_then(Option::as_mut) } // pub fn get_ready_mut(&mut self) -> Option<&mut WarpInstruction> { @@ -212,7 +211,7 @@ impl RegisterSet { // ready // } - pub fn get_ready_sub_core(&self, reg_id: usize) -> Option<&Option> { + #[must_use] pub fn get_ready_sub_core(&self, reg_id: usize) -> Option<&Option> { debug_assert!(reg_id < self.regs.len()); self.regs.get(reg_id) } @@ -227,12 +226,12 @@ impl RegisterSet { pub fn get_instruction_sub_core(&self, reg_id: usize) -> Option<&WarpInstruction> { debug_assert!(reg_id < self.regs.len()); - self.regs.get(reg_id).map(Option::as_ref).flatten() + self.regs.get(reg_id).and_then(Option::as_ref) } pub fn get_instruction_sub_core_mut(&mut self, reg_id: usize) -> Option<&mut WarpInstruction> { debug_assert!(reg_id < self.regs.len()); - self.regs.get_mut(reg_id).map(Option::as_mut).flatten() + self.regs.get_mut(reg_id).and_then(Option::as_mut) } pub fn iter_occupied(&self) -> impl Iterator)> { @@ -249,11 +248,11 @@ impl RegisterSet { } pub fn iter_instructions(&self) -> impl Iterator { - self.regs.iter().map(Option::as_ref).filter_map(|r| r) + self.regs.iter().filter_map(Option::as_ref) } pub fn iter_instructions_mut(&mut self) -> impl Iterator { - self.regs.iter_mut().map(Option::as_mut).filter_map(|r| r) + self.regs.iter_mut().filter_map(Option::as_mut) } pub fn iter_free(&self) -> impl Iterator> { @@ -287,18 +286,15 @@ impl RegisterSet { // in subcore model, each sched has a one specific reg // to use (based on sched id) debug_assert!(reg_id < self.regs.len()); - match self.regs.get_mut(reg_id) { - Some(r) => Some((reg_id, r)), - None => None, - } + self.regs.get_mut(reg_id).map(|r| (reg_id, r)) // .and_then(Option::as_ref) .filter(|r| r.empty()) } - pub fn size(&self) -> usize { + #[must_use] pub fn size(&self) -> usize { self.regs.len() } - pub fn empty(&self) -> bool { + #[must_use] pub fn empty(&self) -> bool { todo!("RegisterSet::empty") } @@ -335,8 +331,7 @@ impl RegisterSet { let ready: Option = self .get_ready_mut() .map(|(_, r)| r) - .map(Option::take) - .flatten(); + .and_then(Option::take); // let msg = format!( // "register set moving out from ready={:?} to {:?}", // ready.as_ref().map(ToString::to_string), @@ -354,8 +349,7 @@ impl RegisterSet { ) { let ready: Option = self .get_ready_sub_core_mut(reg_id) - .map(Option::take) - .flatten(); + .and_then(Option::take); // let msg = format!( // "register set moving out to sub core from ready={:?} to {:?}", // ready.as_ref().map(ToString::to_string), @@ -370,7 +364,7 @@ impl std::fmt::Display for RegisterSet { let instructions = self .regs .iter() - .map(|inst| inst.as_ref().map(|i| i.to_string())); + .map(|inst| inst.as_ref().map(std::string::ToString::to_string)); f.debug_list().entries(instructions).finish() } } diff --git a/src/ported/scheduler.rs b/src/ported/scheduler.rs deleted file mode 100644 index 6b83b4b5..00000000 --- a/src/ported/scheduler.rs +++ /dev/null @@ -1,1273 +0,0 @@ -use std::cell::RefCell; -use std::collections::VecDeque; -use std::rc::Rc; -use std::sync::{Arc, Mutex, RwLock}; - -use super::core::PipelineStage; -use super::{instruction::WarpInstruction, opcodes, scoreboard}; -use crate::config::GPUConfig; -use bitvec::{array::BitArray, BitArr}; -use console::style; - -pub type ThreadActiveMask = BitArr!(for 32, in u32); - -pub type CoreWarp = Rc>; - -#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] -#[allow(dead_code)] -enum ExecUnitKind { - NONE = 0, - SP = 1, - SFU = 2, - MEM = 3, - DP = 4, - INT = 5, - TENSOR = 6, - SPECIALIZED = 7, -} - -#[derive(Debug)] -pub struct SchedulerWarp { - pub block_id: u64, - pub dynamic_warp_id: usize, - pub warp_id: usize, - pub kernel: Option>, - - pub trace_pc: usize, - pub active_mask: ThreadActiveMask, - pub trace_instructions: VecDeque, - - // state - pub done_exit: bool, - pub num_instr_in_pipeline: usize, - pub num_outstanding_stores: usize, - pub num_outstanding_atomics: usize, - pub has_imiss_pending: bool, - pub instr_buffer: Vec>, - pub next: usize, -} - -impl PartialEq for SchedulerWarp { - fn eq(&self, other: &Self) -> bool { - self.kernel == other.kernel - && self.block_id == other.block_id - && self.warp_id == other.warp_id - && self.dynamic_warp_id == other.dynamic_warp_id - } -} - -const IBUFFER_SIZE: usize = 2; - -impl Default for SchedulerWarp { - fn default() -> Self { - let instr_buffer = vec![None; IBUFFER_SIZE]; - Self { - block_id: 0, - dynamic_warp_id: u32::MAX as usize, - warp_id: u32::MAX as usize, - kernel: None, - trace_pc: 0, - trace_instructions: VecDeque::new(), - active_mask: BitArray::ZERO, - done_exit: false, - num_instr_in_pipeline: 0, - num_outstanding_stores: 0, - num_outstanding_atomics: 0, - has_imiss_pending: false, - instr_buffer, - next: 0, - } - } -} - -impl SchedulerWarp { - pub fn init( - &mut self, - _start_pc: Option, - block_id: u64, - warp_id: usize, - dynamic_warp_id: usize, - active_mask: ThreadActiveMask, - kernel: Arc, - ) { - self.block_id = block_id; - self.warp_id = warp_id; - self.dynamic_warp_id = dynamic_warp_id; - self.done_exit = false; - self.kernel = Some(kernel); - self.active_mask = active_mask; - } - - pub fn reset(&mut self) { - debug_assert_eq!(self.num_outstanding_stores, 0); - debug_assert_eq!(self.num_instr_in_pipeline, 0); - self.has_imiss_pending = false; - self.warp_id = u32::MAX as usize; - self.dynamic_warp_id = u32::MAX as usize; - - self.active_mask.fill(false); - self.done_exit = true; - self.next = 0; - } - - pub fn current_instr(&self) -> Option<&WarpInstruction> { - self.trace_instructions.get(self.trace_pc) - } - - pub fn push_trace_instruction(&mut self, instr: WarpInstruction) { - self.trace_instructions.push_back(instr); - } - - pub fn next_trace_inst(&mut self) -> Option<&WarpInstruction> { - let trace_instr = self.trace_instructions.get(self.trace_pc)?; - self.trace_pc += 1; - Some(trace_instr) - } - - pub fn instruction_count(&self) -> usize { - self.trace_instructions.len() - } - - pub fn pc(&self) -> Option { - debug_assert!(self.trace_pc <= self.instruction_count()); - self.trace_instructions - .get(self.trace_pc) - .map(|instr| instr.pc) - } - - pub fn done(&self) -> bool { - self.trace_pc == self.instruction_count() - } - - pub fn clear(&mut self) { - self.trace_pc = 0; - self.trace_instructions.clear(); - } - - pub fn ibuffer_fill(&mut self, slot: usize, instr: WarpInstruction) { - debug_assert!(slot < self.instr_buffer.len()); - self.instr_buffer[slot] = Some(instr); - self.next = 0; - } - - pub fn ibuffer_size(&self) -> usize { - self.instr_buffer.iter().filter(|x| x.is_some()).count() - } - - pub fn ibuffer_empty(&self) -> bool { - self.instr_buffer.iter().all(Option::is_none) - } - - pub fn ibuffer_flush(&mut self) { - for i in self.instr_buffer.iter_mut() { - if i.is_some() { - self.num_instr_in_pipeline -= 1; - } - *i = None; - } - } - - pub fn ibuffer_peek(&self) -> Option<&WarpInstruction> { - self.instr_buffer[self.next].as_ref() - } - - pub fn ibuffer_take(&mut self) -> Option { - self.instr_buffer[self.next].take() - } - - pub fn ibuffer_step(&mut self) { - self.next = (self.next + 1) % IBUFFER_SIZE; - } - - pub fn done_exit(&self) -> bool { - self.done_exit - } - - pub fn hardware_done(&self) -> bool { - self.functional_done() && self.stores_done() && self.num_instr_in_pipeline == 0 - } - - pub fn has_instr_in_pipeline(&self) -> bool { - self.num_instr_in_pipeline > 0 - } - - pub fn stores_done(&self) -> bool { - self.num_outstanding_stores == 0 - } - - pub fn num_completed(&self) -> usize { - self.active_mask.count_zeros() - } - - pub fn set_thread_completed(&mut self, thread_id: usize) { - self.active_mask.set(thread_id, false); - } - - pub fn functional_done(&self) -> bool { - self.active_mask.not_any() - } - - pub fn waiting(&self) -> bool { - if self.functional_done() { - // waiting to be initialized with a kernel - true - // } else if core.warp_waiting_at_barrier(self.warp_id) { - // // waiting for other warps in block to reach barrier - // true - // } else if core.warp_waiting_at_mem_barrier(self.warp_id) { - // // waiting for memory barrier - // true - } else if self.num_outstanding_atomics > 0 { - // waiting for atomic operation to complete at memory: - // this stall is not required for accurate timing model, - // but rather we stall here since if a call/return - // instruction occurs in the meantime the functional - // execution of the atomic when it hits DRAM can cause - // the wrong register to be read. - true - } else { - false - } - } - - pub fn dynamic_warp_id(&self) -> usize { - self.dynamic_warp_id - } -} - -fn sort_warps_by_oldest_dynamic_id(lhs: &CoreWarp, rhs: &CoreWarp) -> std::cmp::Ordering { - let lhs = lhs.try_borrow().unwrap(); - let rhs = rhs.try_borrow().unwrap(); - if lhs.done_exit() || lhs.waiting() { - std::cmp::Ordering::Greater - } else if rhs.done_exit() || rhs.waiting() { - std::cmp::Ordering::Less - } else { - lhs.dynamic_warp_id().cmp(&rhs.dynamic_warp_id()) - } -} - -#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] -pub enum Ordering { - // The item that issued last is prioritized first then the - // sorted result of the priority_function - GREEDY_THEN_PRIORITY_FUNC = 0, - // No greedy scheduling based on last to issue. - // - // Only the priority function determines priority - PRIORITY_FUNC_ONLY, - // NUM_ORDERING, -} - -#[derive(Debug)] -pub struct BaseSchedulerUnit { - id: usize, - cluster_id: usize, - core_id: usize, - /// This is the prioritized warp list that is looped over each cycle to - /// determine which warp gets to issue. - next_cycle_prioritized_warps: VecDeque, - // Supervised warps keeps all warps this scheduler can arbitrate between. - // - // This is useful in systems where there is more than one warp scheduler. - // In a single scheduler system, this is simply all the warps - // assigned to this core. - supervised_warps: VecDeque, - /// This is the iterator pointer to the last supervised warp issued - last_supervised_issued_idx: usize, - - warps: Vec, - num_issued_last_cycle: usize, - current_turn_warp: usize, - - scoreboard: Arc>, - config: Arc, - stats: Arc>, -} - -impl BaseSchedulerUnit { - pub fn new( - id: usize, - cluster_id: usize, - core_id: usize, - warps: Vec, - scoreboard: Arc>, - stats: Arc>, - config: Arc, - ) -> Self { - let supervised_warps = VecDeque::new(); - Self { - id, - cluster_id, - core_id, - next_cycle_prioritized_warps: VecDeque::new(), - supervised_warps, - last_supervised_issued_idx: 0, - warps, - num_issued_last_cycle: 0, - current_turn_warp: 0, - scoreboard, - config, - stats, - } - } - - fn prioritized_warps(&self) -> &VecDeque { - &self.next_cycle_prioritized_warps - } - - fn cycle(&mut self, issuer: &mut dyn super::core::WarpIssuer) { - log::debug!("{}: cycle", style("base scheduler").yellow()); - - // there was one warp with a valid instruction to issue - // (didn't require flush due to control hazard) - let mut valid_inst = false; - // of the valid instructions, there was one not waiting for pending register writes - let mut ready_inst = false; - // of these we issued one - let mut issued_inst = false; - - // dbg!(&self.next_cycle_prioritized_warps.len()); - // dbg!(&self.supervised_warps.len()); - // dbg!(&self.last_supervised_issued_idx); - // - // dbg!(&self - // .warps - // .iter() - // .map(|w| w.lock().unwrap().instruction_count()) - // .sum::()); - // dbg!(&self - // .supervised_warps - // .iter() - // .map(|w| w.lock().unwrap().instruction_count()) - // .sum::()); - // - // dbg!(&self - // .next_cycle_prioritized_warps - // .iter() - // .map(|w| w.lock().unwrap().instruction_count()) - // .sum::()); - - // log::debug!( - // "supervised warps: {:#?}", - // self.supervised_warps - // .iter() - // .map(|w| w.lock().unwrap().instruction_count()) - // .filter(|&c| c > 0) - // .collect::>() - // ); - // log::debug!( - // "next_cycle_prioritized_warps: {:#?}", - // self.next_cycle_prioritized_warps - // .iter() - // .map(|w| w.lock().unwrap().instruction_count()) - // .filter(|&c| c > 0) - // .collect::>() - // ); - - // log::debug!("next cycle prio warp"); - for next_warp_rc in &self.next_cycle_prioritized_warps { - // don't consider warps that are not yet valid - let next_warp = next_warp_rc.try_borrow().unwrap(); - let (warp_id, dyn_warp_id) = (next_warp.warp_id, next_warp.dynamic_warp_id); - // log::debug!("locked next warp = {}", warp_id); - - if next_warp.done_exit() { - continue; - } - let inst_count = next_warp.instruction_count(); - if inst_count == 0 { - log::debug!("next warp: {:#?}", &next_warp); - } - assert!(inst_count > 0); - if inst_count > 1 { - log::debug!( - "core[{}][{}] scheduler[{}]: \n\t => testing (warp_id={}, dynamic_warp_id={}, trace_pc={}, pc={:?}, ibuffer={:?}, {} instructions)", - self.cluster_id, - self.core_id, - self.id, - warp_id, dyn_warp_id, - next_warp.trace_pc, - next_warp.pc(), - next_warp.instr_buffer.iter().filter_map(Option::as_ref).map(|i| i.pc).collect::>(), inst_count, - ); - } - let mut checked = 0; - let mut issued = 0; - - let mut prev_issued_exec_unit = ExecUnitKind::NONE; - let max_issue = self.config.max_instruction_issue_per_warp; - // In tis mode, we only allow dual issue to diff execution - // units (as in Maxwell and Pascal) - let diff_exec_units = self.config.dual_issue_diff_exec_units; - - if inst_count > 1 { - if next_warp.ibuffer_empty() { - log::debug!( - "warp (warp_id={}, dynamic_warp_id={}) fails as ibuffer_empty", - warp_id, - dyn_warp_id - ); - } - - if next_warp.waiting() { - log::debug!( - "warp (warp_id={}, dynamic_warp_id={}) is waiting for completion", - warp_id, - dyn_warp_id - ); - } - } - - let warp = self.warps.get(warp_id).unwrap(); - - // todo: what is the difference? why dont we just use next_warp? - debug_assert!(Rc::ptr_eq(warp, next_warp_rc)); - drop(next_warp); - - // log::debug!("locking warp = {}", warp_id); - let mut warp = warp.try_borrow_mut().unwrap(); - // log::debug!("locked warp {}", warp_id); - // .as_mut() - // .as_ref() - // .unwrap(); - while !warp.waiting() - && !warp.ibuffer_empty() - && checked < max_issue - && checked <= issued - && issued < max_issue - { - // let valid = warp.ibuffer_next_valid(); - let mut warp_inst_issued = false; - - if let Some(instr) = warp.ibuffer_peek() { - // let (pc, rpc) = get_pdom_stack_top_info(warp_id, instr); - log::debug!( - "Warp (warp_id={}, dynamic_warp_id={}) instruction buffer[{}] has valid instruction {}", - warp_id, dyn_warp_id, warp.next, instr, - ); - - // In trace-driven mode, we assume no control hazard, meaning - // that `pc == rpc == instr.pc` - // if pc != instr.pc { - // log::debug!( - // "Warp (warp_id {}, dynamic_warp_id {}) control hazard instruction flush", - // warp_id, dyn_warp_id); - // // control hazard - // warp.set_next_pc(pc); - // warp.ibuffer_flush(); - // } else { - valid_inst = true; - if !self - .scoreboard - .read() - .unwrap() - .has_collision(warp_id, instr) - { - log::debug!( - "Warp (warp_id={}, dynamic_warp_id={}) {}", - warp_id, - dyn_warp_id, - style("passes scoreboard").yellow(), - ); - ready_inst = true; - - // let active_mask = core.active_mask(warp_id, instr); - - debug_assert!(warp.has_instr_in_pipeline()); - - use opcodes::ArchOp; - match instr.opcode.category { - ArchOp::LOAD_OP - | ArchOp::STORE_OP - | ArchOp::MEMORY_BARRIER_OP - | ArchOp::TENSOR_CORE_LOAD_OP - | ArchOp::TENSOR_CORE_STORE_OP => { - // if warp.warp_id == 3 { - // super::debug_break(format!( - // "scheduled mem instr for warp id 3: {}", - // instr - // )); - // } - let mem_stage = PipelineStage::ID_OC_MEM; - - let free_register = issuer.has_free_register(mem_stage, self.id); - - if free_register - && (!diff_exec_units - || prev_issued_exec_unit != ExecUnitKind::MEM) - { - let instr = warp.ibuffer_take().unwrap(); - debug_assert_eq!(warp_id, warp.warp_id); - issuer.issue_warp(mem_stage, &mut warp, instr, self.id); - // .issue_warp(mem_stage, &mut warp, instr, warp_id, self.id); - issued += 1; - issued_inst = true; - warp_inst_issued = true; - prev_issued_exec_unit = ExecUnitKind::MEM; - } else { - log::debug!("issue failed: no free mem port register"); - } - } - // ArchOp::EXIT_OPS => {} - op => { - if op != ArchOp::TENSOR_CORE_OP - && op != ArchOp::SFU_OP - && op != ArchOp::DP_OP - && (op as usize) < opcodes::SPEC_UNIT_START_ID - { - let mut execute_on_sp = false; - let mut execute_on_int = false; - - let sp_pipe_avail = self.config.num_sp_units > 0 - && issuer - .has_free_register(PipelineStage::ID_OC_SP, self.id); - let int_pipe_avail = self.config.num_int_units > 0 - && issuer - .has_free_register(PipelineStage::ID_OC_INT, self.id); - - // if INT unit pipline exist, then execute ALU and INT - // operations on INT unit and SP-FPU on SP unit (like in Volta) - // if INT unit pipline does not exist, then execute all ALU, INT - // and SP operations on SP unit (as in Fermi, Pascal GPUs) - if int_pipe_avail - && op != ArchOp::SP_OP - && !(diff_exec_units - && prev_issued_exec_unit == ExecUnitKind::INT) - { - execute_on_int = true; - } else if sp_pipe_avail - && (self.config.num_int_units == 0 - || (self.config.num_int_units > 0 - && op == ArchOp::SP_OP)) - && !(diff_exec_units - && prev_issued_exec_unit == ExecUnitKind::SP) - { - execute_on_sp = true; - } - - log::debug!( - "execute on INT={} execute on SP={}", - execute_on_int, - execute_on_sp - ); - - let issue_target = if execute_on_sp { - Some((PipelineStage::ID_OC_SP, ExecUnitKind::SP)) - } else if execute_on_int { - Some((PipelineStage::ID_OC_INT, ExecUnitKind::INT)) - } else { - None - }; - - if let Some((stage, unit)) = issue_target { - let instr = warp.ibuffer_take().unwrap(); - debug_assert_eq!(warp.warp_id, warp_id); - issuer.issue_warp(stage, &mut warp, instr, self.id); - // .issue_warp(stage, &mut warp, instr, warp_id, self.id); - issued += 1; - issued_inst = true; - warp_inst_issued = true; - prev_issued_exec_unit = unit; - } - } - // else if ((m_shader->m_config->gpgpu_num_dp_units > 0) && - // (pI->op == DP_OP) && - // !(diff_exec_units && previous_issued_inst_exec_type == - // exec_unit_type_t::DP)) { - // } else if (((m_shader->m_config->gpgpu_num_dp_units == 0 && - // pI->op == DP_OP) || - // (pI->op == SFU_OP) || (pI->op == ALU_SFU_OP)) && - // !(diff_exec_units && previous_issued_inst_exec_type == - // exec_unit_type_t::SFU)) { - // } else if ((pI->op == TENSOR_CORE_OP) && - // !(diff_exec_units && previous_issued_inst_exec_type == - // exec_unit_type_t::TENSOR)) { - // } else if ((pI->op >= SPEC_UNIT_START_ID) && - // !(diff_exec_units && - // previous_issued_inst_exec_type == - // exec_unit_type_t::SPECIALIZED)) { - // } - } // op => unimplemented!("op {:?} not implemented", op), - } - } else { - log::debug!( - "Warp (warp_id={}, dynamic_warp_id={}) {}", - warp_id, - dyn_warp_id, - style("fails scoreboard").yellow(), - ); - } - // } - } - // else if (valid) { - // // this case can happen after a return instruction in diverged warp - // SCHED_DPRINTF( - // "Warp (warp_id %u, dynamic_warp_id %u) return from diverged warp " - // "flush\n", - // (*iter)->get_warp_id(), (*iter)->get_dynamic_warp_id()); - // warp(warp_id).set_next_pc(pc); - // warp(warp_id).ibuffer_flush(); - // } - if warp_inst_issued { - log::debug!( - "Warp (warp_id={}, dynamic_warp_id={}) issued {} instructions", - warp_id, - dyn_warp_id, - issued - ); - // m_stats->event_warp_issued(m_shader->get_sid(), warp_id, num_issued, warp(warp_id).get_dynamic_warp_id()); - warp.ibuffer_step(); - } - checked += 1; - } - // drop(next_warp); - drop(warp); - if issued > 0 { - // This might be a bit inefficient, but we need to maintain - // two ordered list for proper scheduler execution. - // We could remove the need for this loop by associating a - // supervised_is index with each entry in the - // m_next_cycle_prioritized_warps vector. - // For now, just run through until you find the right warp_id - for (sup_idx, supervised) in self.supervised_warps.iter().enumerate() { - // if *next_warp == *supervised.lock().unwrap().warp_id { - // log::debug!("locking supervised[{}]", sup_idx); - // if dynamicwarp_id == supervised.try_borrow().unwrap().warp_id { - // if warp.borrow() == supervised.borrow() { - if *next_warp_rc.try_borrow().unwrap() == *supervised.try_borrow().unwrap() { - // test - self.last_supervised_issued_idx = sup_idx; - } - } - self.num_issued_last_cycle = issued; - if issued == 1 { - // m_stats->single_issue_nums[m_id]++; - } else if issued > 1 { - // m_stats->dual_issue_nums[m_id]++; - } - break; - } - } - - // issue stall statistics: - if !valid_inst { - // idle or control hazard - // m_stats.shader_cycle_distro[0]++; - } else if !ready_inst { - // waiting for RAW hazards (possibly due to memory) - // m_stats.shader_cycle_distro[1]++; - } else if !issued_inst { - // pipeline stalled - // m_stats.shader_cycle_distro[2]++; - } - - // todo!("base scheduler unit: cycle"); - } -} - -pub trait SchedulerUnit { - fn cycle(&mut self, _core: &mut dyn super::core::WarpIssuer) { - // fn cycle(&mut self, core: ()) { - // fn cycle(&mut self) { - todo!("scheduler unit: cycle"); - } - - // fn done_adding_supervised_warps(&mut self) { - // todo!("scheduler unit: done_adding_supervised_warps"); - // } - - fn add_supervised_warp(&mut self, _warp: CoreWarp) { - todo!("scheduler unit: add supervised warp id"); - } - - fn prioritized_warps(&self) -> &VecDeque; - - // self.scheduler - // self.inner.supervised_warps - - // fn add_supervised_warp_id(&mut self, warp_id: usize) { - // todo!("scheduler unit: add supervised warp id"); - // } - - /// Order warps based on scheduling policy. - /// - /// Derived classes can override this function to populate - /// m_supervised_warps with their scheduling policies - fn order_warps( - &mut self, - // out: &mut VecDeque, - // warps: &mut Vec, - // last_issued_warps: &Vec, - // num_warps_to_add: usize, - ) { - todo!("scheduler unit: order warps") - } -} - -#[derive(Debug)] -pub struct LrrScheduler { - inner: BaseSchedulerUnit, -} - -pub fn all_different(values: &[Rc>]) -> bool { - for (vi, v) in values.iter().enumerate() { - for (vii, vv) in values.iter().enumerate() { - let should_be_equal = vi == vii; - let are_equal = Rc::ptr_eq(v, vv); - if should_be_equal && !are_equal { - return false; - } - if !should_be_equal && are_equal { - return false; - } - } - } - true -} - -// pub struct LrrScheduler<'a> { -// inner: BaseSchedulerUnit<'a>, -// } - -// impl<'a> BaseSchedulerUnit<'a> { -impl BaseSchedulerUnit { - fn order_by_priority(&mut self, ordering: Ordering, priority_func: F) - where - F: FnMut(&CoreWarp, &CoreWarp) -> std::cmp::Ordering, - { - // todo!("base scheduler unit: order by priority"); - let num_warps_to_add = self.supervised_warps.len(); - let out = &mut self.next_cycle_prioritized_warps; - - debug_assert!(num_warps_to_add <= self.warps.len()); - out.clear(); - - debug_assert!(all_different(&self.supervised_warps.make_contiguous())); - - // let mut last_issued_iter = self.warps.iter().skip(self.last_supervised_issued_idx); - let mut last_issued_iter = self - .supervised_warps - .iter() - .skip(self.last_supervised_issued_idx); - debug_assert!(all_different(&self.warps)); - - // TODO: maybe we actually should make a copy of the supervised warps to not actually - // reorder those for stability - - let mut supervised_warps_sorted: Vec<_> = - self.supervised_warps.clone().into_iter().collect(); - supervised_warps_sorted.sort_by(priority_func); - - debug_assert!(all_different(&supervised_warps_sorted)); - - // dbg!(&supervised_warps_sorted.len()); - // dbg!(&supervised_warps_sorted - // .iter() - // .map(|w| w.borrow().dynamic_warp_id) - // .collect::>()); - - // self.supervised_warps - // .make_contiguous() - // .sort_by(priority_func); - - match ordering { - Ordering::GREEDY_THEN_PRIORITY_FUNC => { - let greedy_value = last_issued_iter.next(); - if let Some(greedy) = greedy_value { - out.push_back(Rc::clone(greedy)); - } - - log::debug!( - "added greedy warp (last supervised issued idx={}): {:?}", - self.last_supervised_issued_idx, - &greedy_value.map(|w| w.borrow().dynamic_warp_id) - ); - - // dbg!(&greedy_value); - - // self.supervised_warps - // .make_contiguous() - // .sort_by(priority_func); - - // self.supervised_warpsself.supervised_warps.any( .iter() - - out.extend( - supervised_warps_sorted - .into_iter() - .take(num_warps_to_add) - .filter(|warp| { - if let Some(greedy) = greedy_value { - // log::debug!( - // "greedy@{:?} warp@{:?}", - // Rc::as_ptr(greedy), - // Rc::as_ptr(warp) - // ); - let already_added = Rc::ptr_eq(greedy, warp); - !already_added - } else { - true - } - }), - // .map(Rc::clone), - ); - } - Ordering::PRIORITY_FUNC_ONLY => { - // self.supervised_warps - // .make_contiguous() - // .sort_by(priority_func); - out.extend(supervised_warps_sorted.into_iter().take(num_warps_to_add)); - } - } - // dbg!(num_warps_to_add, out.len()); - assert_eq!( - num_warps_to_add, - out.len(), - "either too few supervised warps or greedy warp not in supervised warps" - ); - } - - fn order_rrr( - &mut self, - // out: &mut VecDeque, - // warps: &mut Vec, - // std::vector &result_list, const typename std::vector &input_list, - // const typename std::vector::const_iterator &last_issued_from_input, - // unsigned num_warps_to_add) - ) { - unimplemented!("order rrr is untested"); - let num_warps_to_add = self.supervised_warps.len(); - let out = &mut self.next_cycle_prioritized_warps; - // order_lrr( - // &mut self.inner.next_cycle_prioritized_warps, - // &mut self.inner.supervised_warps, - // &mut self.inner.last_supervised_issued_idx, - // // &mut self.inner.last_supervised_issued(), - // num_warps_to_add, - // ); - - out.clear(); - - let current_turn_warp_ref = self.warps.get(self.current_turn_warp).unwrap(); - let current_turn_warp = current_turn_warp_ref.try_borrow().unwrap(); - // .as_ref() - // .unwrap(); - - if self.num_issued_last_cycle > 0 - || current_turn_warp.done_exit() - || current_turn_warp.waiting() - { - // std::vector::const_iterator iter = - // (last_issued_from_input == input_list.end()) ? - // input_list.begin() : last_issued_from_input + 1; - - let mut iter = self - .supervised_warps - .iter() - .skip(self.last_supervised_issued_idx + 1) - .chain(self.supervised_warps.iter()); - - for w in iter.take(num_warps_to_add) { - let warp = w.try_borrow().unwrap(); - let warp_id = warp.warp_id; - if !warp.done_exit() && !warp.waiting() { - out.push_back(w.clone()); - self.current_turn_warp = warp_id; - break; - } - } - // for (unsigned count = 0; count < num_warps_to_add; ++iter, ++count) { - // if (iter == input_list.end()) { - // iter = input_list.begin(); - // } - // unsigned warp_id = (*iter)->get_warp_id(); - // if (!(*iter)->done_exit() && !(*iter)->waiting()) { - // result_list.push_back(*iter); - // m_current_turn_warp = warp_id; - // break; - // } - // } - } else { - out.push_back(current_turn_warp_ref.clone()); - } - } - - fn order_lrr( - &mut self, - // out: &mut VecDeque, - // warps: &mut Vec, - // // last_issued_warps: &Vec, - // // last_issued_warps: impl Iterator, - // // last_issued_warps: &mut std::slice::Iter<'_, SchedulerWarp>, - // // last_issued_warps: impl Iterator, - // last_issued_warp_idx: &mut usize, - // num_warps_to_add: usize, - ) { - unimplemented!("order lrr is not tested"); - let num_warps_to_add = self.supervised_warps.len(); - let out = &mut self.next_cycle_prioritized_warps; - - debug_assert!(num_warps_to_add <= self.warps.len()); - out.clear(); - // if last_issued_warps - // typename std::vector::const_iterator iter = (last_issued_from_input == input_list.end()) ? input_list.begin() - // : last_issued_from_input + 1; - // - let mut last_issued_iter = self.warps.iter().skip(self.last_supervised_issued_idx); - - let mut iter = last_issued_iter.chain(self.warps.iter()); - // .filter_map(|x| x.as_ref()); - // .filter_map(|x| x.as_ref()); - - out.extend(iter.take(num_warps_to_add).cloned()); - // for count in 0..num_warps_to_add { - // let Some(warp) = iter.next() else { - // return; - // }; - // // if (iter == input_list.end()) { - // // iter = input_list.begin(); - // // } - // out.push_back(warp.clone()); - // } - // todo!("order lrr: order warps") - } -} - -impl SchedulerUnit for LrrScheduler { - // impl<'a> SchedulerUnit for LrrScheduler<'a> { - fn order_warps( - &mut self, - // out: &mut VecDeque, - // warps: &mut Vec, - // last_issued_warps: &Vec, - // num_warps_to_add: usize, - ) { - self.inner.order_lrr(); - // let num_warps_to_add = self.inner.supervised_warps.len(); - // order_lrr( - // &mut self.inner.next_cycle_prioritized_warps, - // &mut self.inner.supervised_warps, - // &mut self.inner.last_supervised_issued_idx, - // // &mut self.inner.last_supervised_issued(), - // num_warps_to_add, - // ); - } - - fn add_supervised_warp(&mut self, warp: CoreWarp) { - self.inner.supervised_warps.push_back(warp); - // self.inner.add_supervised_warp_id(warp_id); - } - - fn prioritized_warps(&self) -> &VecDeque { - self.inner.prioritized_warps() - } - - // fn add_supervised_warp_id(&mut self, warp_id: usize) { - // self.inner.add_supervised_warp_id(warp_id); - // } - - // fn done_adding_supervised_warps(&mut self) { - // self.inner.last_supervised_issued_idx = self.inner.supervised_warps.len(); - // } - - // fn cycle(&mut self, core: &mut super::core::InnerSIMTCore) { - // fn cycle(&mut self, core: ()) { - fn cycle(&mut self, issuer: &mut dyn super::core::WarpIssuer) { - self.order_warps(); - self.inner.cycle(issuer); - } -} - -// impl<'a> LrrScheduler<'a> { -impl LrrScheduler { - // fn order_warps( - // &self, - // out: &mut VecDeque, - // warps: &mut Vec, - // last_issued_warps: &Vec, - // num_warps_to_add: usize, - // ) { - // todo!("scheduler unit: order warps") - // } - - // pub fn new( - // id: usize, - // // warps: &'a Vec, - // warps: Vec, - // // warps: &'a Vec>, - // // mem_out: &'a register_set::RegisterSet, - // // core: &'a super::core::InnerSIMTCore, - // scoreboard: Arc>, - // stats: Arc>, - // config: Arc, - // ) -> Self { - // // todo!("lrr scheduler: new"); - // let inner = BaseSchedulerUnit::new( - // id, // mem_out, core, - // warps, scoreboard, stats, config, - // ); - // Self { inner } - // } - - // lrr_scheduler(shader_core_stats *stats, shader_core_ctx *shader, - // Scoreboard *scoreboard, simt_stack **simt, - // std::vector *warp, register_set *sp_out, - // register_set *dp_out, register_set *sfu_out, - // register_set *int_out, register_set *tensor_core_out, - // std::vector &spec_cores_out, - // register_set *mem_out, int id) - // : scheduler_unit(stats, shader, scoreboard, simt, warp, sp_out, dp_out, - // sfu_out, int_out, tensor_core_out, spec_cores_out, - // mem_out, id) {} - - // virtual void order_warps(); -} - -#[derive(Debug)] -pub struct GTOScheduler { - inner: BaseSchedulerUnit, -} - -impl GTOScheduler { - pub fn new( - id: usize, - cluster_id: usize, - core_id: usize, - warps: Vec, - scoreboard: Arc>, - stats: Arc>, - config: Arc, - ) -> Self { - let inner = BaseSchedulerUnit::new( - id, // mem_out, core, - cluster_id, core_id, warps, scoreboard, stats, config, - ); - Self { inner } - } -} - -impl GTOScheduler { - fn debug_warp_ids(&self) -> Vec { - self.inner - .next_cycle_prioritized_warps - .iter() - .map(|w| w.borrow().warp_id) - .collect() - } - - fn debug_dynamic_warp_ids(&self) -> Vec { - self.inner - .next_cycle_prioritized_warps - .iter() - .map(|w| w.borrow().dynamic_warp_id()) - .collect() - } -} - -impl SchedulerUnit for GTOScheduler { - fn order_warps(&mut self) { - // order_by_priority( - // m_next_cycle_prioritized_warps, - // m_supervised_warps, - // m_last_supervised_issued, - // m_supervised_warps.size(), - // ORDERING_GREEDY_THEN_PRIORITY_FUNC, - // scheduler_unit::sort_warps_by_oldest_dynamic_id, - // ); - //x - - // let before = self.inner.next_cycle_prioritized_warps.len(); - self.inner.order_by_priority( - Ordering::GREEDY_THEN_PRIORITY_FUNC, - sort_warps_by_oldest_dynamic_id, - ); - // let after = self.inner.next_cycle_prioritized_warps.len(); - // assert_eq!(before, after); - } - - fn add_supervised_warp(&mut self, warp: CoreWarp) { - self.inner.supervised_warps.push_back(warp); - } - - fn prioritized_warps(&self) -> &VecDeque { - self.inner.prioritized_warps() - } - - // fn done_adding_supervised_warps(&mut self) { - // // self.inner.last_supervised_issued_idx = self.inner.supervised_warps.len(); - // self.inner.last_supervised_issued_idx = 0; - // } - - // fn cycle(&mut self, core: ()) { - fn cycle(&mut self, issuer: &mut dyn super::core::WarpIssuer) { - log::debug!( - "gto scheduler[{}]: BEFORE: prioritized warp ids: {:?}", - self.inner.id, - self.debug_warp_ids() - ); - log::debug!( - "gto scheduler[{}]: BEFORE: prioritized dynamic warp ids: {:?}", - self.inner.id, - self.debug_dynamic_warp_ids() - ); - - self.order_warps(); - - log::debug!( - "gto scheduler[{}]: AFTER: prioritized warp ids: {:?}", - self.inner.id, - self.debug_warp_ids() - ); - log::debug!( - "gto scheduler[{}]: AFTER: prioritized dynamic warp ids: {:?}", - self.inner.id, - self.debug_dynamic_warp_ids() - ); - - self.inner.cycle(issuer); - } -} - -impl GTOScheduler { - pub fn order_warps( - &self, - out: &mut VecDeque, - warps: &mut Vec, - _last_issued_warps: &Vec, - num_warps_to_add: usize, - ) { - // let mut next_cycle_prioritized_warps = Vec::new(); - // - // let mut supervised_warps = Vec::new(); // input - // let mut last_issued_from_input = Vec::new(); // last issued - // let num_warps_to_add = supervised_warps.len(); - debug_assert!(num_warps_to_add <= warps.len()); - - // scheduler_unit::sort_warps_by_oldest_dynamic_id - - // ORDERING_GREEDY_THEN_PRIORITY_FUNC - out.clear(); - // let greedy_value = last_issued_warps.first(); - // if let Some(greedy_value) = greedy_value { - // out.push_back(greedy_value.clone()); - // } - // - // warps.sort_by(sort_warps_by_oldest_dynamic_id); - // out.extend( - // warps - // .iter() - // .take_while(|w| match greedy_value { - // None => true, - // Some(val) => *w != val, - // }) - // .take(num_warps_to_add) - // .cloned(), - // ); - - // typename std::vector::iterator iter = temp.begin(); - // for (unsigned count = 0; count < num_warps_to_add; ++count, ++iter) { - // if (*iter != greedy_value) { - // result_list.push_back(*iter); - // } - // } - - // result_list.clear(); - // typename std::vector temp = input_list; - // - // if (ORDERING_GREEDY_THEN_PRIORITY_FUNC == ordering) { - // T greedy_value = *last_issued_from_input; - // result_list.push_back(greedy_value); - // - // std::sort(temp.begin(), temp.end(), priority_func); - // typename std::vector::iterator iter = temp.begin(); - // for (unsigned count = 0; count < num_warps_to_add; ++count, ++iter) { - // if (*iter != greedy_value) { - // result_list.push_back(*iter); - // } - // } - // } else if (ORDERED_PRIORITY_FUNC_ONLY == ordering) { - // std::sort(temp.begin(), temp.end(), priority_func); - // typename std::vector::iterator iter = temp.begin(); - // for (unsigned count = 0; count < num_warps_to_add; ++count, ++iter) { - // result_list.push_back(*iter); - // } - // } else { - // fprintf(stderr, "Unknown ordering - %d\n", ordering); - // abort(); - // } - - // order by priority - // (m_next_cycle_prioritized_warps, m_supervised_warps, - // m_last_supervised_issued, m_supervised_warps.size(), - // ORDERING_GREEDY_THEN_PRIORITY_FUNC, - // scheduler_unit::sort_warps_by_oldest_dynamic_id); - } -} - -#[cfg(test)] -mod tests { - use crate::ported::testing; - - use std::ptr; - - #[ignore = "todo"] - #[test] - fn test_shd_warp() { - use playground::types::trace_shd_warp::new_trace_shd_warp; - let core = ptr::null_mut(); - let warp_size = 32; - let mut warp = unsafe { new_trace_shd_warp(core, warp_size) }; - warp.pin_mut().reset(); - dbg!(&warp.get_n_completed()); - dbg!(&warp.hardware_done()); - dbg!(&warp.functional_done()); - assert!(false); - } - - #[test] - fn test_skip_iterator_indexing() { - let issued_warp_id = 3; - let supervised_warp_ids = vec![1, 2, 3, 4, 5]; - let mut last_supervised_idx = 0; - - for (idx, id) in supervised_warp_ids.iter().enumerate() { - if *id == issued_warp_id { - last_supervised_idx = idx; - } - } - assert_eq!( - supervised_warp_ids.iter().skip(last_supervised_idx).next(), - Some(&issued_warp_id) - ); - } - - impl From<&Box> for testing::state::Scheduler { - fn from(scheduler: &Box) -> Self { - // let prioritized_warps = ; - let prioritized_warp_ids: Vec<_> = scheduler - .prioritized_warps() - .iter() - .map(|warp| (warp.borrow().warp_id, warp.borrow().dynamic_warp_id())) - .collect(); - // let prioritized_warp_ids: Vec<_> = prioritized_warps - // .clone() - // .map(|warp| warp.borrow().warp_id) - // .collect(); - // let prioritized_dynamic_warp_ids: Vec<_> = prioritized_warps - // .clone() - // .map(|warp| warp.borrow().dynamic_warp_id()) - // .collect(); - // - // assert_eq!( - // prioritized_warp_ids.len(), - // prioritized_dynamic_warp_ids.len() - // ); - - Self { - prioritized_warp_ids, - // prioritized_warp_ids - // prioritized_dynamic_warp_ids, - } - } - } -} diff --git a/src/ported/scheduler/gto.rs b/src/ported/scheduler/gto.rs new file mode 100644 index 00000000..b3623d90 --- /dev/null +++ b/src/ported/scheduler/gto.rs @@ -0,0 +1,89 @@ +use super::{BaseSchedulerUnit, SchedulerUnit, WarpRef}; +use crate::config::GPUConfig; +use crate::ported::scoreboard::Scoreboard; +use std::collections::VecDeque; +use std::sync::{Arc, Mutex, RwLock}; + +#[derive(Debug)] +pub struct Scheduler { + inner: BaseSchedulerUnit, +} + +impl Scheduler { + pub fn new( + id: usize, + cluster_id: usize, + core_id: usize, + warps: Vec, + scoreboard: Arc>, + stats: Arc>, + config: Arc, + ) -> Self { + let inner = + BaseSchedulerUnit::new(id, cluster_id, core_id, warps, scoreboard, stats, config); + Self { inner } + } +} + +impl Scheduler { + fn debug_warp_ids(&self) -> Vec { + self.inner + .next_cycle_prioritized_warps + .iter() + .map(|w| w.borrow().warp_id) + .collect() + } + + fn debug_dynamic_warp_ids(&self) -> Vec { + self.inner + .next_cycle_prioritized_warps + .iter() + .map(|w| w.borrow().dynamic_warp_id()) + .collect() + } +} + +impl SchedulerUnit for Scheduler { + fn order_warps(&mut self) { + self.inner.order_by_priority( + super::ordering::Ordering::GREEDY_THEN_PRIORITY_FUNC, + super::ordering::sort_warps_by_oldest_dynamic_id, + ); + } + + fn add_supervised_warp(&mut self, warp: WarpRef) { + self.inner.supervised_warps.push_back(warp); + } + + fn prioritized_warps(&self) -> &VecDeque { + self.inner.prioritized_warps() + } + + fn cycle(&mut self, issuer: &mut dyn crate::ported::core::WarpIssuer) { + log::debug!( + "gto scheduler[{}]: BEFORE: prioritized warp ids: {:?}", + self.inner.id, + self.debug_warp_ids() + ); + log::debug!( + "gto scheduler[{}]: BEFORE: prioritized dynamic warp ids: {:?}", + self.inner.id, + self.debug_dynamic_warp_ids() + ); + + self.order_warps(); + + log::debug!( + "gto scheduler[{}]: AFTER: prioritized warp ids: {:?}", + self.inner.id, + self.debug_warp_ids() + ); + log::debug!( + "gto scheduler[{}]: AFTER: prioritized dynamic warp ids: {:?}", + self.inner.id, + self.debug_dynamic_warp_ids() + ); + + self.inner.cycle(issuer); + } +} diff --git a/src/ported/scheduler/mod.rs b/src/ported/scheduler/mod.rs new file mode 100644 index 00000000..faec15ea --- /dev/null +++ b/src/ported/scheduler/mod.rs @@ -0,0 +1,396 @@ +pub mod gto; +pub mod ordering; +pub mod warp; + +use super::core::PipelineStage; +use super::{opcodes, scoreboard}; +use crate::config::GPUConfig; +use console::style; +use std::cell::RefCell; +use std::collections::VecDeque; +use std::rc::Rc; +use std::sync::{Arc, Mutex, RwLock}; + +pub use warp::{SchedulerWarp, ThreadActiveMask}; + +pub type WarpRef = Rc>; + +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +enum ExecUnitKind { + NONE = 0, + SP = 1, + #[allow(dead_code)] + SFU = 2, + MEM = 3, + #[allow(dead_code)] + DP = 4, + INT = 5, + #[allow(dead_code)] + TENSOR = 6, + #[allow(dead_code)] + SPECIALIZED = 7, +} + +pub trait SchedulerUnit { + fn cycle(&mut self, _core: &mut dyn super::core::WarpIssuer); + + fn add_supervised_warp(&mut self, warp: WarpRef); + + fn prioritized_warps(&self) -> &VecDeque; + + /// Order warps based on scheduling policy. + fn order_warps(&mut self); +} + +#[derive(Debug)] +pub struct BaseSchedulerUnit { + id: usize, + cluster_id: usize, + core_id: usize, + + /// This is the prioritized warp list that is looped over each cycle to + /// determine which warp gets to issue. + next_cycle_prioritized_warps: VecDeque, + + // Supervised warps keeps all warps this scheduler can arbitrate between. + // + // This is useful in systems where there is more than one warp scheduler. + // In a single scheduler system, this is simply all the warps + // assigned to this core. + supervised_warps: VecDeque, + warps: Vec, + + /// This is the iterator pointer to the last supervised warp issued + last_supervised_issued_idx: usize, + num_issued_last_cycle: usize, + + scoreboard: Arc>, + + config: Arc, + stats: Arc>, +} + +impl BaseSchedulerUnit { + pub fn new( + id: usize, + cluster_id: usize, + core_id: usize, + warps: Vec, + scoreboard: Arc>, + stats: Arc>, + config: Arc, + ) -> Self { + let supervised_warps = VecDeque::new(); + Self { + id, + cluster_id, + core_id, + next_cycle_prioritized_warps: VecDeque::new(), + supervised_warps, + last_supervised_issued_idx: 0, + warps, + num_issued_last_cycle: 0, + stats, + scoreboard, + config, + } + } + + fn prioritized_warps(&self) -> &VecDeque { + &self.next_cycle_prioritized_warps + } + + fn cycle(&mut self, issuer: &mut dyn super::core::WarpIssuer) { + log::debug!("{}: cycle", style("base scheduler").yellow()); + + let mut valid_inst = false; + let mut ready_inst = false; + let mut issued_inst = false; + + for next_warp_rc in &self.next_cycle_prioritized_warps { + // don't consider warps that are not yet valid + let next_warp = next_warp_rc.try_borrow().unwrap(); + let (warp_id, dyn_warp_id) = (next_warp.warp_id, next_warp.dynamic_warp_id); + + if next_warp.done_exit() { + continue; + } + let inst_count = next_warp.instruction_count(); + if inst_count == 0 { + log::debug!("next warp: {:#?}", &next_warp); + } + assert!(inst_count > 0); + if inst_count > 1 { + log::debug!( + "core[{}][{}] scheduler[{}]: \n\t => testing (warp_id={}, dynamic_warp_id={}, trace_pc={}, pc={:?}, ibuffer={:?}, {} instructions)", + self.cluster_id, + self.core_id, + self.id, + warp_id, dyn_warp_id, + next_warp.trace_pc, + next_warp.pc(), + next_warp.instr_buffer.iter().filter_map(Option::as_ref).map(|i| i.pc).collect::>(), inst_count, + ); + } + let mut checked = 0; + let mut issued = 0; + + let mut prev_issued_exec_unit = ExecUnitKind::NONE; + let max_issue = self.config.max_instruction_issue_per_warp; + // In tis mode, we only allow dual issue to diff execution + // units (as in Maxwell and Pascal) + let diff_exec_units = self.config.dual_issue_diff_exec_units; + + if inst_count > 1 { + if next_warp.ibuffer_empty() { + log::debug!( + "warp (warp_id={}, dynamic_warp_id={}) fails as ibuffer_empty", + warp_id, + dyn_warp_id + ); + } + + if next_warp.waiting() { + log::debug!( + "warp (warp_id={}, dynamic_warp_id={}) is waiting for completion", + warp_id, + dyn_warp_id + ); + } + } + + let warp = self.warps.get(warp_id).unwrap(); + + // todo: what is the difference? why dont we just use next_warp? + debug_assert!(Rc::ptr_eq(warp, next_warp_rc)); + drop(next_warp); + + let mut warp = warp.try_borrow_mut().unwrap(); + while !warp.waiting() + && !warp.ibuffer_empty() + && checked < max_issue + && checked <= issued + && issued < max_issue + { + let mut warp_inst_issued = false; + + if let Some(instr) = warp.ibuffer_peek() { + log::debug!( + "Warp (warp_id={}, dynamic_warp_id={}) instruction buffer[{}] has valid instruction {}", + warp_id, dyn_warp_id, warp.next, instr, + ); + + valid_inst = true; + if !self + .scoreboard + .read() + .unwrap() + .has_collision(warp_id, instr) + { + log::debug!( + "Warp (warp_id={}, dynamic_warp_id={}) {}", + warp_id, + dyn_warp_id, + style("passes scoreboard").yellow(), + ); + ready_inst = true; + + debug_assert!(warp.has_instr_in_pipeline()); + + use opcodes::ArchOp; + match instr.opcode.category { + ArchOp::LOAD_OP + | ArchOp::STORE_OP + | ArchOp::MEMORY_BARRIER_OP + | ArchOp::TENSOR_CORE_LOAD_OP + | ArchOp::TENSOR_CORE_STORE_OP => { + let mem_stage = PipelineStage::ID_OC_MEM; + + let free_register = issuer.has_free_register(mem_stage, self.id); + + if free_register + && (!diff_exec_units + || prev_issued_exec_unit != ExecUnitKind::MEM) + { + let instr = warp.ibuffer_take().unwrap(); + debug_assert_eq!(warp_id, warp.warp_id); + issuer.issue_warp(mem_stage, &mut warp, instr, self.id); + issued += 1; + issued_inst = true; + warp_inst_issued = true; + prev_issued_exec_unit = ExecUnitKind::MEM; + } else { + log::debug!("issue failed: no free mem port register"); + } + } + op => { + if op != ArchOp::TENSOR_CORE_OP + && op != ArchOp::SFU_OP + && op != ArchOp::DP_OP + && (op as usize) < opcodes::SPEC_UNIT_START_ID + { + let mut execute_on_sp = false; + let mut execute_on_int = false; + + let sp_pipe_avail = self.config.num_sp_units > 0 + && issuer + .has_free_register(PipelineStage::ID_OC_SP, self.id); + let int_pipe_avail = self.config.num_int_units > 0 + && issuer + .has_free_register(PipelineStage::ID_OC_INT, self.id); + + // if INT unit pipline exist, then execute ALU and INT + // operations on INT unit and SP-FPU on SP unit (like in Volta) + // if INT unit pipline does not exist, then execute all ALU, INT + // and SP operations on SP unit (as in Fermi, Pascal GPUs) + if int_pipe_avail + && op != ArchOp::SP_OP + && !(diff_exec_units + && prev_issued_exec_unit == ExecUnitKind::INT) + { + execute_on_int = true; + } else if sp_pipe_avail + && (self.config.num_int_units == 0 + || (self.config.num_int_units > 0 + && op == ArchOp::SP_OP)) + && !(diff_exec_units + && prev_issued_exec_unit == ExecUnitKind::SP) + { + execute_on_sp = true; + } + + log::debug!( + "execute on INT={} execute on SP={}", + execute_on_int, + execute_on_sp + ); + + let issue_target = if execute_on_sp { + Some((PipelineStage::ID_OC_SP, ExecUnitKind::SP)) + } else if execute_on_int { + Some((PipelineStage::ID_OC_INT, ExecUnitKind::INT)) + } else { + None + }; + + if let Some((stage, unit)) = issue_target { + let instr = warp.ibuffer_take().unwrap(); + debug_assert_eq!(warp.warp_id, warp_id); + issuer.issue_warp(stage, &mut warp, instr, self.id); + // .issue_warp(stage, &mut warp, instr, warp_id, self.id); + issued += 1; + issued_inst = true; + warp_inst_issued = true; + prev_issued_exec_unit = unit; + } + } + } // op => unimplemented!("op {:?} not implemented", op), + } + } else { + log::debug!( + "Warp (warp_id={}, dynamic_warp_id={}) {}", + warp_id, + dyn_warp_id, + style("fails scoreboard").yellow(), + ); + } + } + if warp_inst_issued { + log::debug!( + "Warp (warp_id={}, dynamic_warp_id={}) issued {} instructions", + warp_id, + dyn_warp_id, + issued + ); + warp.ibuffer_step(); + } + checked += 1; + } + drop(warp); + if issued > 0 { + // This might be a bit inefficient, but we need to maintain + // two ordered list for proper scheduler execution. + // We could remove the need for this loop by associating a + // supervised_is index with each entry in the + // m_next_cycle_prioritized_warps vector. + // For now, just run through until you find the right warp_id + for (sup_idx, supervised) in self.supervised_warps.iter().enumerate() { + if *next_warp_rc.try_borrow().unwrap() == *supervised.try_borrow().unwrap() { + self.last_supervised_issued_idx = sup_idx; + } + } + self.num_issued_last_cycle = issued; + let mut stats = self.stats.lock().unwrap(); + if issued == 1 { + stats.num_single_issue += 1; + } else { + stats.num_dual_issue += 1; + } + break; + } + } + + // issue stall statistics + let mut stats = self.stats.lock().unwrap(); + if !valid_inst { + // idle or control hazard + stats.issue_raw_hazard_stall += 1; + } else if !ready_inst { + // waiting for RAW hazards (possibly due to memory) + stats.issue_control_hazard_stall += 1; + } else if !issued_inst { + // pipeline stalled + stats.issue_pipeline_stall += 1; + } + } +} + +#[cfg(test)] +mod tests { + use crate::ported::testing; + use std::ptr; + + #[ignore = "todo"] + #[test] + fn test_shd_warp() { + use playground::types::trace_shd_warp::new_trace_shd_warp; + let core = ptr::null_mut(); + let warp_size = 32; + let mut warp = unsafe { new_trace_shd_warp(core, warp_size) }; + warp.pin_mut().reset(); + dbg!(&warp.get_n_completed()); + dbg!(&warp.hardware_done()); + dbg!(&warp.functional_done()); + assert!(false); + } + + #[test] + fn test_skip_iterator_indexing() { + let issued_warp_id = 3; + let supervised_warp_ids = vec![1, 2, 3, 4, 5]; + let mut last_supervised_idx = 0; + + for (idx, id) in supervised_warp_ids.iter().enumerate() { + if *id == issued_warp_id { + last_supervised_idx = idx; + } + } + assert_eq!( + supervised_warp_ids.iter().nth(last_supervised_idx), + Some(&issued_warp_id) + ); + } + + impl From<&Box> for testing::state::Scheduler { + fn from(scheduler: &Box) -> Self { + let prioritized_warp_ids: Vec<_> = scheduler + .prioritized_warps() + .iter() + .map(|warp| (warp.borrow().warp_id, warp.borrow().dynamic_warp_id())) + .collect(); + Self { + prioritized_warp_ids, + } + } + } +} diff --git a/src/ported/scheduler/ordering.rs b/src/ported/scheduler/ordering.rs new file mode 100644 index 00000000..e7b28785 --- /dev/null +++ b/src/ported/scheduler/ordering.rs @@ -0,0 +1,109 @@ +use super::{BaseSchedulerUnit, WarpRef}; + +use std::cell::RefCell; +use std::rc::Rc; + +pub fn all_different(values: &[Rc>]) -> bool { + for (vi, v) in values.iter().enumerate() { + for (vii, vv) in values.iter().enumerate() { + let should_be_equal = vi == vii; + let are_equal = Rc::ptr_eq(v, vv); + if should_be_equal && !are_equal { + return false; + } + if !should_be_equal && are_equal { + return false; + } + } + } + true +} + +pub fn sort_warps_by_oldest_dynamic_id(lhs: &WarpRef, rhs: &WarpRef) -> std::cmp::Ordering { + let lhs = lhs.try_borrow().unwrap(); + let rhs = rhs.try_borrow().unwrap(); + if lhs.done_exit() || lhs.waiting() { + std::cmp::Ordering::Greater + } else if rhs.done_exit() || rhs.waiting() { + std::cmp::Ordering::Less + } else { + lhs.dynamic_warp_id().cmp(&rhs.dynamic_warp_id()) + } +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +pub enum Ordering { + // The item that issued last is prioritized first then the + // sorted result of the priority_function + GREEDY_THEN_PRIORITY_FUNC = 0, + // No greedy scheduling based on last to issue. + // + // Only the priority function determines priority + PRIORITY_FUNC_ONLY, + // NUM_ORDERING, +} + +impl BaseSchedulerUnit { + pub fn order_by_priority(&mut self, ordering: Ordering, priority_func: F) + where + F: FnMut(&WarpRef, &WarpRef) -> std::cmp::Ordering, + { + let num_warps_to_add = self.supervised_warps.len(); + let out = &mut self.next_cycle_prioritized_warps; + + debug_assert!(num_warps_to_add <= self.warps.len()); + out.clear(); + + debug_assert!(all_different(self.supervised_warps.make_contiguous())); + + let mut last_issued_iter = self + .supervised_warps + .iter() + .skip(self.last_supervised_issued_idx); + debug_assert!(all_different(&self.warps)); + + // sort a copy of the supervised warps reorder those for stability + let mut supervised_warps_sorted: Vec<_> = + self.supervised_warps.clone().into_iter().collect(); + supervised_warps_sorted.sort_by(priority_func); + + debug_assert!(all_different(&supervised_warps_sorted)); + + match ordering { + Ordering::GREEDY_THEN_PRIORITY_FUNC => { + let greedy_value = last_issued_iter.next(); + if let Some(greedy) = greedy_value { + out.push_back(Rc::clone(greedy)); + } + + log::debug!( + "added greedy warp (last supervised issued idx={}): {:?}", + self.last_supervised_issued_idx, + &greedy_value.map(|w| w.borrow().dynamic_warp_id) + ); + + out.extend( + supervised_warps_sorted + .into_iter() + .take(num_warps_to_add) + .filter(|warp| { + if let Some(greedy) = greedy_value { + let already_added = Rc::ptr_eq(greedy, warp); + !already_added + } else { + true + } + }), + ); + } + Ordering::PRIORITY_FUNC_ONLY => { + out.extend(supervised_warps_sorted.into_iter().take(num_warps_to_add)); + } + } + assert_eq!( + num_warps_to_add, + out.len(), + "either too few supervised warps or greedy warp not in supervised warps" + ); + } +} diff --git a/src/ported/scheduler/warp.rs b/src/ported/scheduler/warp.rs new file mode 100644 index 00000000..a9fd2bea --- /dev/null +++ b/src/ported/scheduler/warp.rs @@ -0,0 +1,207 @@ +use crate::ported::{instruction::WarpInstruction, kernel::Kernel}; +use bitvec::{array::BitArray, BitArr}; +use std::collections::VecDeque; +use std::sync::Arc; + +pub type ThreadActiveMask = BitArr!(for 32, in u32); + +#[derive(Debug)] +pub struct SchedulerWarp { + pub block_id: u64, + pub dynamic_warp_id: usize, + pub warp_id: usize, + pub kernel: Option>, + + pub trace_pc: usize, + pub active_mask: ThreadActiveMask, + pub trace_instructions: VecDeque, + + // state + pub done_exit: bool, + pub num_instr_in_pipeline: usize, + pub num_outstanding_stores: usize, + pub num_outstanding_atomics: usize, + pub has_imiss_pending: bool, + pub instr_buffer: Vec>, + pub next: usize, +} + +impl PartialEq for SchedulerWarp { + fn eq(&self, other: &Self) -> bool { + self.kernel == other.kernel + && self.block_id == other.block_id + && self.warp_id == other.warp_id + && self.dynamic_warp_id == other.dynamic_warp_id + } +} + +const IBUFFER_SIZE: usize = 2; + +impl Default for SchedulerWarp { + fn default() -> Self { + let instr_buffer = vec![None; IBUFFER_SIZE]; + Self { + block_id: 0, + dynamic_warp_id: u32::MAX as usize, + warp_id: u32::MAX as usize, + kernel: None, + trace_pc: 0, + trace_instructions: VecDeque::new(), + active_mask: BitArray::ZERO, + done_exit: false, + num_instr_in_pipeline: 0, + num_outstanding_stores: 0, + num_outstanding_atomics: 0, + has_imiss_pending: false, + instr_buffer, + next: 0, + } + } +} + +impl SchedulerWarp { + pub fn init( + &mut self, + _start_pc: Option, + block_id: u64, + warp_id: usize, + dynamic_warp_id: usize, + active_mask: ThreadActiveMask, + kernel: Arc, + ) { + self.block_id = block_id; + self.warp_id = warp_id; + self.dynamic_warp_id = dynamic_warp_id; + self.done_exit = false; + self.kernel = Some(kernel); + self.active_mask = active_mask; + } + + pub fn reset(&mut self) { + debug_assert_eq!(self.num_outstanding_stores, 0); + debug_assert_eq!(self.num_instr_in_pipeline, 0); + self.has_imiss_pending = false; + self.warp_id = u32::MAX as usize; + self.dynamic_warp_id = u32::MAX as usize; + + self.active_mask.fill(false); + self.done_exit = true; + self.next = 0; + } + + #[must_use] pub fn current_instr(&self) -> Option<&WarpInstruction> { + self.trace_instructions.get(self.trace_pc) + } + + pub fn push_trace_instruction(&mut self, instr: WarpInstruction) { + self.trace_instructions.push_back(instr); + } + + pub fn next_trace_inst(&mut self) -> Option<&WarpInstruction> { + let trace_instr = self.trace_instructions.get(self.trace_pc)?; + self.trace_pc += 1; + Some(trace_instr) + } + + #[must_use] pub fn instruction_count(&self) -> usize { + self.trace_instructions.len() + } + + #[must_use] pub fn pc(&self) -> Option { + debug_assert!(self.trace_pc <= self.instruction_count()); + self.trace_instructions + .get(self.trace_pc) + .map(|instr| instr.pc) + } + + #[must_use] pub fn done(&self) -> bool { + self.trace_pc == self.instruction_count() + } + + pub fn clear(&mut self) { + self.trace_pc = 0; + self.trace_instructions.clear(); + } + + pub fn ibuffer_fill(&mut self, slot: usize, instr: WarpInstruction) { + debug_assert!(slot < self.instr_buffer.len()); + self.instr_buffer[slot] = Some(instr); + self.next = 0; + } + + #[must_use] pub fn ibuffer_size(&self) -> usize { + self.instr_buffer.iter().filter(|x| x.is_some()).count() + } + + pub fn ibuffer_empty(&self) -> bool { + self.instr_buffer.iter().all(Option::is_none) + } + + pub fn ibuffer_flush(&mut self) { + for i in &mut self.instr_buffer { + if i.is_some() { + self.num_instr_in_pipeline -= 1; + } + *i = None; + } + } + + #[must_use] pub fn ibuffer_peek(&self) -> Option<&WarpInstruction> { + self.instr_buffer[self.next].as_ref() + } + + pub fn ibuffer_take(&mut self) -> Option { + self.instr_buffer[self.next].take() + } + + pub fn ibuffer_step(&mut self) { + self.next = (self.next + 1) % IBUFFER_SIZE; + } + + #[must_use] pub fn done_exit(&self) -> bool { + self.done_exit + } + + #[must_use] pub fn hardware_done(&self) -> bool { + self.functional_done() && self.stores_done() && self.num_instr_in_pipeline == 0 + } + + #[must_use] pub fn has_instr_in_pipeline(&self) -> bool { + self.num_instr_in_pipeline > 0 + } + + #[must_use] pub fn stores_done(&self) -> bool { + self.num_outstanding_stores == 0 + } + + #[must_use] pub fn num_completed(&self) -> usize { + self.active_mask.count_zeros() + } + + pub fn set_thread_completed(&mut self, thread_id: usize) { + self.active_mask.set(thread_id, false); + } + + #[must_use] pub fn functional_done(&self) -> bool { + self.active_mask.not_any() + } + + #[must_use] pub fn waiting(&self) -> bool { + if self.functional_done() { + // waiting to be initialized with a kernel + true + // } else if core.warp_waiting_at_barrier(self.warp_id) { + // // waiting for other warps in block to reach barrier + // true + // } else if core.warp_waiting_at_mem_barrier(self.warp_id) { + // // waiting for memory barrier + // true + } else { + self.num_outstanding_atomics > 0 + } + } + + #[must_use] pub fn dynamic_warp_id(&self) -> usize { + self.dynamic_warp_id + } +} diff --git a/src/ported/scoreboard.rs b/src/ported/scoreboard.rs index 467138fc..232ed6a5 100644 --- a/src/ported/scoreboard.rs +++ b/src/ported/scoreboard.rs @@ -1,4 +1,4 @@ -use super::instruction::{MemorySpace, WarpInstruction}; +use super::instruction::WarpInstruction; use std::collections::HashSet; /// Scoreboard implementation @@ -6,25 +6,19 @@ use std::collections::HashSet; /// This should however not be needed in trace driven mode.. #[derive(Debug, Default)] pub struct Scoreboard { - core_id: usize, - cluster_id: usize, - max_warps: usize, + pub core_id: usize, + pub cluster_id: usize, pub register_table: Vec>, - /// Register that depend on a long operation (global, local or tex memory) - long_op_registers: Vec>, } impl Scoreboard { - pub fn new(core_id: usize, cluster_id: usize, max_warps: usize) -> Self { + #[must_use] pub fn new(core_id: usize, cluster_id: usize, max_warps: usize) -> Self { let register_table: Vec<_> = (0..max_warps).map(|_| HashSet::new()).collect(); - let long_op_registers = register_table.clone(); Self { core_id, cluster_id, - max_warps, register_table, - long_op_registers, } } @@ -33,7 +27,7 @@ impl Scoreboard { /// # Returns /// true if WAW or RAW hazard (no WAR since in-order issue) /// - pub fn has_collision(&self, warp_id: usize, instr: &WarpInstruction) -> bool { + #[must_use] pub fn has_collision(&self, warp_id: usize, instr: &WarpInstruction) -> bool { use itertools::Itertools; // Get list of all input and output registers @@ -49,17 +43,6 @@ impl Scoreboard { instr_registers.iter().sorted().collect::>(), ); - // ar1 = 0; - // ar2 = 0; - - // predicate register number - // if instr.pred > 0 - // inst_regs.insert(inst->pred); - // if (inst->ar1 > 0) - // inst_regs.insert(inst->ar1); - // if (inst->ar2 > 0) - // inst_regs.insert(inst->ar2); - // get the intersection of reserved registers and instruction registers let Some(reserved) = self.register_table.get(warp_id) else { return false; @@ -69,12 +52,11 @@ impl Scoreboard { warp_id, reserved.iter().sorted().collect::>(), ); - let mut intersection = instr_registers.intersection(&reserved); + let mut intersection = instr_registers.intersection(reserved); intersection.next().is_some() - // todo!("scoreboard: check collision"); } - pub fn pending_writes(&self, warp_id: usize) -> &HashSet { + #[must_use] pub fn pending_writes(&self, warp_id: usize) -> &HashSet { &self.register_table[warp_id] } @@ -92,16 +74,13 @@ impl Scoreboard { pub fn release_registers(&mut self, instr: &WarpInstruction) { for &out_reg in instr.outputs() { self.release_register(instr.warp_id, out_reg); - self.long_op_registers[instr.warp_id].remove(&out_reg); } } pub fn reserve_register(&mut self, warp_id: usize, reg_num: u32) { let warp_registers = &mut self.register_table[warp_id]; - if warp_registers.contains(®_num) { - panic!("trying to reserve an already reserved register (core_id={}, warp_id={}, reg_num={})", + assert!(!warp_registers.contains(®_num), "trying to reserve an already reserved register (core_id={}, warp_id={}, reg_num={})", self.core_id, warp_id, reg_num); - } log::trace!( "scoreboard: warp {} reserves register: {}", warp_id, @@ -114,21 +93,5 @@ impl Scoreboard { for &out_reg in instr.outputs() { self.reserve_register(instr.warp_id, out_reg); } - - // Keep track of long operations - if instr.is_load() - && matches!( - instr.memory_space, - Some(MemorySpace::Global | MemorySpace::Local | MemorySpace::Texture) - ) - { - // inst->space.get_type() == local_space || - // inst->space.get_type() == param_space_kernel || - // inst->space.get_type() == param_space_local || - // inst->space.get_type() == param_space_unclassified || - for &out_reg in instr.outputs() { - self.long_op_registers[instr.warp_id].insert(out_reg); - } - } } } diff --git a/src/ported/set_index/mod.rs b/src/ported/set_index/mod.rs new file mode 100644 index 00000000..e38d4b91 --- /dev/null +++ b/src/ported/set_index/mod.rs @@ -0,0 +1,157 @@ +use super::address; + +pub trait SetIndexFunction: std::fmt::Debug { + /// Compute set index using + fn compute_set_index( + &self, + addr: address, + num_sets: usize, + line_size_log2: u32, + num_sets_log2: u32, + ) -> u64; +} + +pub mod fermi { + // Set Indexing function from + // "A Detailed GPU Cache Model Based on Reuse + // Distance Theory" Cedric Nugteren et al. HPCA 2014 + #[derive(Default, Debug, PartialEq, Eq, Hash)] + pub struct SetIndex {} + impl super::SetIndexFunction for SetIndex { + fn compute_set_index( + &self, + addr: super::address, + num_sets: usize, + line_size_log2: u32, + num_sets_log2: u32, + ) -> u64 { + // check for incorrect number of sets + assert!( + matches!(num_sets, 32 | 64), + "bad cache config: num sets should be 32 or 64 for fermi set index function (got {num_sets})", + ); + + // lower xor value is bits 7-11 + let lower_xor = (addr >> line_size_log2) & 0x1F; + + // upper xor value is bits 13, 14, 15, 17, and 19 + let mut upper_xor = (addr & 0xE000) >> 13; // Bits 13, 14, 15 + upper_xor |= (addr & 0x20000) >> 14; // Bit 17 + upper_xor |= (addr & 0x80000) >> 15; // Bit 19 + + let mut set_idx = lower_xor ^ upper_xor; + + // 48KB cache prepends the set_index with bit 12 + if num_sets == 64 { + set_idx |= (addr & 0x1000) >> 7; + } + assert!(set_idx < num_sets as u64, "set index out of bounds"); + set_idx + } + } +} + +pub mod bitwise_xor { + #[must_use] + pub fn bitwise_hash_function( + higher_bits: super::address, + index: usize, + bank_set_num: usize, + ) -> u64 { + index as u64 ^ (higher_bits & (bank_set_num as u64 - 1)) + } + + #[derive(Default, Debug, PartialEq, Eq, Hash)] + pub struct SetIndex {} + impl super::SetIndexFunction for SetIndex { + fn compute_set_index( + &self, + addr: super::address, + num_sets: usize, + line_size_log2: u32, + num_sets_log2: u32, + ) -> u64 { + let bits = line_size_log2 + num_sets_log2; + let higher_bits = addr >> bits; + let mut index = (addr >> line_size_log2) as usize; + index &= num_sets - 1; + let set_idx = bitwise_hash_function(higher_bits, index, num_sets); + assert!(set_idx < num_sets as u64, "set index out of bounds"); + set_idx + } + } +} + +pub mod ipoly { + /// Set Indexing function from "Pseudo-randomly interleaved memory." + /// Rau, B. R et al. + /// ISCA 1991 + /// http://citeseerx.ist.psu.edu/viewdoc/download;jsessionid=348DEA37A3E440473B3C075EAABC63B6?doi=10.1.1.12.7149&rep=rep1&type=pdf + /// + /// equations are corresponding to IPOLY(37) and are adopted from: + /// "Sacat: streaming-aware conflict-avoiding thrashing-resistant gpgpu + /// cache management scheme." Khairy et al. IEEE TPDS 2017. + /// + /// equations for 16 banks are corresponding to IPOLY(5) + /// equations for 32 banks are corresponding to IPOLY(37) + /// equations for 64 banks are corresponding to IPOLY(67) + /// To see all the IPOLY equations for all the degrees, see + /// http://wireless-systems.ece.gatech.edu/6604/handouts/Peterson's%20Table.pdf + /// + /// We generate these equations using GF(2) arithmetic: + /// http://www.ee.unb.ca/cgi-bin/tervo/calc.pl?num=&den=&f=d&e=1&m=1 + /// + /// We go through all the strides 128 (10000000), 256 (100000000),... and + /// do modular arithmetic in GF(2) Then, we create the H-matrix and group + /// each bit together, for more info read the ISCA 1991 paper + /// + /// IPOLY hashing guarantees conflict-free for all 2^n strides which widely + /// exit in GPGPU applications and also show good performance for other + /// strides. + #[must_use] + pub fn ipoly_hash_function( + _higher_bits: super::address, + _index: usize, + _bank_set_num: usize, + ) -> u64 { + todo!("ipoly_hash_function"); + } + + #[derive(Default, Debug, PartialEq, Eq, Hash)] + pub struct SetIndex {} + impl super::SetIndexFunction for SetIndex { + fn compute_set_index( + &self, + addr: super::address, + num_sets: usize, + line_size_log2: u32, + num_sets_log2: u32, + ) -> u64 { + let bits = line_size_log2 + num_sets_log2; + let higher_bits = addr >> bits; + let mut index = (addr >> line_size_log2) as usize; + index &= num_sets - 1; + let set_idx = ipoly_hash_function(higher_bits, index, num_sets); + assert!(set_idx < num_sets as u64, "set index out of bounds"); + set_idx + } + } +} + +pub mod linear { + #[derive(Default, Debug, PartialEq, Eq, Hash)] + pub struct SetIndex {} + impl super::SetIndexFunction for SetIndex { + fn compute_set_index( + &self, + addr: super::address, + num_sets: usize, + line_size_log2: u32, + num_sets_log2: u32, + ) -> u64 { + let set_idx = (addr >> line_size_log2) & (num_sets as u64 - 1); + assert!(set_idx < num_sets as u64, "set index out of bounds"); + set_idx + } + } +} diff --git a/src/ported/simd_function_unit.rs b/src/ported/simd_function_unit.rs index b6df759b..13791bd7 100644 --- a/src/ported/simd_function_unit.rs +++ b/src/ported/simd_function_unit.rs @@ -93,16 +93,9 @@ impl PipelinedSimdUnitImpl { impl SimdFunctionUnit for PipelinedSimdUnitImpl { fn active_lanes_in_pipeline(&self) -> usize { let mut active_lanes: sched::ThreadActiveMask = BitArray::ZERO; - // if self.config. - for stage in &self.pipeline_reg { - if let Some(stage) = stage { - active_lanes |= stage.active_mask; - } + for stage in self.pipeline_reg.iter().flatten() { + active_lanes |= stage.active_mask; } - // for (unsigned stage = 0; (stage + 1) < m_pipeline_depth; stage++) { - // if (!m_pipeline_reg[stage]->empty()) - // active_lanes |= m_pipeline_reg[stage]->get_active_mask(); - // } active_lanes.count_ones() } @@ -122,7 +115,7 @@ impl SimdFunctionUnit for PipelinedSimdUnitImpl { self.cycle.get(), self.pipeline_reg .iter() - .map(|reg| reg.as_ref().map(|r| r.to_string())) + .map(|reg| reg.as_ref().map(std::string::ToString::to_string)) .collect::>(), self.num_active_instr_in_pipeline(), self.pipeline_reg.len(), diff --git a/src/ported/sp_unit.rs b/src/ported/sp_unit.rs index 8736103e..8a90dda0 100644 --- a/src/ported/sp_unit.rs +++ b/src/ported/sp_unit.rs @@ -1,6 +1,5 @@ use super::{ - instruction::WarpInstruction, opcodes, register_set::RegisterSet, - simd_function_unit as fu, + instruction::WarpInstruction, opcodes, register_set::RegisterSet, simd_function_unit as fu, }; use crate::config::GPUConfig; use std::cell::RefCell; @@ -8,15 +7,11 @@ use std::rc::Rc; use std::sync::{Arc, Mutex}; #[derive()] -// pub struct SPUnit { pub struct SPUnit { - // core_id: usize, - // cluster_id: usize, config: Arc, pipelined_simd_unit: fu::PipelinedSimdUnitImpl, } -// impl SPUnit { impl SPUnit { pub fn new( id: usize, @@ -56,23 +51,19 @@ impl std::fmt::Debug for SPUnit { } } -// impl fu::SimdFunctionUnit for SPUnit -impl fu::SimdFunctionUnit for SPUnit -// where -// I: ic::Interconnect, -{ +impl fu::SimdFunctionUnit for SPUnit { fn can_issue(&self, instr: &WarpInstruction) -> bool { + use opcodes::ArchOp; match instr.opcode.category { - opcodes::ArchOp::SFU_OP => false, - opcodes::ArchOp::LOAD_OP => false, - opcodes::ArchOp::TENSOR_CORE_LOAD_OP => false, - opcodes::ArchOp::STORE_OP => false, - opcodes::ArchOp::TENSOR_CORE_STORE_OP => false, - opcodes::ArchOp::MEMORY_BARRIER_OP => false, - opcodes::ArchOp::DP_OP => false, + ArchOp::SFU_OP + | ArchOp::LOAD_OP + | ArchOp::TENSOR_CORE_LOAD_OP + | ArchOp::STORE_OP + | ArchOp::TENSOR_CORE_STORE_OP + | ArchOp::MEMORY_BARRIER_OP + | ArchOp::DP_OP => false, _ => self.pipelined_simd_unit.can_issue(instr), } - // todo!("load store unit: can issue"); } fn pipeline(&self) -> &Vec> { diff --git a/src/ported/tag_array.rs b/src/ported/tag_array.rs index eb2bb26f..d6107004 100644 --- a/src/ported/tag_array.rs +++ b/src/ported/tag_array.rs @@ -28,13 +28,6 @@ pub struct TagArray { /// nbanks x nset x assoc lines in total pub lines: Vec, phantom: std::marker::PhantomData, - access: usize, - miss: usize, - pending_hit: usize, - res_fail: usize, - sector_miss: usize, - core_id: usize, - type_id: usize, is_used: bool, num_access: usize, num_miss: usize, @@ -48,32 +41,15 @@ pub struct TagArray { impl TagArray { #[must_use] - pub fn new(core_id: usize, type_id: usize, config: Arc) -> Self { + pub fn new(config: Arc) -> Self { let num_cache_lines = config.max_num_lines(); let lines = (0..num_cache_lines) .map(|_| cache_block::LineCacheBlock::new()) .collect(); - // if (config.m_cache_type == NORMAL) { - // for (unsigned i = 0; i < cache_lines_num; ++i) - // m_lines[i] = new line_cache_block(); - // } else if (config.m_cache_type == SECTOR) { - // for (unsigned i = 0; i < cache_lines_num; ++i) - // m_lines[i] = new sector_cache_block(); - // } else - // assert(0); - // - // init(core_id, type_id); Self { lines, phantom: std::marker::PhantomData, - access: 0, - miss: 0, - pending_hit: 0, - res_fail: 0, - sector_miss: 0, - core_id, - type_id, is_used: false, num_access: 0, num_miss: 0, @@ -100,7 +76,6 @@ impl TagArray { let mut writeback = false; let mut evicted = None; - // shader_cache_access_log(m_core_id, m_type_id, 0); let (index, status) = self.probe(addr, fetch, fetch.is_write(), false); match status { cache::RequestStatus::HIT | cache::RequestStatus::HIT_RESERVED => { @@ -156,10 +131,8 @@ impl TagArray { } } cache::RequestStatus::SECTOR_MISS => { - unimplemented!("no sector miss"); debug_assert!(self.config.kind == config::CacheKind::Sector); self.num_sector_miss += 1; - // shader_cache_access_log(m_core_id, m_type_id, 1); if self.config.allocate_policy == config::CacheAllocatePolicy::ON_MISS { let index = index.expect("hit has idx"); let line = &mut self.lines[index]; @@ -169,12 +142,13 @@ impl TagArray { self.num_dirty -= 1; } } + unimplemented!("sector miss"); } cache::RequestStatus::RESERVATION_FAIL => { self.num_reservation_fail += 1; } - status => { - panic!("tag_array access: unknown cache request status {status:?}"); + status @ cache::RequestStatus::MSHR_HIT => { + panic!("tag_array access: status {status:?} should never be returned"); } } AccessStatus { @@ -189,10 +163,10 @@ impl TagArray { /// /// # Returns /// A tuple with the cache index `Option` and cache request status. + #[must_use] pub fn probe( &self, block_addr: address, - // cache_idx: Option, fetch: &mem_fetch::MemFetch, is_write: bool, is_probe: bool, @@ -202,19 +176,17 @@ impl TagArray { fetch.access_sector_mask(), is_write, is_probe, - fetch.to_string(), + Some(fetch), ) } pub fn probe_masked( &self, block_addr: address, - // cache_idx: Option, mask: &mem_fetch::MemAccessSectorMask, is_write: bool, _is_probe: bool, - fetch: String, - // fetch: &mem_fetch::MemFetch, + fetch: Option<&mem_fetch::MemFetch>, ) -> (Option, cache::RequestStatus) { let set_index = self.config.set_index(block_addr) as usize; let tag = self.config.tag(block_addr); @@ -231,8 +203,8 @@ impl TagArray { let dirty_line_percent = (dirty_line_percent * 100f64) as usize; log::trace!( - "tag_array::probe({}) set_idx = {}, tag = {}, assoc = {} dirty lines = {}%", - fetch, + "tag_array::probe({:?}) set_idx = {}, tag = {}, assoc = {} dirty lines = {}%", + fetch.map(ToString::to_string), set_index, tag, self.config.associativity, @@ -244,15 +216,15 @@ impl TagArray { let idx = set_index * self.config.associativity + way; let line = &self.lines[idx]; log::trace!( - "tag_array::probe({}) => checking cache index {} (tag={}, status={:?}, last_access={})", - fetch, + "tag_array::probe({:?}) => checking cache index {} (tag={}, status={:?}, last_access={})", + fetch.map(ToString::to_string), idx, line.tag, - line.status(&mask), + line.status(mask), line.last_access_time() ); if line.tag == tag { - match line.status(&mask) { + match line.status(mask) { cache_block::Status::RESERVED => { return (Some(idx), cache::RequestStatus::HIT_RESERVED); } @@ -260,11 +232,17 @@ impl TagArray { return (Some(idx), cache::RequestStatus::HIT); } cache_block::Status::MODIFIED => { - if (!is_write && line.is_readable(mask)) || is_write { - return (Some(idx), cache::RequestStatus::HIT); + let status = if is_write || line.is_readable(mask) { + cache::RequestStatus::HIT } else { - return (Some(idx), cache::RequestStatus::SECTOR_MISS); - } + cache::RequestStatus::SECTOR_MISS + }; + // let status = match is_write { + // true => cache::RequestStatus::HIT, + // false if line.is_readable(mask) => cache::RequestStatus::HIT, + // _ => cache::RequestStatus::SECTOR_MISS, + // }; + return (Some(idx), status); } cache_block::Status::INVALID if line.is_valid() => { return (Some(idx), cache::RequestStatus::SECTOR_MISS); @@ -292,18 +270,17 @@ impl TagArray { } } else if self.config.replacement_policy == config::CacheReplacementPolicy::FIFO + && line.alloc_time() < valid_time { - if line.alloc_time() < valid_time { - valid_time = line.alloc_time(); - valid_line = Some(idx); - } + valid_time = line.alloc_time(); + valid_line = Some(idx); } } } } } - log::trace!("tag_array::probe({}) => all reserved={} invalid_line={:?} valid_line={:?} ({:?} policy)", fetch, all_reserved, invalid_line, valid_line, self.config.replacement_policy); + log::trace!("tag_array::probe({:?}) => all reserved={} invalid_line={:?} valid_line={:?} ({:?} policy)", fetch.map(ToString::to_string), all_reserved, invalid_line, valid_line, self.config.replacement_policy); if all_reserved { debug_assert_eq!( @@ -323,16 +300,6 @@ impl TagArray { panic!("found neither a valid nor invalid cache line"); } }; - // let cache_idx = if invalid_line.is_some() { - // invalid_line - // } else if valid_line.is_some() { - // valid_line - // } else { - // // if an unreserved block exists, - // // it is either invalid or replaceable - // panic!("found neither a valid nor invalid cache line"); - // }; - (Some(cache_idx), cache::RequestStatus::MISS) } @@ -363,17 +330,12 @@ impl TagArray { time: u64, ) { let is_probe = false; - let (cache_index, probe_status) = self.probe_masked( - addr, - §or_mask, - is_write, - is_probe, - "".to_string(), - ); + let (cache_index, probe_status) = + self.probe_masked(addr, §or_mask, is_write, is_probe, None); log::trace!( "tag_array::fill(cache={}, tag={}, addr={}) (on fill) status={:?}", - cache_index.map(|i| i as i64).unwrap_or(-1), + cache_index.map_or(-1, |i| i as i64), self.config.tag(addr), addr, probe_status, @@ -424,7 +386,7 @@ impl TagArray { log::trace!( "tag_array::fill(cache={}, tag={}, addr={}) (on fill) status={:?}", - cache_index.map(|i| i as i64).unwrap_or(-1), + cache_index.map_or(-1, |i| i as i64), self.config.tag(fetch.addr()), fetch.addr(), probe_status, @@ -473,6 +435,7 @@ impl TagArray { todo!("invalidate tag array"); } + #[must_use] pub fn size(&self) -> usize { self.config.max_num_lines() } @@ -481,12 +444,12 @@ impl TagArray { &mut self.lines[idx] } + #[must_use] pub fn get_block(&self, idx: usize) -> &cache_block::LineCacheBlock { &self.lines[idx] } pub fn add_pending_line(&mut self, fetch: &mem_fetch::MemFetch) { - // log::debug!("tag_array::add_pending_line({})", fetch.addr()); let addr = self.config.block_addr(fetch.addr()); let instr = fetch.instr.as_ref().unwrap(); if self.pending_lines.contains_key(&addr) { @@ -495,42 +458,9 @@ impl TagArray { } pub fn remove_pending_line(&mut self, fetch: &mem_fetch::MemFetch) { - // log::debug!("tag_array::remove_pending_line({})", fetch.addr()); let addr = self.config.block_addr(fetch.addr()); self.pending_lines.remove(&addr); } - - // pub fn from_block( - // config: GenericCacheConfig, - // core_id: usize, - // type_id: usize, - // block: CacheBlock, - // ) -> Self { - // Self { - // // config, - // lines: Vec::new(), - // } - // } - - // pub fn from_config(config: GenericCacheConfig, core_id: usize, type_id: usize) -> Self { - // config.max_lines; - // let lines = - // Self { - // // config, - // lines: Vec::new(), - // } - // // unsigned cache_lines_num = config.get_max_num_lines(); - // // m_lines = new cache_block_t *[cache_lines_num]; - // // if (config.m_cache_type == NORMAL) { - // // for (unsigned i = 0; i < cache_lines_num; ++i) - // // m_lines[i] = new line_cache_block(); - // // } else if (config.m_cache_type == SECTOR) { - // // for (unsigned i = 0; i < cache_lines_num; ++i) - // // m_lines[i] = new sector_cache_block(); - // // } else - // // assert(0); - // } - // todo: update config (GenericCacheConfig) } #[cfg(test)] @@ -543,7 +473,7 @@ mod tests { #[test] fn test_tag_array() { let config = GPUConfig::default().data_cache_l1.unwrap(); - let _tag_array: TagArray = TagArray::new(0, 0, Arc::clone(&config.inner)); + let _tag_array: TagArray = TagArray::new(Arc::clone(&config.inner)); assert!(false); } } diff --git a/src/ported/testing/state.rs b/src/ported/testing/state.rs index a2b1a256..2e712274 100644 --- a/src/ported/testing/state.rs +++ b/src/ported/testing/state.rs @@ -31,7 +31,7 @@ impl From for ported::mem_fetch::AccessKind { mem_access_type::L1_WR_ALLOC_R => AccessKind::L1_WR_ALLOC_R, mem_access_type::L2_WR_ALLOC_R => AccessKind::L2_WR_ALLOC_R, other @ mem_access_type::NUM_MEM_ACCESS_TYPE => { - panic!("bad mem access kind: {:?}", other) + panic!("bad mem access kind: {other:?}") } } } @@ -100,8 +100,8 @@ impl From for CacheBlock { } } -impl<'a> From<&'a playground::cache::cache_block_t> for CacheBlock { - fn from(block: &'a playground::cache::cache_block_t) -> Self { +impl From<&playground::cache::cache_block_t> for CacheBlock { + fn from(block: &playground::cache::cache_block_t) -> Self { let status = if block.is_valid_line() { CacheBlockStatus::VALID } else if block.is_invalid_line() { @@ -162,12 +162,17 @@ pub struct RegisterSet { } impl RegisterSet { + #[must_use] pub fn is_empty(&self) -> bool { self.num_instructions_in_pipeline() == 0 } + #[must_use] pub fn num_instructions_in_pipeline(&self) -> usize { - self.pipeline.iter().filter_map(|x| x.as_ref()).count() + self.pipeline + .iter() + .filter_map(std::option::Option::as_ref) + .count() } } @@ -182,10 +187,7 @@ impl From for RegisterSet { let pipeline = reg .regs .into_iter() - .map(|instr| match instr { - Some(instr) => Some(instr.into()), - None => None, - }) + .map(|instr| instr.map(std::convert::Into::into)) .collect(); Self { name: format!("{:?}", ®.stage), @@ -253,7 +255,7 @@ pub struct DispatchUnit { pub kind: OperandCollectorUnitKind, } -impl<'a> From<&playground::operand_collector::dispatch_unit_t> for DispatchUnit { +impl From<&playground::operand_collector::dispatch_unit_t> for DispatchUnit { fn from(unit: &playground::operand_collector::dispatch_unit_t) -> Self { Self { last_cu: unit.get_last_cu() as usize, @@ -320,8 +322,8 @@ impl<'a> From> for CollectorUnit { } } -impl<'a> From<&'a playground::operand_collector::arbiter_t> for Arbiter { - fn from(arbiter: &'a playground::operand_collector::arbiter_t) -> Self { +impl From<&playground::operand_collector::arbiter_t> for Arbiter { + fn from(arbiter: &playground::operand_collector::arbiter_t) -> Self { Self { last_cu: arbiter.get_last_cu() as usize, } @@ -411,7 +413,7 @@ impl std::fmt::Debug for MemFetch { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { write!(f, "{:?}({:?}", self.kind, self.access_kind)?; if let Some((alloc_id, rel_addr)) = self.relative_addr { - write!(f, "@{}+{}", alloc_id, rel_addr)?; + write!(f, "@{alloc_id}+{rel_addr}")?; } write!(f, ")") } @@ -465,7 +467,7 @@ impl From<&playground::core::pending_register_writes> for PendingRegisterWrites } #[derive(Debug, Clone, PartialEq, Eq, Serialize)] -pub struct ArbitrationState { +pub struct Arbitration { pub last_borrower: usize, pub shared_credit: usize, pub private_credit: Box<[usize]>, @@ -482,7 +484,7 @@ pub struct Simulation { pub l2_cache_per_sub: Box<[Option]>, // per partition pub dram_latency_queue_per_partition: Box<[Vec]>, - pub dram_arbitration_per_partition: Box<[ArbitrationState]>, + pub dram_arbitration_per_partition: Box<[Arbitration]>, // per cluster pub core_sim_order_per_cluster: Box<[Box<[usize]>]>, // per core @@ -493,6 +495,7 @@ pub struct Simulation { } impl Simulation { + #[must_use] pub fn new( num_clusters: usize, cores_per_cluster: usize, @@ -514,7 +517,7 @@ impl Simulation { // per partition dram_latency_queue_per_partition: vec![vec![]; num_mem_partitions].into_boxed_slice(), dram_arbitration_per_partition: vec![ - ArbitrationState { + Arbitration { last_borrower: 0, shared_credit: 0, private_credit: vec![0; num_sub_partitions].into_boxed_slice(), diff --git a/stats/src/lib.rs b/stats/src/lib.rs index 60c1e96d..59c074f6 100644 --- a/stats/src/lib.rs +++ b/stats/src/lib.rs @@ -4,6 +4,7 @@ pub mod cache; pub mod dram; pub mod instructions; pub mod mem; +pub mod scheduler; pub mod sim; pub use cache::{Cache, PerCache}; @@ -55,6 +56,8 @@ pub struct Stats { pub l1t_stats: PerCache, pub l1d_stats: PerCache, pub l2d_stats: PerCache, + // where should those go? + pub stall_dram_full: u64, } impl Stats { @@ -70,6 +73,7 @@ impl Stats { l1t_stats: PerCache::default(), l1d_stats: PerCache::default(), l2d_stats: PerCache::default(), + stall_dram_full: 0, } } } diff --git a/stats/src/scheduler.rs b/stats/src/scheduler.rs new file mode 100644 index 00000000..49efcd87 --- /dev/null +++ b/stats/src/scheduler.rs @@ -0,0 +1,10 @@ +use serde::{Deserialize, Serialize}; + +#[derive(Clone, Debug, Default, PartialEq, Eq, Serialize, Deserialize)] +pub struct Scheduler { + pub num_single_issue: u64, + pub num_dual_issue: u64, + pub issue_raw_hazard_stall: u64, + pub issue_control_hazard_stall: u64, + pub issue_pipeline_stall: u64, +}