From 11292cdf83caeda572f7f86a6cc873e164999219 Mon Sep 17 00:00:00 2001 From: romnnn Date: Tue, 22 Aug 2023 22:33:19 +0200 Subject: [PATCH] non deterministic implementation: simpler implemenation using rayon scopes --- .gitignore | 5 +- Cargo.lock | 58 +- Cargo.toml | 10 +- WIP.md | 2 +- benches/vectoradd.rs | 48 +- src/cache/l2.rs | 2 +- src/cache/mod.rs | 2 +- src/cluster.rs | 20 +- src/core.rs | 972 ++++++++++++--------------- src/interconn.rs | 2 +- src/lib.rs | 139 ++-- src/mem_partition_unit.rs | 10 +- src/mem_sub_partition.rs | 8 +- src/operand_collector.rs | 60 +- src/parallel/deterministic.rs | 2 +- src/parallel/nondeterministic.rs | 693 +++++++++++++++---- src/scheduler/gto.rs | 2 +- src/scheduler/mod.rs | 9 +- src/sync.rs | 30 +- src/warp.rs | 2 +- test-apps/test-apps-materialized.yml | 2 +- 21 files changed, 1232 insertions(+), 846 deletions(-) diff --git a/.gitignore b/.gitignore index 4ae54313..f2c0ac4c 100644 --- a/.gitignore +++ b/.gitignore @@ -15,7 +15,10 @@ Pipfile.lock cuda_*.run # code coverage files +/coverage **/*.profraw + +# perf traces **/perf.data* +**/bench.trace.json **/flamegraph.svg -/coverage diff --git a/Cargo.lock b/Cargo.lock index 6d33d9e3..70181624 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -528,7 +528,7 @@ dependencies = [ "nvbit-io", "nvbit-model", "once_cell", - "parking_lot 0.11.2", + "parking_lot 0.12.1", "paste", "phf", "playground", @@ -546,6 +546,9 @@ dependencies = [ "thiserror", "tokio", "trace-model", + "tracing", + "tracing-chrome", + "tracing-subscriber", "utils", "validate", ] @@ -2175,6 +2178,16 @@ dependencies = [ "minimal-lexical", ] +[[package]] +name = "nu-ansi-term" +version = "0.46.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84" +dependencies = [ + "overload", + "winapi", +] + [[package]] name = "num-integer" version = "0.1.45" @@ -2362,6 +2375,12 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "overload" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39" + [[package]] name = "owo-colors" version = "3.5.0" @@ -3787,9 +3806,32 @@ checksum = "8ce8c33a8d48bd45d624a6e523445fd21ec13d3653cd51f681abf67418f54eb8" dependencies = [ "cfg-if", "pin-project-lite", + "tracing-attributes", "tracing-core", ] +[[package]] +name = "tracing-attributes" +version = "0.1.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f4f31f56159e98206da9efd823404b79b6ef3143b4a7ab76e67b1751b25a4ab" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.28", +] + +[[package]] +name = "tracing-chrome" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "496b3cd5447f7ff527bbbf19b071ad542a000adf297d4127078b4dfdb931f41a" +dependencies = [ + "serde_json", + "tracing-core", + "tracing-subscriber", +] + [[package]] name = "tracing-core" version = "0.1.31" @@ -3810,15 +3852,29 @@ dependencies = [ "tracing-subscriber", ] +[[package]] +name = "tracing-log" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78ddad33d2d10b1ed7eb9d1f518a5674713876e97e5bb9b7345a7984fbb4f922" +dependencies = [ + "lazy_static", + "log", + "tracing-core", +] + [[package]] name = "tracing-subscriber" version = "0.3.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "30a651bc37f915e81f087d86e62a18eec5f79550c7faff886f7090b4ea757c77" dependencies = [ + "nu-ansi-term", "sharded-slab", + "smallvec", "thread_local", "tracing-core", + "tracing-log", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index f53eb9b2..e3d12619 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -35,9 +35,9 @@ debug-assertions = false # lto = true # warning: debug assertions negatively impact the performance of accelsim and play debug-assertions = false -codegen-units = 10 +# codegen-units = 10 opt-level = 3 -debug = false +debug = true [package] name = "casimu" @@ -82,11 +82,17 @@ strum = { version = "0", features = ["derive"] } phf = { version = "0.11.1", features = ["macros"] } rangemap = "1" +# synchronization flume = "0" crossbeam = "0" num_cpus = "1" parking_lot = "0" +# tracing +tracing = "0" +tracing-subscriber = "0" +tracing-chrome = "0" + similar-asserts = "1" # log4rs = "0" diff --git a/WIP.md b/WIP.md index 2432390c..0f0cfd0a 100644 --- a/WIP.md +++ b/WIP.md @@ -17,7 +17,6 @@ - refactor interconn to couple has buffer and push using a single explicit lock - refactor to get rid of global config but use per component configs - use traits for common components - - try using native threads and barriers for core simulation - record mem fetch latency - add a few more stats - plot statistics @@ -32,6 +31,7 @@ - asynchronously push into file (unordered) + - DONE: try using native threads and barriers for core simulation - DONE: pipelined simd function unit should not implement simd function unit - DONE: get rid of global cycle mutex - DONE: lint diff --git a/benches/vectoradd.rs b/benches/vectoradd.rs index 3c06f1b6..261da7c7 100644 --- a/benches/vectoradd.rs +++ b/benches/vectoradd.rs @@ -118,28 +118,47 @@ fn main() -> eyre::Result<()> { #[allow(unused_imports)] use std::io::Write; use std::time::Instant; + use tracing_chrome::ChromeLayerBuilder; + use tracing_subscriber::{prelude::*, registry::Registry}; - env_logger::init(); - // let mut log_builder = env_logger::Builder::new(); - // log_builder.format(|buf, record| writeln!(buf, "{}", record.args())); + let profile = std::env::var("TRACE").unwrap_or_default().to_lowercase() == "yes"; - let (bench_name, input_num) = ("transpose", 0); // takes 34 sec (accel same) + let mut generate_trace = if profile { + // tracing_subscriber::fmt::init(); + let (chrome_layer, guard) = ChromeLayerBuilder::new().file("bench.trace.json").build(); + tracing_subscriber::registry().with(chrome_layer).init(); + Some(guard) + } else { + // env_logger::init(); + // let mut log_builder = env_logger::Builder::new(); + // log_builder.format(|buf, record| writeln!(buf, "{}", record.args())); + None + }; + + // let (bench_name, input_num) = ("transpose", 0); // takes 34 sec (accel same) // let (bench_name, input_num) = ("simple_matrixmul", 26); // takes 22 sec let (bench_name, input_num) = ("matrixmul", 3); // takes 54 sec (accel 76) + + // let (bench_name, input_num) = ("vectorAdd", 0); println!("running {bench_name}@{input_num}"); let runtime = tokio::runtime::Builder::new_multi_thread() .enable_all() .build()?; - let mut start = Instant::now(); + let start = Instant::now(); let stats = run_box(black_box(get_bench_config(bench_name, input_num)?))?; dbg!(&stats.sim); let box_dur = start.elapsed(); println!("box took:\t\t{box_dur:?}"); + drop(generate_trace.take()); + if profile { + return Ok(()); + } + let timings = casimu::TIMINGS.lock(); println!("sorted by NAME"); for (name, dur) in timings.iter().sorted_by_key(|(name, _dur)| name.clone()) { @@ -158,21 +177,18 @@ fn main() -> eyre::Result<()> { dur.total().as_secs_f64(), ); } - - if let Some(serial_cycle) = timings.get("SERIAL CYCLE") { - println!( - "=> serial only execution time: {:?}", - serial_cycle.mean() * u32::try_from(stats.sim.cycles).unwrap() - ); - } println!(); - start = Instant::now(); + let start = Instant::now(); run_playground(&black_box(get_bench_config(bench_name, input_num)?))?; let play_dur = start.elapsed(); println!("play took:\t\t{play_dur:?}"); + println!( + "speedup is :\t\t{:.2}", + play_dur.as_secs_f64() / box_dur.as_secs_f64() + ); - start = Instant::now(); + let start = Instant::now(); runtime.block_on(async { run_accelsim(black_box(get_bench_config(bench_name, input_num)?)).await?; Ok::<(), eyre::Report>(()) @@ -180,9 +196,5 @@ fn main() -> eyre::Result<()> { let accel_dur = start.elapsed(); println!("accel took:\t\t{accel_dur:?}"); - println!( - "speedup is :\t\t{:.2}", - play_dur.as_secs_f64() / box_dur.as_secs_f64() - ); Ok(()) } diff --git a/src/cache/l2.rs b/src/cache/l2.rs index 72d70fbb..76d82071 100644 --- a/src/cache/l2.rs +++ b/src/cache/l2.rs @@ -1,6 +1,6 @@ +use crate::sync::{Arc, Mutex}; use crate::{address, config, interconn as ic, mem_fetch}; use std::collections::VecDeque; -use crate::sync::{Arc, Mutex}; /// Generic data cache. #[derive(Debug)] diff --git a/src/cache/mod.rs b/src/cache/mod.rs index aaa23192..c0e42d47 100644 --- a/src/cache/mod.rs +++ b/src/cache/mod.rs @@ -13,8 +13,8 @@ pub use readonly::ReadOnly; use super::{address, mem_fetch}; use crate::config; -use std::collections::VecDeque; use crate::sync::{Arc, Mutex}; +use std::collections::VecDeque; #[derive(Debug, strum::EnumIter, Clone, Copy, Hash, PartialEq, Eq)] pub enum RequestStatus { diff --git a/src/cluster.rs b/src/cluster.rs index fa363cd4..ef0c92a7 100644 --- a/src/cluster.rs +++ b/src/cluster.rs @@ -1,5 +1,6 @@ use super::{config, interconn as ic, kernel::Kernel, mem_fetch, Core, MockSimulator, Packet}; use console::style; +use crossbeam::utils::CachePadded; use std::collections::VecDeque; @@ -8,7 +9,7 @@ use crate::sync::{atomic, Arc, Mutex, RwLock}; #[derive(Debug)] pub struct Cluster { pub cluster_id: usize, - pub warp_instruction_unique_uid: Arc, + pub warp_instruction_unique_uid: Arc>, pub cores: Vec>>>, pub config: Arc, pub stats: Arc>, @@ -26,7 +27,7 @@ where { pub fn new( cluster_id: usize, - warp_instruction_unique_uid: &Arc, + warp_instruction_unique_uid: &Arc>, allocations: &super::allocation::Ref, interconn: &Arc, stats: &Arc>, @@ -75,7 +76,7 @@ where pub fn num_active_sms(&self) -> usize { self.cores .iter() - .filter(|core| core.try_read().active()) + .filter(|core| core.try_read().is_active()) .count() } @@ -86,6 +87,7 @@ where .sum() } + #[tracing::instrument] pub fn interconn_cycle(&mut self, cycle: u64) { use mem_fetch::AccessKind; @@ -106,6 +108,7 @@ where if let Some(fetch) = self.response_fifo.front() { let core_id = self.config.global_core_id_to_core_id(fetch.core_id); + // we should not fully lock a core as we completely block a full core cycle let core = self.cores[core_id].read(); match *fetch.access_kind() { @@ -198,7 +201,8 @@ where // } // } - pub fn issue_block_to_core(&self, sim: &MockSimulator) -> usize { + #[tracing::instrument(name = "cluster_issue_block_to_core")] + pub fn issue_block_to_core(&self, sim: &MockSimulator, cycle: u64) -> usize { let num_cores = self.cores.len(); log::debug!( @@ -212,10 +216,7 @@ where for core_id in 0..num_cores { let core_id = (core_id + *block_issue_next_core + 1) % num_cores; - // let core = &mut cores[core_id]; - // THIS KILLS THE PERFORMANCE - let core = self.cores[core_id].try_read(); - // let core = self.cores[core_id].read(); + let core = self.cores[core_id].read(); // let kernel: Option> = if self.config.concurrent_kernel_sm { // // always select latest issued kernel @@ -270,9 +271,8 @@ where let can_issue = !kernel.no_more_blocks_to_run() && core.can_issue_block(&kernel); drop(core); if can_issue { - // dbg!("core issue"); let mut core = self.cores[core_id].write(); - core.issue_block(&kernel); + core.issue_block(&kernel, cycle); num_blocks_issued += 1; *block_issue_next_core = core_id; break; diff --git a/src/core.rs b/src/core.rs index 103d4744..7e972c3e 100644 --- a/src/core.rs +++ b/src/core.rs @@ -16,6 +16,7 @@ use crate::sync::{Mutex, RwLock}; use bitvec::{array::BitArray, BitArr}; use color_eyre::eyre; use console::style; +use crossbeam::utils::CachePadded; use fu::SimdFunctionUnit; use itertools::Itertools; use once_cell::sync::Lazy; @@ -92,97 +93,6 @@ impl std::fmt::Display for Packet { } } -impl Core -where - I: ic::Interconnect + Send + 'static, -{ - // Returns numbers of addresses in translated_addrs. - // - // Each addr points to a 4B (32-bit) word - #[must_use] - pub fn translate_local_memaddr( - &self, - local_addr: address, - thread_id: usize, - num_cores: usize, - data_size: u32, - ) -> Vec
{ - // During functional execution, each thread sees its own memory space for - // local memory, but these need to be mapped to a shared address space for - // timing simulation. We do that mapping here. - - let (thread_base, max_concurrent_threads) = if self.config.local_mem_map { - // Dnew = D*N + T%nTpC + nTpC*C - // N = nTpC*nCpS*nS (max concurent threads) - // C = nS*K + S (hw cta number per gpu) - // K = T/nTpC (hw cta number per core) - // D = data index - // T = thread - // nTpC = number of threads per CTA - // nCpS = number of CTA per shader - // - // for a given local memory address threads in a CTA map to - // contiguous addresses, then distribute across memory space by CTAs - // from successive shader cores first, then by successive CTA in same - // shader core - let kernel_padded_threads_per_cta = self.thread_block_size; - let kernel_max_cta_per_shader = self.max_blocks_per_shader; - - let temp = self.core_id + num_cores * (thread_id / kernel_padded_threads_per_cta); - let rest = thread_id % kernel_padded_threads_per_cta; - let thread_base = 4 * (kernel_padded_threads_per_cta * temp + rest); - let max_concurrent_threads = - kernel_padded_threads_per_cta * kernel_max_cta_per_shader * num_cores; - (thread_base, max_concurrent_threads) - } else { - // legacy mapping that maps the same address in the local memory - // space of all threads to a single contiguous address region - let thread_base = 4 * (self.config.max_threads_per_core * self.core_id + thread_id); - let max_concurrent_threads = num_cores * self.config.max_threads_per_core; - (thread_base, max_concurrent_threads) - }; - debug_assert!(thread_base < 4 /*word size*/ * max_concurrent_threads); - - // If requested datasize > 4B, split into multiple 4B accesses - // otherwise do one sub-4 byte memory access - let mut translated_addresses = vec![]; - - if data_size >= 4 { - // >4B access, split into 4B chunks - debug_assert_eq!(data_size % 4, 0); // Must be a multiple of 4B - let num_accesses = data_size / 4; - // max 32B - debug_assert!( - num_accesses <= super::instruction::MAX_ACCESSES_PER_INSN_PER_THREAD as u32 - ); - // Address must be 4B aligned - required if - // accessing 4B per request, otherwise access - // will overflow into next thread's space - debug_assert_eq!(local_addr % 4, 0); - for i in 0..num_accesses { - let local_word = local_addr / 4 + u64::from(i); - let linear_address: address = local_word * max_concurrent_threads as u64 * 4 - + thread_base as u64 - + super::instruction::LOCAL_GENERIC_START; - translated_addresses.push(linear_address); - } - } else { - // Sub-4B access, do only one access - debug_assert!(data_size > 0); - let local_word = local_addr / 4; - let local_word_offset = local_addr % 4; - // Make sure access doesn't overflow into next 4B chunk - debug_assert_eq!((local_addr + u64::from(data_size) - 1) / 4, local_word); - let linear_address: address = local_word * max_concurrent_threads as u64 * 4 - + local_word_offset - + thread_base as u64 - + super::instruction::LOCAL_GENERIC_START; - translated_addresses.push(linear_address); - } - translated_addresses - } -} - pub trait WarpIssuer { fn issue_warp( &self, @@ -205,7 +115,7 @@ impl WarpIssuer for Core where I: ic::Interconnect + Send + 'static, { - fn has_free_register(&self, stage: PipelineStage, register_id: usize) -> bool { + fn has_free_register(&self, stage: PipelineStage, _register_id: usize) -> bool { // locking here blocks when we run schedulers in parallel let pipeline_stage = self.pipeline_reg[stage as usize].try_lock(); @@ -217,6 +127,7 @@ where } } + #[tracing::instrument(name = "core_issue_warp")] fn issue_warp( &self, stage: PipelineStage, @@ -353,17 +264,13 @@ where if warp.done() && warp.functional_done() { warp.ibuffer_flush(); - self.barriers - .write() - // .unwrap() - .warp_exited(pipe_reg_ref.warp_id); + self.barriers.write().warp_exited(pipe_reg_ref.warp_id); } if pipe_reg_ref.opcode.category == opcodes::ArchOp::BARRIER_OP { // m_warp[warp_id]->store_info_of_last_inst_at_barrier(*pipe_reg); self.barriers .write() - // .unwrap() .warp_reached_barrier(warp.block_id, &pipe_reg_ref); } else if pipe_reg_ref.opcode.category == opcodes::ArchOp::MEMORY_BARRIER_OP { warp.waiting_for_memory_barrier = true; @@ -380,27 +287,21 @@ where pipe_reg_ref ); - self.scoreboard - .write() - // .unwrap() - .reserve_all(&pipe_reg_ref); + self.scoreboard.write().reserve_all(&pipe_reg_ref); *pipe_reg = Some(pipe_reg_ref); - log::debug!( - "post issue register set of {:?} pipeline: {}", - stage, - pipeline_stage - ); + // log::debug!( + // "post issue register set of {:?} pipeline: {}", + // stage, + // pipeline_stage + // ); Ok(()) } #[must_use] fn warp_waiting_at_barrier(&self, warp_id: usize) -> bool { - self.barriers - .try_read() - // .unwrap() - .is_waiting_at_barrier(warp_id) + self.barriers.try_read().is_waiting_at_barrier(warp_id) } #[must_use] @@ -411,7 +312,6 @@ where let has_pending_writes = !self .scoreboard .read() - // .unwrap() .pending_writes(warp.warp_id) .is_empty(); @@ -452,13 +352,18 @@ pub enum PipelineStage { OC_EX_TENSOR_CORE = 12, } +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)] +pub enum FetchResponseTarget { + LoadStoreUnit, + ICache, +} + /// SIMT Core. #[derive()] pub struct Core { pub core_id: usize, pub cluster_id: usize, - // pub cycle: super::Cycle, - pub warp_instruction_unique_uid: Arc, + pub warp_instruction_unique_uid: Arc>, pub stats: Arc>, pub config: Arc, pub current_kernel: Mutex>>, @@ -481,7 +386,7 @@ pub struct Core { pub allocations: super::allocation::Ref, pub instr_l1_cache: Box, - pub please_fill: Mutex>, + pub please_fill: Mutex>, pub instr_fetch_buffer: InstrFetchBuffer, pub warps: Vec, pub thread_state: Vec>, @@ -507,6 +412,7 @@ impl std::fmt::Debug for Core { } } +// PUBLIC impl Core where I: ic::Interconnect + Send + 'static, @@ -515,7 +421,7 @@ where core_id: usize, cluster_id: usize, allocations: super::allocation::Ref, - warp_instruction_unique_uid: Arc, + warp_instruction_unique_uid: Arc>, interconn: Arc, stats: Arc>, config: Arc, @@ -633,7 +539,7 @@ where let scheduler_stats = Arc::new(Mutex::new(stats::scheduler::Scheduler::default())); let scheduler: Arc> = match scheduler_kind { config::SchedulerKind::GTO => { - Arc::new(Mutex::new(scheduler::gto::Scheduler::new( + let gto = scheduler::gto::Scheduler::new( sched_id, cluster_id, core_id, @@ -641,7 +547,8 @@ where scoreboard.clone(), scheduler_stats, config.clone(), - ))) + ); + Arc::new(Mutex::new(gto)) } scheduler_kind => unimplemented!("scheduler: {:?}", &scheduler_kind), }; @@ -653,7 +560,7 @@ where // distribute warps evenly though schedulers let sched_idx = i % config.num_schedulers_per_core; let scheduler = &mut schedulers[sched_idx]; - scheduler.lock().add_supervised_warp(Arc::clone(warp)); + scheduler.try_lock().add_supervised_warp(Arc::clone(warp)); } let mut functional_units: Vec>> = Vec::new(); @@ -768,6 +675,338 @@ where } } + // Returns numbers of addresses in translated_addrs. + // + // Each addr points to a 4B (32-bit) word + #[must_use] + pub fn translate_local_memaddr( + &self, + local_addr: address, + thread_id: usize, + num_cores: usize, + data_size: u32, + ) -> Vec
{ + // During functional execution, each thread sees its own memory space for + // local memory, but these need to be mapped to a shared address space for + // timing simulation. We do that mapping here. + + let (thread_base, max_concurrent_threads) = if self.config.local_mem_map { + // Dnew = D*N + T%nTpC + nTpC*C + // N = nTpC*nCpS*nS (max concurent threads) + // C = nS*K + S (hw cta number per gpu) + // K = T/nTpC (hw cta number per core) + // D = data index + // T = thread + // nTpC = number of threads per CTA + // nCpS = number of CTA per shader + // + // for a given local memory address threads in a CTA map to + // contiguous addresses, then distribute across memory space by CTAs + // from successive shader cores first, then by successive CTA in same + // shader core + let kernel_padded_threads_per_cta = self.thread_block_size; + let kernel_max_cta_per_shader = self.max_blocks_per_shader; + + let temp = self.core_id + num_cores * (thread_id / kernel_padded_threads_per_cta); + let rest = thread_id % kernel_padded_threads_per_cta; + let thread_base = 4 * (kernel_padded_threads_per_cta * temp + rest); + let max_concurrent_threads = + kernel_padded_threads_per_cta * kernel_max_cta_per_shader * num_cores; + (thread_base, max_concurrent_threads) + } else { + // legacy mapping that maps the same address in the local memory + // space of all threads to a single contiguous address region + let thread_base = 4 * (self.config.max_threads_per_core * self.core_id + thread_id); + let max_concurrent_threads = num_cores * self.config.max_threads_per_core; + (thread_base, max_concurrent_threads) + }; + debug_assert!(thread_base < 4 /*word size*/ * max_concurrent_threads); + + // If requested datasize > 4B, split into multiple 4B accesses + // otherwise do one sub-4 byte memory access + let mut translated_addresses = vec![]; + + if data_size >= 4 { + // >4B access, split into 4B chunks + debug_assert_eq!(data_size % 4, 0); // Must be a multiple of 4B + let num_accesses = data_size / 4; + // max 32B + debug_assert!( + num_accesses <= super::instruction::MAX_ACCESSES_PER_INSN_PER_THREAD as u32 + ); + // Address must be 4B aligned - required if + // accessing 4B per request, otherwise access + // will overflow into next thread's space + debug_assert_eq!(local_addr % 4, 0); + for i in 0..num_accesses { + let local_word = local_addr / 4 + u64::from(i); + let linear_address: address = local_word * max_concurrent_threads as u64 * 4 + + thread_base as u64 + + super::instruction::LOCAL_GENERIC_START; + translated_addresses.push(linear_address); + } + } else { + // Sub-4B access, do only one access + debug_assert!(data_size > 0); + let local_word = local_addr / 4; + let local_word_offset = local_addr % 4; + // Make sure access doesn't overflow into next 4B chunk + debug_assert_eq!((local_addr + u64::from(data_size) - 1) / 4, local_word); + let linear_address: address = local_word * max_concurrent_threads as u64 * 4 + + local_word_offset + + thread_base as u64 + + super::instruction::LOCAL_GENERIC_START; + translated_addresses.push(linear_address); + } + translated_addresses + } + + #[inline] + pub fn cache_flush(&mut self) { + let mut unit = self.load_store_unit.try_lock(); + unit.flush(); + } + + #[inline] + pub fn cache_invalidate(&mut self) { + let mut unit = self.load_store_unit.try_lock(); + unit.invalidate(); + } + + #[must_use] + #[inline] + pub fn ldst_unit_response_buffer_full(&self) -> bool { + self.load_store_unit.try_lock().response_buffer_full() + } + + #[must_use] + #[inline] + pub fn fetch_unit_response_buffer_full(&self) -> bool { + false + } + + #[inline] + pub fn accept_fetch_response(&self, mut fetch: mem_fetch::MemFetch, time: u64) { + fetch.status = mem_fetch::Status::IN_SHADER_FETCHED; + self.please_fill + .lock() + .push((FetchResponseTarget::ICache, fetch, time)); + } + + #[inline] + pub fn accept_ldst_unit_response(&self, fetch: mem_fetch::MemFetch, time: u64) { + self.please_fill + .lock() + .push((FetchResponseTarget::LoadStoreUnit, fetch, time)); + } + + #[must_use] + #[inline] + pub fn not_completed(&self) -> usize { + self.num_active_threads + } + + #[must_use] + #[inline] + pub fn is_active(&self) -> bool { + self.num_active_blocks > 0 + } + + #[must_use] + #[inline] + pub fn num_active_blocks(&self) -> usize { + self.num_active_blocks + } + + #[inline] + pub fn can_issue_block(&self, kernel: &Kernel) -> bool { + let max_blocks = self.config.max_blocks(kernel).unwrap(); + if self.config.concurrent_kernel_sm { + if max_blocks < 1 { + return false; + } + // self.occupy_resource_for_block(kernel, false); + unimplemented!("concurrent kernel sm model"); + } else { + self.num_active_blocks < max_blocks + } + } + + #[must_use] + #[inline] + pub fn id(&self) -> (usize, usize) { + (self.cluster_id, self.core_id) + } + + #[tracing::instrument(name = "core_reinit")] + #[inline] + pub fn reinit(&mut self, start_thread: usize, end_thread: usize, reset_not_completed: bool) { + if reset_not_completed { + self.num_active_warps = 0; + self.num_active_threads = 0; + self.active_thread_mask.fill(false); + self.occupied_block_to_hw_thread_id.clear(); + self.occupied_hw_thread_ids.fill(false); + } + for t in start_thread..end_thread { + self.thread_state[t] = None; + } + let warp_size = self.config.warp_size; + + let start_warp = start_thread / warp_size; + let end_warp = end_thread / warp_size; + log::debug!( + "reset warps {}..{} (threads {}..{})", + start_warp, + end_warp, + start_thread, + end_thread + ); + + for w in start_warp..end_warp { + self.warps[w].try_lock().reset(); + } + } + + #[tracing::instrument(name = "core_issue_block")] + pub fn issue_block(&mut self, kernel: &Arc, cycle: u64) { + log::debug!("core {:?}: issue block", self.id()); + if self.config.concurrent_kernel_sm { + // let occupied = self.occupy_resource_for_block(&*kernel, true); + // assert!(occupied); + unimplemented!("concurrent kernel sm"); + } else { + // calculate the max cta count and cta size for local memory address mapping + self.max_blocks_per_shader = self.config.max_blocks(kernel).unwrap(); + self.thread_block_size = self.config.threads_per_block_padded(kernel); + } + + // kernel.inc_running(); + + // find a free block context + let max_blocks_per_core = if self.config.concurrent_kernel_sm { + unimplemented!("concurrent kernel sm"); + // self.config.max_concurrent_blocks_per_core + } else { + self.max_blocks_per_shader + }; + log::debug!( + "core {:?}: free block status: {:?}", + self.id(), + self.block_status + ); + + debug_assert_eq!( + self.num_active_blocks, + self.block_status + .iter() + .filter(|&num_threads_in_block| *num_threads_in_block > 0) + .count() + ); + let Some(free_block_hw_id) = self.block_status[0..max_blocks_per_core] + .iter().position(|num_threads_in_block| *num_threads_in_block == 0) else { + return; + }; + + // determine hardware threads and warps that will be used for this block + let thread_block_size = kernel.threads_per_block(); + let padded_thread_block_size = self.config.threads_per_block_padded(kernel); + + // hw warp id = hw thread id mod warp size, so we need to find a range + // of hardware thread ids corresponding to an integral number of hardware + // thread ids + let (start_thread, end_thread) = if self.config.concurrent_kernel_sm { + let start_thread = self + .find_available_hw_thread_id(padded_thread_block_size, true) + .unwrap(); + let end_thread = start_thread + thread_block_size; + + assert!(!self + .occupied_block_to_hw_thread_id + .contains_key(&free_block_hw_id)); + self.occupied_block_to_hw_thread_id + .insert(free_block_hw_id, start_thread); + (start_thread, end_thread) + } else { + let start_thread = free_block_hw_id * padded_thread_block_size; + let end_thread = start_thread + thread_block_size; + (start_thread, end_thread) + }; + + // reset state of the selected hardware thread and warp contexts + self.reinit(start_thread, end_thread, false); + + // initalize scalar threads and determine which hardware warps they are + // allocated to bind functional simulation state of threads to hardware + // resources (simulation) + let mut warps: WarpMask = BitArray::ZERO; + let block = kernel.current_block().expect("kernel has current block"); + log::debug!( + "core {:?}: issue block {} from kernel {}", + self.id(), + block, + kernel, + ); + let block_id = block.id(); + + let mut num_threads_in_block = 0; + for i in start_thread..end_thread { + self.thread_state[i] = Some(ThreadState { + // block_id: free_block_hw_id, + active: true, + pc: 0, // todo + }); + let warp_id = i / self.config.warp_size; + + // TODO: removed this but is that fine? + if !kernel.no_more_blocks_to_run() { + // if !kernel.more_threads_in_block() { + // kernel.next_thread_iterlock().next(); + // } + // + // // we just incremented the thread id so this is not the same + // if !kernel.more_threads_in_block() { + // kernel.next_block_iterlock().next(); + // *kernel.next_thread_iterlock() = + // kernel.config.block.into_iter().peekable(); + // } + num_threads_in_block += 1; + } + + warps.set(warp_id, true); + } + + self.block_status[free_block_hw_id] = num_threads_in_block; + log::debug!( + "num threads in block {}={} (hw {}) = {}", + block, + block_id, + free_block_hw_id, + num_threads_in_block + ); + + self.barriers + .write() + .allocate_barrier(free_block_hw_id as u64, warps); + + self.init_warps(free_block_hw_id, start_thread, end_thread, block_id, kernel); + self.num_active_blocks += 1; + } + + // Return the next pc of a thread + // #[must_use] + // #[inline] + // pub fn next_pc(&mut self, thread_id: usize) -> Option { + // self.thread_state[thread_id].as_ref().map(|t| t.pc) + // } +} + +// PRIVATE +impl Core +where + I: ic::Interconnect + Send + 'static, +{ + #[inline] fn init_operand_collector( operand_collector: &mut opcoll::RegisterFileUnit, config: &config::GPU, @@ -921,180 +1160,13 @@ where operand_collector.init(config.num_reg_banks); } - // fn init_operand_collectors(&mut self) { - // let mut operand_collector = self.operand_collectorlock(); - // - // // let mut operand_collector = self.operand_collector.try_borrow_mut().unwrap(); - // - // // configure generic collectors - // operand_collector.add_cu_set( - // opcoll::OperandCollectorUnitKind::GEN_CUS, - // self.config.operand_collector_num_units_gen, - // self.config.operand_collector_num_out_ports_gen, - // ); - // - // for _i in 0..self.config.operand_collector_num_in_ports_gen { - // let mut in_ports = opcoll::PortVec::new(); - // let mut out_ports = opcoll::PortVec::new(); - // let mut cu_sets: Vec = Vec::new(); - // - // in_ports.push(self.pipeline_reg[PipelineStage::ID_OC_SP as usize].clone()); - // // in_ports.push_back(&m_pipeline_reg[ID_OC_SFU]); - // // in_ports.push(&self.pipeline_reg[ID_OC_MEM]); - // out_ports.push(self.pipeline_reg[PipelineStage::OC_EX_SP as usize].clone()); - // // out_ports.push_back(&m_pipeline_reg[OC_EX_SFU]); - // // out_ports.push(&self.pipeline_reg[OC_EX_MEM]); - // // if (m_config->gpgpu_tensor_core_avail) { - // // in_ports.push_back(&m_pipeline_reg[ID_OC_TENSOR_CORE]); - // // out_ports.push_back(&m_pipeline_reg[OC_EX_TENSOR_CORE]); - // // } - // // if (m_config->gpgpu_num_dp_units > 0) { - // // in_ports.push_back(&m_pipeline_reg[ID_OC_DP]); - // // out_ports.push_back(&m_pipeline_reg[OC_EX_DP]); - // // } - // // if (m_config->gpgpu_num_int_units > 0) { - // // in_ports.push_back(&m_pipeline_reg[ID_OC_INT]); - // // out_ports.push_back(&m_pipeline_reg[OC_EX_INT]); - // // } - // // if (m_config->m_specialized_unit.size() > 0) { - // // for (unsigned j = 0; j < m_config->m_specialized_unit.size(); ++j) { - // // in_ports.push_back( - // // &m_pipeline_reg[m_config->m_specialized_unit[j].ID_OC_SPEC_ID]); - // // out_ports.push_back( - // // &m_pipeline_reg[m_config->m_specialized_unit[j].OC_EX_SPEC_ID]); - // // } - // // } - // // cu_sets.push_back((unsigned)GEN_CUS); - // // m_operand_collector.add_port(in_ports, out_ports, cu_sets); - // // in_ports.clear(), out_ports.clear(), cu_sets.clear(); - // cu_sets.push(opcoll::OperandCollectorUnitKind::GEN_CUS); - // operand_collector.add_port(in_ports, out_ports, cu_sets); - // // in_ports.clear(); - // // out_ports.clear(); - // // cu_sets.clear(); - // } - // - // // let enable_specialized_operand_collector = true; - // if self.config.enable_specialized_operand_collector { - // // only added two - // operand_collector.add_cu_set( - // opcoll::OperandCollectorUnitKind::SP_CUS, - // self.config.operand_collector_num_units_sp, - // self.config.operand_collector_num_out_ports_sp, - // ); - // operand_collector.add_cu_set( - // opcoll::OperandCollectorUnitKind::MEM_CUS, - // self.config.operand_collector_num_units_mem, - // self.config.operand_collector_num_out_ports_mem, - // ); - // - // for _i in 0..self.config.operand_collector_num_in_ports_sp { - // let mut in_ports = opcoll::PortVec::new(); - // let mut out_ports = opcoll::PortVec::new(); - // let mut cu_sets: Vec = Vec::new(); - // - // in_ports.push(self.pipeline_reg[PipelineStage::ID_OC_SP as usize].clone()); - // out_ports.push(self.pipeline_reg[PipelineStage::OC_EX_SP as usize].clone()); - // cu_sets.push(opcoll::OperandCollectorUnitKind::SP_CUS); - // cu_sets.push(opcoll::OperandCollectorUnitKind::GEN_CUS); - // operand_collector.add_port(in_ports, out_ports, cu_sets); - // } - // - // for _i in 0..self.config.operand_collector_num_in_ports_mem { - // let mut in_ports = opcoll::PortVec::new(); - // let mut out_ports = opcoll::PortVec::new(); - // let mut cu_sets: Vec = Vec::new(); - // - // in_ports.push(self.pipeline_reg[PipelineStage::ID_OC_MEM as usize].clone()); - // out_ports.push(self.pipeline_reg[PipelineStage::OC_EX_MEM as usize].clone()); - // cu_sets.push(opcoll::OperandCollectorUnitKind::MEM_CUS); - // cu_sets.push(opcoll::OperandCollectorUnitKind::GEN_CUS); - // operand_collector.add_port(in_ports, out_ports, cu_sets); - // } - // } - // - // // this must be called after we add the collector unit sets! - // operand_collector.init(self.config.num_reg_banks); - // } - // - // fn init_functional_units(&mut self) { - // // single precision units - // for u in 0..self.config.num_sp_units { - // self.functional_units - // .push(Arc::new(Mutex::new(super::SPUnit::new( - // u, // id - // Arc::clone(&self.pipeline_reg[PipelineStage::EX_WB as usize]), - // Arc::clone(&self.config), - // Arc::clone(&self.stats), - // self.cycle.clone(), - // u, // issue reg id - // )))); - // self.dispatch_ports.push(PipelineStage::ID_OC_SP); - // self.issue_ports.push(PipelineStage::OC_EX_SP); - // } - // - // // load store unit - // self.functional_units.push(self.load_store_unit.clone()); // Arc::clone needs type hints - // self.dispatch_ports.push(PipelineStage::OC_EX_MEM); - // self.issue_ports.push(PipelineStage::OC_EX_MEM); - // - // debug_assert_eq!(self.functional_units.len(), self.issue_ports.len()); - // debug_assert_eq!(self.functional_units.len(), self.dispatch_ports.len()); - // } - // - // fn init_schedulers(&mut self) { - // let scheduler_kind = config::SchedulerKind::GTO; - // - // self.schedulers = (0..self.config.num_schedulers_per_core) - // .map(|sched_id| { - // let scheduler_stats = Arc::new(Mutex::new(stats::scheduler::Scheduler::default())); - // match scheduler_kind { - // config::SchedulerKind::GTO => { - // Arc::new(Mutex::new(scheduler::gto::Scheduler::new( - // sched_id, - // self.cluster_id, - // self.core_id, - // self.warps.clone(), - // self.scoreboard.clone(), - // scheduler_stats, - // self.config.clone(), - // ))) as Arc> - // } - // scheduler_kind => unimplemented!("scheduler: {:?}", &scheduler_kind), - // } - // }) - // .collect(); - // - // for (i, warp) in self.warps.iter().enumerate() { - // // distribute warps evenly though schedulers - // let sched_idx = i % self.config.num_schedulers_per_core; - // let scheduler = &mut self.schedulers[sched_idx]; - // scheduler - // .lock() - // .unwrap() - // .add_supervised_warp(Arc::clone(warp)); - // } - // } - - #[must_use] - pub fn active(&self) -> bool { - self.num_active_blocks > 0 - } - - /// return the next pc of a thread - pub fn next_pc(&mut self, thread_id: usize) -> Option { - self.thread_state[thread_id].as_ref().map(|t| t.pc) - } - + #[inline] fn register_thread_in_block_exited( &mut self, block_hw_id: usize, kernel: &Option>, ) { let current_kernel: &mut Option<_> = &mut *self.current_kernel.try_lock(); - // let current_kernel: &mut Option<_> = - // current_kernel.as_ref().map(std::convert::AsRef::as_ref); - debug_assert!(block_hw_id < MAX_CTA_PER_SHADER); debug_assert!(self.block_status[block_hw_id] > 0); self.block_status[block_hw_id] -= 1; @@ -1102,10 +1174,7 @@ where // this is the last thread that exited if self.block_status[block_hw_id] == 0 { // deallocate barriers for this block - self.barriers - .write() - // .unwrap() - .deallocate_barrier(block_hw_id as u64); + self.barriers.write().deallocate_barrier(block_hw_id as u64); // increment the number of completed blocks self.num_active_blocks -= 1; @@ -1132,6 +1201,8 @@ where } } + #[inline] + #[tracing::instrument] fn fetch(&mut self, cycle: u64) { log::debug!( "{}", @@ -1181,37 +1252,37 @@ where // next 1-2 instructions from instruction cache let max_warps = self.config.max_warps_per_core(); - if false { - for warp_id in 0..max_warps { - // let warp = self.warps[warp_id].try_borrow().unwrap(); - let warp = self.warps[warp_id].try_lock(); - if warp.instruction_count() == 0 { - // consider empty - continue; - } - debug_assert_eq!(warp.warp_id, warp_id); - - let sb = self.scoreboard.try_read(); - let pending_writes = sb.pending_writes(warp_id); - - // if warp.functional_done() && warp.hardware_done() && warp.done_exit() { - // continue; - // } - log::debug!( - "checking warp_id = {} dyn warp id = {} (instruction count={}, trace pc={} hardware_done={}, functional_done={}, instr in pipe={}, stores={}, done_exit={}, pending writes={:?})", - &warp_id, - warp.dynamic_warp_id(), - warp.instruction_count(), - warp.trace_pc, - warp.hardware_done(), - warp.functional_done(), - warp.num_instr_in_pipeline, - warp.num_outstanding_stores, - warp.done_exit(), - pending_writes.iter().sorted().collect::>() - ); - } - } + // if false { + // for warp_id in 0..max_warps { + // // let warp = self.warps[warp_id].try_borrow().unwrap(); + // let warp = self.warps[warp_id].try_lock(); + // if warp.instruction_count() == 0 { + // // consider empty + // continue; + // } + // debug_assert_eq!(warp.warp_id, warp_id); + // + // let sb = self.scoreboard.try_read(); + // let pending_writes = sb.pending_writes(warp_id); + // + // // if warp.functional_done() && warp.hardware_done() && warp.done_exit() { + // // continue; + // // } + // log::debug!( + // "checking warp_id = {} dyn warp id = {} (instruction count={}, trace pc={} hardware_done={}, functional_done={}, instr in pipe={}, stores={}, done_exit={}, pending writes={:?})", + // &warp_id, + // warp.dynamic_warp_id(), + // warp.instruction_count(), + // warp.trace_pc, + // warp.hardware_done(), + // warp.functional_done(), + // warp.num_instr_in_pipeline, + // warp.num_outstanding_stores, + // warp.done_exit(), + // pending_writes.iter().sorted().collect::>() + // ); + // } + // } for i in 0..max_warps { let last = self.last_warp_fetched.unwrap_or(0); @@ -1229,12 +1300,8 @@ where let kernel = warp.kernel.as_ref().map(Arc::clone); - let has_pending_writes = !self - .scoreboard - .read() - // .unwrap() - .pending_writes(warp_id) - .is_empty(); + let has_pending_writes = + !self.scoreboard.read().pending_writes(warp_id).is_empty(); let did_maybe_exit = warp.hardware_done() && !has_pending_writes && !warp.done_exit(); @@ -1380,6 +1447,8 @@ where } /// Shader core decode + #[tracing::instrument] + // #[inline] fn decode(&mut self, cycle: u64) { let InstrFetchBuffer { valid, warp_id, .. } = self.instr_fetch_buffer; @@ -1446,8 +1515,9 @@ where self.instr_fetch_buffer.valid = false; } - fn decode_instruction(&mut self, warp_id: usize, instr: WarpInstruction, slot: usize) { - let warp = self.warps.get_mut(warp_id).unwrap(); + #[inline] + fn decode_instruction(&self, warp_id: usize, instr: WarpInstruction, slot: usize) { + let warp = self.warps.get(warp_id).unwrap(); let mut warp = warp.try_lock(); log::debug!( @@ -1461,6 +1531,8 @@ where warp.num_instr_in_pipeline += 1; } + #[tracing::instrument] + // #[inline] fn issue(&mut self, cycle: u64) { // fair round robin issue between schedulers let num_schedulers = self.schedulers.len(); @@ -1473,6 +1545,8 @@ where self.scheduler_issue_priority = (self.scheduler_issue_priority + 1) % num_schedulers; } + #[tracing::instrument] + // #[inline] fn writeback(&mut self, cycle: u64) { // from the functional units let mut exec_writeback_pipeline = @@ -1535,6 +1609,8 @@ where } } + #[tracing::instrument] + // #[inline] fn execute(&mut self, cycle: u64) { let core_id = self.id(); log::debug!( @@ -1649,52 +1725,14 @@ where } } - pub fn cache_flush(&mut self) { - let mut unit = self.load_store_unit.try_lock(); - unit.flush(); - } - - pub fn cache_invalidate(&mut self) { - let mut unit = self.load_store_unit.try_lock(); - unit.invalidate(); - } - - #[must_use] - pub fn ldst_unit_response_buffer_full(&self) -> bool { - self.load_store_unit.try_lock().response_buffer_full() - } - - #[must_use] - pub fn fetch_unit_response_buffer_full(&self) -> bool { - false - } - - pub fn accept_fetch_response(&self, mut fetch: mem_fetch::MemFetch, time: u64) { - fetch.status = mem_fetch::Status::IN_SHADER_FETCHED; - // self.instr_l1_cache.fill(fetch, time); - self.please_fill.lock().push((fetch, time)); - } - - pub fn accept_ldst_unit_response(&self, fetch: mem_fetch::MemFetch, time: u64) { - self.load_store_unit.try_lock().fill(fetch); - } - - #[must_use] - pub fn not_completed(&self) -> usize { - self.num_active_threads - } - - #[must_use] - pub fn is_active(&self) -> bool { - self.num_active_blocks > 0 - } - // pub fn set_kernel(&self, kernel: Arc) { // log::debug!("kernel {} bind to core {:?}", kernel, self.id()); - // *self.current_kernellock() = Some(kernel); + // *self.current_kernel.lock() = Some(kernel); // } - pub fn find_available_hw_thread_id( + #[must_use] + #[inline] + fn find_available_hw_thread_id( &mut self, thread_block_size: usize, occupy: bool, @@ -1730,37 +1768,12 @@ where } } + #[tracing::instrument(name = "core_init_warps_from_traces")] #[inline] - pub fn can_issue_block(&self, kernel: &Kernel) -> bool { - let max_blocks = self.config.max_blocks(kernel).unwrap(); - if self.config.concurrent_kernel_sm { - if max_blocks < 1 { - return false; - } - // self.occupy_resource_for_block(kernel, false); - unimplemented!("concurrent kernel sm model"); - } else { - self.num_active_blocks < max_blocks - } - } - - #[must_use] - #[inline] - pub fn id(&self) -> (usize, usize) { - (self.cluster_id, self.core_id) - } - - #[inline] - pub fn init_warps_from_traces( - &mut self, - kernel: &Arc, - start_warp: usize, - end_warp: usize, - ) { + fn init_warps_from_traces(&mut self, kernel: &Arc, start_warp: usize, end_warp: usize) { debug_assert!(!self.warps.is_empty()); let selected_warps = &mut self.warps[start_warp..end_warp]; for warp in selected_warps.iter_mut() { - // let mut warp = warp.try_borrow_mut().unwrap(); let mut warp = warp.try_lock(); warp.trace_instructions.clear(); warp.kernel = Some(Arc::clone(kernel)); @@ -1775,8 +1788,9 @@ where ); } + #[tracing::instrument(name = "core_init_warps")] #[inline] - pub fn init_warps( + fn init_warps( &mut self, block_hw_id: usize, start_thread: usize, @@ -1825,164 +1839,13 @@ where ); self.init_warps_from_traces(kernel, start_warp, end_warp); } - - #[inline] - pub fn reinit(&mut self, start_thread: usize, end_thread: usize, reset_not_completed: bool) { - if reset_not_completed { - self.num_active_warps = 0; - self.num_active_threads = 0; - self.active_thread_mask.fill(false); - self.occupied_block_to_hw_thread_id.clear(); - self.occupied_hw_thread_ids.fill(false); - } - for t in start_thread..end_thread { - self.thread_state[t] = None; - } - let warp_size = self.config.warp_size; - - let start_warp = start_thread / warp_size; - let end_warp = end_thread / warp_size; - log::debug!( - "reset warps {}..{} (threads {}..{})", - start_warp, - end_warp, - start_thread, - end_thread - ); - - for w in start_warp..end_warp { - self.warps[w].try_lock().reset(); - } - } - - // fn set_max_blocks(&mut self, kernel: &Kernel) -> eyre::Result<()> { - // // calculate the max cta count and cta size for local memory address mapping - // self.max_blocks_per_shader = self.config.max_blocks(kernel)?; - // self.thread_block_size = self.config.threads_per_block_padded(kernel); - // Ok(()) - // } - - // pub fn maybe_issue_block(&mut self, kernel: &Arc) { - pub fn issue_block(&mut self, kernel: &Arc) { - log::debug!("core {:?}: issue block", self.id()); - if self.config.concurrent_kernel_sm { - // let occupied = self.occupy_resource_for_block(&*kernel, true); - // assert!(occupied); - unimplemented!("concurrent kernel sm"); - } else { - // calculate the max cta count and cta size for local memory address mapping - self.max_blocks_per_shader = self.config.max_blocks(kernel).unwrap(); - self.thread_block_size = self.config.threads_per_block_padded(kernel); - } - - // kernel.inc_running(); - - // find a free CTA context - let max_blocks_per_core = if self.config.concurrent_kernel_sm { - unimplemented!("concurrent kernel sm"); - // self.config.max_concurrent_blocks_per_core - } else { - self.max_blocks_per_shader - }; - log::debug!( - "core {:?}: free block status: {:?}", - self.id(), - self.block_status - ); - let free_block_hw_id = (0..max_blocks_per_core) - .find(|i| self.block_status[*i] == 0) - .unwrap(); - - // determine hardware threads and warps that will be used for this block - let thread_block_size = kernel.threads_per_block(); - let padded_thread_block_size = self.config.threads_per_block_padded(kernel); - - // hw warp id = hw thread id mod warp size, so we need to find a range - // of hardware thread ids corresponding to an integral number of hardware - // thread ids - let (start_thread, end_thread) = if self.config.concurrent_kernel_sm { - let start_thread = self - .find_available_hw_thread_id(padded_thread_block_size, true) - .unwrap(); - let end_thread = start_thread + thread_block_size; - - assert!(!self - .occupied_block_to_hw_thread_id - .contains_key(&free_block_hw_id)); - self.occupied_block_to_hw_thread_id - .insert(free_block_hw_id, start_thread); - (start_thread, end_thread) - } else { - let start_thread = free_block_hw_id * padded_thread_block_size; - let end_thread = start_thread + thread_block_size; - (start_thread, end_thread) - }; - - // reset state of the selected hardware thread and warp contexts - self.reinit(start_thread, end_thread, false); - - // initalize scalar threads and determine which hardware warps they are - // allocated to bind functional simulation state of threads to hardware - // resources (simulation) - let mut warps: WarpMask = BitArray::ZERO; - let block = kernel.current_block().expect("kernel has current block"); - log::debug!( - "core {:?}: issue block {} from kernel {}", - self.id(), - block, - kernel, - ); - let block_id = block.id(); - - let mut num_threads_in_block = 0; - for i in start_thread..end_thread { - self.thread_state[i] = Some(ThreadState { - // block_id: free_block_hw_id, - active: true, - pc: 0, // todo - }); - let warp_id = i / self.config.warp_size; - - // TODO: removed this but is that fine? - if !kernel.no_more_blocks_to_run() { - // if !kernel.more_threads_in_block() { - // kernel.next_thread_iterlock().next(); - // } - // - // // we just incremented the thread id so this is not the same - // if !kernel.more_threads_in_block() { - // kernel.next_block_iterlock().next(); - // *kernel.next_thread_iterlock() = - // kernel.config.block.into_iter().peekable(); - // } - num_threads_in_block += 1; - } - - warps.set(warp_id, true); - } - - self.block_status[free_block_hw_id] = num_threads_in_block; - log::debug!( - "num threads in block {}={} (hw {}) = {}", - block, - block_id, - free_block_hw_id, - num_threads_in_block - ); - - self.barriers - .write() - .allocate_barrier(free_block_hw_id as u64, warps); - - self.init_warps(free_block_hw_id, start_thread, end_thread, block_id, kernel); - self.num_active_blocks += 1; - } } impl crate::engine::cycle::Component for Core where I: ic::Interconnect + Send + 'static, { + #[tracing::instrument(name = "core_cycle")] fn cycle(&mut self, cycle: u64) { // log::debug!( // "{} \tactive={}, not completed={}", @@ -1996,8 +1859,11 @@ where // self.not_completed(), // ); - for (fetch, time) in self.please_fill.lock().drain(..) { - self.instr_l1_cache.fill(fetch, time); + for (target, fetch, time) in self.please_fill.lock().drain(..) { + match target { + FetchResponseTarget::LoadStoreUnit => self.load_store_unit.try_lock().fill(fetch), + FetchResponseTarget::ICache => self.instr_l1_cache.fill(fetch, time), + } } if !self.is_active() && self.not_completed() == 0 { diff --git a/src/interconn.rs b/src/interconn.rs index af42d997..c3e09fcb 100644 --- a/src/interconn.rs +++ b/src/interconn.rs @@ -7,7 +7,7 @@ use std::collections::VecDeque; /// /// Functions are not mutable because the interface should /// implement locking internally -pub trait Interconnect

: Send + Sync + 'static { +pub trait Interconnect

: std::fmt::Debug + Send + Sync + 'static { fn busy(&self) -> bool; fn push(&self, _src: usize, _dest: usize, _packet: P, _size: u32); diff --git a/src/lib.rs b/src/lib.rs index 18191340..9d262534 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -71,6 +71,7 @@ use crate::sync::{atomic, Arc, Mutex, RwLock}; use bitvec::array::BitArray; use color_eyre::eyre::{self}; use console::style; +use crossbeam::utils::CachePadded; use rayon::prelude::*; use std::collections::{HashMap, VecDeque}; use std::path::{Path, PathBuf}; @@ -117,13 +118,17 @@ pub static TIMINGS: Lazy>> = #[macro_export] macro_rules! timeit { ($name:expr, $call:expr) => {{ - // let start = std::time::Instant::now(); - let res = $call; - // let dur = start.elapsed(); - // let mut timings = $crate::TIMINGSlock(); - // timings.entry($name).or_default().add(dur); - // drop(timings); - res + if true { + $call + } else { + let start = std::time::Instant::now(); + let res = $call; + let dur = start.elapsed(); + let mut timings = $crate::TIMINGS.lock(); + timings.entry($name).or_default().add(dur); + drop(timings); + res + } }}; ($call:expr) => {{ $crate::timeit!(stringify!($call), $call) @@ -141,7 +146,7 @@ pub struct MockSimulator { // clusters: Vec>, clusters: Vec>>>, #[allow(dead_code)] - warp_instruction_unique_uid: Arc, + warp_instruction_unique_uid: Arc>, interconn: Arc, parallel_simulation: bool, @@ -162,6 +167,12 @@ pub struct MockSimulator { partition_replies_in_parallel: usize, } +impl std::fmt::Debug for MockSimulator { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("MockSimulator").finish() + } +} + pub trait FromConfig { fn from_config(config: &config::GPU) -> Self; } @@ -204,21 +215,14 @@ where let mut mem_sub_partitions = Vec::new(); for partition in &mem_partition_units { - mem_sub_partitions.extend( - partition - .try_read() - // .unwrap() - .sub_partitions - .iter() - .cloned(), - ); + mem_sub_partitions.extend(partition.try_read().sub_partitions.iter().cloned()); } let max_concurrent_kernels = config.max_concurrent_kernels; let running_kernels = Arc::new(RwLock::new(vec![None; max_concurrent_kernels])); let allocations = Arc::new(RwLock::new(Allocations::default())); - let warp_instruction_unique_uid = Arc::new(atomic::AtomicU64::new(0)); + let warp_instruction_unique_uid = Arc::new(CachePadded::new(atomic::AtomicU64::new(0))); let clusters: Vec<_> = (0..config.num_simt_clusters) .map(|i| { let cluster = Cluster::new( @@ -381,21 +385,22 @@ where log::error!( "CTA size (x*y*z) = {threads_per_block}, max supported = {max_threads_per_block}" ); - return Err(eyre::eyre!("kernel block size is too large")); + eyre::bail!("kernel block size is too large"); } let mut running_kernels = self.running_kernels.try_write(); - for running in running_kernels.iter_mut() { - if running.is_none() || running.as_ref().map_or(false, |k| k.done()) { - *running = Some(kernel); - break; - } - } + let free_slot = running_kernels + .iter_mut() + .find(|slot| slot.is_none() || slot.as_ref().map_or(false, |k| k.done())) + .ok_or(eyre::eyre!("no free slot for kernel"))?; + *free_slot = Some(kernel); Ok(()) } - fn issue_block_to_core(&self) { + #[tracing::instrument] + #[inline] + fn issue_block_to_core(&self, cycle: u64) { log::debug!("===> issue block to core"); - let mut last_cluster_issue = self.last_cluster_issue.try_lock(); // .unwrap(); + let mut last_cluster_issue = self.last_cluster_issue.try_lock(); let last_issued = *last_cluster_issue; let num_clusters = self.config.num_simt_clusters; for cluster_idx in 0..num_clusters { @@ -404,7 +409,9 @@ where // self.clusters[cluster_idx].try_read().cluster_id // ); let idx = (cluster_idx + last_issued + 1) % num_clusters; - let num_blocks_issued = self.clusters[idx].try_read().issue_block_to_core(self); + let cluster = self.clusters[idx].read(); + // dbg!((idx, cluster.num_active_sms())); + let num_blocks_issued = cluster.issue_block_to_core(self, cycle); log::trace!("cluster[{}] issued {} blocks", idx, num_blocks_issued); if num_blocks_issued > 0 { @@ -423,6 +430,7 @@ where } #[allow(clippy::overly_complex_bool_expr)] + #[tracing::instrument(name = "cycle")] pub fn cycle(&mut self, cycle: u64) { let start_total = Instant::now(); // int clock_mask = next_clock_domain(); @@ -430,23 +438,16 @@ where // shader core loading (pop from ICNT into core) let start = Instant::now(); if self.parallel_simulation { - self.clusters.par_iter().for_each(|cluster| { - cluster - .try_write() - // .unwrap() - .interconn_cycle(cycle) - }); + self.clusters + .par_iter() + .for_each(|cluster| cluster.try_write().interconn_cycle(cycle)); } else { for cluster in &self.clusters { - cluster - .try_write() - // .unwrap() - .interconn_cycle(cycle); + cluster.try_write().interconn_cycle(cycle); } } TIMINGS .lock() - // .unwrap() .entry("cycle::interconn") .or_default() .add(start.elapsed()); @@ -467,7 +468,7 @@ where .par_iter() .enumerate() .for_each(|(i, mem_sub)| { - let mut mem_sub = mem_sub.try_lock(); // .unwrap(); + let mut mem_sub = mem_sub.try_lock(); if let Some(fetch) = mem_sub.top() { let response_packet_size = if fetch.is_write() { fetch.control_size() @@ -502,7 +503,7 @@ where // } } else { for (i, mem_sub) in self.mem_sub_partitions.iter().enumerate() { - let mut mem_sub = mem_sub.try_lock(); // .unwrap(); + let mut mem_sub = mem_sub.try_lock(); { log::debug!("checking sub partition[{i}]:"); log::debug!( @@ -515,7 +516,7 @@ where mem_sub.l2_to_interconn_queue.len(), mem_sub.l2_to_interconn_queue ); - let l2_to_dram_queue = mem_sub.l2_to_dram_queue.try_lock(); // .unwrap(); + let l2_to_dram_queue = mem_sub.l2_to_dram_queue.try_lock(); log::debug!( "\t l2 to dram queue ({:<3}) = {}", l2_to_dram_queue.len(), @@ -566,7 +567,6 @@ where } TIMINGS .lock() - // .unwrap() .entry("cycle::subs") .or_default() .add(start.elapsed()); @@ -596,7 +596,6 @@ where } TIMINGS .lock() - // .unwrap() .entry("cycle::dram") .or_default() .add(start.elapsed()); @@ -613,7 +612,7 @@ where .par_iter() .enumerate() .for_each(|(i, mem_sub)| { - let mut mem_sub = mem_sub.try_lock(); // .unwrap(); + let mut mem_sub = mem_sub.try_lock(); let device = self.config.mem_id_to_device_id(i); // same as full with parameter overload @@ -651,15 +650,14 @@ where // let mut parallel_mem_partition_reqs_per_cycle = 0; // let mut stall_dram_full = 0; for (i, mem_sub) in self.mem_sub_partitions.iter_mut().enumerate() { - // let mut mem_sub = mem_sub.try_borrow_mut().unwrap(); - let mut mem_sub = mem_sub.try_lock(); // .unwrap(); - // move memory request from interconnect into memory partition - // (if not backed up) - // - // Note:This needs to be called in DRAM clock domain if there - // is no L2 cache in the system In the worst case, we may need - // to push SECTOR_CHUNCK_SIZE requests, so ensure you have enough - // buffer for them + let mut mem_sub = mem_sub.try_lock(); + // move memory request from interconnect into memory partition + // (if not backed up) + // + // Note:This needs to be called in DRAM clock domain if there + // is no L2 cache in the system In the worst case, we may need + // to push SECTOR_CHUNCK_SIZE requests, so ensure you have enough + // buffer for them let device = self.config.mem_id_to_device_id(i); // same as full with parameter overload @@ -758,7 +756,6 @@ where crate::timeit!(core.write().cycle(cycle)); // TIMINGS // .lock() - // .unwrap() // .entry("core_cycle") // .or_default() // .add(start.elapsed()); @@ -842,11 +839,11 @@ where // continue; // } - let cluster = cluster.try_read(); // .unwrap(); - let mut core_sim_order = cluster.core_sim_order.try_lock(); // .unwrap(); + let cluster = cluster.try_read(); + let mut core_sim_order = cluster.core_sim_order.try_lock(); for core_id in core_sim_order.iter() { - let core = cluster.cores[*core_id].try_read(); // .unwrap(); - let mut port = core.interconn_port.try_lock(); // .unwrap(); + let core = cluster.cores[*core_id].try_read(); + let mut port = core.interconn_port.try_lock(); for (dest, fetch, size) in port.drain(..) { self.interconn .push(core.cluster_id, dest, Packet::Fetch(fetch), size); @@ -897,7 +894,6 @@ where TIMINGS .lock() - // .unwrap() .entry("cycle::core") .or_default() .add(start.elapsed()); @@ -918,7 +914,8 @@ where // } // } // } else { - crate::timeit!(self.issue_block_to_core()); + + crate::timeit!(self.issue_block_to_core(cycle)); // self.decrement_kernel_latency(); // } @@ -952,7 +949,6 @@ where // log::debug!("flushed L2 caches..."); // if l2_config.inner.total_lines() > 0 { // for (i, mem_sub) in self.mem_sub_partitions.iter_mut().enumerate() { - // // let mut mem_sub = mem_sub.try_borrow_mut().unwrap(); // let mut mem_sub = mem_sub.try_lock(); // let num_dirty_lines_flushed = mem_sub.flush_l2(); // log::debug!( @@ -968,7 +964,6 @@ where TIMINGS .lock() - // .unwrap() .entry("cycle::total") .or_default() .add(start_total.elapsed()); @@ -984,7 +979,6 @@ where // let alloc_range = addr..(addr + num_bytes); // self.allocations // .try_borrow_mut() - // .unwrap() // .insert(alloc_range.clone(), name); } @@ -1045,27 +1039,27 @@ where } pub fn stats(&self) -> Stats { - let mut stats: Stats = self.stats.lock().clone(); // .unwrap().clone(); + let mut stats: Stats = self.stats.lock().clone(); for cluster in &self.clusters { - let cluster = cluster.try_read(); // .unwrap(); + let cluster = cluster.try_read(); for core in &cluster.cores { - let core = core.try_read(); // .unwrap(); + let core = core.try_read(); let core_id = core.core_id; - stats.l1i_stats[core_id] = core.instr_l1_cache.stats().try_lock().clone(); // .unwrap().clone(); - let ldst_unit = &core.load_store_unit.try_lock(); // .unwrap(); + stats.l1i_stats[core_id] = core.instr_l1_cache.stats().try_lock().clone(); + let ldst_unit = &core.load_store_unit.try_lock(); let data_l1 = ldst_unit.data_l1.as_ref().unwrap(); - stats.l1d_stats[core_id] = data_l1.stats().try_lock().clone(); // .unwrap().clone(); + stats.l1d_stats[core_id] = data_l1.stats().try_lock().clone(); stats.l1c_stats[core_id] = stats::Cache::default(); stats.l1t_stats[core_id] = stats::Cache::default(); } } for sub in &self.mem_sub_partitions { - let sub = sub.try_lock(); // .unwrap(); + let sub = sub.try_lock(); let l2_cache = sub.l2_cache.as_ref().unwrap(); - stats.l2d_stats[sub.id] = l2_cache.stats().try_lock().clone(); // .unwrap().clone(); + stats.l2d_stats[sub.id] = l2_cache.stats().try_lock().clone(); } stats } @@ -1165,6 +1159,7 @@ where !self.kernels.is_empty() } + #[tracing::instrument] pub fn run_to_completion(&mut self) -> eyre::Result<()> { let mut cycle: u64 = 0; let mut last_state_change: Option<(deadlock::State, u64)> = None; @@ -1286,7 +1281,7 @@ where fn finished_kernel(&mut self) -> Option> { // check running kernels // let _active = self.active(); - let mut running_kernels = self.running_kernels.try_write().clone(); // .unwrap(); + let mut running_kernels = self.running_kernels.try_write().clone(); let finished_kernel: Option<&mut Option>> = running_kernels.iter_mut().find(|k| { if let Some(k) = k { diff --git a/src/mem_partition_unit.rs b/src/mem_partition_unit.rs index 39a66bb9..b66eceed 100644 --- a/src/mem_partition_unit.rs +++ b/src/mem_partition_unit.rs @@ -9,7 +9,6 @@ use crate::sync::{Arc, Mutex}; use console::style; use std::collections::VecDeque; -#[derive()] pub struct MemoryPartitionUnit { id: usize, dram: dram::DRAM, @@ -22,6 +21,12 @@ pub struct MemoryPartitionUnit { stats: Arc>, } +impl std::fmt::Debug for MemoryPartitionUnit { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("MemoryPartitionUnit").finish() + } +} + impl MemoryPartitionUnit { pub fn new(id: usize, config: Arc, stats: Arc>) -> Self { let num_sub_partitions = config.num_sub_partition_per_memory_channel; @@ -105,6 +110,7 @@ impl MemoryPartitionUnit { sub.set_done(fetch); } + #[tracing::instrument] pub fn simple_dram_cycle(&mut self) { log::debug!("{} ...", style("simple dram cycle").red()); // pop completed memory request from dram and push it to dram-to-L2 queue @@ -137,7 +143,6 @@ impl MemoryPartitionUnit { let dest_global_spid = returned_fetch.sub_partition_id(); let dest_spid = self.global_sub_partition_id_to_local_id(dest_global_spid); - // let mut sub = self.sub_partitions[dest_spid].borrow_mut(); let mut sub = self.sub_partitions[dest_spid].try_lock(); debug_assert_eq!(sub.id, dest_global_spid); @@ -175,7 +180,6 @@ impl MemoryPartitionUnit { let last_issued_partition = self.arbitration_metadata.last_borrower(); for sub_id in 0..self.sub_partitions.len() { let spid = (sub_id + last_issued_partition + 1) % self.sub_partitions.len(); - // let sub = self.sub_partitions[spid].borrow_mut(); let sub = self.sub_partitions[spid].try_lock(); let sub_partition_contention = sub.dram_to_l2_queue.full(); diff --git a/src/mem_sub_partition.rs b/src/mem_sub_partition.rs index c0566a7f..a05504f9 100644 --- a/src/mem_sub_partition.rs +++ b/src/mem_sub_partition.rs @@ -17,7 +17,6 @@ pub const SECTOR_CHUNCK_SIZE: u32 = 4; /// Sector size is 32 bytes width pub const SECTOR_SIZE: u32 = 32; -#[derive()] pub struct MemorySubPartition> { pub id: usize, pub partition_id: usize, @@ -46,6 +45,12 @@ pub struct MemorySubPartition> { memcpy_cycle_offset: u64, } +impl std::fmt::Debug for MemorySubPartition { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("MemorySubPartition").finish() + } +} + const NO_FETCHES: VecDeque = VecDeque::new(); impl MemorySubPartition @@ -334,6 +339,7 @@ where todo!("mem sub partition: dram l2 queue full"); } + #[tracing::instrument] pub fn cache_cycle(&mut self, cycle: u64) { use config::CacheWriteAllocatePolicy; use mem_fetch::{AccessKind, Status}; diff --git a/src/operand_collector.rs b/src/operand_collector.rs index 2fe47cb6..23e34129 100644 --- a/src/operand_collector.rs +++ b/src/operand_collector.rs @@ -418,8 +418,8 @@ impl Arbiter { pub fn allocate_reads(&mut self) -> HashMap { // log::trace!("queue: {:?}", &self.queue); - #[cfg(feature = "stats")] - let start = std::time::Instant::now(); + // #[cfg(feature = "stats")] + // let start = std::time::Instant::now(); let num_inputs = self.num_banks; let num_outputs = self.num_collectors; @@ -475,22 +475,22 @@ impl Arbiter { // log::trace!("request: {:?}", &Self::compat(&request[bank])); } - #[cfg(feature = "stats")] - { - crate::TIMINGS - .lock() - .entry("allocate_reads_prepare") - .or_default() - .add(start.elapsed()); - } + // #[cfg(feature = "stats")] + // { + // crate::TIMINGS + // .lock() + // .entry("allocate_reads_prepare") + // .or_default() + // .add(start.elapsed()); + // } // log::trace!("inmatch: {:?}", &Self::compat(inmatch)); // wavefront allocator from booksim // loop through diagonals of request matrix - #[cfg(feature = "stats")] - let start = std::time::Instant::now(); + // #[cfg(feature = "stats")] + // let start = std::time::Instant::now(); for p in 0..square { let mut output = (pri + p) % num_outputs; @@ -547,18 +547,18 @@ impl Arbiter { } } - #[cfg(feature = "stats")] - { - crate::TIMINGS - .lock() - .entry("allocate_reads_search_diagonal") - .or_default() - .add(start.elapsed()); - } + // #[cfg(feature = "stats")] + // { + // crate::TIMINGS + // .lock() + // .entry("allocate_reads_search_diagonal") + // .or_default() + // .add(start.elapsed()); + // } // allocated - #[cfg(feature = "stats")] - let start = std::time::Instant::now(); + // #[cfg(feature = "stats")] + // let start = std::time::Instant::now(); log::debug!( "arbiter allocated {} reads ({:?})", @@ -581,14 +581,14 @@ impl Arbiter { self.allocate_bank_for_read(bank, read.clone()); read_ops.insert(bank, read); } - #[cfg(feature = "stats")] - { - crate::TIMINGS - .lock() - .entry("allocate_reads_register_banks") - .or_default() - .add(start.elapsed()); - } + // #[cfg(feature = "stats")] + // { + // crate::TIMINGS + // .lock() + // .entry("allocate_reads_register_banks") + // .or_default() + // .add(start.elapsed()); + // } read_ops } diff --git a/src/parallel/deterministic.rs b/src/parallel/deterministic.rs index b8eafe17..9f59d747 100644 --- a/src/parallel/deterministic.rs +++ b/src/parallel/deterministic.rs @@ -198,7 +198,7 @@ where } } - self.issue_block_to_core(); + self.issue_block_to_core(cycle); // let mut all_threads_complete = true; // if self.config.flush_l1_cache { diff --git a/src/parallel/nondeterministic.rs b/src/parallel/nondeterministic.rs index 00186eba..75c554ca 100644 --- a/src/parallel/nondeterministic.rs +++ b/src/parallel/nondeterministic.rs @@ -1,24 +1,409 @@ #![allow(warnings)] +use crate::ic::ToyInterconnect; +use crate::sync::{Arc, Mutex, RwLock}; use crate::{ config, core, engine::cycle::Component, ic, mem_fetch, mem_sub_partition, MockSimulator, TIMINGS, }; use color_eyre::eyre; use rayon::prelude::*; -use std::sync::Arc; +use std::collections::VecDeque; use std::time::Instant; +#[tracing::instrument] +fn new_serial_cycle( + cycle: u64, + stats: Arc>, + mem_sub_partitions: Vec>>, + mem_partition_units: Vec>, + interconn: Arc, + clusters: Vec>>>, + config: &config::GPU, +) where + Q: crate::fifo::Queue + 'static, + I: ic::Interconnect, + T: std::fmt::Debug, +{ + for cluster in &clusters { + cluster.write().interconn_cycle(cycle); + } + + for (i, mem_sub) in mem_sub_partitions.iter().enumerate() { + let mut mem_sub = mem_sub.try_lock(); + if let Some(fetch) = mem_sub.top() { + let response_packet_size = if fetch.is_write() { + fetch.control_size() + } else { + fetch.size() + }; + let device = config.mem_id_to_device_id(i); + if interconn.has_buffer(device, response_packet_size) { + let mut fetch = mem_sub.pop().unwrap(); + let cluster_id = fetch.cluster_id; + fetch.set_status(mem_fetch::Status::IN_ICNT_TO_SHADER, 0); + let packet = core::Packet::Fetch(fetch); + // fetch.set_return_timestamp(gpu_sim_cycle + gpu_tot_sim_cycle); + // , gpu_sim_cycle + gpu_tot_sim_cycle); + // drop(fetch); + interconn.push(device, cluster_id, packet, response_packet_size); + // self.partition_replies_in_parallel += 1; + } else { + // self.gpu_stall_icnt2sh += 1; + } + } + } + + for (_i, unit) in mem_partition_units.iter().enumerate() { + unit.try_write().simple_dram_cycle(); + } + + for (i, mem_sub) in mem_sub_partitions.iter().enumerate() { + let mut mem_sub = mem_sub.try_lock(); + // move memory request from interconnect into memory partition + // (if not backed up) + // + // Note:This needs to be called in DRAM clock domain if there + // is no L2 cache in the system In the worst case, we may need + // to push SECTOR_CHUNCK_SIZE requests, so ensure you have enough + // buffer for them + let device = config.mem_id_to_device_id(i); + + // same as full with parameter overload + if mem_sub.interconn_to_l2_can_fit(mem_sub_partition::SECTOR_CHUNCK_SIZE as usize) { + if let Some(core::Packet::Fetch(fetch)) = interconn.pop(device) { + log::debug!( + "got new fetch {} for mem sub partition {} ({})", + fetch, + i, + device + ); + + mem_sub.push(fetch, cycle); + // self.parallel_mem_partition_reqs += 1; + } + } else { + log::debug!("SKIP sub partition {} ({}): DRAM full stall", i, device); + #[cfg(feature = "stats")] + { + stats.lock().stall_dram_full += 1; + } + } + // we borrow all of sub here, which is a problem for the cyclic reference in l2 + // interface + mem_sub.cache_cycle(cycle); + } +} + impl MockSimulator where I: ic::Interconnect + 'static, { + #[tracing::instrument] pub fn run_to_completion_parallel_nondeterministic( &mut self, mut run_ahead: usize, ) -> eyre::Result<()> { run_ahead = run_ahead.max(1); + let num_threads: usize = std::env::var("NUM_THREADS") + .ok() + .as_deref() + .map(str::parse) + .transpose()? + .unwrap_or_else(num_cpus::get_physical); + rayon::ThreadPoolBuilder::new() + .num_threads(num_threads) + .build_global() + .unwrap(); + println!("nondeterministic [{run_ahead} run ahead] using RAYON"); + println!("\t => launching {num_threads} worker threads"); + + let num_clusters = self.clusters.len(); + let cores: Vec<( + Arc>>, + Arc>>, + usize, + )> = self + .clusters + .iter() + .flat_map(|cluster| { + cluster + .try_read() + .cores + .iter() + .enumerate() + .map(|(core_id, core)| (Arc::clone(&cluster), Arc::clone(&core), core_id)) + .collect::>() + }) + .collect(); + + let sim_orders: Vec> = self + .clusters + .iter() + .map(|cluster| Arc::clone(&cluster.try_read().core_sim_order)) + .collect(); + let interconn_ports: Vec>> = self + .clusters + .iter() + .map(|cluster| { + cluster + .try_read() + .cores + .iter() + .map(|core| Arc::clone(&core.try_read().interconn_port)) + .collect() + }) + .collect(); + + let cores = Arc::new(cores); + let sim_orders = Arc::new(sim_orders); + let interconn_ports = Arc::new(interconn_ports); + + let use_round_robin = + self.config.simt_core_sim_order == config::SchedulingOrder::RoundRobin; + + let mut cycle: u64 = 0; + + rayon::scope_fifo(|s| { + while (self.commands_left() || self.kernels_left()) && !self.reached_limit(cycle) { + self.process_commands(cycle); + self.launch_kernels(cycle); + + let mut finished_kernel = None; + loop { + if self.reached_limit(cycle) || !self.active() { + break; + } + + let span = tracing::span!(tracing::Level::INFO, "wave", cycle, run_ahead); + let enter = span.enter(); + + rayon::scope(|wave| { + for i in 0..run_ahead { + // run cores in any order + // rayon::scope(|core_scope| { + for (cluster, core, core_id) in cores.iter() { + // core_scope.spawn(move |_| { + wave.spawn(move |_| { + if *core_id == 0 { + cluster.write().interconn_cycle(cycle); + } + + core.write().cycle(cycle); + }); + } + // }); + + // let sim_orders = sim_orders.clone(); + // let interconn_ports = interconn_ports.clone(); + // let interconn = self.interconn.clone(); + // s.spawn_fifo(move |_| { + for cluster_id in 0..num_clusters { + let mut core_sim_order = sim_orders[cluster_id].try_lock(); + for core_id in core_sim_order.iter() { + let mut port = interconn_ports[cluster_id][*core_id].lock(); + for (dest, fetch, size) in port.drain(..) { + self.interconn.push( + cluster_id, + dest, + core::Packet::Fetch(fetch), + size, + ); + } + } + + if use_round_robin { + core_sim_order.rotate_left(1); + } + } + + // after cores complete, run serial cycle + self.serial_cycle(cycle + i as u64); + // new_serial_cycle( + // cycle + i as u64, + // stats: Arc>, + // mem_sub_partitions: + // Vec>>, + // mem_partition_units: + // Vec>, + // interconn: Arc, + // clusters: Vec>>>, + // config: &config::GPU, + // ); + // + // // locks are uncontended now + // self.issue_block_to_core(cycle); + // }); + } + }); + + self.issue_block_to_core(cycle); + + drop(enter); + + cycle += run_ahead as u64; + // cycle += 1; + self.set_cycle(cycle); + + if !self.active() { + finished_kernel = self.finished_kernel(); + if finished_kernel.is_some() { + break; + } + } + } + + if let Some(kernel) = finished_kernel { + self.cleanup_finished_kernel(&kernel); + } + + log::trace!( + "commands left={} kernels left={}", + self.commands_left(), + self.kernels_left() + ); + } + }); + + // let mut cycle: u64 = 0; + // while (self.commands_left() || self.kernels_left()) && !self.reached_limit(cycle) { + // self.process_commands(cycle); + // self.launch_kernels(cycle); + // + // let mut finished_kernel = None; + // loop { + // if self.reached_limit(cycle) || !self.active() { + // break; + // } + // + // let span = tracing::span!(tracing::Level::INFO, "wave", cycle, run_ahead); + // let enter = span.enter(); + // + // // for i in 0..run_ahead { + // let i = 0; + // // TODO: make this in place + // // rayon::in_place_scope_fifo(|s| { + // rayon::scope_fifo(|s| { + // // run cores in any order + // rayon::scope(|core_scope| { + // for core in cores.iter() { + // core_scope.spawn(move |_| { + // core.write().cycle(cycle); + // }); + // } + // }); + // + // // let sim_orders = sim_orders.clone(); + // // let interconn_ports = interconn_ports.clone(); + // // s.spawn_fifo(move |_| { + // for cluster_id in 0..num_clusters { + // let mut core_sim_order = sim_orders[cluster_id].try_lock(); + // for core_id in core_sim_order.iter() { + // let mut port = interconn_ports[cluster_id][*core_id].try_lock(); + // for (dest, fetch, size) in port.drain(..) { + // // self.interconn.push( + // // cluster_id, + // // dest, + // // core::Packet::Fetch(fetch), + // // size, + // // ); + // } + // } + // + // if use_round_robin { + // core_sim_order.rotate_left(1); + // } + // } + // // after cores complete, run serial cycle + // self.serial_cycle(cycle + i as u64); + // // }) + // + // // s.spawn_fifo(|s| { + // // // task s.1 + // // s.spawn_fifo(|s| { + // // // task s.1.1 + // // rayon::scope_fifo(|t| { + // // t.spawn_fifo(|_| ()); // task t.1 + // // t.spawn_fifo(|_| ()); // task t.2 + // // }); + // // }); + // // }); + // // s.spawn_fifo(|s| { // task s.2 + // // }); + // // point mid + // }); + // + // // locks are uncontended now + // self.issue_block_to_core(cycle); + // drop(enter); + // + // cycle += run_ahead as u64; + // self.set_cycle(cycle); + // + // if !self.active() { + // finished_kernel = self.finished_kernel(); + // if finished_kernel.is_some() { + // break; + // } + // } + // } + // + // if let Some(kernel) = finished_kernel { + // self.cleanup_finished_kernel(&kernel); + // } + // + // log::trace!( + // "commands left={} kernels left={}", + // self.commands_left(), + // self.kernels_left() + // ); + // } + + // let clustersx: Vec<_> = self.clusters; + + // let run_core = |(core, core_sim_order): ( + // Arc>>, + // Arc>>, + // )| { + // let run_core = |core: Arc>>| { + // // c.write().cycle(cycle); + // // for c in core { + // // c.write().cycle(cycle); + // // } + // // + // // let mut core_sim_order = core_sim_order.try_lock(); + // // for core_id in core_sim_order.iter() { + // // // let (_core, interconn_port) = &cores[*core_id]; + // // // let mut port = interconn_port.try_lock(); + // // // for (dest, fetch, size) in port.drain(..) { + // // // interconn.push( + // // // *cluster_id, + // // // dest, + // // // core::Packet::Fetch(fetch), + // // // size, + // // // ); + // // // } + // // } + // // + // // if use_round_robin { + // // core_sim_order.rotate_left(1); + // // } + // }; + + log::info!("exit after {cycle} cycles"); + dbg!(&cycle); + + Ok(()) + } + + #[tracing::instrument] + pub fn run_to_completion_parallel_nondeterministic_old( + &mut self, + mut run_ahead: usize, + ) -> eyre::Result<()> { + run_ahead = run_ahead.max(1); + let num_threads: usize = std::env::var("NUM_THREADS") .ok() .as_deref() @@ -30,7 +415,6 @@ where // prefer less cores let cores_per_thread = cores_per_thread.ceil() as usize; // todo: tune this - // let cores: Vec<_> = self.clusters.iter().cloned().collect(); let core_chunks: Vec)>> = self .clusters .chunks(cores_per_thread) @@ -63,21 +447,8 @@ where let core_done: Vec<_> = vec![crossbeam::channel::bounded(1); num_chunks]; - // use std::sync::Semaphore; - // use parking_lot::Condvar; - - // let core_reached: Vec<_> = vec![Semaphore::new(num_chunks); run_ahead]; + let lockstep = true; - // let core_ = Arc::new((Mutex::new(false), Condvar::new())); - // let start_core: Vec<_> = Semaphore::new(num_chunks); - // - // let core_done: Vec<_> = Semaphore::new(num_chunks); - - let lockstep = false; - - // let (start_serial_tx, start_serial_rx) = crossbeam::channel::bounded(1); - // let (serial_done_tx, serial_done_rx) = crossbeam::channel::bounded(1); - // let use_round_robin = self.config.simt_core_sim_order == config::SchedulingOrder::RoundRobin; @@ -106,12 +477,13 @@ where // .iter() // .filter_map(std::option::Option::as_ref) // .all(|k| k.no_more_blocks_to_run()); + tracing::info!("cycle {cycle} + run ahead {i}"); for (core_sim_order, cluster_id, cores) in &clusters { // let mut cluster = cluster.read(); // let cores_completed = cluster.not_completed() == 0; // let cluster_done = cores_completed && kernels_completed; - let start = Instant::now(); + // let start = Instant::now(); let cluster_done = false; if !cluster_done { for (core, _) in cores { @@ -140,14 +512,14 @@ where core_sim_order.rotate_left(1); } - #[cfg(feature = "stats")] - { - TIMINGS - .lock() - .entry("parallel::cluster") - .or_default() - .add(start.elapsed()); - } + // #[cfg(feature = "stats")] + // { + // TIMINGS + // .lock() + // .entry("parallel::cluster") + // .or_default() + // .add(start.elapsed()); + // } // issue new blocks // issue_block_to_core @@ -161,7 +533,7 @@ where if lockstep { core_reached_tx[i].send(()).unwrap(); } - std::thread::yield_now(); + // std::thread::yield_now(); } core_done_tx.send(()).unwrap(); @@ -171,76 +543,7 @@ where assert_eq!(core_worker_handles.len(), num_chunks); - // // crossbeam::thread::scope(|s| { s.spawn(move |s| loop { - // std::thread::spawn(move || loop { - // let Ok(cycle) = start_serial_rx.recv() else { - // // println!("cluster {} exited", cluster.try_read().cluster_id); - // break; - // }; - // - // for i in 0..run_ahead { - // // wait until all cores are ready for this - // println!("waiting for cores to reach barrier {i}"); - // for _ in 0..num_cores { - // // let _ = core_reached_rx[i].recv().unwrap(); - // let _ = core_reached[i].1.recv().unwrap(); - // } - // println!("all cores reached reached barrier {i}"); - // log::info!("======== cycle {cycle} ========"); - // log::info!(""); - // - // // collect the core packets pushed to the interconn - // for cluster in &self.clusters { - // let mut cluster = cluster.write(); - // for core_id in &cluster.core_sim_order { - // let core = cluster.cores[*core_id].read(); - // let mut port = core.interconn_portlock(); - // for (dest, fetch, size) in port.drain(..) { - // self.interconn.push( - // core.cluster_id, - // dest, - // core::Packet::Fetch(fetch), - // size, - // ); - // } - // } - // - // if let config::SchedulingOrder::RoundRobin = self.config.simt_core_sim_order { - // cluster.core_sim_order.rotate_left(1); - // } - // } - // - // if self.reached_limit(cycle) || !self.active() { - // break; - // } - // - // self.serial_cycle(cycle + i as u64); - // serial_done_tx.send(()).unwrap(); - // } - // }); - // // }); - let mut cycle: u64 = 0; - // loop { - // // start serial thread - // start_serial_tx.send(cycle).unwrap(); - // - // // start all cores - // for core_idx in 0..num_cores { - // start_core[core_idx].0.send(cycle).unwrap(); - // } - // - // // wait for all cores to finish - // for core_idx in 0..num_cores { - // core_done[core_idx].1.recv().unwrap(); - // } - // - // // wait for serial thread - // serial_done_rx.recv().unwrap(); - // - // cycle += run_ahead as u64; - // } - while (self.commands_left() || self.kernels_left()) && !self.reached_limit(cycle) { self.process_commands(cycle); self.launch_kernels(cycle); @@ -251,12 +554,17 @@ where break; } + let span = tracing::span!(tracing::Level::INFO, "wave", cycle, run_ahead); + let enter = span.enter(); + // start all cores + tracing::warn!("WAVE START"); for core_idx in 0..num_chunks { start_core[core_idx].0.send(cycle).unwrap(); } for i in 0..run_ahead { + tracing::info!("cycle {cycle} + run ahead {i}"); if lockstep { // wait until all cores are ready for this // println!("waiting for cores to reach barrier {i}"); @@ -272,6 +580,18 @@ where // could enforce round robin here crate::timeit!("SERIAL CYCLE", self.serial_cycle(cycle + i as u64)); + + // issue new blocks + // let start = Instant::now(); + // self.issue_block_to_core(); + // #[cfg(feature = "stats")] + // { + // TIMINGS + // .lock() + // .entry("serial::issue_block_to_core") + // .or_default() + // .add(start.elapsed()); + // } } // wait for all cores to finish @@ -279,17 +599,10 @@ where core_done[core_idx].1.recv().unwrap(); } - // issue new blocks - let start = Instant::now(); - self.issue_block_to_core(); - #[cfg(feature = "stats")] - { - TIMINGS - .lock() - .entry("serial::issue_block_to_core") - .or_default() - .add(start.elapsed()); - } + // locks are uncontended now + // self.serial_issue_block_to_core(cycle); + crate::timeit!("SERIAL ISSUE", self.issue_block_to_core(cycle)); + drop(enter); cycle += run_ahead as u64; self.set_cycle(cycle); @@ -320,6 +633,96 @@ where Ok(()) } + #[tracing::instrument] + fn serial_issue_block_to_core(&mut self, cycle: u64) { + // let num_cores = self.cores.len(); + // + // log::debug!( + // "cluster {}: issue block to core for {} cores", + // self.cluster_id, + // num_cores + // ); + // let mut num_blocks_issued = 0; + // + // let mut block_issue_next_core = self.block_issue_next_core.try_lock(); + // + // for core_id in 0..num_cores { + // let core_id = (core_id + *block_issue_next_core + 1) % num_cores; + // // let core = &mut cores[core_id]; + // // THIS KILLS THE PERFORMANCE + // let core = self.cores[core_id].read(); + // + // // let kernel: Option> = if self.config.concurrent_kernel_sm { + // // // always select latest issued kernel + // // // kernel = sim.select_kernel() + // // // sim.select_kernel().map(Arc::clone); + // // unimplemented!("concurrent kernel sm"); + // // } else { + // let mut current_kernel = core.current_kernel.try_lock().clone(); + // let should_select_new_kernel = if let Some(ref current) = current_kernel { + // // if no more blocks left, get new kernel once current block completes + // current.no_more_blocks_to_run() && core.not_completed() == 0 + // } else { + // // core was not assigned a kernel yet + // true + // }; + // + // // if let Some(ref current) = current_kernel { + // // log::debug!( + // // "core {}-{}: current kernel {}, more blocks={}, completed={}", + // // self.cluster_id, + // // core_id, + // // current, + // // !current.no_more_blocks_to_run(), + // // core.not_completed() == 0, + // // ); + // // } + // + // // dbg!(&should_select_new_kernel); + // if should_select_new_kernel { + // current_kernel = crate::timeit!(self.select_kernel()); + // // current_kernel = sim.select_kernel(); + // // if let Some(ref k) = current_kernel { + // // log::debug!("kernel {} bind to core {:?}", kernel, self.id()); + // // // core.set_kernel(Arc::clone(k)); + // // } + // } + // + // // current_kernel + // // }; + // + // // if let Some(kernel) = kernel { + // if let Some(kernel) = current_kernel { + // log::debug!( + // "core {}-{}: selected kernel {} more blocks={} can issue={}", + // self.cluster_id, + // core_id, + // kernel, + // !kernel.no_more_blocks_to_run(), + // core.can_issue_block(&kernel), + // ); + // + // let can_issue = !kernel.no_more_blocks_to_run() && core.can_issue_block(&kernel); + // drop(core); + // if can_issue { + // let mut core = self.cores[core_id].write(); + // core.issue_block(&kernel); + // num_blocks_issued += 1; + // *block_issue_next_core = core_id; + // break; + // } + // } else { + // log::debug!( + // "core {}-{}: selected kernel NULL", + // self.cluster_id, + // core.core_id, + // ); + // } + // } + // num_blocks_issued + } + + #[tracing::instrument] fn serial_cycle(&mut self, cycle: u64) { // if false { // let start = Instant::now(); @@ -331,20 +734,20 @@ where // .add(start.elapsed()); // } - let start = Instant::now(); - for cluster in &self.clusters { - cluster.write().interconn_cycle(cycle); - } - #[cfg(feature = "stats")] - { - TIMINGS - .lock() - .entry("serial::interconn_cycle") - .or_default() - .add(start.elapsed()); - } + // let start = Instant::now(); + // for cluster in &self.clusters { + // cluster.write().interconn_cycle(cycle); + // } + // #[cfg(feature = "stats")] + // { + // TIMINGS + // .lock() + // .entry("serial::interconn_cycle") + // .or_default() + // .add(start.elapsed()); + // } - let start = Instant::now(); + // let start = Instant::now(); for (i, mem_sub) in self.mem_sub_partitions.iter().enumerate() { let mut mem_sub = mem_sub.try_lock(); if let Some(fetch) = mem_sub.top() { @@ -370,29 +773,29 @@ where } } } - #[cfg(feature = "stats")] - { - TIMINGS - .lock() - .entry("serial::subs") - .or_default() - .add(start.elapsed()); - } + // #[cfg(feature = "stats")] + // { + // TIMINGS + // .lock() + // .entry("serial::subs") + // .or_default() + // .add(start.elapsed()); + // } - let start = Instant::now(); + // let start = Instant::now(); for (_i, unit) in self.mem_partition_units.iter().enumerate() { unit.try_write().simple_dram_cycle(); } - #[cfg(feature = "stats")] - { - TIMINGS - .lock() - .entry("serial::dram") - .or_default() - .add(start.elapsed()); - } + // #[cfg(feature = "stats")] + // { + // TIMINGS + // .lock() + // .entry("serial::dram") + // .or_default() + // .add(start.elapsed()); + // } - let start = Instant::now(); + // let start = Instant::now(); for (i, mem_sub) in self.mem_sub_partitions.iter().enumerate() { // let mut mem_sub = mem_sub.try_borrow_mut().unwrap(); let mut mem_sub = mem_sub.try_lock(); @@ -429,14 +832,14 @@ where // interface mem_sub.cache_cycle(cycle); } - #[cfg(feature = "stats")] - { - TIMINGS - .lock() - .entry("serial::l2") - .or_default() - .add(start.elapsed()); - } + // #[cfg(feature = "stats")] + // { + // TIMINGS + // .lock() + // .entry("serial::l2") + // .or_default() + // .add(start.elapsed()); + // } // let mut all_threads_complete = true; // if self.config.flush_l1_cache { diff --git a/src/scheduler/gto.rs b/src/scheduler/gto.rs index 7a8ee7cb..29fa5daf 100644 --- a/src/scheduler/gto.rs +++ b/src/scheduler/gto.rs @@ -1,6 +1,6 @@ +use crate::sync::{Arc, Mutex, RwLock}; use crate::{config, core::WarpIssuer, scoreboard::Scoreboard, warp}; use std::collections::VecDeque; -use crate::sync::{Arc, Mutex, RwLock}; #[derive(Debug)] pub struct Scheduler { diff --git a/src/scheduler/mod.rs b/src/scheduler/mod.rs index 0fa2aae4..157d07ca 100644 --- a/src/scheduler/mod.rs +++ b/src/scheduler/mod.rs @@ -39,6 +39,12 @@ pub trait Scheduler: Send + Sync + std::fmt::Debug + 'static { fn order_warps(&mut self, core: &dyn WarpIssuer); } +impl std::fmt::Debug for &dyn WarpIssuer { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("WarpIssuer").finish() + } +} + #[derive(Debug)] pub struct Base { id: usize, @@ -96,6 +102,7 @@ impl Base { &self.next_cycle_prioritized_warps } + #[tracing::instrument(name = "scheduler_issue")] #[must_use] #[inline] fn issue( @@ -210,7 +217,7 @@ impl Base { ); valid_inst = true; - if self.scoreboard.read().has_collision(warp_id, instr) { + if self.scoreboard.try_read().has_collision(warp_id, instr) { log::debug!( "Warp (warp_id={}, dynamic_warp_id={}) {}", warp_id, diff --git a/src/sync.rs b/src/sync.rs index 27673d98..b90e8b02 100644 --- a/src/sync.rs +++ b/src/sync.rs @@ -3,6 +3,7 @@ pub use std::sync::Arc; #[cfg(feature = "parking_lot")] pub mod parking_lot { + /// A mutex #[repr(transparent)] #[derive(Debug, Default)] @@ -30,6 +31,33 @@ pub mod parking_lot { } } + /// A fair mutex + #[repr(transparent)] + #[derive(Debug, Default)] + pub struct FairMutex(parking_lot::FairMutex); + + impl FairMutex { + #[must_use] + #[inline] + pub fn new(value: T) -> Self { + Self(parking_lot::FairMutex::new(value)) + } + } + + impl FairMutex { + #[must_use] + #[inline] + pub fn lock(&self) -> parking_lot::FairMutexGuard { + self.0.lock() + } + + #[must_use] + #[inline] + pub fn try_lock(&self) -> parking_lot::FairMutexGuard { + self.0.try_lock().unwrap() + } + } + /// A read-write lock #[repr(transparent)] #[derive(Debug, Default)] @@ -136,7 +164,7 @@ pub mod std { } #[cfg(feature = "parking_lot")] -pub use self::parking_lot::{Mutex, RwLock}; +pub use self::parking_lot::{FairMutex as Mutex, RwLock}; #[cfg(not(feature = "parking_lot"))] pub use std::{Mutex, RwLock}; diff --git a/src/warp.rs b/src/warp.rs index bcc04c74..64afc372 100644 --- a/src/warp.rs +++ b/src/warp.rs @@ -1,7 +1,7 @@ +use crate::sync::{Arc, Mutex}; use crate::{instruction::WarpInstruction, kernel::Kernel}; use bitvec::{array::BitArray, BitArr}; use std::collections::VecDeque; -use crate::sync::{Arc, Mutex}; /// Warp size. /// diff --git a/test-apps/test-apps-materialized.yml b/test-apps/test-apps-materialized.yml index cf93dfd7..45616ad6 100755 --- a/test-apps/test-apps-materialized.yml +++ b/test-apps/test-apps-materialized.yml @@ -2,7 +2,7 @@ ## ## AUTO GENERATED! DO NOT EDIT ## -## this configuration was materialized from /home/roman/dev/box/test-apps/test-apps.yml on 21/08/2023 23:27:46 +## this configuration was materialized from /home/roman/dev/box/test-apps/test-apps.yml on 22/08/2023 04:42:09 ## config: