non deterministic implementation: simpler implemenation using rayon s…

…copes
romnn · Aug 22, 2023 · 11292cd · 11292cd
1 parent 64e90dc
commit 11292cd
Show file tree

Hide file tree

Showing 21 changed files with 1,232 additions and 846 deletions.
diff --git a/.gitignore b/.gitignore
@@ -15,7 +15,10 @@ Pipfile.lock
 cuda_*.run
 
 # code coverage files
+/coverage
 **/*.profraw
+
+# perf traces
 **/perf.data*
+**/bench.trace.json
 **/flamegraph.svg
-/coverage
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -35,9 +35,9 @@ debug-assertions = false
 # lto = true
 # warning: debug assertions negatively impact the performance of accelsim and play
 debug-assertions = false
-codegen-units = 10
+# codegen-units = 10
 opt-level = 3
-debug = false
+debug = true
 
 [package]
 name = "casimu"
@@ -82,11 +82,17 @@ strum = { version = "0", features = ["derive"] }
 phf = { version = "0.11.1", features = ["macros"] }
 rangemap = "1"
 
+# synchronization
 flume = "0"
 crossbeam = "0"
 num_cpus = "1"
 parking_lot = "0"
 
+# tracing
+tracing = "0"
+tracing-subscriber = "0"
+tracing-chrome = "0"
+
 similar-asserts = "1"
 
 # log4rs = "0"

diff --git a/WIP.md b/WIP.md
@@ -17,7 +17,6 @@
   - refactor interconn to couple has buffer and push using a single explicit lock
   - refactor to get rid of global config but use per component configs
   - use traits for common components
-  - try using native threads and barriers for core simulation
   - record mem fetch latency
   - add a few more stats
   - plot statistics
@@ -32,6 +31,7 @@
 
     - asynchronously push into file (unordered)
 
+  - DONE: try using native threads and barriers for core simulation
   - DONE: pipelined simd function unit should not implement simd function unit
   - DONE: get rid of global cycle mutex
   - DONE: lint

diff --git a/benches/vectoradd.rs b/benches/vectoradd.rs
@@ -118,28 +118,47 @@ fn main() -> eyre::Result<()> {
     #[allow(unused_imports)]
     use std::io::Write;
     use std::time::Instant;
+    use tracing_chrome::ChromeLayerBuilder;
+    use tracing_subscriber::{prelude::*, registry::Registry};
 
-    env_logger::init();
-    // let mut log_builder = env_logger::Builder::new();
-    // log_builder.format(|buf, record| writeln!(buf, "{}", record.args()));
+    let profile = std::env::var("TRACE").unwrap_or_default().to_lowercase() == "yes";
 
-    let (bench_name, input_num) = ("transpose", 0); // takes 34 sec (accel same)
+    let mut generate_trace = if profile {
+        // tracing_subscriber::fmt::init();
+        let (chrome_layer, guard) = ChromeLayerBuilder::new().file("bench.trace.json").build();
+        tracing_subscriber::registry().with(chrome_layer).init();
+        Some(guard)
+    } else {
+        // env_logger::init();
+        // let mut log_builder = env_logger::Builder::new();
+        // log_builder.format(|buf, record| writeln!(buf, "{}", record.args()));
+        None
+    };
+
+    // let (bench_name, input_num) = ("transpose", 0); // takes 34 sec (accel same)
 
     // let (bench_name, input_num) = ("simple_matrixmul", 26); // takes 22 sec
 
     let (bench_name, input_num) = ("matrixmul", 3); // takes 54 sec (accel 76)
+
+    // let (bench_name, input_num) = ("vectorAdd", 0);
     println!("running {bench_name}@{input_num}");
 
     let runtime = tokio::runtime::Builder::new_multi_thread()
         .enable_all()
         .build()?;
 
-    let mut start = Instant::now();
+    let start = Instant::now();
     let stats = run_box(black_box(get_bench_config(bench_name, input_num)?))?;
     dbg!(&stats.sim);
     let box_dur = start.elapsed();
     println!("box took:\t\t{box_dur:?}");
 
+    drop(generate_trace.take());
+    if profile {
+        return Ok(());
+    }
+
     let timings = casimu::TIMINGS.lock();
     println!("sorted by NAME");
     for (name, dur) in timings.iter().sorted_by_key(|(name, _dur)| name.clone()) {
@@ -158,31 +177,24 @@ fn main() -> eyre::Result<()> {
             dur.total().as_secs_f64(),
         );
     }
-
-    if let Some(serial_cycle) = timings.get("SERIAL CYCLE") {
-        println!(
-            "=> serial only execution time: {:?}",
-            serial_cycle.mean() * u32::try_from(stats.sim.cycles).unwrap()
-        );
-    }
     println!();
 
-    start = Instant::now();
+    let start = Instant::now();
     run_playground(&black_box(get_bench_config(bench_name, input_num)?))?;
     let play_dur = start.elapsed();
     println!("play took:\t\t{play_dur:?}");
+    println!(
+        "speedup is :\t\t{:.2}",
+        play_dur.as_secs_f64() / box_dur.as_secs_f64()
+    );
 
-    start = Instant::now();
+    let start = Instant::now();
     runtime.block_on(async {
         run_accelsim(black_box(get_bench_config(bench_name, input_num)?)).await?;
         Ok::<(), eyre::Report>(())
     })?;
     let accel_dur = start.elapsed();
     println!("accel took:\t\t{accel_dur:?}");
 
-    println!(
-        "speedup is :\t\t{:.2}",
-        play_dur.as_secs_f64() / box_dur.as_secs_f64()
-    );
     Ok(())
 }
diff --git a/src/cache/l2.rs b/src/cache/l2.rs
@@ -1,6 +1,6 @@
+use crate::sync::{Arc, Mutex};
 use crate::{address, config, interconn as ic, mem_fetch};
 use std::collections::VecDeque;
-use crate::sync::{Arc, Mutex};
 
 /// Generic data cache.
 #[derive(Debug)]

diff --git a/src/cache/mod.rs b/src/cache/mod.rs
@@ -13,8 +13,8 @@ pub use readonly::ReadOnly;
 
 use super::{address, mem_fetch};
 use crate::config;
-use std::collections::VecDeque;
 use crate::sync::{Arc, Mutex};
+use std::collections::VecDeque;
 
 #[derive(Debug, strum::EnumIter, Clone, Copy, Hash, PartialEq, Eq)]
 pub enum RequestStatus {

diff --git a/src/cluster.rs b/src/cluster.rs
@@ -1,5 +1,6 @@
 use super::{config, interconn as ic, kernel::Kernel, mem_fetch, Core, MockSimulator, Packet};
 use console::style;
+use crossbeam::utils::CachePadded;
 
 use std::collections::VecDeque;
 
@@ -8,7 +9,7 @@ use crate::sync::{atomic, Arc, Mutex, RwLock};
 #[derive(Debug)]
 pub struct Cluster<I> {
     pub cluster_id: usize,
-    pub warp_instruction_unique_uid: Arc<atomic::AtomicU64>,
+    pub warp_instruction_unique_uid: Arc<CachePadded<atomic::AtomicU64>>,
     pub cores: Vec<Arc<RwLock<Core<I>>>>,
     pub config: Arc<config::GPU>,
     pub stats: Arc<Mutex<stats::Stats>>,
@@ -26,7 +27,7 @@ where
 {
     pub fn new(
         cluster_id: usize,
-        warp_instruction_unique_uid: &Arc<atomic::AtomicU64>,
+        warp_instruction_unique_uid: &Arc<CachePadded<atomic::AtomicU64>>,
         allocations: &super::allocation::Ref,
         interconn: &Arc<I>,
         stats: &Arc<Mutex<stats::Stats>>,
@@ -75,7 +76,7 @@ where
     pub fn num_active_sms(&self) -> usize {
         self.cores
             .iter()
-            .filter(|core| core.try_read().active())
+            .filter(|core| core.try_read().is_active())
             .count()
     }
 
@@ -86,6 +87,7 @@ where
             .sum()
     }
 
+    #[tracing::instrument]
     pub fn interconn_cycle(&mut self, cycle: u64) {
         use mem_fetch::AccessKind;
 
@@ -106,6 +108,7 @@ where
         if let Some(fetch) = self.response_fifo.front() {
             let core_id = self.config.global_core_id_to_core_id(fetch.core_id);
 
+            // we should not fully lock a core as we completely block a full core cycle
             let core = self.cores[core_id].read();
 
             match *fetch.access_kind() {
@@ -198,7 +201,8 @@ where
     //     }
     // }
 
-    pub fn issue_block_to_core(&self, sim: &MockSimulator<I>) -> usize {
+    #[tracing::instrument(name = "cluster_issue_block_to_core")]
+    pub fn issue_block_to_core(&self, sim: &MockSimulator<I>, cycle: u64) -> usize {
         let num_cores = self.cores.len();
 
         log::debug!(
@@ -212,10 +216,7 @@ where
 
         for core_id in 0..num_cores {
             let core_id = (core_id + *block_issue_next_core + 1) % num_cores;
-            // let core = &mut cores[core_id];
-            // THIS KILLS THE PERFORMANCE
-            let core = self.cores[core_id].try_read();
-            // let core = self.cores[core_id].read();
+            let core = self.cores[core_id].read();
 
             // let kernel: Option<Arc<Kernel>> = if self.config.concurrent_kernel_sm {
             //     // always select latest issued kernel
@@ -270,9 +271,8 @@ where
                 let can_issue = !kernel.no_more_blocks_to_run() && core.can_issue_block(&kernel);
                 drop(core);
                 if can_issue {
-                    // dbg!("core issue");
                     let mut core = self.cores[core_id].write();
-                    core.issue_block(&kernel);
+                    core.issue_block(&kernel, cycle);
                     num_blocks_issued += 1;
                     *block_issue_next_core = core_id;
                     break;