exec: trace generation

romnn · Aug 17, 2023 · bf04f0e · bf04f0e
1 parent 88ca14e
commit bf04f0e
Show file tree

Hide file tree

Showing 24 changed files with 776 additions and 442 deletions.
diff --git a/WIP.md b/WIP.md
@@ -7,11 +7,12 @@
 
 - today:
 
-  - add rop delay queue
+  - execution driven frontend
   - record mem fetch latency
   - add a few more stats
   - refactor events
   - lint
+  - DONE: add rop delay queue
 
 - tomorrow:
 

diff --git a/benches/vectoradd.rs b/benches/vectoradd.rs
@@ -28,13 +28,13 @@ pub fn run_box(mut bench_config: BenchmarkConfig) -> eyre::Result<stats::Stats>
         std::env::var("PARALLEL").unwrap_or_default().to_lowercase() == "yes";
     println!("parallel: {}", bench_config.simulate.parallel);
     let stats = validate::simulate::simulate_bench_config(&bench_config)?;
-    let cycles = stats.sim.cycles;
+    let _cycles = stats.sim.cycles;
     // fast parallel:   cycle loop time: 558485 ns
     // serial:          cycle loop time: 2814591 ns (speedup 5x)
     // have 80 cores and 16 threads
     //
     // parallel: dram cycle time: 229004 ns
-    println!("");
+    println!();
     let timings = casimu::TIMINGS.lock().unwrap();
     let total = timings["total_cycle"].mean();
     for (name, dur) in [
@@ -48,7 +48,7 @@ pub fn run_box(mut bench_config: BenchmarkConfig) -> eyre::Result<stats::Stats>
         let ms = dur.as_secs_f64() * 1000.0;
         println!("{name} time: {ms:.5} ms ({percent:>2.2}%)");
     }
-    println!("");
+    println!();
     Ok(stats)
 }
 
@@ -124,28 +124,28 @@ fn main() -> eyre::Result<()> {
     let stats = run_box(black_box(get_bench_config("transpose", 0)?))?;
     dbg!(stats.sim);
     let box_dur = start.elapsed();
-    println!("box took:\t\t{:?}", box_dur);
+    println!("box took:\t\t{box_dur:?}");
 
     let timings = casimu::TIMINGS.lock().unwrap();
     let mut timings: Vec<_> = timings.iter().map(|(name, dur)| (name, dur.mean())).collect();
-    timings.sort_by_key(|(name, dur)| *dur);
+    timings.sort_by_key(|(_name, dur)| *dur);
     for (name, dur) in timings {
         println!("{name:>30}: {:>6.5} ms", dur.as_secs_f64() * 1000.0);
     }
-    println!("");
+    println!();
 
     start = Instant::now();
-    let _ = run_playground(&black_box(get_bench_config("transpose", 0)?))?;
+    run_playground(&black_box(get_bench_config("transpose", 0)?))?;
     let play_dur = start.elapsed();
-    println!("play took:\t\t{:?}", play_dur);
+    println!("play took:\t\t{play_dur:?}");
 
     start = Instant::now();
     runtime.block_on(async {
         run_accelsim(black_box(get_bench_config("transpose", 0)?)).await?;
         Ok::<(), eyre::Report>(())
     })?;
     let accel_dur = start.elapsed();
-    println!("accel took:\t\t{:?}", accel_dur);
+    println!("accel took:\t\t{accel_dur:?}");
 
     println!(
         "speedup is :\t\t{:.2}",

diff --git a/examples/vectoradd.rs b/examples/vectoradd.rs
@@ -1,116 +1,141 @@
-#![allow(clippy::cast_precision_loss)]
-#![allow(clippy::cast_possible_truncation)]
-#![allow(clippy::cast_sign_loss)]
+#![allow(
+    clippy::cast_precision_loss,
+    clippy::cast_possible_truncation,
+    clippy::cast_sign_loss
+)]
 
+use casimu::exec;
 use color_eyre::eyre;
-// use num_traits::{Float, NumCast, Zero};
-
-// #[derive(Debug)]
-// struct VecAdd<'s, 'a, T> {
-//     d_a: &'a mut casimu::DevicePtr<'s, 'a, Vec<T>>,
-//     d_b: &'a mut casimu::DevicePtr<'s, 'a, Vec<T>>,
-//     d_c: &'a mut casimu::DevicePtr<'s, 'a, Vec<T>>,
-//     n: usize,
-// }
-//
-// impl<'s, 'a, T> casimu::Kernel for VecAdd<'s, 'a, T>
-// where
-//     T: Float + std::fmt::Debug,
-// {
-//     type Error = std::convert::Infallible;
-//
-//     fn run(&mut self, idx: &casimu::ThreadIndex) -> Result<(), Self::Error> {
-//         // Get our global thread ID
-//         // int id = blockIdx.x * blockDim.x + threadIdx.x;
-//         let id: usize = (idx.block_idx.x * idx.block_dim.x + idx.thread_idx.x) as usize;
-//
-//         // Make sure we do not go out of bounds
-//         // if (id < n) c[id] = a[id] + b[id];
-//         // let test2: &(dyn std::ops::IndexMut<usize, Output = T>) = self.d_a;
-//         if id < self.n {
-//             self.d_c[id] = self.d_a[id] + self.d_b[id];
-//         }
-//         Ok(())
-//     }
-// }
-//
-// // Number of threads in each thread block
-// const BLOCK_SIZE: u32 = 1024;
-//
-// fn vectoradd<T>(n: usize) -> eyre::Result<()>
-// where
-//     T: Float + Zero + NumCast + std::iter::Sum + std::fmt::Display + std::fmt::Debug,
-// {
-//     // create host vectors
-//     let mut a: Vec<T> = vec![T::zero(); n];
-//     let mut b: Vec<T> = vec![T::zero(); n];
-//     let mut c: Vec<T> = vec![T::zero(); n];
-//
-//     // initialize vectors
-//     for i in 0..n {
-//         let angle = T::from(i).unwrap();
-//         a[i] = angle.sin() * angle.sin();
-//         b[i] = angle.cos() * angle.cos();
-//         c[i] = T::zero();
-//     }
-//
-//     let sim = casimu::Simulation::new();
-//
-//     // allocate memory for each vector on simulated GPU device
-//     let a_size = a.len() * std::mem::size_of::<T>();
-//     let b_size = b.len() * std::mem::size_of::<T>();
-//     let c_size = c.len() * std::mem::size_of::<T>();
-//     let mut d_a = sim.allocate(&mut a, a_size as u64);
-//     let mut d_b = sim.allocate(&mut b, b_size as u64);
-//     let mut d_c = sim.allocate(&mut c, c_size as u64);
-//
-//     // number of thread blocks in grid
-//     let grid_size = (n as f64 / <f64 as From<_>>::from(BLOCK_SIZE)).ceil() as u32;
-//
-//     let kernel: VecAdd<T> = VecAdd {
-//         d_a: &mut d_a,
-//         d_b: &mut d_b,
-//         d_c: &mut d_c,
-//         n,
-//     };
-//     sim.launch_kernel(grid_size, BLOCK_SIZE, kernel)?;
-//
-//     // sum up vector c and print result divided by n.
-//     // this should equal 1 within
-//     let total_sum: T = c.into_iter().sum();
-//     println!(
-//         "Final sum = {total_sum}; sum/n = {} (should be ~1)\n",
-//         total_sum / T::from(n).unwrap()
-//     );
-//
-//     dbg!(&sim.stats.lock().unwrap());
-//     Ok(())
-// }
+use num_traits::{Float, NumCast, Zero};
+
+#[derive(Debug)]
+struct VecAdd<'s, 'a, T> {
+    d_a: &'a mut exec::DevicePtr<'s, 'a, Vec<T>>,
+    d_b: &'a mut exec::DevicePtr<'s, 'a, Vec<T>>,
+    d_c: &'a mut exec::DevicePtr<'s, 'a, Vec<T>>,
+    n: usize,
+}
+
+impl<'s, 'a, T> exec::Kernel for VecAdd<'s, 'a, T>
+where
+    T: Float + std::fmt::Debug,
+{
+    type Error = std::convert::Infallible;
+
+    fn run(&mut self, idx: &exec::ThreadIndex) -> Result<(), Self::Error> {
+        // Get our global thread ID
+        // int id = blockIdx.x * blockDim.x + threadIdx.x;
+        let id: usize = (idx.block_idx.x * idx.block_dim.x + idx.thread_idx.x) as usize;
+
+        // Make sure we do not go out of bounds
+        // if (id < n) c[id] = a[id] + b[id];
+        // let test2: &(dyn std::ops::IndexMut<usize, Output = T>) = self.d_a;
+        if id < self.n {
+            self.d_c[id] = self.d_a[id] + self.d_b[id];
+        }
+        Ok(())
+    }
+}
+
+// Number of threads in each thread block
+const BLOCK_SIZE: u32 = 1024;
+
+fn vectoradd<T>(n: usize) -> eyre::Result<()>
+where
+    T: Float + Zero + NumCast + std::iter::Sum + std::fmt::Display + std::fmt::Debug,
+{
+    let start = std::time::Instant::now();
+
+    // create host vectors
+    let mut a: Vec<T> = vec![T::zero(); n];
+    let mut b: Vec<T> = vec![T::zero(); n];
+    let mut c: Vec<T> = vec![T::zero(); n];
+
+    // initialize vectors
+    for i in 0..n {
+        let angle = T::from(i).unwrap();
+        a[i] = angle.sin() * angle.sin();
+        b[i] = angle.cos() * angle.cos();
+        c[i] = T::zero();
+    }
+
+    let sim = exec::Simulation::new();
+
+    // allocate memory for each vector on simulated GPU device
+    let a_size = a.len() * std::mem::size_of::<T>();
+    let b_size = b.len() * std::mem::size_of::<T>();
+    let c_size = c.len() * std::mem::size_of::<T>();
+    let mut d_a = sim.allocate(&mut a, a_size as u64, exec::MemorySpace::Global);
+    let mut d_b = sim.allocate(&mut b, b_size as u64, exec::MemorySpace::Global);
+    let mut d_c = sim.allocate(&mut c, c_size as u64, exec::MemorySpace::Global);
+
+    // number of thread blocks in grid
+    let grid_size = (n as f64 / <f64 as From<_>>::from(BLOCK_SIZE)).ceil() as u32;
+
+    let kernel: VecAdd<T> = VecAdd {
+        d_a: &mut d_a,
+        d_b: &mut d_b,
+        d_c: &mut d_c,
+        n,
+    };
+    sim.launch_kernel(grid_size, BLOCK_SIZE, kernel)?;
+    let stats = sim.run_to_completion()?;
+
+    // sum up vector c and print result divided by n.
+    // this should equal 1 within
+    let total_sum: T = c.into_iter().sum();
+    println!(
+        "Final sum = {total_sum}; sum/n = {:.2} (should be ~1)\n",
+        total_sum / T::from(n).unwrap()
+    );
+
+    // dbg!(&stats.sim);
+    // dbg!(box_stats.l1i_stats.reduce())
+    eprintln!("STATS:\n");
+    eprintln!("DRAM: total reads: {}", &stats.dram.total_reads());
+    eprintln!("DRAM: total writes: {}", &stats.dram.total_writes());
+    eprintln!("SIM: {:#?}", &stats.sim);
+    eprintln!("INSTRUCTIONS: {:#?}", &stats.instructions);
+    eprintln!("ACCESSES: {:#?}", &stats.accesses);
+    eprintln!("L1I: {:#?}", &stats.l1i_stats.reduce());
+    eprintln!("L1D: {:#?}", &stats.l1d_stats.reduce());
+    eprintln!("L2D: {:#?}", &stats.l2d_stats.reduce());
+    eprintln!("completed in {:?}", start.elapsed());
+    Ok(())
+}
 
 fn main() -> eyre::Result<()> {
-    // vectoradd::<f32>(100)?;
+    env_logger::init();
+    vectoradd::<f32>(100)?;
     Ok(())
 }
 
 #[cfg(test)]
 mod tests {
     use color_eyre::eyre;
     use std::path::PathBuf;
+    use trace_model as model;
 
     #[test]
-    pub fn test_read_trace() -> eyre::Result<()> {
+    pub fn trace_instructions() -> eyre::Result<()> {
         let traces_dir = PathBuf::from(file!())
             .parent()
             .unwrap()
-            .join("../test-apps/vectoradd/traces/vectoradd-100-32-trace");
+            .join("../results/vectorAdd/vectorAdd-dtype-32-length-100/trace");
         dbg!(&traces_dir);
-        let rmp_trace_file_path = traces_dir.join("trace.msgpack");
+        let rmp_trace_file_path = traces_dir.join("kernel-0.msgpack");
         dbg!(&rmp_trace_file_path);
 
-        let sim = casimu::Simulation::new();
-        sim.read_trace(rmp_trace_file_path)?;
-
-        dbg!(&sim.stats.lock().unwrap());
+        // let sim = casimu::Simulation::new();
+        // sim.read_trace(rmp_trace_file_path)?;
+        // dbg!(&sim.stats.lock().unwrap());
+        let mut reader = utils::fs::open_readable(rmp_trace_file_path)?;
+        let full_trace: model::MemAccessTrace = rmp_serde::from_read(&mut reader)?;
+        let warp_traces = full_trace.to_warp_traces();
+        dbg!(&warp_traces[&(model::Dim::ZERO, 0)]
+            .iter()
+            .map(|entry| (&entry.instr_opcode, &entry.active_mask))
+            .collect::<Vec<_>>());
 
         assert!(false);
         Ok(())

diff --git a/playground/sys/src/ref/gpgpu_sim_config.hpp b/playground/sys/src/ref/gpgpu_sim_config.hpp
@@ -116,7 +116,7 @@ class gpgpu_sim_config : public power_config,
       m_memory_config.m_perf_sim_memcpy = true;  // true
 
       // gpgpu_l2_rop_latency was 120
-      m_memory_config.rop_latency = 0;
+      // m_memory_config.rop_latency = 0;
       // dram_latency latency was 100
       m_memory_config.dram_latency = 0;
       // cannot create the l1 latency queue otherwise (to be removed i guess)

diff --git a/src/cache/data.rs b/src/cache/data.rs
@@ -831,7 +831,7 @@ mod tests {
                 Command::MemcpyHtoD { .. } => {}
                 Command::MemAlloc { .. } => {}
                 Command::KernelLaunch(launch) => {
-                    let kernel = Kernel::from_trace(&trace_dir, launch.clone());
+                    let kernel = Kernel::from_trace(launch.clone(), &trace_dir);
                     kernels.push_back(kernel);
                 }
             }
@@ -991,7 +991,7 @@ mod tests {
             local_mem_base_addr: 140_663_752_491_008,
             nvbit_version: "1.5.5".to_string(),
         };
-        let kernel = Kernel::from_trace(trace_dir, launch);
+        let kernel = Kernel::from_trace(launch, trace_dir);
 
         let trace_instr = trace_model::MemAccessTraceEntry {
             cuda_ctx: 0,

diff --git a/src/config/accelsim/mod.rs b/src/config/accelsim/mod.rs
@@ -196,7 +196,7 @@ impl Config {
         let args =
             extract_arguments(config.as_ref()).map(|(key, value)| format!("--{key}={value}"));
         let args: Vec<String> = ["test".to_string()].into_iter().chain(args).collect();
-        let config = Self::try_parse_from(&args)?;
+        let config = Self::try_parse_from(args)?;
         Ok(config)
     }
 }
@@ -215,7 +215,7 @@ mod tests {
         let config = std::fs::read_to_string(config_path)?;
         diff::assert_eq!(super::Config::parse("")?, super::Config::default());
         diff::assert_eq!(
-            super::Config::parse(&config)?,
+            super::Config::parse(config)?,
             super::Config {
                 shader_core: super::core::CoreConfig {
                     specialized_unit_1: "1,4,4,4,4,BRA".to_string(),
@@ -254,7 +254,7 @@ mod tests {
             super::Config::default()
         );
 
-        let mut have = super::Config::parse(&config)?;
+        let mut have = super::Config::parse(config)?;
         let mut want = super::Config {
             functional: super::functional::FunctionalConfig {
                 m_ptx_force_max_capability: 60,

diff --git a/src/config/accelsim/sim.rs b/src/config/accelsim/sim.rs
@@ -220,7 +220,7 @@ impl Default for SimConfig {
             g_visualizer_filename: None,
             g_visualizer_zlevel: 6,
             stack_size_limit: 1024,
-            heap_size_limit: 8388608,
+            heap_size_limit: 8_388_608,
             runtime_sync_depth_limit: 2,
             runtime_pending_launch_count_limit: 2048,
             trace_enabled: false.into(),

diff --git a/src/config/mod.rs b/src/config/mod.rs
@@ -319,6 +319,8 @@ pub struct GPUConfig {
     pub log_after_cycle: Option<u64>,
     /// Parallel simulation
     pub parallel: bool,
+    /// Deadlock check
+    pub deadlock_check: bool,
 
     pub linear_to_raw_adress_translation:
         std::sync::OnceLock<addrdec::LinearToRawAddressTranslation>,
@@ -527,7 +529,7 @@ pub struct GPUConfig {
     /// nbk=16:CCD=2:RRD=6:RCD=12:RAS=28:RP=12:RC=40: CL=12:WL=4:CDLR=5:WR=12:nbkgrp=1:CCDL=0:RTPL=0
     pub dram_timing_options: dram::TimingOptions,
     /// ROP queue latency (default 85)
-    pub l2_rop_latency: usize, // 120
+    pub l2_rop_latency: u64, // 120
     /// DRAM latency (default 30)
     pub dram_latency: usize, // 100
     /// dual_bus_interface (default = 0)
@@ -941,6 +943,7 @@ impl Default for GPUConfig {
         Self {
             log_after_cycle: None,
             parallel: false,
+            deadlock_check: false,
             linear_to_raw_adress_translation: std::sync::OnceLock::new(),
             occupancy_sm_number: 60,
             max_threads_per_core: 2048,