Skip to content

Commit

Permalink
exec: trace generation
Browse files Browse the repository at this point in the history
  • Loading branch information
romnn committed Aug 17, 2023
1 parent 88ca14e commit bf04f0e
Show file tree
Hide file tree
Showing 24 changed files with 776 additions and 442 deletions.
3 changes: 2 additions & 1 deletion WIP.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,12 @@

- today:

- add rop delay queue
- execution driven frontend
- record mem fetch latency
- add a few more stats
- refactor events
- lint
- DONE: add rop delay queue

- tomorrow:

Expand Down
18 changes: 9 additions & 9 deletions benches/vectoradd.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,13 @@ pub fn run_box(mut bench_config: BenchmarkConfig) -> eyre::Result<stats::Stats>
std::env::var("PARALLEL").unwrap_or_default().to_lowercase() == "yes";
println!("parallel: {}", bench_config.simulate.parallel);
let stats = validate::simulate::simulate_bench_config(&bench_config)?;
let cycles = stats.sim.cycles;
let _cycles = stats.sim.cycles;
// fast parallel: cycle loop time: 558485 ns
// serial: cycle loop time: 2814591 ns (speedup 5x)
// have 80 cores and 16 threads
//
// parallel: dram cycle time: 229004 ns
println!("");
println!();
let timings = casimu::TIMINGS.lock().unwrap();
let total = timings["total_cycle"].mean();
for (name, dur) in [
Expand All @@ -48,7 +48,7 @@ pub fn run_box(mut bench_config: BenchmarkConfig) -> eyre::Result<stats::Stats>
let ms = dur.as_secs_f64() * 1000.0;
println!("{name} time: {ms:.5} ms ({percent:>2.2}%)");
}
println!("");
println!();
Ok(stats)
}

Expand Down Expand Up @@ -124,28 +124,28 @@ fn main() -> eyre::Result<()> {
let stats = run_box(black_box(get_bench_config("transpose", 0)?))?;
dbg!(stats.sim);
let box_dur = start.elapsed();
println!("box took:\t\t{:?}", box_dur);
println!("box took:\t\t{box_dur:?}");

let timings = casimu::TIMINGS.lock().unwrap();
let mut timings: Vec<_> = timings.iter().map(|(name, dur)| (name, dur.mean())).collect();
timings.sort_by_key(|(name, dur)| *dur);
timings.sort_by_key(|(_name, dur)| *dur);
for (name, dur) in timings {
println!("{name:>30}: {:>6.5} ms", dur.as_secs_f64() * 1000.0);
}
println!("");
println!();

start = Instant::now();
let _ = run_playground(&black_box(get_bench_config("transpose", 0)?))?;
run_playground(&black_box(get_bench_config("transpose", 0)?))?;
let play_dur = start.elapsed();
println!("play took:\t\t{:?}", play_dur);
println!("play took:\t\t{play_dur:?}");

start = Instant::now();
runtime.block_on(async {
run_accelsim(black_box(get_bench_config("transpose", 0)?)).await?;
Ok::<(), eyre::Report>(())
})?;
let accel_dur = start.elapsed();
println!("accel took:\t\t{:?}", accel_dur);
println!("accel took:\t\t{accel_dur:?}");

println!(
"speedup is :\t\t{:.2}",
Expand Down
213 changes: 119 additions & 94 deletions examples/vectoradd.rs
Original file line number Diff line number Diff line change
@@ -1,116 +1,141 @@
#![allow(clippy::cast_precision_loss)]
#![allow(clippy::cast_possible_truncation)]
#![allow(clippy::cast_sign_loss)]
#![allow(
clippy::cast_precision_loss,
clippy::cast_possible_truncation,
clippy::cast_sign_loss
)]

use casimu::exec;
use color_eyre::eyre;
// use num_traits::{Float, NumCast, Zero};

// #[derive(Debug)]
// struct VecAdd<'s, 'a, T> {
// d_a: &'a mut casimu::DevicePtr<'s, 'a, Vec<T>>,
// d_b: &'a mut casimu::DevicePtr<'s, 'a, Vec<T>>,
// d_c: &'a mut casimu::DevicePtr<'s, 'a, Vec<T>>,
// n: usize,
// }
//
// impl<'s, 'a, T> casimu::Kernel for VecAdd<'s, 'a, T>
// where
// T: Float + std::fmt::Debug,
// {
// type Error = std::convert::Infallible;
//
// fn run(&mut self, idx: &casimu::ThreadIndex) -> Result<(), Self::Error> {
// // Get our global thread ID
// // int id = blockIdx.x * blockDim.x + threadIdx.x;
// let id: usize = (idx.block_idx.x * idx.block_dim.x + idx.thread_idx.x) as usize;
//
// // Make sure we do not go out of bounds
// // if (id < n) c[id] = a[id] + b[id];
// // let test2: &(dyn std::ops::IndexMut<usize, Output = T>) = self.d_a;
// if id < self.n {
// self.d_c[id] = self.d_a[id] + self.d_b[id];
// }
// Ok(())
// }
// }
//
// // Number of threads in each thread block
// const BLOCK_SIZE: u32 = 1024;
//
// fn vectoradd<T>(n: usize) -> eyre::Result<()>
// where
// T: Float + Zero + NumCast + std::iter::Sum + std::fmt::Display + std::fmt::Debug,
// {
// // create host vectors
// let mut a: Vec<T> = vec![T::zero(); n];
// let mut b: Vec<T> = vec![T::zero(); n];
// let mut c: Vec<T> = vec![T::zero(); n];
//
// // initialize vectors
// for i in 0..n {
// let angle = T::from(i).unwrap();
// a[i] = angle.sin() * angle.sin();
// b[i] = angle.cos() * angle.cos();
// c[i] = T::zero();
// }
//
// let sim = casimu::Simulation::new();
//
// // allocate memory for each vector on simulated GPU device
// let a_size = a.len() * std::mem::size_of::<T>();
// let b_size = b.len() * std::mem::size_of::<T>();
// let c_size = c.len() * std::mem::size_of::<T>();
// let mut d_a = sim.allocate(&mut a, a_size as u64);
// let mut d_b = sim.allocate(&mut b, b_size as u64);
// let mut d_c = sim.allocate(&mut c, c_size as u64);
//
// // number of thread blocks in grid
// let grid_size = (n as f64 / <f64 as From<_>>::from(BLOCK_SIZE)).ceil() as u32;
//
// let kernel: VecAdd<T> = VecAdd {
// d_a: &mut d_a,
// d_b: &mut d_b,
// d_c: &mut d_c,
// n,
// };
// sim.launch_kernel(grid_size, BLOCK_SIZE, kernel)?;
//
// // sum up vector c and print result divided by n.
// // this should equal 1 within
// let total_sum: T = c.into_iter().sum();
// println!(
// "Final sum = {total_sum}; sum/n = {} (should be ~1)\n",
// total_sum / T::from(n).unwrap()
// );
//
// dbg!(&sim.stats.lock().unwrap());
// Ok(())
// }
use num_traits::{Float, NumCast, Zero};

#[derive(Debug)]
struct VecAdd<'s, 'a, T> {
d_a: &'a mut exec::DevicePtr<'s, 'a, Vec<T>>,
d_b: &'a mut exec::DevicePtr<'s, 'a, Vec<T>>,
d_c: &'a mut exec::DevicePtr<'s, 'a, Vec<T>>,
n: usize,
}

impl<'s, 'a, T> exec::Kernel for VecAdd<'s, 'a, T>
where
T: Float + std::fmt::Debug,
{
type Error = std::convert::Infallible;

fn run(&mut self, idx: &exec::ThreadIndex) -> Result<(), Self::Error> {
// Get our global thread ID
// int id = blockIdx.x * blockDim.x + threadIdx.x;
let id: usize = (idx.block_idx.x * idx.block_dim.x + idx.thread_idx.x) as usize;

// Make sure we do not go out of bounds
// if (id < n) c[id] = a[id] + b[id];
// let test2: &(dyn std::ops::IndexMut<usize, Output = T>) = self.d_a;
if id < self.n {
self.d_c[id] = self.d_a[id] + self.d_b[id];
}
Ok(())
}
}

// Number of threads in each thread block
const BLOCK_SIZE: u32 = 1024;

fn vectoradd<T>(n: usize) -> eyre::Result<()>
where
T: Float + Zero + NumCast + std::iter::Sum + std::fmt::Display + std::fmt::Debug,
{
let start = std::time::Instant::now();

// create host vectors
let mut a: Vec<T> = vec![T::zero(); n];
let mut b: Vec<T> = vec![T::zero(); n];
let mut c: Vec<T> = vec![T::zero(); n];

// initialize vectors
for i in 0..n {
let angle = T::from(i).unwrap();
a[i] = angle.sin() * angle.sin();
b[i] = angle.cos() * angle.cos();
c[i] = T::zero();
}

let sim = exec::Simulation::new();

// allocate memory for each vector on simulated GPU device
let a_size = a.len() * std::mem::size_of::<T>();
let b_size = b.len() * std::mem::size_of::<T>();
let c_size = c.len() * std::mem::size_of::<T>();
let mut d_a = sim.allocate(&mut a, a_size as u64, exec::MemorySpace::Global);
let mut d_b = sim.allocate(&mut b, b_size as u64, exec::MemorySpace::Global);
let mut d_c = sim.allocate(&mut c, c_size as u64, exec::MemorySpace::Global);

// number of thread blocks in grid
let grid_size = (n as f64 / <f64 as From<_>>::from(BLOCK_SIZE)).ceil() as u32;

let kernel: VecAdd<T> = VecAdd {
d_a: &mut d_a,
d_b: &mut d_b,
d_c: &mut d_c,
n,
};
sim.launch_kernel(grid_size, BLOCK_SIZE, kernel)?;
let stats = sim.run_to_completion()?;

// sum up vector c and print result divided by n.
// this should equal 1 within
let total_sum: T = c.into_iter().sum();
println!(
"Final sum = {total_sum}; sum/n = {:.2} (should be ~1)\n",
total_sum / T::from(n).unwrap()
);

// dbg!(&stats.sim);
// dbg!(box_stats.l1i_stats.reduce())
eprintln!("STATS:\n");
eprintln!("DRAM: total reads: {}", &stats.dram.total_reads());
eprintln!("DRAM: total writes: {}", &stats.dram.total_writes());
eprintln!("SIM: {:#?}", &stats.sim);
eprintln!("INSTRUCTIONS: {:#?}", &stats.instructions);
eprintln!("ACCESSES: {:#?}", &stats.accesses);
eprintln!("L1I: {:#?}", &stats.l1i_stats.reduce());
eprintln!("L1D: {:#?}", &stats.l1d_stats.reduce());
eprintln!("L2D: {:#?}", &stats.l2d_stats.reduce());
eprintln!("completed in {:?}", start.elapsed());
Ok(())
}

fn main() -> eyre::Result<()> {
// vectoradd::<f32>(100)?;
env_logger::init();
vectoradd::<f32>(100)?;
Ok(())
}

#[cfg(test)]
mod tests {
use color_eyre::eyre;
use std::path::PathBuf;
use trace_model as model;

#[test]
pub fn test_read_trace() -> eyre::Result<()> {
pub fn trace_instructions() -> eyre::Result<()> {
let traces_dir = PathBuf::from(file!())
.parent()
.unwrap()
.join("../test-apps/vectoradd/traces/vectoradd-100-32-trace");
.join("../results/vectorAdd/vectorAdd-dtype-32-length-100/trace");
dbg!(&traces_dir);
let rmp_trace_file_path = traces_dir.join("trace.msgpack");
let rmp_trace_file_path = traces_dir.join("kernel-0.msgpack");
dbg!(&rmp_trace_file_path);

let sim = casimu::Simulation::new();
sim.read_trace(rmp_trace_file_path)?;

dbg!(&sim.stats.lock().unwrap());
// let sim = casimu::Simulation::new();
// sim.read_trace(rmp_trace_file_path)?;
// dbg!(&sim.stats.lock().unwrap());
let mut reader = utils::fs::open_readable(rmp_trace_file_path)?;
let full_trace: model::MemAccessTrace = rmp_serde::from_read(&mut reader)?;
let warp_traces = full_trace.to_warp_traces();
dbg!(&warp_traces[&(model::Dim::ZERO, 0)]
.iter()
.map(|entry| (&entry.instr_opcode, &entry.active_mask))
.collect::<Vec<_>>());

assert!(false);
Ok(())
Expand Down
2 changes: 1 addition & 1 deletion playground/sys/src/ref/gpgpu_sim_config.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ class gpgpu_sim_config : public power_config,
m_memory_config.m_perf_sim_memcpy = true; // true

// gpgpu_l2_rop_latency was 120
m_memory_config.rop_latency = 0;
// m_memory_config.rop_latency = 0;
// dram_latency latency was 100
m_memory_config.dram_latency = 0;
// cannot create the l1 latency queue otherwise (to be removed i guess)
Expand Down
4 changes: 2 additions & 2 deletions src/cache/data.rs
Original file line number Diff line number Diff line change
Expand Up @@ -831,7 +831,7 @@ mod tests {
Command::MemcpyHtoD { .. } => {}
Command::MemAlloc { .. } => {}
Command::KernelLaunch(launch) => {
let kernel = Kernel::from_trace(&trace_dir, launch.clone());
let kernel = Kernel::from_trace(launch.clone(), &trace_dir);
kernels.push_back(kernel);
}
}
Expand Down Expand Up @@ -991,7 +991,7 @@ mod tests {
local_mem_base_addr: 140_663_752_491_008,
nvbit_version: "1.5.5".to_string(),
};
let kernel = Kernel::from_trace(trace_dir, launch);
let kernel = Kernel::from_trace(launch, trace_dir);

let trace_instr = trace_model::MemAccessTraceEntry {
cuda_ctx: 0,
Expand Down
6 changes: 3 additions & 3 deletions src/config/accelsim/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@ impl Config {
let args =
extract_arguments(config.as_ref()).map(|(key, value)| format!("--{key}={value}"));
let args: Vec<String> = ["test".to_string()].into_iter().chain(args).collect();
let config = Self::try_parse_from(&args)?;
let config = Self::try_parse_from(args)?;
Ok(config)
}
}
Expand All @@ -215,7 +215,7 @@ mod tests {
let config = std::fs::read_to_string(config_path)?;
diff::assert_eq!(super::Config::parse("")?, super::Config::default());
diff::assert_eq!(
super::Config::parse(&config)?,
super::Config::parse(config)?,
super::Config {
shader_core: super::core::CoreConfig {
specialized_unit_1: "1,4,4,4,4,BRA".to_string(),
Expand Down Expand Up @@ -254,7 +254,7 @@ mod tests {
super::Config::default()
);

let mut have = super::Config::parse(&config)?;
let mut have = super::Config::parse(config)?;
let mut want = super::Config {
functional: super::functional::FunctionalConfig {
m_ptx_force_max_capability: 60,
Expand Down
2 changes: 1 addition & 1 deletion src/config/accelsim/sim.rs
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,7 @@ impl Default for SimConfig {
g_visualizer_filename: None,
g_visualizer_zlevel: 6,
stack_size_limit: 1024,
heap_size_limit: 8388608,
heap_size_limit: 8_388_608,
runtime_sync_depth_limit: 2,
runtime_pending_launch_count_limit: 2048,
trace_enabled: false.into(),
Expand Down
5 changes: 4 additions & 1 deletion src/config/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -319,6 +319,8 @@ pub struct GPUConfig {
pub log_after_cycle: Option<u64>,
/// Parallel simulation
pub parallel: bool,
/// Deadlock check
pub deadlock_check: bool,

pub linear_to_raw_adress_translation:
std::sync::OnceLock<addrdec::LinearToRawAddressTranslation>,
Expand Down Expand Up @@ -527,7 +529,7 @@ pub struct GPUConfig {
/// nbk=16:CCD=2:RRD=6:RCD=12:RAS=28:RP=12:RC=40: CL=12:WL=4:CDLR=5:WR=12:nbkgrp=1:CCDL=0:RTPL=0
pub dram_timing_options: dram::TimingOptions,
/// ROP queue latency (default 85)
pub l2_rop_latency: usize, // 120
pub l2_rop_latency: u64, // 120
/// DRAM latency (default 30)
pub dram_latency: usize, // 100
/// dual_bus_interface (default = 0)
Expand Down Expand Up @@ -941,6 +943,7 @@ impl Default for GPUConfig {
Self {
log_after_cycle: None,
parallel: false,
deadlock_check: false,
linear_to_raw_adress_translation: std::sync::OnceLock::new(),
occupancy_sm_number: 60,
max_threads_per_core: 2048,
Expand Down
Loading

0 comments on commit bf04f0e

Please sign in to comment.