Skip to content

Commit

Permalink
non deterministic implementation: simpler implemenation using rayon s…
Browse files Browse the repository at this point in the history
…copes
  • Loading branch information
romnn committed Aug 22, 2023
1 parent 64e90dc commit 11292cd
Show file tree
Hide file tree
Showing 21 changed files with 1,232 additions and 846 deletions.
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,10 @@ Pipfile.lock
cuda_*.run

# code coverage files
/coverage
**/*.profraw

# perf traces
**/perf.data*
**/bench.trace.json
**/flamegraph.svg
/coverage
58 changes: 57 additions & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

10 changes: 8 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,9 @@ debug-assertions = false
# lto = true
# warning: debug assertions negatively impact the performance of accelsim and play
debug-assertions = false
codegen-units = 10
# codegen-units = 10
opt-level = 3
debug = false
debug = true

[package]
name = "casimu"
Expand Down Expand Up @@ -82,11 +82,17 @@ strum = { version = "0", features = ["derive"] }
phf = { version = "0.11.1", features = ["macros"] }
rangemap = "1"

# synchronization
flume = "0"
crossbeam = "0"
num_cpus = "1"
parking_lot = "0"

# tracing
tracing = "0"
tracing-subscriber = "0"
tracing-chrome = "0"

similar-asserts = "1"

# log4rs = "0"
Expand Down
2 changes: 1 addition & 1 deletion WIP.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
- refactor interconn to couple has buffer and push using a single explicit lock
- refactor to get rid of global config but use per component configs
- use traits for common components
- try using native threads and barriers for core simulation
- record mem fetch latency
- add a few more stats
- plot statistics
Expand All @@ -32,6 +31,7 @@

- asynchronously push into file (unordered)

- DONE: try using native threads and barriers for core simulation
- DONE: pipelined simd function unit should not implement simd function unit
- DONE: get rid of global cycle mutex
- DONE: lint
Expand Down
48 changes: 30 additions & 18 deletions benches/vectoradd.rs
Original file line number Diff line number Diff line change
Expand Up @@ -118,28 +118,47 @@ fn main() -> eyre::Result<()> {
#[allow(unused_imports)]
use std::io::Write;
use std::time::Instant;
use tracing_chrome::ChromeLayerBuilder;
use tracing_subscriber::{prelude::*, registry::Registry};

env_logger::init();
// let mut log_builder = env_logger::Builder::new();
// log_builder.format(|buf, record| writeln!(buf, "{}", record.args()));
let profile = std::env::var("TRACE").unwrap_or_default().to_lowercase() == "yes";

let (bench_name, input_num) = ("transpose", 0); // takes 34 sec (accel same)
let mut generate_trace = if profile {
// tracing_subscriber::fmt::init();
let (chrome_layer, guard) = ChromeLayerBuilder::new().file("bench.trace.json").build();
tracing_subscriber::registry().with(chrome_layer).init();
Some(guard)
} else {
// env_logger::init();
// let mut log_builder = env_logger::Builder::new();
// log_builder.format(|buf, record| writeln!(buf, "{}", record.args()));
None
};

// let (bench_name, input_num) = ("transpose", 0); // takes 34 sec (accel same)

// let (bench_name, input_num) = ("simple_matrixmul", 26); // takes 22 sec

let (bench_name, input_num) = ("matrixmul", 3); // takes 54 sec (accel 76)

// let (bench_name, input_num) = ("vectorAdd", 0);
println!("running {bench_name}@{input_num}");

let runtime = tokio::runtime::Builder::new_multi_thread()
.enable_all()
.build()?;

let mut start = Instant::now();
let start = Instant::now();
let stats = run_box(black_box(get_bench_config(bench_name, input_num)?))?;
dbg!(&stats.sim);
let box_dur = start.elapsed();
println!("box took:\t\t{box_dur:?}");

drop(generate_trace.take());
if profile {
return Ok(());
}

let timings = casimu::TIMINGS.lock();
println!("sorted by NAME");
for (name, dur) in timings.iter().sorted_by_key(|(name, _dur)| name.clone()) {
Expand All @@ -158,31 +177,24 @@ fn main() -> eyre::Result<()> {
dur.total().as_secs_f64(),
);
}

if let Some(serial_cycle) = timings.get("SERIAL CYCLE") {
println!(
"=> serial only execution time: {:?}",
serial_cycle.mean() * u32::try_from(stats.sim.cycles).unwrap()
);
}
println!();

start = Instant::now();
let start = Instant::now();
run_playground(&black_box(get_bench_config(bench_name, input_num)?))?;
let play_dur = start.elapsed();
println!("play took:\t\t{play_dur:?}");
println!(
"speedup is :\t\t{:.2}",
play_dur.as_secs_f64() / box_dur.as_secs_f64()
);

start = Instant::now();
let start = Instant::now();
runtime.block_on(async {
run_accelsim(black_box(get_bench_config(bench_name, input_num)?)).await?;
Ok::<(), eyre::Report>(())
})?;
let accel_dur = start.elapsed();
println!("accel took:\t\t{accel_dur:?}");

println!(
"speedup is :\t\t{:.2}",
play_dur.as_secs_f64() / box_dur.as_secs_f64()
);
Ok(())
}
2 changes: 1 addition & 1 deletion src/cache/l2.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use crate::sync::{Arc, Mutex};
use crate::{address, config, interconn as ic, mem_fetch};
use std::collections::VecDeque;
use crate::sync::{Arc, Mutex};

/// Generic data cache.
#[derive(Debug)]
Expand Down
2 changes: 1 addition & 1 deletion src/cache/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ pub use readonly::ReadOnly;

use super::{address, mem_fetch};
use crate::config;
use std::collections::VecDeque;
use crate::sync::{Arc, Mutex};
use std::collections::VecDeque;

#[derive(Debug, strum::EnumIter, Clone, Copy, Hash, PartialEq, Eq)]
pub enum RequestStatus {
Expand Down
20 changes: 10 additions & 10 deletions src/cluster.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
use super::{config, interconn as ic, kernel::Kernel, mem_fetch, Core, MockSimulator, Packet};
use console::style;
use crossbeam::utils::CachePadded;

use std::collections::VecDeque;

Expand All @@ -8,7 +9,7 @@ use crate::sync::{atomic, Arc, Mutex, RwLock};
#[derive(Debug)]
pub struct Cluster<I> {
pub cluster_id: usize,
pub warp_instruction_unique_uid: Arc<atomic::AtomicU64>,
pub warp_instruction_unique_uid: Arc<CachePadded<atomic::AtomicU64>>,
pub cores: Vec<Arc<RwLock<Core<I>>>>,
pub config: Arc<config::GPU>,
pub stats: Arc<Mutex<stats::Stats>>,
Expand All @@ -26,7 +27,7 @@ where
{
pub fn new(
cluster_id: usize,
warp_instruction_unique_uid: &Arc<atomic::AtomicU64>,
warp_instruction_unique_uid: &Arc<CachePadded<atomic::AtomicU64>>,
allocations: &super::allocation::Ref,
interconn: &Arc<I>,
stats: &Arc<Mutex<stats::Stats>>,
Expand Down Expand Up @@ -75,7 +76,7 @@ where
pub fn num_active_sms(&self) -> usize {
self.cores
.iter()
.filter(|core| core.try_read().active())
.filter(|core| core.try_read().is_active())
.count()
}

Expand All @@ -86,6 +87,7 @@ where
.sum()
}

#[tracing::instrument]
pub fn interconn_cycle(&mut self, cycle: u64) {
use mem_fetch::AccessKind;

Expand All @@ -106,6 +108,7 @@ where
if let Some(fetch) = self.response_fifo.front() {
let core_id = self.config.global_core_id_to_core_id(fetch.core_id);

// we should not fully lock a core as we completely block a full core cycle
let core = self.cores[core_id].read();

match *fetch.access_kind() {
Expand Down Expand Up @@ -198,7 +201,8 @@ where
// }
// }

pub fn issue_block_to_core(&self, sim: &MockSimulator<I>) -> usize {
#[tracing::instrument(name = "cluster_issue_block_to_core")]
pub fn issue_block_to_core(&self, sim: &MockSimulator<I>, cycle: u64) -> usize {
let num_cores = self.cores.len();

log::debug!(
Expand All @@ -212,10 +216,7 @@ where

for core_id in 0..num_cores {
let core_id = (core_id + *block_issue_next_core + 1) % num_cores;
// let core = &mut cores[core_id];
// THIS KILLS THE PERFORMANCE
let core = self.cores[core_id].try_read();
// let core = self.cores[core_id].read();
let core = self.cores[core_id].read();

// let kernel: Option<Arc<Kernel>> = if self.config.concurrent_kernel_sm {
// // always select latest issued kernel
Expand Down Expand Up @@ -270,9 +271,8 @@ where
let can_issue = !kernel.no_more_blocks_to_run() && core.can_issue_block(&kernel);
drop(core);
if can_issue {
// dbg!("core issue");
let mut core = self.cores[core_id].write();
core.issue_block(&kernel);
core.issue_block(&kernel, cycle);
num_blocks_issued += 1;
*block_issue_next_core = core_id;
break;
Expand Down
Loading

0 comments on commit 11292cd

Please sign in to comment.