Skip to content

Commit

Permalink
implement populating l2 on cuda mem copies
Browse files Browse the repository at this point in the history
  • Loading branch information
romnn committed Aug 1, 2023
1 parent 5546daf commit ef76ee9
Show file tree
Hide file tree
Showing 21 changed files with 358 additions and 161 deletions.
1 change: 1 addition & 0 deletions WIP.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
- generate plots and correlation stuff etc
- add flag for playground to run in accelsim compatibility mode

- DONE: support multiple kernel launches
- DONE: fix tracing of multiple kernels
- DONE: add transpose benchmarks
- DONE: most likely need to be modified to allow selecting an implementation)
Expand Down
5 changes: 3 additions & 2 deletions playground/sys/src/ref/data_cache.cc
Original file line number Diff line number Diff line change
Expand Up @@ -586,8 +586,9 @@ enum cache_request_status data_cache::access(new_addr_type addr, mem_fetch *mf,
new_addr_type block_addr = m_config.block_addr(addr);
unsigned cache_index = (unsigned)-1;

logger->debug("data_cache::access({}, write = {}, size = {}, block = {})",
mem_fetch_ptr(mf), wr, mf->get_data_size(), block_addr);
logger->debug(
"data_cache::access({}, write = {}, size = {}, block = {}, time = {})",
mem_fetch_ptr(mf), wr, mf->get_data_size(), block_addr, time);
enum cache_request_status probe_status =
m_tag_array->probe(block_addr, cache_index, mf, mf->is_write(), true);

Expand Down
1 change: 0 additions & 1 deletion playground/sys/src/ref/ldst_unit.cc
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,6 @@ mem_stage_stall_type ldst_unit::process_memory_access_queue_l1cache(
}

void ldst_unit::L1_latency_queue_cycle() {
// throw std::runtime_error("no l1 latency queue");
for (unsigned j = 0; j < m_config->m_L1D_config.l1_banks; j++) {
if ((l1_latency_queue[j][0]) != NULL) {
mem_fetch *mf_next = l1_latency_queue[j][0];
Expand Down
5 changes: 5 additions & 0 deletions playground/sys/src/ref/memory_partition_unit.cc
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,11 @@ void memory_partition_unit::handle_memcpy_to_gpu(
unsigned p = global_sub_partition_id_to_local_id(global_subpart_id);
std::string mystring = mask.to_string<char, std::string::traits_type,
std::string::allocator_type>();
logger->trace(
"copy engine request received for address={}, local_subpart={}, "
"global_subpart={}, sector_mask={}",
addr, p, global_subpart_id, mystring.c_str());

MEMPART_DPRINTF(
"Copy Engine Request Received For Address=%zx, local_subpart=%u, "
"global_subpart=%u, sector_mask=%s \n",
Expand Down
4 changes: 2 additions & 2 deletions playground/sys/src/ref/read_only_cache.cc
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,9 @@ enum cache_request_status read_only_cache::access(

logger->debug(
"{}::read_only_cache::access({}, write = {}, data size = {}, control "
"size = {}, block = {})",
"size = {}, block = {}, time = {})",
name(), addr, mf->is_write(), mf->get_data_size(), mf->get_ctrl_size(),
block_addr);
block_addr, time);

unsigned cache_index = (unsigned)-1;
enum cache_request_status status =
Expand Down
8 changes: 4 additions & 4 deletions playground/sys/src/ref/tag_array.cc
Original file line number Diff line number Diff line change
Expand Up @@ -234,8 +234,8 @@ enum cache_request_status tag_array::access(new_addr_type addr, unsigned time,
// mf->get_alloc_start_addr(), mem_fetch_ptr(mf));
m_dirty--;
}
logger->trace("tag_array::allocate(cache={}, tag={})", idx,
m_config.tag(addr));
logger->trace("tag_array::allocate(cache={}, tag={}, time={})", idx,
m_config.tag(addr), time);
m_lines[idx]->allocate(m_config.tag(addr), m_config.block_addr(addr),
time, mf->get_access_sector_mask());
}
Expand Down Expand Up @@ -291,8 +291,8 @@ void tag_array::fill(new_addr_type addr, unsigned time,
// assert(status==MISS||status==SECTOR_MISS); // MSHR should have prevented
// redundant memory request
if (status == MISS) {
logger->trace("tag_array::allocate(cache={}, tag={})", idx,
m_config.tag(addr));
logger->trace("tag_array::allocate(cache={}, tag={}, time={})", idx,
m_config.tag(addr), time);
m_lines[idx]->allocate(m_config.tag(addr), m_config.block_addr(addr), time,
mask);
} else if (status == SECTOR_MISS) {
Expand Down
37 changes: 23 additions & 14 deletions playground/sys/src/ref/trace_gpgpu_sim.cc
Original file line number Diff line number Diff line change
Expand Up @@ -86,29 +86,38 @@ void trace_gpgpu_sim::init() {
}

void trace_gpgpu_sim::perf_memcpy_to_gpu(size_t dst_start_addr, size_t count) {
logger->info("memcopy: <unnamed> {:>15} ({:>5} f32) to address {:>20}", count,
count / 4, dst_start_addr);

unsigned id = m_allocations.size() + 1; // zero is reserved for instructions
m_allocations.insert(Allocation(id, dst_start_addr, dst_start_addr + count));

if (m_memory_config->m_perf_sim_memcpy) {
// assert(0 && "no sim memcopy");
// if(!m_config.trace_driven_mode) //in trace-driven mode, CUDA runtime
// can start nre data structure at any position assert (dst_start_addr %
// 32
//== 0);

// TODO: add this back in
// for (unsigned counter = 0; counter < count; counter += 32) {
// const unsigned wr_addr = dst_start_addr + counter;
// addrdec_t raw_addr;
// mem_access_sector_mask_t mask;
// mask.set(wr_addr % 128 / 32);
// m_memory_config->m_address_mapping.addrdec_tlx(wr_addr, &raw_addr);
// const unsigned partition_id =
// raw_addr.sub_partition /
// m_memory_config->m_n_sub_partition_per_memory_channel;
// m_memory_partition_unit[partition_id]->handle_memcpy_to_gpu(
// wr_addr, raw_addr.sub_partition, mask);
// }
for (size_t counter = 0; counter < count; counter += 32) {
const size_t wr_addr = dst_start_addr + counter;
addrdec_t raw_addr;

m_memory_config->m_address_mapping.addrdec_tlx(wr_addr, &raw_addr);
const unsigned partition_id =
raw_addr.sub_partition /
m_memory_config->m_n_sub_partition_per_memory_channel;

mem_access_sector_mask_t mask;
mask.set(wr_addr % 128 / 32);

logger->trace(
"memcopy to gpu: copy 32 byte chunk starting at {} to sub partition "
"unit {} of partition unit {} (mask {})",
wr_addr, raw_addr.sub_partition, partition_id, mask_to_string(mask));

m_memory_partition_unit[partition_id]->handle_memcpy_to_gpu(
wr_addr, raw_addr.sub_partition, mask);
}
}
}

Expand Down
31 changes: 27 additions & 4 deletions src/ported/addrdec.rs
Original file line number Diff line number Diff line change
Expand Up @@ -409,10 +409,10 @@ mod tests {
use color_eyre::eyre;

macro_rules! diff_assert_all_eq (
($a:expr, $b:expr) => {
($a:expr, $b:expr $(,)?) => {
::pretty_assertions_sorted::assert_eq!($a, $b);
};
($a:expr, $b:expr, $c:expr) => {
($a:expr, $b:expr, $c:expr $(,)?) => {
::pretty_assertions_sorted::assert_eq!($a, $b);
::pretty_assertions_sorted::assert_eq!($b, $c);
};
Expand All @@ -423,10 +423,10 @@ mod tests {
);

macro_rules! assert_all_eq (
($a:expr, $b:expr) => {
($a:expr, $b:expr $(,)?) => {
assert_eq!($a, $b);
};
($a:expr, $b:expr, $c:expr) => {
($a:expr, $b:expr, $c:expr $(,)?) => {
assert_eq!($a, $b);
assert_eq!($b, $c);
};
Expand Down Expand Up @@ -482,6 +482,29 @@ mod tests {
);
}

#[test]
fn test_tlx_sub_partition() {
use playground::addrdec::AddressTranslation;
let config = GPUConfig::default();
let mapping = config.address_mapping();
let ref_mapping = AddressTranslation::new(
config.num_mem_units as u32,
config.num_sub_partition_per_memory_channel as u32,
);
let addr = 140159034066112;

let tlx_addr = mapping.tlx(addr);
let ref_tlx_addr = ref_mapping.tlx(addr);

diff_assert_all_eq!(
super::DecodedAddress::from(ref_tlx_addr).sub_partition,
tlx_addr.sub_partition,
1,
);
dbg!(tlx_addr);
assert!(false);
}

#[test]
fn test_tlx() {
use playground::addrdec::AddressTranslation;
Expand Down
7 changes: 6 additions & 1 deletion src/ported/cache.rs
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ pub trait Cache: Component + CacheBandwidth {
addr: address,
fetch: mem_fetch::MemFetch,
events: &mut Vec<Event>,
time: u64,
) -> RequestStatus {
todo!("cache: access");
}
Expand All @@ -121,7 +122,7 @@ pub trait Cache: Component + CacheBandwidth {
todo!("cache: next access");
}

fn fill(&mut self, fetch: mem_fetch::MemFetch) {
fn fill(&mut self, fetch: mem_fetch::MemFetch, time: u64) {
todo!("cache: fill");
}

Expand All @@ -133,6 +134,10 @@ pub trait Cache: Component + CacheBandwidth {
todo!("cache: invalidate");
}

fn force_tag_access(&mut self, addr: address, time: u64, mask: mem_fetch::MemAccessSectorMask) {
todo!("cache: invalidate");
}

fn waiting_for_fill(&self, fetch: &mem_fetch::MemFetch) -> bool {
todo!("cache: waiting for fill");
}
Expand Down
1 change: 1 addition & 0 deletions src/ported/cache_block.rs
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,7 @@ impl LineCacheBlock {
#[inline]
pub fn set_last_access_time(&mut self, time: u64, _mask: &mem_fetch::MemAccessSectorMask) {
self.last_access_time = time;
// self.last_access_time = self.last_access_time.max(time);
}

#[inline]
Expand Down
15 changes: 10 additions & 5 deletions src/ported/core.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1172,7 +1172,7 @@ where
let mut warp = self.inner.warps[warp_id].try_borrow_mut().unwrap();
if did_exit {
// todo!("first warp did exit");
log::debug!("warp_id = {} exited", &warp_id);
// log::debug!("warp_id = {} exited", &warp_id);
// if warp_id == 3 {
// panic!("warp 3 exited");
// }
Expand Down Expand Up @@ -1239,9 +1239,13 @@ where
cache::RequestStatus::HIT
} else {
let mut events = Vec::new();
self.inner
.instr_l1_cache
.access(ppc as address, fetch, &mut events)
let time = self.inner.cycle.get();
self.inner.instr_l1_cache.access(
ppc as address,
fetch,
&mut events,
time,
)
};

log::debug!("L1I->access(addr={}) -> status = {:?}", ppc, status);
Expand Down Expand Up @@ -1645,8 +1649,9 @@ where
}

pub fn accept_fetch_response(&mut self, mut fetch: mem_fetch::MemFetch) {
let time = self.inner.cycle.get();
fetch.status = mem_fetch::Status::IN_SHADER_FETCHED;
self.inner.instr_l1_cache.fill(fetch);
self.inner.instr_l1_cache.fill(fetch, time);
}

pub fn accept_ldst_unit_response(&self, fetch: mem_fetch::MemFetch) {
Expand Down
4 changes: 2 additions & 2 deletions src/ported/l1/base.rs
Original file line number Diff line number Diff line change
Expand Up @@ -577,7 +577,7 @@ where
/// bandwidth restictions should be modeled in the caller.
/// TODO: fill could also accept the fetch by value, otherwise we drop the fetch!!
// pub fn fill(&mut self, fetch: &mut mem_fetch::MemFetch) {
pub fn fill(&mut self, mut fetch: mem_fetch::MemFetch) {
pub fn fill(&mut self, mut fetch: mem_fetch::MemFetch, time: u64) {
if self.cache_config.mshr_kind == mshr::Kind::SECTOR_ASSOC {
todo!("sector assoc cache");
let original_fetch = fetch.original_fetch.as_ref().unwrap();
Expand All @@ -603,7 +603,7 @@ where
fetch.data_size = pending.data_size;
fetch.access.addr = pending.addr;

let time = self.cycle.get();
// let time = self.cycle.get();
match self.cache_config.allocate_policy {
config::CacheAllocatePolicy::ON_MISS => {
self.tag_array
Expand Down
28 changes: 19 additions & 9 deletions src/ported/l1/data.rs
Original file line number Diff line number Diff line change
Expand Up @@ -672,6 +672,7 @@ where
cache_index: Option<usize>,
fetch: mem_fetch::MemFetch,
events: &mut Vec<cache::Event>,
time: u64,
) -> cache::RequestStatus {
// dbg!(cache_index, probe_status);
// Each function pointer ( m_[rd/wr]_[hit/miss] ) is set in the
Expand All @@ -680,7 +681,7 @@ where
//
// Function pointers were used to avoid many long conditional
// branches resulting from many cache configuration options.
let time = self.inner.cycle.get();
// let time = self.inner.cycle.get();
let mut access_status = probe_status;
let data_size = fetch.data_size;

Expand Down Expand Up @@ -765,6 +766,7 @@ where
addr: address,
fetch: mem_fetch::MemFetch,
events: &mut Vec<cache::Event>,
time: u64,
) -> cache::RequestStatus {
let super::base::Base {
ref cache_config, ..
Expand All @@ -778,9 +780,9 @@ where
let block_addr = cache_config.block_addr(addr);

log::debug!(
"{}::data_cache::access({fetch}, write = {is_write}, size = {}, block = {block_addr})",
"{}::data_cache::access({fetch}, write = {is_write}, size = {}, block = {block_addr}, time = {})",
self.inner.name,
fetch.data_size,
fetch.data_size, time,
);

let dbg_fetch = fetch.clone();
Expand All @@ -791,8 +793,15 @@ where
.probe(block_addr, &fetch, is_write, true);
// dbg!((cache_index, probe_status));

let access_status =
self.process_tag_probe(is_write, probe_status, addr, cache_index, fetch, events);
let access_status = self.process_tag_probe(
is_write,
probe_status,
addr,
cache_index,
fetch,
events,
time,
);
// dbg!(&access_status);

log::debug!(
Expand Down Expand Up @@ -847,8 +856,8 @@ where
}

// fn fill(&mut self, fetch: &mut mem_fetch::MemFetch) {
fn fill(&mut self, fetch: mem_fetch::MemFetch) {
self.inner.fill(fetch)
fn fill(&mut self, fetch: mem_fetch::MemFetch, time: u64) {
self.inner.fill(fetch, time)
}

fn waiting_for_fill(&self, fetch: &mem_fetch::MemFetch) -> bool {
Expand Down Expand Up @@ -1228,11 +1237,12 @@ mod tests {
cluster_id,
);
// let status = l1.access(0x00000000, fetch.clone(), None);
let time = 0;
let mut events = Vec::new();
let status = l1.access(fetch.addr(), fetch.clone(), &mut events);
let status = l1.access(fetch.addr(), fetch.clone(), &mut events, time);
dbg!(&status);
let mut events = Vec::new();
let status = l1.access(fetch.addr(), fetch, &mut events);
let status = l1.access(fetch.addr(), fetch, &mut events, time);
dbg!(&status);

// let mut stats = STATS.lock().unwrap();
Expand Down
9 changes: 5 additions & 4 deletions src/ported/l1/readonly.rs
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ where
addr: address,
fetch: mem_fetch::MemFetch,
events: &mut Vec<cache::Event>,
time: u64,
) -> cache::RequestStatus {
use cache::RequestStatus as Status;

Expand All @@ -114,19 +115,19 @@ where
let block_addr = cache_config.block_addr(addr);

log::debug!(
"{}::readonly_cache::access({addr}, write = {}, data size = {}, control size = {}, block = {block_addr})",
"{}::readonly_cache::access({addr}, write = {}, data size = {}, control size = {}, block = {block_addr}, time={})",
self.inner.name,
fetch.is_write(),
fetch.data_size,
fetch.control_size,
time,
);

let is_probe = false;
let (cache_index, probe_status) =
tag_array.probe(block_addr, &fetch, fetch.is_write(), is_probe);
let mut status = Status::RESERVATION_FAIL;

let time = self.inner.cycle.get();
if probe_status == Status::HIT {
// update LRU state
tag_array::AccessStatus { status, .. } = tag_array.access(block_addr, &fetch, time);
Expand Down Expand Up @@ -175,8 +176,8 @@ where
status
}

fn fill(&mut self, fetch: mem_fetch::MemFetch) {
self.inner.fill(fetch);
fn fill(&mut self, fetch: mem_fetch::MemFetch, time: u64) {
self.inner.fill(fetch, time);
}
}

Expand Down
Loading

0 comments on commit ef76ee9

Please sign in to comment.