implement populating l2 on cuda mem copies

romnn · Aug 1, 2023 · ef76ee9 · ef76ee9
1 parent 5546daf
commit ef76ee9
Show file tree

Hide file tree

Showing 21 changed files with 358 additions and 161 deletions.
diff --git a/WIP.md b/WIP.md
@@ -9,6 +9,7 @@
   - generate plots and correlation stuff etc
   - add flag for playground to run in accelsim compatibility mode
 
+  - DONE: support multiple kernel launches
   - DONE: fix tracing of multiple kernels
   - DONE: add transpose benchmarks
   - DONE: most likely need to be modified to allow selecting an implementation)

diff --git a/playground/sys/src/ref/data_cache.cc b/playground/sys/src/ref/data_cache.cc
@@ -586,8 +586,9 @@ enum cache_request_status data_cache::access(new_addr_type addr, mem_fetch *mf,
   new_addr_type block_addr = m_config.block_addr(addr);
   unsigned cache_index = (unsigned)-1;
 
-  logger->debug("data_cache::access({}, write = {}, size = {}, block = {})",
-                mem_fetch_ptr(mf), wr, mf->get_data_size(), block_addr);
+  logger->debug(
+      "data_cache::access({}, write = {}, size = {}, block = {}, time = {})",
+      mem_fetch_ptr(mf), wr, mf->get_data_size(), block_addr, time);
   enum cache_request_status probe_status =
       m_tag_array->probe(block_addr, cache_index, mf, mf->is_write(), true);
 

diff --git a/playground/sys/src/ref/ldst_unit.cc b/playground/sys/src/ref/ldst_unit.cc
@@ -191,7 +191,6 @@ mem_stage_stall_type ldst_unit::process_memory_access_queue_l1cache(
 }
 
 void ldst_unit::L1_latency_queue_cycle() {
-  // throw std::runtime_error("no l1 latency queue");
   for (unsigned j = 0; j < m_config->m_L1D_config.l1_banks; j++) {
     if ((l1_latency_queue[j][0]) != NULL) {
       mem_fetch *mf_next = l1_latency_queue[j][0];

diff --git a/playground/sys/src/ref/memory_partition_unit.cc b/playground/sys/src/ref/memory_partition_unit.cc
@@ -36,6 +36,11 @@ void memory_partition_unit::handle_memcpy_to_gpu(
   unsigned p = global_sub_partition_id_to_local_id(global_subpart_id);
   std::string mystring = mask.to_string<char, std::string::traits_type,
                                         std::string::allocator_type>();
+  logger->trace(
+      "copy engine request received for address={}, local_subpart={}, "
+      "global_subpart={}, sector_mask={}",
+      addr, p, global_subpart_id, mystring.c_str());
+
   MEMPART_DPRINTF(
       "Copy Engine Request Received For Address=%zx, local_subpart=%u, "
       "global_subpart=%u, sector_mask=%s \n",

diff --git a/playground/sys/src/ref/read_only_cache.cc b/playground/sys/src/ref/read_only_cache.cc
@@ -14,9 +14,9 @@ enum cache_request_status read_only_cache::access(
 
   logger->debug(
       "{}::read_only_cache::access({}, write = {}, data size = {}, control "
-      "size = {}, block = {})",
+      "size = {}, block = {}, time = {})",
       name(), addr, mf->is_write(), mf->get_data_size(), mf->get_ctrl_size(),
-      block_addr);
+      block_addr, time);
 
   unsigned cache_index = (unsigned)-1;
   enum cache_request_status status =

diff --git a/playground/sys/src/ref/tag_array.cc b/playground/sys/src/ref/tag_array.cc
@@ -234,8 +234,8 @@ enum cache_request_status tag_array::access(new_addr_type addr, unsigned time,
           //               mf->get_alloc_start_addr(), mem_fetch_ptr(mf));
           m_dirty--;
         }
-        logger->trace("tag_array::allocate(cache={}, tag={})", idx,
-                      m_config.tag(addr));
+        logger->trace("tag_array::allocate(cache={}, tag={}, time={})", idx,
+                      m_config.tag(addr), time);
         m_lines[idx]->allocate(m_config.tag(addr), m_config.block_addr(addr),
                                time, mf->get_access_sector_mask());
       }
@@ -291,8 +291,8 @@ void tag_array::fill(new_addr_type addr, unsigned time,
   // assert(status==MISS||status==SECTOR_MISS); // MSHR should have prevented
   // redundant memory request
   if (status == MISS) {
-    logger->trace("tag_array::allocate(cache={}, tag={})", idx,
-                  m_config.tag(addr));
+    logger->trace("tag_array::allocate(cache={}, tag={}, time={})", idx,
+                  m_config.tag(addr), time);
     m_lines[idx]->allocate(m_config.tag(addr), m_config.block_addr(addr), time,
                            mask);
   } else if (status == SECTOR_MISS) {

diff --git a/playground/sys/src/ref/trace_gpgpu_sim.cc b/playground/sys/src/ref/trace_gpgpu_sim.cc
@@ -86,29 +86,38 @@ void trace_gpgpu_sim::init() {
 }
 
 void trace_gpgpu_sim::perf_memcpy_to_gpu(size_t dst_start_addr, size_t count) {
+  logger->info("memcopy: <unnamed> {:>15} ({:>5} f32) to address {:>20}", count,
+               count / 4, dst_start_addr);
+
   unsigned id = m_allocations.size() + 1;  // zero is reserved for instructions
   m_allocations.insert(Allocation(id, dst_start_addr, dst_start_addr + count));
 
   if (m_memory_config->m_perf_sim_memcpy) {
-    // assert(0 && "no sim memcopy");
     // if(!m_config.trace_driven_mode)    //in trace-driven mode, CUDA runtime
     // can start nre data structure at any position 	assert (dst_start_addr %
     // 32
     //== 0);
 
-    // TODO: add this back in
-    // for (unsigned counter = 0; counter < count; counter += 32) {
-    //   const unsigned wr_addr = dst_start_addr + counter;
-    //   addrdec_t raw_addr;
-    //   mem_access_sector_mask_t mask;
-    //   mask.set(wr_addr % 128 / 32);
-    //   m_memory_config->m_address_mapping.addrdec_tlx(wr_addr, &raw_addr);
-    //   const unsigned partition_id =
-    //       raw_addr.sub_partition /
-    //       m_memory_config->m_n_sub_partition_per_memory_channel;
-    //   m_memory_partition_unit[partition_id]->handle_memcpy_to_gpu(
-    //       wr_addr, raw_addr.sub_partition, mask);
-    // }
+    for (size_t counter = 0; counter < count; counter += 32) {
+      const size_t wr_addr = dst_start_addr + counter;
+      addrdec_t raw_addr;
+
+      m_memory_config->m_address_mapping.addrdec_tlx(wr_addr, &raw_addr);
+      const unsigned partition_id =
+          raw_addr.sub_partition /
+          m_memory_config->m_n_sub_partition_per_memory_channel;
+
+      mem_access_sector_mask_t mask;
+      mask.set(wr_addr % 128 / 32);
+
+      logger->trace(
+          "memcopy to gpu: copy 32 byte chunk starting at {} to sub partition "
+          "unit {} of partition unit {} (mask {})",
+          wr_addr, raw_addr.sub_partition, partition_id, mask_to_string(mask));
+
+      m_memory_partition_unit[partition_id]->handle_memcpy_to_gpu(
+          wr_addr, raw_addr.sub_partition, mask);
+    }
   }
 }
 

diff --git a/src/ported/addrdec.rs b/src/ported/addrdec.rs
@@ -409,10 +409,10 @@ mod tests {
     use color_eyre::eyre;
 
     macro_rules! diff_assert_all_eq (
-        ($a:expr, $b:expr) => {
+        ($a:expr, $b:expr $(,)?) => {
             ::pretty_assertions_sorted::assert_eq!($a, $b);
         };
-        ($a:expr, $b:expr, $c:expr) => {
+        ($a:expr, $b:expr, $c:expr $(,)?) => {
             ::pretty_assertions_sorted::assert_eq!($a, $b);
             ::pretty_assertions_sorted::assert_eq!($b, $c);
         };
@@ -423,10 +423,10 @@ mod tests {
     );
 
     macro_rules! assert_all_eq (
-        ($a:expr, $b:expr) => {
+        ($a:expr, $b:expr $(,)?) => {
             assert_eq!($a, $b);
         };
-        ($a:expr, $b:expr, $c:expr) => {
+        ($a:expr, $b:expr, $c:expr $(,)?) => {
             assert_eq!($a, $b);
             assert_eq!($b, $c);
         };
@@ -482,6 +482,29 @@ mod tests {
         );
     }
 
+    #[test]
+    fn test_tlx_sub_partition() {
+        use playground::addrdec::AddressTranslation;
+        let config = GPUConfig::default();
+        let mapping = config.address_mapping();
+        let ref_mapping = AddressTranslation::new(
+            config.num_mem_units as u32,
+            config.num_sub_partition_per_memory_channel as u32,
+        );
+        let addr = 140159034066112;
+
+        let tlx_addr = mapping.tlx(addr);
+        let ref_tlx_addr = ref_mapping.tlx(addr);
+
+        diff_assert_all_eq!(
+            super::DecodedAddress::from(ref_tlx_addr).sub_partition,
+            tlx_addr.sub_partition,
+            1,
+        );
+        dbg!(tlx_addr);
+        assert!(false);
+    }
+
     #[test]
     fn test_tlx() {
         use playground::addrdec::AddressTranslation;

diff --git a/src/ported/cache.rs b/src/ported/cache.rs
@@ -109,6 +109,7 @@ pub trait Cache: Component + CacheBandwidth {
         addr: address,
         fetch: mem_fetch::MemFetch,
         events: &mut Vec<Event>,
+        time: u64,
     ) -> RequestStatus {
         todo!("cache: access");
     }
@@ -121,7 +122,7 @@ pub trait Cache: Component + CacheBandwidth {
         todo!("cache: next access");
     }
 
-    fn fill(&mut self, fetch: mem_fetch::MemFetch) {
+    fn fill(&mut self, fetch: mem_fetch::MemFetch, time: u64) {
         todo!("cache: fill");
     }
 
@@ -133,6 +134,10 @@ pub trait Cache: Component + CacheBandwidth {
         todo!("cache: invalidate");
     }
 
+    fn force_tag_access(&mut self, addr: address, time: u64, mask: mem_fetch::MemAccessSectorMask) {
+        todo!("cache: invalidate");
+    }
+
     fn waiting_for_fill(&self, fetch: &mem_fetch::MemFetch) -> bool {
         todo!("cache: waiting for fill");
     }

diff --git a/src/ported/cache_block.rs b/src/ported/cache_block.rs
@@ -154,6 +154,7 @@ impl LineCacheBlock {
     #[inline]
     pub fn set_last_access_time(&mut self, time: u64, _mask: &mem_fetch::MemAccessSectorMask) {
         self.last_access_time = time;
+        // self.last_access_time = self.last_access_time.max(time);
     }
 
     #[inline]

diff --git a/src/ported/core.rs b/src/ported/core.rs
@@ -1172,7 +1172,7 @@ where
                     let mut warp = self.inner.warps[warp_id].try_borrow_mut().unwrap();
                     if did_exit {
                         // todo!("first warp did exit");
-                        log::debug!("warp_id = {} exited", &warp_id);
+                        // log::debug!("warp_id = {} exited", &warp_id);
                         // if warp_id == 3 {
                         //     panic!("warp 3 exited");
                         // }
@@ -1239,9 +1239,13 @@ where
                             cache::RequestStatus::HIT
                         } else {
                             let mut events = Vec::new();
-                            self.inner
-                                .instr_l1_cache
-                                .access(ppc as address, fetch, &mut events)
+                            let time = self.inner.cycle.get();
+                            self.inner.instr_l1_cache.access(
+                                ppc as address,
+                                fetch,
+                                &mut events,
+                                time,
+                            )
                         };
 
                         log::debug!("L1I->access(addr={}) -> status = {:?}", ppc, status);
@@ -1645,8 +1649,9 @@ where
     }
 
     pub fn accept_fetch_response(&mut self, mut fetch: mem_fetch::MemFetch) {
+        let time = self.inner.cycle.get();
         fetch.status = mem_fetch::Status::IN_SHADER_FETCHED;
-        self.inner.instr_l1_cache.fill(fetch);
+        self.inner.instr_l1_cache.fill(fetch, time);
     }
 
     pub fn accept_ldst_unit_response(&self, fetch: mem_fetch::MemFetch) {

diff --git a/src/ported/l1/base.rs b/src/ported/l1/base.rs
@@ -577,7 +577,7 @@ where
     /// bandwidth restictions should be modeled in the caller.
     /// TODO: fill could also accept the fetch by value, otherwise we drop the fetch!!
     // pub fn fill(&mut self, fetch: &mut mem_fetch::MemFetch) {
-    pub fn fill(&mut self, mut fetch: mem_fetch::MemFetch) {
+    pub fn fill(&mut self, mut fetch: mem_fetch::MemFetch, time: u64) {
         if self.cache_config.mshr_kind == mshr::Kind::SECTOR_ASSOC {
             todo!("sector assoc cache");
             let original_fetch = fetch.original_fetch.as_ref().unwrap();
@@ -603,7 +603,7 @@ where
         fetch.data_size = pending.data_size;
         fetch.access.addr = pending.addr;
 
-        let time = self.cycle.get();
+        // let time = self.cycle.get();
         match self.cache_config.allocate_policy {
             config::CacheAllocatePolicy::ON_MISS => {
                 self.tag_array

diff --git a/src/ported/l1/data.rs b/src/ported/l1/data.rs
@@ -672,6 +672,7 @@ where
         cache_index: Option<usize>,
         fetch: mem_fetch::MemFetch,
         events: &mut Vec<cache::Event>,
+        time: u64,
     ) -> cache::RequestStatus {
         // dbg!(cache_index, probe_status);
         // Each function pointer ( m_[rd/wr]_[hit/miss] ) is set in the
@@ -680,7 +681,7 @@ where
         //
         // Function pointers were used to avoid many long conditional
         // branches resulting from many cache configuration options.
-        let time = self.inner.cycle.get();
+        // let time = self.inner.cycle.get();
         let mut access_status = probe_status;
         let data_size = fetch.data_size;
 
@@ -765,6 +766,7 @@ where
         addr: address,
         fetch: mem_fetch::MemFetch,
         events: &mut Vec<cache::Event>,
+        time: u64,
     ) -> cache::RequestStatus {
         let super::base::Base {
             ref cache_config, ..
@@ -778,9 +780,9 @@ where
         let block_addr = cache_config.block_addr(addr);
 
         log::debug!(
-            "{}::data_cache::access({fetch}, write = {is_write}, size = {}, block = {block_addr})",
+            "{}::data_cache::access({fetch}, write = {is_write}, size = {}, block = {block_addr}, time = {})",
             self.inner.name,
-            fetch.data_size,
+            fetch.data_size, time,
         );
 
         let dbg_fetch = fetch.clone();
@@ -791,8 +793,15 @@ where
             .probe(block_addr, &fetch, is_write, true);
         // dbg!((cache_index, probe_status));
 
-        let access_status =
-            self.process_tag_probe(is_write, probe_status, addr, cache_index, fetch, events);
+        let access_status = self.process_tag_probe(
+            is_write,
+            probe_status,
+            addr,
+            cache_index,
+            fetch,
+            events,
+            time,
+        );
         // dbg!(&access_status);
 
         log::debug!(
@@ -847,8 +856,8 @@ where
     }
 
     // fn fill(&mut self, fetch: &mut mem_fetch::MemFetch) {
-    fn fill(&mut self, fetch: mem_fetch::MemFetch) {
-        self.inner.fill(fetch)
+    fn fill(&mut self, fetch: mem_fetch::MemFetch, time: u64) {
+        self.inner.fill(fetch, time)
     }
 
     fn waiting_for_fill(&self, fetch: &mem_fetch::MemFetch) -> bool {
@@ -1228,11 +1237,12 @@ mod tests {
             cluster_id,
         );
         // let status = l1.access(0x00000000, fetch.clone(), None);
+        let time = 0;
         let mut events = Vec::new();
-        let status = l1.access(fetch.addr(), fetch.clone(), &mut events);
+        let status = l1.access(fetch.addr(), fetch.clone(), &mut events, time);
         dbg!(&status);
         let mut events = Vec::new();
-        let status = l1.access(fetch.addr(), fetch, &mut events);
+        let status = l1.access(fetch.addr(), fetch, &mut events, time);
         dbg!(&status);
 
         // let mut stats = STATS.lock().unwrap();

diff --git a/src/ported/l1/readonly.rs b/src/ported/l1/readonly.rs
@@ -96,6 +96,7 @@ where
         addr: address,
         fetch: mem_fetch::MemFetch,
         events: &mut Vec<cache::Event>,
+        time: u64,
     ) -> cache::RequestStatus {
         use cache::RequestStatus as Status;
 
@@ -114,19 +115,19 @@ where
         let block_addr = cache_config.block_addr(addr);
 
         log::debug!(
-            "{}::readonly_cache::access({addr}, write = {}, data size = {}, control size = {}, block = {block_addr})",
+            "{}::readonly_cache::access({addr}, write = {}, data size = {}, control size = {}, block = {block_addr}, time={})",
             self.inner.name,
             fetch.is_write(),
             fetch.data_size,
             fetch.control_size,
+            time,
         );
 
         let is_probe = false;
         let (cache_index, probe_status) =
             tag_array.probe(block_addr, &fetch, fetch.is_write(), is_probe);
         let mut status = Status::RESERVATION_FAIL;
 
-        let time = self.inner.cycle.get();
         if probe_status == Status::HIT {
             // update LRU state
             tag_array::AccessStatus { status, .. } = tag_array.access(block_addr, &fetch, time);
@@ -175,8 +176,8 @@ where
         status
     }
 
-    fn fill(&mut self, fetch: mem_fetch::MemFetch) {
-        self.inner.fill(fetch);
+    fn fill(&mut self, fetch: mem_fetch::MemFetch, time: u64) {
+        self.inner.fill(fetch, time);
     }
 }