[feat] Add basic support to dynamically compute memory access pattern

SNU-ARC · Aug 7, 2018 · 4a30d33 · 4a30d33
1 parent 17e78d7
commit 4a30d33
Show file tree

Hide file tree

Showing 15 changed files with 561 additions and 100 deletions.
diff --git a/.gitignore b/.gitignore
@@ -10,6 +10,9 @@ CTestTestfile.cmake
 
 build/*
 
+*.trace
+*.tdg
+*.profile
 *.bc
 *.ll
 *.s

diff --git a/benchmark/MachSuite.py b/benchmark/MachSuite.py
@@ -73,6 +73,9 @@ def get_trace_result(self):
     def get_profile(self):
         return self.get_trace() + '.profile'
 
+    def get_profile(self):
+        return self.get_trace() + '.profile'
+
     def get_tdg(self, transform):
         return '{name}.{transform}.tdg'.format(name=self.get_name(), transform=transform)
 
@@ -192,41 +195,34 @@ def trace(self):
         self.benchmark.run_trace(self.get_trace())
         os.chdir(self.cwd)
 
-    def build_replay(self):
+    def transform(self, transform, debugs):
         pass_name = 'replay'
-        debugs = [
-            'ReplayPass',
-            # 'DataGraph',
-            # 'TDGSerializer'
-        ]
-        self.benchmark.build_replay(
-            pass_name=pass_name,
-            trace_file=self.get_trace_result(),
-            tdg_detail='integrated',
-            # tdg_detail='standalone',
-            output_tdg=self.get_tdg('replay'),
-            debugs=debugs,
-        )
+        if transform == 'adfa':
+            pass_name = 'abs-data-flow-acc-pass'
+        elif transform == 'stream':
+            pass_name = 'stream-pass'
+        elif transform == 'replay':
+            pass_name = 'replay'
+        else:
+            assert(False)
+
+        os.chdir(self.work_path)
 
-    def build_replay_abs_data_flow(self):
-        pass_name = 'abs-data-flow-acc-pass'
-        debugs = [
-            'ReplayPass',
-            'DynamicInstruction',
-            'TDGSerializer',
-            # 'AbstractDataFlowAcceleratorPass',
-            # 'LoopUtils',
-        ]
         self.benchmark.build_replay(
             pass_name=pass_name,
             trace_file=self.get_trace_result(),
+            profile_file=self.get_profile(),
             tdg_detail='integrated',
             # tdg_detail='standalone',
-            output_tdg=self.get_tdg('adfa'),
+            output_tdg=self.get_tdg(transform),
             debugs=debugs,
         )
 
-    def run_replay(self, transform, debugs):
+
+        os.chdir(self.cwd)
+
+    def simulate(self, transform, debugs):
+        os.chdir(self.work_path)
         gem5_outdir = self.benchmark.gem5_replay(
             standalone=0,
             output_tdg=self.get_tdg(transform),
@@ -238,29 +234,13 @@ def run_replay(self, transform, debugs):
             os.path.join(gem5_outdir, 'region.stats.txt'),
             self.get_result(transform),
         ])
-
-    def replay(self):
-        os.chdir(self.work_path)
-        # Basic replay.
-        # self.build_replay()
-        # debugs = [
-        #     # 'LLVMTraceCPU',
-        #     'RegionStats',
-        # ]
-        # self.run_replay('replay', debugs)
-        # Abstract data flow replay.
-        # self.build_replay_abs_data_flow()
-        debugs = [
-            # 'AbstractDataFlowAccelerator',
-            # 'LLVMTraceCPU',
-            'RegionStats',
-        ]
-        self.run_replay('adfa', debugs)
         os.chdir(self.cwd)
 
     def statistics(self):
         os.chdir(self.work_path)
-        debugs = []
+        debugs = [
+            'TraceStatisticPass',
+        ]
         self.benchmark.get_trace_statistics(
             trace_file=self.get_trace_result(),
             profile_file=self.get_profile(),
@@ -364,9 +344,48 @@ def run_benchmark(benchmark):
     print('start run benchmark ' + benchmark.get_name())
     # benchmark.baseline()
     # benchmark.build_raw_bc()
-    benchmark.trace()
-    benchmark.statistics()
-    # benchmark.replay()
+    # benchmark.trace()
+    # benchmark.statistics()
+
+    # Basic replay.
+    # debugs = [
+    #     'ReplayPass',
+    #     # 'DataGraph',
+    #     # 'TDGSerializer'
+    # ]
+    # benchmark.transform('replay', debugs)
+    # debugs = [
+    #     # 'LLVMTraceCPU',
+    #     'RegionStats',
+    # ]
+    # benchmark.simulate('replay', debugs)
+    # Abstract data flow replay.
+
+    # debugs = [
+    #     'ReplayPass',
+    #     'DynamicInstruction',
+    #     'TDGSerializer',
+    #     # 'AbstractDataFlowAcceleratorPass',
+    #     # 'LoopUtils',
+    # ]
+    # benchmark.transform('adfa', debugs)
+    # debugs = [
+    #     # 'AbstractDataFlowAccelerator',
+    #     # 'LLVMTraceCPU',
+    #     'RegionStats',
+    # ]
+    # benchmark.simulate('adfa', debugs)
+
+    # Stream.
+    debugs = [
+        'ReplayPass',
+        'StreamPass',
+    ]
+    benchmark.transform('stream', debugs)
+    debugs = [
+        # 'LLVMTraceCPU',
+    ]
+    # benchmark.simulate('stream', debugs)
 
 
 def main(folder):

diff --git a/benchmark/MachSuite/bfs/bulk/bfs.c b/benchmark/MachSuite/bfs/bulk/bfs.c
@@ -1,15 +1,15 @@
 /*
 Implementations based on:
-Harish and Narayanan. "Accelerating large graph algorithms on the GPU using CUDA." HiPC, 2007.
-Hong, Oguntebi, Olukotun. "Efficient Parallel Graph Exploration on Multi-Core CPU and GPU." PACT, 2011.
+Harish and Narayanan. "Accelerating large graph algorithms on the GPU using
+CUDA." HiPC, 2007. Hong, Oguntebi, Olukotun. "Efficient Parallel Graph
+Exploration on Multi-Core CPU and GPU." PACT, 2011.
 */
 
 #include "bfs.h"
 
 void bfs(node_t nodes[N_NODES], edge_t edges[N_EDGES],
-            node_index_t starting_node, level_t level[N_NODES],
-            edge_index_t level_counts[N_LEVELS])
-{
+         node_index_t starting_node, level_t level[N_NODES],
+         edge_index_t level_counts[N_LEVELS]) {
   node_index_t n;
   edge_index_t e;
   level_t horizon;
@@ -18,25 +18,28 @@ void bfs(node_t nodes[N_NODES], edge_t edges[N_EDGES],
   level[starting_node] = 0;
   level_counts[0] = 1;
 
-  loop_horizons: for( horizon=0; horizon<N_LEVELS; horizon++ ) {
+loop_horizons:
+  for (horizon = 0; horizon < N_LEVELS; horizon++) {
     cnt = 0;
-    // Add unmarked neighbors of the current horizon to the next horizon
-    loop_nodes: for( n=0; n<N_NODES; n++ ) {
-      if( level[n]==horizon ) {
+  // Add unmarked neighbors of the current horizon to the next horizon
+  loop_nodes:
+    for (n = 0; n < N_NODES; n++) {
+      if (level[n] == horizon) {
         edge_index_t tmp_begin = nodes[n].edge_begin;
         edge_index_t tmp_end = nodes[n].edge_end;
-        loop_neighbors: for( e=tmp_begin; e<tmp_end; e++ ) {
+      loop_neighbors:
+        for (e = tmp_begin; e < tmp_end; e++) {
           node_index_t tmp_dst = edges[e].dst;
           level_t tmp_level = level[tmp_dst];
 
-          if( tmp_level ==MAX_LEVEL ) { // Unmarked
-            level[tmp_dst] = horizon+1;
+          if (tmp_level == MAX_LEVEL) { // Unmarked
+            level[tmp_dst] = horizon + 1;
             ++cnt;
           }
         }
       }
     }
-    if( (level_counts[horizon+1]=cnt)==0 )
+    if ((level_counts[horizon + 1] = cnt) == 0)
       break;
   }
 }
diff --git a/benchmark/MachSuite/bfs/queue/bfs.c b/benchmark/MachSuite/bfs/queue/bfs.c
@@ -1,19 +1,25 @@
 /*
 Implementation based on:
-Hong, Oguntebi, Olukotun. "Efficient Parallel Graph Exploration on Multi-Core CPU and GPU." PACT, 2011.
+Hong, Oguntebi, Olukotun. "Efficient Parallel Graph Exploration on Multi-Core
+CPU and GPU." PACT, 2011.
 */
 
 #include "bfs.h"
 
-#define Q_PUSH(node) { queue[q_in==0?N_NODES-1:q_in-1]=node; q_in=(q_in+1)%N_NODES; }
+#define Q_PUSH(node)                                                           \
+  {                                                                            \
+    queue[q_in == 0 ? N_NODES - 1 : q_in - 1] = node;                          \
+    q_in = (q_in + 1) % N_NODES;                                               \
+  }
 #define Q_PEEK() (queue[q_out])
-#define Q_POP() { q_out = (q_out+1)%N_NODES; }
-#define Q_EMPTY() (q_in>q_out ? q_in==q_out+1 : (q_in==0)&&(q_out==N_NODES-1))
+#define Q_POP()                                                                \
+  { q_out = (q_out + 1) % N_NODES; }
+#define Q_EMPTY()                                                              \
+  (q_in > q_out ? q_in == q_out + 1 : (q_in == 0) && (q_out == N_NODES - 1))
 
 void bfs(node_t nodes[N_NODES], edge_t edges[N_EDGES],
-            node_index_t starting_node, level_t level[N_NODES],
-            edge_index_t level_counts[N_LEVELS])
-{
+         node_index_t starting_node, level_t level[N_NODES],
+         edge_index_t level_counts[N_LEVELS]) {
   node_index_t queue[N_NODES];
   node_index_t q_in, q_out;
   node_index_t dummy;
@@ -33,21 +39,24 @@ void bfs(node_t nodes[N_NODES], edge_t edges[N_EDGES],
 
   // printf("bfs called\n");
 
-  loop_queue: for( dummy=0; dummy<N_NODES; dummy++ ) { // Typically while(not_empty(queue)){
-    if( Q_EMPTY() )
+loop_queue:
+  for (dummy = 0; dummy < N_NODES;
+       dummy++) { // Typically while(not_empty(queue)){
+    if (Q_EMPTY())
       break;
-    n = Q_PEEK();
+    n = Q_PEEK();                                 // Data dependent load.
     Q_POP();
-    edge_index_t tmp_begin = nodes[n].edge_begin;
-    edge_index_t tmp_end = nodes[n].edge_end;
-    loop_neighbors: for( e=tmp_begin; e<tmp_end; e++ ) {
-      node_index_t tmp_dst = edges[e].dst;
+    edge_index_t tmp_begin = nodes[n].edge_begin; // Data dependent load.
+    edge_index_t tmp_end = nodes[n].edge_end;     // Data dependent load.
+  loop_neighbors:
+    for (e = tmp_begin; e < tmp_end; e++) {
+      node_index_t tmp_dst = edges[e].dst;        // Stream load.
       // level_t tmp_level = level[tmp_dst];
       // if( tmp_level ==MAX_LEVEL ) { // Unmarked
-        level_t tmp_level = level[n]+1;
-        level[tmp_dst] = tmp_level;
-        ++level_counts[tmp_level];
-        Q_PUSH(tmp_dst);
+      level_t tmp_level = level[n] + 1;           // Data dependent load.
+      level[tmp_dst] = tmp_level;                 // Data dependent store.
+      ++level_counts[tmp_level];                  // Data dependent load/store.
+      Q_PUSH(tmp_dst);                            // Data dependent store.
       // }
     }
   }

diff --git a/benchmark/MachSuite/bfs/queue/local_support.c b/benchmark/MachSuite/bfs/queue/local_support.c
@@ -7,15 +7,6 @@ int INPUT_SIZE = sizeof(struct bench_args_t);
 void run_benchmark( void *vargs ) {
   struct bench_args_t *args = (struct bench_args_t *)vargs;
   bfs(args->nodes, args->edges, args->starting_node, args->level, args->level_counts);
-  bfs(args->nodes, args->edges, args->starting_node, args->level, args->level_counts);
-  bfs(args->nodes, args->edges, args->starting_node, args->level, args->level_counts);
-  bfs(args->nodes, args->edges, args->starting_node, args->level, args->level_counts);
-  bfs(args->nodes, args->edges, args->starting_node, args->level, args->level_counts);
-  // bfs(args->nodes, args->edges, args->starting_node, args->level, args->level_counts);
-  // bfs(args->nodes, args->edges, args->starting_node, args->level, args->level_counts);
-  // bfs(args->nodes, args->edges, args->starting_node, args->level, args->level_counts);
-  // bfs(args->nodes, args->edges, args->starting_node, args->level, args->level_counts);
-  // bfs(args->nodes, args->edges, args->starting_node, args->level, args->level_counts);
 }
 
 /* Input format:

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -35,6 +35,7 @@ add_library(LLVMTDGPass MODULE
     LoopUnroller.cpp
     PostDominanceFrontier.cpp
     DynamicLoopTree.cpp
+    MemoryAccessPattern.cpp
     LocateAccelerableFunctions.cpp
     Replay.cpp
     TraceStatisticPass.cpp

diff --git a/src/DynamicLoopTree.cpp b/src/DynamicLoopTree.cpp
@@ -38,6 +38,9 @@ void DynamicLoopIteration::addInst(DynamicInstIter InstIter) {
       // We just entered the loop.
       this->Start = InstIter;
       this->Status = BUFFERING;
+      assert(this->StaticToDynamicMap.empty() &&
+             "StaticToDynamicMap should be empty in EMPTY state.");
+      this->StaticToDynamicMap.emplace(StaticInst, DynamicInst);
     }
     break;
   }
@@ -56,8 +59,17 @@ void DynamicLoopIteration::addInst(DynamicInstIter InstIter) {
     } else {
       // This instruction is in the loop.
       // Add to all the nest loops.
+      bool IsInNestedLoop = false;
       for (auto &Nest : this->NestLoopIters) {
         Nest.second->addInst(InstIter);
+        IsInNestedLoop =
+            IsInNestedLoop || Nest.second->getLoop()->contains(StaticInst);
+      }
+      if (!IsInNestedLoop) {
+        // If not in nested loop, add to my StaticToDynamicMap.
+        auto Emplaced =
+            this->StaticToDynamicMap.emplace(StaticInst, DynamicInst).second;
+        assert(Emplaced && "Multiple dynamic instructions in one iteration?");
       }
     }
     break;
@@ -105,4 +117,27 @@ DynamicLoopIteration *DynamicLoopIteration::getChildIter(llvm::Loop *Child) {
   assert(Child->getParentLoop() == this->Loop &&
          "Invalid child for this->Loop");
   return this->NestLoopIters.at(Child);
+}
+
+DynamicInstruction *
+DynamicLoopIteration::getDynamicInst(llvm::Instruction *StaticInst) const {
+  assert(
+      this->Loop->contains(StaticInst) &&
+      "Try getting dynamic instruction for static instruction not within this "
+      "loop.");
+
+  bool IsInNestedLoop = false;
+  for (auto &Nest : this->NestLoopIters) {
+    IsInNestedLoop =
+        IsInNestedLoop || Nest.second->getLoop()->contains(StaticInst);
+  }
+
+  assert(!IsInNestedLoop && "Try getting dynamic instruction for static "
+                            "instruction within nested loops.");
+
+  auto Iter = this->StaticToDynamicMap.find(StaticInst);
+  if (Iter == this->StaticToDynamicMap.end()) {
+    return nullptr;
+  }
+  return Iter->second;
 }
diff --git a/src/DynamicLoopTree.h b/src/DynamicLoopTree.h
@@ -43,6 +43,8 @@ class DynamicLoopIteration {
   DynamicInstIter begin() { return this->Start; }
   DynamicInstIter end() { return this->End; }
 
+  DynamicInstruction *getDynamicInst(llvm::Instruction *StaticInst) const;
+
 private:
   llvm::Loop *Loop;
   // Iterator pointing to the start and end of this iteration.
@@ -52,6 +54,10 @@ class DynamicLoopIteration {
   std::unordered_map<llvm::Loop *, DynamicLoopIteration *> NestLoopIters;
   // Pointer to the next iteration.
   DynamicLoopIteration *NextIter;
+
+  // Map from static instruction to the dynamic inst.
+  std::unordered_map<llvm::Instruction *, DynamicInstruction *>
+      StaticToDynamicMap;
   enum {
     EMPTY,
     BUFFERING,
-Original file line number
+Diff line change
@@ Expand Up / @@ -10,6 +10,9 @@ CTestTestfile.cmake @@
     build/*
+    *.trace
+    *.tdg
+    *.profile
     *.bc
     *.ll
     *.s
@@ Expand Down @@