Skip to content

Commit

Permalink
[feat] Add basic support to dynamically compute memory access pattern
Browse files Browse the repository at this point in the history
  • Loading branch information
seanzw committed Aug 7, 2018
1 parent 17e78d7 commit 4a30d33
Show file tree
Hide file tree
Showing 15 changed files with 561 additions and 100 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@ CTestTestfile.cmake

build/*

*.trace
*.tdg
*.profile
*.bc
*.ll
*.s
Expand Down
113 changes: 66 additions & 47 deletions benchmark/MachSuite.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,9 @@ def get_trace_result(self):
def get_profile(self):
return self.get_trace() + '.profile'

def get_profile(self):
return self.get_trace() + '.profile'

def get_tdg(self, transform):
return '{name}.{transform}.tdg'.format(name=self.get_name(), transform=transform)

Expand Down Expand Up @@ -192,41 +195,34 @@ def trace(self):
self.benchmark.run_trace(self.get_trace())
os.chdir(self.cwd)

def build_replay(self):
def transform(self, transform, debugs):
pass_name = 'replay'
debugs = [
'ReplayPass',
# 'DataGraph',
# 'TDGSerializer'
]
self.benchmark.build_replay(
pass_name=pass_name,
trace_file=self.get_trace_result(),
tdg_detail='integrated',
# tdg_detail='standalone',
output_tdg=self.get_tdg('replay'),
debugs=debugs,
)
if transform == 'adfa':
pass_name = 'abs-data-flow-acc-pass'
elif transform == 'stream':
pass_name = 'stream-pass'
elif transform == 'replay':
pass_name = 'replay'
else:
assert(False)

os.chdir(self.work_path)

def build_replay_abs_data_flow(self):
pass_name = 'abs-data-flow-acc-pass'
debugs = [
'ReplayPass',
'DynamicInstruction',
'TDGSerializer',
# 'AbstractDataFlowAcceleratorPass',
# 'LoopUtils',
]
self.benchmark.build_replay(
pass_name=pass_name,
trace_file=self.get_trace_result(),
profile_file=self.get_profile(),
tdg_detail='integrated',
# tdg_detail='standalone',
output_tdg=self.get_tdg('adfa'),
output_tdg=self.get_tdg(transform),
debugs=debugs,
)

def run_replay(self, transform, debugs):

os.chdir(self.cwd)

def simulate(self, transform, debugs):
os.chdir(self.work_path)
gem5_outdir = self.benchmark.gem5_replay(
standalone=0,
output_tdg=self.get_tdg(transform),
Expand All @@ -238,29 +234,13 @@ def run_replay(self, transform, debugs):
os.path.join(gem5_outdir, 'region.stats.txt'),
self.get_result(transform),
])

def replay(self):
os.chdir(self.work_path)
# Basic replay.
# self.build_replay()
# debugs = [
# # 'LLVMTraceCPU',
# 'RegionStats',
# ]
# self.run_replay('replay', debugs)
# Abstract data flow replay.
# self.build_replay_abs_data_flow()
debugs = [
# 'AbstractDataFlowAccelerator',
# 'LLVMTraceCPU',
'RegionStats',
]
self.run_replay('adfa', debugs)
os.chdir(self.cwd)

def statistics(self):
os.chdir(self.work_path)
debugs = []
debugs = [
'TraceStatisticPass',
]
self.benchmark.get_trace_statistics(
trace_file=self.get_trace_result(),
profile_file=self.get_profile(),
Expand Down Expand Up @@ -364,9 +344,48 @@ def run_benchmark(benchmark):
print('start run benchmark ' + benchmark.get_name())
# benchmark.baseline()
# benchmark.build_raw_bc()
benchmark.trace()
benchmark.statistics()
# benchmark.replay()
# benchmark.trace()
# benchmark.statistics()

# Basic replay.
# debugs = [
# 'ReplayPass',
# # 'DataGraph',
# # 'TDGSerializer'
# ]
# benchmark.transform('replay', debugs)
# debugs = [
# # 'LLVMTraceCPU',
# 'RegionStats',
# ]
# benchmark.simulate('replay', debugs)
# Abstract data flow replay.

# debugs = [
# 'ReplayPass',
# 'DynamicInstruction',
# 'TDGSerializer',
# # 'AbstractDataFlowAcceleratorPass',
# # 'LoopUtils',
# ]
# benchmark.transform('adfa', debugs)
# debugs = [
# # 'AbstractDataFlowAccelerator',
# # 'LLVMTraceCPU',
# 'RegionStats',
# ]
# benchmark.simulate('adfa', debugs)

# Stream.
debugs = [
'ReplayPass',
'StreamPass',
]
benchmark.transform('stream', debugs)
debugs = [
# 'LLVMTraceCPU',
]
# benchmark.simulate('stream', debugs)


def main(folder):
Expand Down
29 changes: 16 additions & 13 deletions benchmark/MachSuite/bfs/bulk/bfs.c
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
/*
Implementations based on:
Harish and Narayanan. "Accelerating large graph algorithms on the GPU using CUDA." HiPC, 2007.
Hong, Oguntebi, Olukotun. "Efficient Parallel Graph Exploration on Multi-Core CPU and GPU." PACT, 2011.
Harish and Narayanan. "Accelerating large graph algorithms on the GPU using
CUDA." HiPC, 2007. Hong, Oguntebi, Olukotun. "Efficient Parallel Graph
Exploration on Multi-Core CPU and GPU." PACT, 2011.
*/

#include "bfs.h"

void bfs(node_t nodes[N_NODES], edge_t edges[N_EDGES],
node_index_t starting_node, level_t level[N_NODES],
edge_index_t level_counts[N_LEVELS])
{
node_index_t starting_node, level_t level[N_NODES],
edge_index_t level_counts[N_LEVELS]) {
node_index_t n;
edge_index_t e;
level_t horizon;
Expand All @@ -18,25 +18,28 @@ void bfs(node_t nodes[N_NODES], edge_t edges[N_EDGES],
level[starting_node] = 0;
level_counts[0] = 1;

loop_horizons: for( horizon=0; horizon<N_LEVELS; horizon++ ) {
loop_horizons:
for (horizon = 0; horizon < N_LEVELS; horizon++) {
cnt = 0;
// Add unmarked neighbors of the current horizon to the next horizon
loop_nodes: for( n=0; n<N_NODES; n++ ) {
if( level[n]==horizon ) {
// Add unmarked neighbors of the current horizon to the next horizon
loop_nodes:
for (n = 0; n < N_NODES; n++) {
if (level[n] == horizon) {
edge_index_t tmp_begin = nodes[n].edge_begin;
edge_index_t tmp_end = nodes[n].edge_end;
loop_neighbors: for( e=tmp_begin; e<tmp_end; e++ ) {
loop_neighbors:
for (e = tmp_begin; e < tmp_end; e++) {
node_index_t tmp_dst = edges[e].dst;
level_t tmp_level = level[tmp_dst];

if( tmp_level ==MAX_LEVEL ) { // Unmarked
level[tmp_dst] = horizon+1;
if (tmp_level == MAX_LEVEL) { // Unmarked
level[tmp_dst] = horizon + 1;
++cnt;
}
}
}
}
if( (level_counts[horizon+1]=cnt)==0 )
if ((level_counts[horizon + 1] = cnt) == 0)
break;
}
}
45 changes: 27 additions & 18 deletions benchmark/MachSuite/bfs/queue/bfs.c
Original file line number Diff line number Diff line change
@@ -1,19 +1,25 @@
/*
Implementation based on:
Hong, Oguntebi, Olukotun. "Efficient Parallel Graph Exploration on Multi-Core CPU and GPU." PACT, 2011.
Hong, Oguntebi, Olukotun. "Efficient Parallel Graph Exploration on Multi-Core
CPU and GPU." PACT, 2011.
*/

#include "bfs.h"

#define Q_PUSH(node) { queue[q_in==0?N_NODES-1:q_in-1]=node; q_in=(q_in+1)%N_NODES; }
#define Q_PUSH(node) \
{ \
queue[q_in == 0 ? N_NODES - 1 : q_in - 1] = node; \
q_in = (q_in + 1) % N_NODES; \
}
#define Q_PEEK() (queue[q_out])
#define Q_POP() { q_out = (q_out+1)%N_NODES; }
#define Q_EMPTY() (q_in>q_out ? q_in==q_out+1 : (q_in==0)&&(q_out==N_NODES-1))
#define Q_POP() \
{ q_out = (q_out + 1) % N_NODES; }
#define Q_EMPTY() \
(q_in > q_out ? q_in == q_out + 1 : (q_in == 0) && (q_out == N_NODES - 1))

void bfs(node_t nodes[N_NODES], edge_t edges[N_EDGES],
node_index_t starting_node, level_t level[N_NODES],
edge_index_t level_counts[N_LEVELS])
{
node_index_t starting_node, level_t level[N_NODES],
edge_index_t level_counts[N_LEVELS]) {
node_index_t queue[N_NODES];
node_index_t q_in, q_out;
node_index_t dummy;
Expand All @@ -33,21 +39,24 @@ void bfs(node_t nodes[N_NODES], edge_t edges[N_EDGES],

// printf("bfs called\n");

loop_queue: for( dummy=0; dummy<N_NODES; dummy++ ) { // Typically while(not_empty(queue)){
if( Q_EMPTY() )
loop_queue:
for (dummy = 0; dummy < N_NODES;
dummy++) { // Typically while(not_empty(queue)){
if (Q_EMPTY())
break;
n = Q_PEEK();
n = Q_PEEK(); // Data dependent load.
Q_POP();
edge_index_t tmp_begin = nodes[n].edge_begin;
edge_index_t tmp_end = nodes[n].edge_end;
loop_neighbors: for( e=tmp_begin; e<tmp_end; e++ ) {
node_index_t tmp_dst = edges[e].dst;
edge_index_t tmp_begin = nodes[n].edge_begin; // Data dependent load.
edge_index_t tmp_end = nodes[n].edge_end; // Data dependent load.
loop_neighbors:
for (e = tmp_begin; e < tmp_end; e++) {
node_index_t tmp_dst = edges[e].dst; // Stream load.
// level_t tmp_level = level[tmp_dst];
// if( tmp_level ==MAX_LEVEL ) { // Unmarked
level_t tmp_level = level[n]+1;
level[tmp_dst] = tmp_level;
++level_counts[tmp_level];
Q_PUSH(tmp_dst);
level_t tmp_level = level[n] + 1; // Data dependent load.
level[tmp_dst] = tmp_level; // Data dependent store.
++level_counts[tmp_level]; // Data dependent load/store.
Q_PUSH(tmp_dst); // Data dependent store.
// }
}
}
Expand Down
9 changes: 0 additions & 9 deletions benchmark/MachSuite/bfs/queue/local_support.c
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,6 @@ int INPUT_SIZE = sizeof(struct bench_args_t);
void run_benchmark( void *vargs ) {
struct bench_args_t *args = (struct bench_args_t *)vargs;
bfs(args->nodes, args->edges, args->starting_node, args->level, args->level_counts);
bfs(args->nodes, args->edges, args->starting_node, args->level, args->level_counts);
bfs(args->nodes, args->edges, args->starting_node, args->level, args->level_counts);
bfs(args->nodes, args->edges, args->starting_node, args->level, args->level_counts);
bfs(args->nodes, args->edges, args->starting_node, args->level, args->level_counts);
// bfs(args->nodes, args->edges, args->starting_node, args->level, args->level_counts);
// bfs(args->nodes, args->edges, args->starting_node, args->level, args->level_counts);
// bfs(args->nodes, args->edges, args->starting_node, args->level, args->level_counts);
// bfs(args->nodes, args->edges, args->starting_node, args->level, args->level_counts);
// bfs(args->nodes, args->edges, args->starting_node, args->level, args->level_counts);
}

/* Input format:
Expand Down
1 change: 1 addition & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ add_library(LLVMTDGPass MODULE
LoopUnroller.cpp
PostDominanceFrontier.cpp
DynamicLoopTree.cpp
MemoryAccessPattern.cpp
LocateAccelerableFunctions.cpp
Replay.cpp
TraceStatisticPass.cpp
Expand Down
35 changes: 35 additions & 0 deletions src/DynamicLoopTree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,9 @@ void DynamicLoopIteration::addInst(DynamicInstIter InstIter) {
// We just entered the loop.
this->Start = InstIter;
this->Status = BUFFERING;
assert(this->StaticToDynamicMap.empty() &&
"StaticToDynamicMap should be empty in EMPTY state.");
this->StaticToDynamicMap.emplace(StaticInst, DynamicInst);
}
break;
}
Expand All @@ -56,8 +59,17 @@ void DynamicLoopIteration::addInst(DynamicInstIter InstIter) {
} else {
// This instruction is in the loop.
// Add to all the nest loops.
bool IsInNestedLoop = false;
for (auto &Nest : this->NestLoopIters) {
Nest.second->addInst(InstIter);
IsInNestedLoop =
IsInNestedLoop || Nest.second->getLoop()->contains(StaticInst);
}
if (!IsInNestedLoop) {
// If not in nested loop, add to my StaticToDynamicMap.
auto Emplaced =
this->StaticToDynamicMap.emplace(StaticInst, DynamicInst).second;
assert(Emplaced && "Multiple dynamic instructions in one iteration?");
}
}
break;
Expand Down Expand Up @@ -105,4 +117,27 @@ DynamicLoopIteration *DynamicLoopIteration::getChildIter(llvm::Loop *Child) {
assert(Child->getParentLoop() == this->Loop &&
"Invalid child for this->Loop");
return this->NestLoopIters.at(Child);
}

DynamicInstruction *
DynamicLoopIteration::getDynamicInst(llvm::Instruction *StaticInst) const {
assert(
this->Loop->contains(StaticInst) &&
"Try getting dynamic instruction for static instruction not within this "
"loop.");

bool IsInNestedLoop = false;
for (auto &Nest : this->NestLoopIters) {
IsInNestedLoop =
IsInNestedLoop || Nest.second->getLoop()->contains(StaticInst);
}

assert(!IsInNestedLoop && "Try getting dynamic instruction for static "
"instruction within nested loops.");

auto Iter = this->StaticToDynamicMap.find(StaticInst);
if (Iter == this->StaticToDynamicMap.end()) {
return nullptr;
}
return Iter->second;
}
6 changes: 6 additions & 0 deletions src/DynamicLoopTree.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ class DynamicLoopIteration {
DynamicInstIter begin() { return this->Start; }
DynamicInstIter end() { return this->End; }

DynamicInstruction *getDynamicInst(llvm::Instruction *StaticInst) const;

private:
llvm::Loop *Loop;
// Iterator pointing to the start and end of this iteration.
Expand All @@ -52,6 +54,10 @@ class DynamicLoopIteration {
std::unordered_map<llvm::Loop *, DynamicLoopIteration *> NestLoopIters;
// Pointer to the next iteration.
DynamicLoopIteration *NextIter;

// Map from static instruction to the dynamic inst.
std::unordered_map<llvm::Instruction *, DynamicInstruction *>
StaticToDynamicMap;
enum {
EMPTY,
BUFFERING,
Expand Down
Loading

0 comments on commit 4a30d33

Please sign in to comment.