Skip to content

Commit

Permalink
replaced long initialization loops with memcpys of small blocks
Browse files Browse the repository at this point in the history
  • Loading branch information
Daniel Molka committed Mar 23, 2017
1 parent 9561075 commit 9583ee1
Show file tree
Hide file tree
Showing 8 changed files with 45 additions and 19 deletions.
6 changes: 4 additions & 2 deletions CHANGELOG
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,15 @@
###############################################################################
$$ TODO
$$ TODO Version 2.x
$$ TODO - add windows version
$$ TODO - add Mac OS version
$$ TODO - add AMD Zen support
$$ TODO
$$ TODO Version 2.0
$$ TODO - add results verification
$$ TODO - improve Haswell/Broadwell and Skylake support
$$ TODO - improve Haswell/Broadwell, KNL, and Skylake support

Version 1.6
- added Windows version

Version 1.5
- added Knights Landing support (AVX512F)
Expand Down
2 changes: 1 addition & 1 deletion config.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
major=1
minor=6
# additional information, e.g., "BETA"
info="BETA"
info=""

# optional features
enable_cuda=1
Expand Down
2 changes: 2 additions & 0 deletions source_files/firestarter_global.h
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,8 @@
#define LOAD_HIGH 1 /* DO NOT CHANGE! the asm load-loop continues until the load-variable is != 1 */
#define LOAD_STOP 2

#define INIT_BLOCKSIZE 8192

/*
* watchdog timer
*/
Expand Down
12 changes: 8 additions & 4 deletions templates/avx512_functions_c.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,14 @@ def init_functions(file,architectures):
file.write(" unsigned long long addrMem = threaddata->addrMem;\n")
file.write(" int i;\n")
file.write("\n")
buffersize = (l1_size+l2_size+l3_size+ram_size) // 8
#TODO check if replacing modulo operation changes energy consumption
file.write(" // for (i = 0; i<"+str(buffersize)+"; i++) ((double*)addrMem)[i] = 0.25 + (double)(i%9267) * 0.24738995982e-4;\n")
file.write(" for (i = 0; i<"+str(buffersize)+"; i++) ((double*)addrMem)[i] = 0.25 + (double)(i&0x1FFF) * 0.27948995982e-4;\n")
# old version: one large loop that initializes indivisual elements
# buffersize = (l1_size+l2_size+l3_size+ram_size) // 8
# file.write(" //for (i = 0; i<"+str(buffersize)+"; i++) ((double*)addrMem)[i] = 0.25 + (double)(i%9267) * 0.24738995982e-4;\n")
# file.write(" for (i = 0; i<"+str(buffersize)+"; i++) ((double*)addrMem)[i] = 0.25 + (double)(i&0x1FFF) * 0.27948995982e-4;\n")
buffersize = (l1_size+l2_size+l3_size+ram_size)
file.write(" for (i = 0; i < INIT_BLOCKSIZE; i+=8) *((double*)(addrMem+i)) = 0.25 + (double)i * 0.27948995982e-4;\n")
file.write(" for (i = INIT_BLOCKSIZE; i <= "+str(buffersize)+" - INIT_BLOCKSIZE; i+= INIT_BLOCKSIZE) memcpy((void*)(addrMem+i),(void*)(addrMem+i-INIT_BLOCKSIZE),INIT_BLOCKSIZE);\n")
file.write(" for (; i <= "+str(buffersize)+"-8; i+=8) *((double*)(addrMem+i)) = 0.25 + (double)i * 0.27948995982e-4;\n")
file.write("\n")
flops_total=0
bytes_total=0
Expand Down
9 changes: 7 additions & 2 deletions templates/avx_functions_c.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,13 @@ def init_functions(file,architectures):
file.write(" unsigned long long addrMem = threaddata->addrMem;\n")
file.write(" int i;\n")
file.write("\n")
buffersize = (l1_size+l2_size+l3_size+ram_size) // 8
file.write(" for (i = 0; i<"+str(buffersize)+"; i++) ((double*)addrMem)[i] = i * 1.654738925401e-15;\n")
# old version: one large loop that initializes indivisual elements
# buffersize = (l1_size+l2_size+l3_size+ram_size) // 8
# file.write(" for (i = 0; i<"+str(buffersize)+"; i++) ((double*)addrMem)[i] = i * 1.654738925401e-15;\n")
buffersize = (l1_size+l2_size+l3_size+ram_size)
file.write(" for (i = 0; i < INIT_BLOCKSIZE; i+=8) *((double*)(addrMem+i)) = i * 1.654738925401e-10;\n")
file.write(" for (i = INIT_BLOCKSIZE; i <= "+str(buffersize)+" - INIT_BLOCKSIZE; i+= INIT_BLOCKSIZE) memcpy((void*)(addrMem+i),(void*)(addrMem+i-INIT_BLOCKSIZE),INIT_BLOCKSIZE);\n")
file.write(" for (; i <= "+str(buffersize)+"-8; i+=8) *((double*)(addrMem+i)) = i * 1.654738925401e-15;\n")
file.write("\n")
flops_total=0
bytes_total=0
Expand Down
12 changes: 8 additions & 4 deletions templates/fma4_functions_c.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,14 @@ def init_functions(file,architectures):
file.write(" unsigned long long addrMem = threaddata->addrMem;\n")
file.write(" int i;\n")
file.write("\n")
buffersize = (l1_size+l2_size+l3_size+ram_size) // 8
#TODO check if replacing modulo operation changes energy consumption
file.write(" // for (i = 0; i<"+str(buffersize)+"; i++) ((double*)addrMem)[i] = 0.25 + (double)(i%9267) * 0.24738995982e-4;\n")
file.write(" for (i = 0; i<"+str(buffersize)+"; i++) ((double*)addrMem)[i] = 0.25 + (double)(i&0x1FFF) * 0.27948995982e-4;\n")
# old version: one large loop that initializes indivisual elements
# buffersize = (l1_size+l2_size+l3_size+ram_size) // 8
# file.write(" // for (i = 0; i<"+str(buffersize)+"; i++) ((double*)addrMem)[i] = 0.25 + (double)(i%9267) * 0.24738995982e-4;\n")
# file.write(" for (i = 0; i<"+str(buffersize)+"; i++) ((double*)addrMem)[i] = 0.25 + (double)(i&0x1FFF) * 0.27948995982e-4;\n")
buffersize = (l1_size+l2_size+l3_size+ram_size)
file.write(" for (i = 0; i < INIT_BLOCKSIZE; i+=8) *((double*)(addrMem+i)) = 0.25 + (double)i * 0.27948995982e-4;\n")
file.write(" for (i = INIT_BLOCKSIZE; i <= "+str(buffersize)+" - INIT_BLOCKSIZE; i+= INIT_BLOCKSIZE) memcpy((void*)(addrMem+i),(void*)(addrMem+i-INIT_BLOCKSIZE),INIT_BLOCKSIZE);\n")
file.write(" for (; i <= "+str(buffersize)+"-8; i+=8) *((double*)(addrMem+i)) = 0.25 + (double)i * 0.27948995982e-4;\n")
file.write("\n")
flops_total=0
bytes_total=0
Expand Down
12 changes: 8 additions & 4 deletions templates/fma_functions_c.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,14 @@ def init_functions(file,architectures):
file.write(" unsigned long long addrMem = threaddata->addrMem;\n")
file.write(" int i;\n")
file.write("\n")
buffersize = (l1_size+l2_size+l3_size+ram_size) // 8
#TODO check if replacing modulo operation changes energy consumption
file.write(" // for (i = 0; i<"+str(buffersize)+"; i++) ((double*)addrMem)[i] = 0.25 + (double)(i%9267) * 0.24738995982e-4;\n")
file.write(" for (i = 0; i<"+str(buffersize)+"; i++) ((double*)addrMem)[i] = 0.25 + (double)(i&0x1FFF) * 0.27948995982e-4;\n")
# old version: one large loop that initializes indivisual elements
# buffersize = (l1_size+l2_size+l3_size+ram_size) // 8
# file.write(" // for (i = 0; i<"+str(buffersize)+"; i++) ((double*)addrMem)[i] = 0.25 + (double)(i%9267) * 0.24738995982e-4;\n")
# file.write(" for (i = 0; i<"+str(buffersize)+"; i++) ((double*)addrMem)[i] = 0.25 + (double)(i&0x1FFF) * 0.27948995982e-4;\n")
buffersize = (l1_size+l2_size+l3_size+ram_size)
file.write(" for (i = 0; i < INIT_BLOCKSIZE; i+=8) *((double*)(addrMem+i)) = 0.25 + (double)i * 0.27948995982e-4;\n")
file.write(" for (i = INIT_BLOCKSIZE; i <= "+str(buffersize)+" - INIT_BLOCKSIZE; i+= INIT_BLOCKSIZE) memcpy((void*)(addrMem+i),(void*)(addrMem+i-INIT_BLOCKSIZE),INIT_BLOCKSIZE);\n")
file.write(" for (; i <= "+str(buffersize)+"-8; i+=8) *((double*)(addrMem+i)) = 0.25 + (double)i * 0.27948995982e-4;\n")
file.write("\n")
flops_total=0
bytes_total=0
Expand Down
9 changes: 7 additions & 2 deletions templates/sse2_functions_c.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,13 @@ def init_functions(file,architectures):
file.write(" unsigned long long addrMem = threaddata->addrMem;\n")
file.write(" int i;\n")
file.write("\n")
buffersize = (l1_size+l2_size+l3_size+ram_size) // 8
file.write(" for (i = 0; i<"+str(buffersize)+"; i++) ((double*)addrMem)[i] = i * 1.654738925401e-15;\n")
# old version: one large loop that initializes indivisual elements
# buffersize = (l1_size+l2_size+l3_size+ram_size) // 8
# file.write(" for (i = 0; i<"+str(buffersize)+"; i++) ((double*)addrMem)[i] = i * 1.654738925401e-15;\n")
buffersize = (l1_size+l2_size+l3_size+ram_size)
file.write(" for (i = 0; i < INIT_BLOCKSIZE; i+=8) *((double*)(addrMem+i)) = i * 1.654738925401e-10;\n")
file.write(" for (i = INIT_BLOCKSIZE; i <= "+str(buffersize)+" - INIT_BLOCKSIZE; i+= INIT_BLOCKSIZE) memcpy((void*)(addrMem+i),(void*)(addrMem+i-INIT_BLOCKSIZE),INIT_BLOCKSIZE);\n")
file.write(" for (; i <= "+str(buffersize)+"-8; i+=8) *((double*)(addrMem+i)) = i * 1.654738925401e-15;\n")
file.write("\n")
flops_total=0
bytes_total=0
Expand Down

0 comments on commit 9583ee1

Please sign in to comment.