From 9583ee1ee20ad27a3117f7f607896ed0154b66d5 Mon Sep 17 00:00:00 2001 From: Daniel Molka Date: Thu, 23 Mar 2017 17:15:28 +0100 Subject: [PATCH] replaced long initialization loops with memcpys of small blocks --- CHANGELOG | 6 ++++-- config.cfg | 2 +- source_files/firestarter_global.h | 2 ++ templates/avx512_functions_c.py | 12 ++++++++---- templates/avx_functions_c.py | 9 +++++++-- templates/fma4_functions_c.py | 12 ++++++++---- templates/fma_functions_c.py | 12 ++++++++---- templates/sse2_functions_c.py | 9 +++++++-- 8 files changed, 45 insertions(+), 19 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index d3060654..f121745d 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -20,13 +20,15 @@ ############################################################################### $$ TODO $$ TODO Version 2.x -$$ TODO - add windows version $$ TODO - add Mac OS version $$ TODO - add AMD Zen support $$ TODO $$ TODO Version 2.0 $$ TODO - add results verification -$$ TODO - improve Haswell/Broadwell and Skylake support +$$ TODO - improve Haswell/Broadwell, KNL, and Skylake support + +Version 1.6 + - added Windows version Version 1.5 - added Knights Landing support (AVX512F) diff --git a/config.cfg b/config.cfg index 6377fb43..90b30987 100644 --- a/config.cfg +++ b/config.cfg @@ -31,7 +31,7 @@ major=1 minor=6 # additional information, e.g., "BETA" -info="BETA" +info="" # optional features enable_cuda=1 diff --git a/source_files/firestarter_global.h b/source_files/firestarter_global.h index 410d1f9f..8a6c0c59 100644 --- a/source_files/firestarter_global.h +++ b/source_files/firestarter_global.h @@ -66,6 +66,8 @@ #define LOAD_HIGH 1 /* DO NOT CHANGE! the asm load-loop continues until the load-variable is != 1 */ #define LOAD_STOP 2 +#define INIT_BLOCKSIZE 8192 + /* * watchdog timer */ diff --git a/templates/avx512_functions_c.py b/templates/avx512_functions_c.py index 99b93ecd..be7fce58 100644 --- a/templates/avx512_functions_c.py +++ b/templates/avx512_functions_c.py @@ -41,10 +41,14 @@ def init_functions(file,architectures): file.write(" unsigned long long addrMem = threaddata->addrMem;\n") file.write(" int i;\n") file.write("\n") - buffersize = (l1_size+l2_size+l3_size+ram_size) // 8 - #TODO check if replacing modulo operation changes energy consumption - file.write(" // for (i = 0; i<"+str(buffersize)+"; i++) ((double*)addrMem)[i] = 0.25 + (double)(i%9267) * 0.24738995982e-4;\n") - file.write(" for (i = 0; i<"+str(buffersize)+"; i++) ((double*)addrMem)[i] = 0.25 + (double)(i&0x1FFF) * 0.27948995982e-4;\n") +# old version: one large loop that initializes indivisual elements +# buffersize = (l1_size+l2_size+l3_size+ram_size) // 8 +# file.write(" //for (i = 0; i<"+str(buffersize)+"; i++) ((double*)addrMem)[i] = 0.25 + (double)(i%9267) * 0.24738995982e-4;\n") +# file.write(" for (i = 0; i<"+str(buffersize)+"; i++) ((double*)addrMem)[i] = 0.25 + (double)(i&0x1FFF) * 0.27948995982e-4;\n") + buffersize = (l1_size+l2_size+l3_size+ram_size) + file.write(" for (i = 0; i < INIT_BLOCKSIZE; i+=8) *((double*)(addrMem+i)) = 0.25 + (double)i * 0.27948995982e-4;\n") + file.write(" for (i = INIT_BLOCKSIZE; i <= "+str(buffersize)+" - INIT_BLOCKSIZE; i+= INIT_BLOCKSIZE) memcpy((void*)(addrMem+i),(void*)(addrMem+i-INIT_BLOCKSIZE),INIT_BLOCKSIZE);\n") + file.write(" for (; i <= "+str(buffersize)+"-8; i+=8) *((double*)(addrMem+i)) = 0.25 + (double)i * 0.27948995982e-4;\n") file.write("\n") flops_total=0 bytes_total=0 diff --git a/templates/avx_functions_c.py b/templates/avx_functions_c.py index f3533903..f847d9ae 100644 --- a/templates/avx_functions_c.py +++ b/templates/avx_functions_c.py @@ -41,8 +41,13 @@ def init_functions(file,architectures): file.write(" unsigned long long addrMem = threaddata->addrMem;\n") file.write(" int i;\n") file.write("\n") - buffersize = (l1_size+l2_size+l3_size+ram_size) // 8 - file.write(" for (i = 0; i<"+str(buffersize)+"; i++) ((double*)addrMem)[i] = i * 1.654738925401e-15;\n") +# old version: one large loop that initializes indivisual elements +# buffersize = (l1_size+l2_size+l3_size+ram_size) // 8 +# file.write(" for (i = 0; i<"+str(buffersize)+"; i++) ((double*)addrMem)[i] = i * 1.654738925401e-15;\n") + buffersize = (l1_size+l2_size+l3_size+ram_size) + file.write(" for (i = 0; i < INIT_BLOCKSIZE; i+=8) *((double*)(addrMem+i)) = i * 1.654738925401e-10;\n") + file.write(" for (i = INIT_BLOCKSIZE; i <= "+str(buffersize)+" - INIT_BLOCKSIZE; i+= INIT_BLOCKSIZE) memcpy((void*)(addrMem+i),(void*)(addrMem+i-INIT_BLOCKSIZE),INIT_BLOCKSIZE);\n") + file.write(" for (; i <= "+str(buffersize)+"-8; i+=8) *((double*)(addrMem+i)) = i * 1.654738925401e-15;\n") file.write("\n") flops_total=0 bytes_total=0 diff --git a/templates/fma4_functions_c.py b/templates/fma4_functions_c.py index de6b3bb8..7acab360 100644 --- a/templates/fma4_functions_c.py +++ b/templates/fma4_functions_c.py @@ -41,10 +41,14 @@ def init_functions(file,architectures): file.write(" unsigned long long addrMem = threaddata->addrMem;\n") file.write(" int i;\n") file.write("\n") - buffersize = (l1_size+l2_size+l3_size+ram_size) // 8 - #TODO check if replacing modulo operation changes energy consumption - file.write(" // for (i = 0; i<"+str(buffersize)+"; i++) ((double*)addrMem)[i] = 0.25 + (double)(i%9267) * 0.24738995982e-4;\n") - file.write(" for (i = 0; i<"+str(buffersize)+"; i++) ((double*)addrMem)[i] = 0.25 + (double)(i&0x1FFF) * 0.27948995982e-4;\n") +# old version: one large loop that initializes indivisual elements +# buffersize = (l1_size+l2_size+l3_size+ram_size) // 8 +# file.write(" // for (i = 0; i<"+str(buffersize)+"; i++) ((double*)addrMem)[i] = 0.25 + (double)(i%9267) * 0.24738995982e-4;\n") +# file.write(" for (i = 0; i<"+str(buffersize)+"; i++) ((double*)addrMem)[i] = 0.25 + (double)(i&0x1FFF) * 0.27948995982e-4;\n") + buffersize = (l1_size+l2_size+l3_size+ram_size) + file.write(" for (i = 0; i < INIT_BLOCKSIZE; i+=8) *((double*)(addrMem+i)) = 0.25 + (double)i * 0.27948995982e-4;\n") + file.write(" for (i = INIT_BLOCKSIZE; i <= "+str(buffersize)+" - INIT_BLOCKSIZE; i+= INIT_BLOCKSIZE) memcpy((void*)(addrMem+i),(void*)(addrMem+i-INIT_BLOCKSIZE),INIT_BLOCKSIZE);\n") + file.write(" for (; i <= "+str(buffersize)+"-8; i+=8) *((double*)(addrMem+i)) = 0.25 + (double)i * 0.27948995982e-4;\n") file.write("\n") flops_total=0 bytes_total=0 diff --git a/templates/fma_functions_c.py b/templates/fma_functions_c.py index 4b1367c4..7f3c00e4 100644 --- a/templates/fma_functions_c.py +++ b/templates/fma_functions_c.py @@ -41,10 +41,14 @@ def init_functions(file,architectures): file.write(" unsigned long long addrMem = threaddata->addrMem;\n") file.write(" int i;\n") file.write("\n") - buffersize = (l1_size+l2_size+l3_size+ram_size) // 8 - #TODO check if replacing modulo operation changes energy consumption - file.write(" // for (i = 0; i<"+str(buffersize)+"; i++) ((double*)addrMem)[i] = 0.25 + (double)(i%9267) * 0.24738995982e-4;\n") - file.write(" for (i = 0; i<"+str(buffersize)+"; i++) ((double*)addrMem)[i] = 0.25 + (double)(i&0x1FFF) * 0.27948995982e-4;\n") +# old version: one large loop that initializes indivisual elements +# buffersize = (l1_size+l2_size+l3_size+ram_size) // 8 +# file.write(" // for (i = 0; i<"+str(buffersize)+"; i++) ((double*)addrMem)[i] = 0.25 + (double)(i%9267) * 0.24738995982e-4;\n") +# file.write(" for (i = 0; i<"+str(buffersize)+"; i++) ((double*)addrMem)[i] = 0.25 + (double)(i&0x1FFF) * 0.27948995982e-4;\n") + buffersize = (l1_size+l2_size+l3_size+ram_size) + file.write(" for (i = 0; i < INIT_BLOCKSIZE; i+=8) *((double*)(addrMem+i)) = 0.25 + (double)i * 0.27948995982e-4;\n") + file.write(" for (i = INIT_BLOCKSIZE; i <= "+str(buffersize)+" - INIT_BLOCKSIZE; i+= INIT_BLOCKSIZE) memcpy((void*)(addrMem+i),(void*)(addrMem+i-INIT_BLOCKSIZE),INIT_BLOCKSIZE);\n") + file.write(" for (; i <= "+str(buffersize)+"-8; i+=8) *((double*)(addrMem+i)) = 0.25 + (double)i * 0.27948995982e-4;\n") file.write("\n") flops_total=0 bytes_total=0 diff --git a/templates/sse2_functions_c.py b/templates/sse2_functions_c.py index 86c40d50..2ca1f638 100644 --- a/templates/sse2_functions_c.py +++ b/templates/sse2_functions_c.py @@ -41,8 +41,13 @@ def init_functions(file,architectures): file.write(" unsigned long long addrMem = threaddata->addrMem;\n") file.write(" int i;\n") file.write("\n") - buffersize = (l1_size+l2_size+l3_size+ram_size) // 8 - file.write(" for (i = 0; i<"+str(buffersize)+"; i++) ((double*)addrMem)[i] = i * 1.654738925401e-15;\n") +# old version: one large loop that initializes indivisual elements +# buffersize = (l1_size+l2_size+l3_size+ram_size) // 8 +# file.write(" for (i = 0; i<"+str(buffersize)+"; i++) ((double*)addrMem)[i] = i * 1.654738925401e-15;\n") + buffersize = (l1_size+l2_size+l3_size+ram_size) + file.write(" for (i = 0; i < INIT_BLOCKSIZE; i+=8) *((double*)(addrMem+i)) = i * 1.654738925401e-10;\n") + file.write(" for (i = INIT_BLOCKSIZE; i <= "+str(buffersize)+" - INIT_BLOCKSIZE; i+= INIT_BLOCKSIZE) memcpy((void*)(addrMem+i),(void*)(addrMem+i-INIT_BLOCKSIZE),INIT_BLOCKSIZE);\n") + file.write(" for (; i <= "+str(buffersize)+"-8; i+=8) *((double*)(addrMem+i)) = i * 1.654738925401e-15;\n") file.write("\n") flops_total=0 bytes_total=0