From acf384fae7f7e85f161390e5867628a219af0eaa Mon Sep 17 00:00:00 2001 From: Robert Schoene Date: Thu, 19 Oct 2017 10:04:00 +0200 Subject: [PATCH] FIRESTARTER 1.7 --- CHANGELOG | 4 ++++ config.cfg | 17 +++++++++++++---- templates/avx512_functions_c.py | 34 ++++++++++++++++++++++++++++++++- 3 files changed, 50 insertions(+), 5 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index f121745d..58c76f99 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -27,6 +27,10 @@ $$ TODO Version 2.0 $$ TODO - add results verification $$ TODO - improve Haswell/Broadwell, KNL, and Skylake support +Version 1.7 + - added Skylake-SP support + - minor fixes + Version 1.6 - added Windows version diff --git a/config.cfg b/config.cfg index 90b30987..54902be0 100644 --- a/config.cfg +++ b/config.cfg @@ -41,7 +41,7 @@ enable_mac=0 [ISA_AVX512] template= avx512 feature_req= avx512 -fallback= func_knl_xeonphi_avx512_4t +fallback= func_skl_xeonep_avx512_1t, func_skl_xeonep_avx512_2t flags = -mavx512f win64_incl = 1 @@ -121,8 +121,18 @@ lines= 1536 instr_groups= RAM_L,L3_LS_256,L2_LS_256,L1_2LS_256,REG proportion= 3,5,18,78,40 -# TODO Skylake server -# - use AVX-512 +# Skylake server +[Skylake-SP] +arch= skl +model= xeonep +threads= 1,2 +isa= avx512 +cpu_family= 6 +cpu_model= 85 +buffer_sizes= 32768,1048576,1441792,1048576000 +lines= 1536 +instr_groups= RAM_S,RAM_P,L3_S,L3_P,L2_S,L2_L,L1_S,L1_L,L1_BROADCAST,REG +proportion= 3,1,1,1,4,70,0,40,120,160 # Haswell/Broadwell desktop [Haswell] @@ -215,4 +225,3 @@ buffer_sizes= 16384,1048576,786432,104857600 lines= 1536 instr_groups= RAM_L,L3_L,L2_LS,L1_L,REG proportion= 1,1,5,90,45 - diff --git a/templates/avx512_functions_c.py b/templates/avx512_functions_c.py index cde355f2..46318456 100644 --- a/templates/avx512_functions_c.py +++ b/templates/avx512_functions_c.py @@ -61,6 +61,8 @@ def init_functions(file,architectures): flops=32 # two 512 bit FMA operations elif each.instr_groups[i] == 'L1_L': flops=32 # two 512 bit FMA operations + elif each.instr_groups[i] == 'L1_BROADCAST': + flops=16 # one 512 bit FMA operation elif each.instr_groups[i] == 'L1_S': flops=16 # one 512 bit FMA operation elif each.instr_groups[i] == 'L1_LS': @@ -234,6 +236,16 @@ def work_functions(file,architectures,version): d3_inst = 'xor %%'+str(shift_reg[(shift_pos+nr_shift_regs-1)%nr_shift_regs])+', %%'+str(temp_reg)+';' comment = '// REG ops only' mov_dst = mov_dst +1 + elif item == 'L1_BROADCAST': + d0_inst = 'vfmadd231pd %%zmm'+str(add_start+(add_dest-add_start+add_regs+1)%add_regs)+', %%zmm0, %%zmm'+str(add_dest)+';' + d1_inst = 'vbroadcastsd 64(%%'+l1_addr+'), %%zmm'+str(add_dest)+';' + l1_offset = l1_offset + each.cl_size + if l1_offset < l1_size*each.l1_cover: + d3_inst = 'add %%'+offset_reg+', %%'+l1_addr+';' + else: + l1_offset = 0 + d3_inst = 'mov %%'+pointer_reg+', %%'+l1_addr+';' + comment = '// L1 packed single load' elif item == 'L1_L': d0_inst = 'vfmadd231pd %%zmm'+str(add_start+(add_dest-add_start+add_regs+1)%add_regs)+', %%zmm0, %%zmm'+str(add_dest)+';' d1_inst = 'vfmadd231pd 64(%%'+l1_addr+'), %%zmm1, %%zmm'+str(add_dest)+';' @@ -279,7 +291,27 @@ def work_functions(file,architectures,version): d1_inst = 'vfmadd231pd 128(%%'+l2_addr+'), %%zmm0, %%zmm'+str(add_dest)+';' d3_inst = 'add %%'+str(offset_reg)+', %%'+l2_addr+';' comment = '// L2 load, L2 store' - elif item == 'RAM_L': + elif item == 'L3_L': + d0_inst = 'vfmadd231pd %%zmm'+str(add_start+(add_dest-add_start+add_regs+1)%add_regs)+', %%zmm0, %%zmm'+str(add_dest)+';' + d1_inst = 'vfmadd231pd 64(%%'+l3_addr+'), %%zmm1, %%zmm'+str(add_dest)+';' + d3_inst = 'add %%'+str(offset_reg)+', %%'+l3_addr+';' + comment = '// L3 load' + elif item == 'L3_S': + d0_inst = 'vmovapd %%zmm'+str(add_dest)+', 64(%%'+l3_addr+');' + d1_inst = 'vfmadd231pd %%zmm'+str(add_start+(add_dest-add_start+add_regs+1)%add_regs)+', %%zmm0, %%zmm'+str(add_dest)+';' + d3_inst = 'add %%'+str(offset_reg)+', %%'+l3_addr+';' + comment = '// L3 store' + elif item == 'L3_LS': + d0_inst = 'vmovapd %%zmm'+str(add_dest)+', 64(%%'+l2_addr+');' + d1_inst = 'vfmadd231pd 128(%%'+l3_addr+'), %%zmm0, %%zmm'+str(add_dest)+';' + d3_inst = 'add %%'+str(offset_reg)+', %%'+l3_addr+';' + comment = '// L3 load, L3 store' + elif item == 'L3_P': + d0_inst = 'vfmadd231pd 64(%%'+l1_addr+'), %%zmm0, %%zmm'+str(add_dest)+';' + d1_inst = 'prefetcht2 (%%'+l3_addr+');' + d3_inst = 'add %%'+str(offset_reg)+', %%'+l3_addr+';' + comment = '// L3 prefetch' + elif item == 'RAM_L': d0_inst = 'vfmadd231pd %%zmm'+str(add_start+(add_dest-add_start+add_regs+1)%add_regs)+', %%zmm0, %%zmm'+str(add_dest)+';' d1_inst = 'vfmadd231pd 64(%%'+ram_addr+'), %%zmm1, %%'+str(ram_reg)+';' d3_inst = 'add %%'+str(offset_reg)+', %%'+ram_addr+';'