diff --git a/CHANGELOG b/CHANGELOG index 70c9bf81..f26e56ee 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -19,6 +19,10 @@ # Contact: daniel.hackenberg@tu-dresden.de ############################################################################### +Version 1.5 + - added Knights Landing support (AVX512F) + - added error handling for restricted cpu sets for --bind and --threads options + Version 1.4 - added support for Skylake-H (FMA) - added support for Broadwell-E/EP (FMA) diff --git a/FIRESTARTER b/FIRESTARTER index 22cba54d..1136c093 100644 Binary files a/FIRESTARTER and b/FIRESTARTER differ diff --git a/FIRESTARTER_CUDA b/FIRESTARTER_CUDA index bcbbfbba..e76fabed 100644 Binary files a/FIRESTARTER_CUDA and b/FIRESTARTER_CUDA differ diff --git a/LICENSE b/LICENSE index 9cecc1d4..bc08fe2e 100644 --- a/LICENSE +++ b/LICENSE @@ -617,58 +617,3 @@ reviewing courts shall apply local law that most closely approximates an absolute waiver of all civil liability in connection with the Program, unless a warranty or assumption of liability accompanies a copy of the Program in return for a fee. - - END OF TERMS AND CONDITIONS - - How to Apply These Terms to Your New Programs - - If you develop a new program, and you want it to be of the greatest -possible use to the public, the best way to achieve this is to make it -free software which everyone can redistribute and change under these terms. - - To do so, attach the following notices to the program. It is safest -to attach them to the start of each source file to most effectively -state the exclusion of warranty; and each file should have at least -the "copyright" line and a pointer to where the full notice is found. - - {one line to give the program's name and a brief idea of what it does.} - Copyright (C) {year} {name of author} - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . - -Also add information on how to contact you by electronic and paper mail. - - If the program does terminal interaction, make it output a short -notice like this when it starts in an interactive mode: - - {project} Copyright (C) {year} {fullname} - This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. - This is free software, and you are welcome to redistribute it - under certain conditions; type `show c' for details. - -The hypothetical commands `show w' and `show c' should show the appropriate -parts of the General Public License. Of course, your program's commands -might be different; for a GUI interface, you would use an "about box". - - You should also get your employer (if you work as a programmer) or school, -if any, to sign a "copyright disclaimer" for the program, if necessary. -For more information on this, and how to apply and follow the GNU GPL, see -. - - The GNU General Public License does not permit incorporating your program -into proprietary programs. If your program is a subroutine library, you -may consider it more useful to permit linking proprietary applications with -the library. If this is what you want to do, use the GNU Lesser General -Public License instead of this License. But first, please read -. diff --git a/Makefile b/Makefile index 83b90e7e..adb946d5 100644 --- a/Makefile +++ b/Makefile @@ -44,11 +44,11 @@ cuda: FIRESTARTER_CUDA all: linux cuda -FIRESTARTER: generic.o x86.o main.o work.o x86.o watchdog.o help.o sse2_functions.o avx_functions.o fma_functions.o fma4_functions.o - ${LINUX_CC} -o FIRESTARTER generic.o main.o work.o x86.o watchdog.o help.o sse2_functions.o avx_functions.o fma_functions.o fma4_functions.o ${LINUX_L_FLAGS} +FIRESTARTER: generic.o x86.o main.o work.o x86.o watchdog.o help.o sse2_functions.o avx_functions.o fma_functions.o fma4_functions.o avx512_functions.o + ${LINUX_CC} -o FIRESTARTER generic.o main.o work.o x86.o watchdog.o help.o sse2_functions.o avx_functions.o fma_functions.o fma4_functions.o avx512_functions.o ${LINUX_L_FLAGS} -FIRESTARTER_CUDA: generic.o x86.o work.o x86.o watchdog.o sse2_functions.o avx_functions.o fma_functions.o fma4_functions.o gpu.o main_cuda.o help_cuda.o - ${LINUX_CC} -o FIRESTARTER_CUDA generic.o main_cuda.o work.o x86.o watchdog.o help_cuda.o sse2_functions.o avx_functions.o fma_functions.o fma4_functions.o gpu.o ${LINUX_CUDA_L_FLAGS} +FIRESTARTER_CUDA: generic.o x86.o work.o x86.o watchdog.o sse2_functions.o avx_functions.o fma_functions.o fma4_functions.o avx512_functions.o gpu.o main_cuda.o help_cuda.o + ${LINUX_CC} -o FIRESTARTER_CUDA generic.o main_cuda.o work.o x86.o watchdog.o help_cuda.o sse2_functions.o avx_functions.o fma_functions.o fma4_functions.o avx512_functions.o gpu.o ${LINUX_CUDA_L_FLAGS} generic.o: generic.c cpu.h ${LINUX_CC} ${OPT} ${LINUX_C_FLAGS} -c generic.c @@ -77,6 +77,9 @@ main_cuda.o: main.c work.h cpu.h help_cuda.o: help.c help.h ${LINUX_CC} ${OPT} ${LINUX_C_FLAGS} -o help_cuda.o -c help.c -DCUDA +avx512_functions.o: avx512_functions.c + ${LINUX_CC} ${OPT} ${LINUX_C_FLAGS} -mavx512f -c avx512_functions.c + fma4_functions.o: fma4_functions.c ${LINUX_CC} ${OPT} ${LINUX_C_FLAGS} -mfma4 -mavx -c fma4_functions.c diff --git a/README b/README index 9b737581..7168d62d 100644 --- a/README +++ b/README @@ -30,6 +30,7 @@ Supported CPU microarchitectures - Intel Ivy Bridge - Intel Haswell - Intel Skylake +- Intel Knights Landing - AMD Bulldozer (experimental) Since version 1.1 it is also possible to create alternating and repetitive @@ -54,8 +55,10 @@ Options: -c | --copyright display copyright information -w | --warranty display warranty information -q | --quiet disable output to stdout +-r | --report display additional information (overridden by -q) -a | --avail list available functions --i ID | --function=ID specify ID of the load-function to be used +-i ID | --function=ID specify integer ID of the load-function to be + used (as listed by --avail) -t TIMEOUT | --timeout=TIMEOUT set timeout (seconds) after which FIRESTARTER terminates itself, default: no timeout -l LOAD | --load=LOAD set the percentage of high load to LOAD (%), @@ -67,9 +70,12 @@ Options: load and an idle phase, the percentage of high load is defined by -l -n COUNT | --threads=COUNT specify the number of threads --b CPULIST | --bind=CPULIST select certain CPUs (overrides -n) + cannot be combined with -b | --bind, which + implicitly specifies the number of threads +-b CPULIST | --bind=CPULIST select certain CPUs CPULIST format: "x,y,z", "x-y", "x-y/step", and any combination of the above + cannot be combined with -n | --threads CUDA Options: -g | --gpus number of gpus to use (default: all) diff --git a/avx512_functions.c b/avx512_functions.c new file mode 100644 index 00000000..fe503168 --- /dev/null +++ b/avx512_functions.c @@ -0,0 +1,530 @@ +/****************************************************************************** + * FIRESTARTER - A Processor Stress Test Utility + * Copyright (C) 2016 TU Dresden, Center for Information Services and High + * Performance Computing + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + * Contact: daniel.hackenberg@tu-dresden.de + *****************************************************************************/ + +#include "work.h" + + + + + int init_knl_xeonphi_avx512_4t(threaddata_t* threaddata) __attribute__((noinline)); +int init_knl_xeonphi_avx512_4t(threaddata_t* threaddata) +{ + unsigned long long addrMem = threaddata->addrMem; + int i; + for (i=0;i<836608;i++) *((double*)(addrMem+8*i)) = 0.25 + (double)(i%9267) * 0.24738995982e-4; + + // lines with register operations + threaddata->flops+=10*32; // 2 512 bit FMA operations + + // lines with L1 operations + threaddata->flops+=40*32; // 2 512 bit FMA operations + + // lines with L2 operations + threaddata->flops+=8*16; // 1 512 bit FMA operation + + // lines with RAM operations + threaddata->flops+=3*16; // 1 512 bit FMA operation + threaddata->bytes=3*64; // 1 memory access + + threaddata->flops*=6; + threaddata->bytes*=6; + + return EXIT_SUCCESS; +} + +/** + * assembler implementation of processor and memory stress test + * uses AVX512F instruction set + * optimized for Intel Xeon Phi (Knights Landing) + * @input - addrMem: pointer to buffer + * @return EXIT_SUCCESS + */ +int asm_work_knl_xeonphi_avx512_4t(threaddata_t* threaddata) __attribute__((noinline)); +int asm_work_knl_xeonphi_avx512_4t(threaddata_t* threaddata) +{ + if (*((unsigned long long*)threaddata->addrHigh) == 0) return EXIT_SUCCESS; + /* input: + * - addrMem -> rax + * register usage: + * - rax: stores original pointer to buffer, used to periodically reset other pointers + * - rbx: pointer to L1 buffer + * - rcx: pointer to L2 buffer + * - r8: pointer to L3 buffer + * - r9: pointer to RAM buffer + * - r10: counter for L2-pointer reset + * - r11: counter for L3-pointer reset + * - r12: counter for RAM-pointer reset + * - r13: register for temporary results + * - r14: stores cacheline width as increment for buffer addresses + * - r15: stores address of shared variable that controls load level + * - mm0: stores iteration counter + * - rdx, rsi, rdi: registers for shift operations + * - xmm*,zmm*: data registers for SIMD instructions + */ + __asm__ __volatile__( + "mov %%rax, %%rax;" // store start address of buffer + "mov %%rbx, %%r15;" // store address of shared variable that controls load level + "movq %%rcx, %%mm0;" // store iteration counter + "mov $64, %%r14;" // increment after each cache/memory access + //Initialize registers for shift operations + "mov $0xAAAAAAAA, %%edi;" + "mov $0xAAAAAAAA, %%esi;" + "mov $0xAAAAAAAA, %%edx;" + //Initialize AVX-Registers for FMA Operations + "vmovapd (%%rax), %%zmm0;" + "vmovapd (%%rax), %%zmm1;" + "vmovapd 384(%%rax), %%zmm2;" + "vmovapd 448(%%rax), %%zmm3;" + "vmovapd 512(%%rax), %%zmm4;" + "vmovapd 576(%%rax), %%zmm5;" + "vmovapd 640(%%rax), %%zmm6;" + "vmovapd 704(%%rax), %%zmm7;" + "vmovapd 768(%%rax), %%zmm8;" + "vmovapd 832(%%rax), %%zmm9;" + "vmovapd 896(%%rax), %%zmm10;" + "vmovapd 960(%%rax), %%zmm11;" + "vmovapd 1024(%%rax), %%zmm12;" + "vmovapd 1088(%%rax), %%zmm13;" + "vmovapd 1152(%%rax), %%zmm14;" + "vmovapd 1216(%%rax), %%zmm15;" + "vmovapd 1280(%%rax), %%zmm16;" + "vmovapd 1344(%%rax), %%zmm17;" + "vmovapd 1408(%%rax), %%zmm18;" + "vmovapd 1472(%%rax), %%zmm19;" + "vmovapd 1536(%%rax), %%zmm20;" + "vmovapd 1600(%%rax), %%zmm21;" + "vmovapd 1664(%%rax), %%zmm22;" + "vmovapd 1728(%%rax), %%zmm23;" + "vmovapd 1792(%%rax), %%zmm24;" + "vmovapd 1856(%%rax), %%zmm25;" + "vmovapd 1920(%%rax), %%zmm26;" + "vmovapd 1984(%%rax), %%zmm27;" + "vmovapd 2048(%%rax), %%zmm28;" + "vmovapd 2112(%%rax), %%zmm29;" + "vmovapd 2176(%%rax), %%zmm30;" + "mov %%rax, %%rbx;" // address for L1-buffer + "mov %%rax, %%rcx;" + "add $8192, %%rcx;" // address for L2-buffer + "mov %%rax, %%r8;" + "add $131072, %%r8;" // address for L3-buffer + "mov %%rax, %%r9;" + "add $0, %%r9;" // address for RAM-buffer + "movabs $34, %%r10;" // reset-counter for L2-buffer with 48 cache line accesses per loop (102 KB) + "movabs $0, %%r11;" // reset-counter for L3-buffer with 0 cache line accesses per loop (0 KB) + "movabs $5688, %%r12;" // reset-counter for RAM-buffer with 18 cache line accesses per loop (6399 KB) + + ".align 64;" /* alignment in bytes */ + "_work_loop_knl_xeonphi_avx512_4t:" + /***************************************************************************************************************************************************** + decode 0 decode 1 decode 2 decode 3 */ + "vfmadd231pd 64(%%rbx), %%zmm0, %%zmm3; prefetcht2 (%%r9); shl $1, %%edi; add %%r14, %%r9; " // RAM prefetch + "vfmadd231pd %%zmm5, %%zmm0, %%zmm4; vfmadd231pd %%zmm6, %%zmm1, %%zmm26; shl $1, %%esi; xor %%rdi, %%r13; " // REG ops only + "vfmadd231pd %%zmm6, %%zmm0, %%zmm5; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm5; shl $1, %%edx; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm7, %%zmm0, %%zmm6; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm6; shr $1, %%edi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm8, %%zmm0, %%zmm7; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm7; shr $1, %%esi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm9, %%zmm0, %%zmm8; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm8; shr $1, %%edx; add %%r14, %%rbx; " // L1 load + "vmovapd %%zmm9, 64(%%rcx); vfmadd231pd %%zmm10, %%zmm0, %%zmm9; shl $1, %%edi; add %%r14, %%rcx; " // L2 store + "vfmadd231pd %%zmm11, %%zmm0, %%zmm10; vfmadd231pd %%zmm12, %%zmm1, %%zmm27; shl $1, %%esi; xor %%rdi, %%r13; " // REG ops only + "vfmadd231pd %%zmm12, %%zmm0, %%zmm11; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm11; shl $1, %%edx; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm13, %%zmm0, %%zmm12; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm12; shr $1, %%edi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm14, %%zmm0, %%zmm13; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm13; shr $1, %%esi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm15, %%zmm0, %%zmm14; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm14; shr $1, %%edx; add %%r14, %%rbx; " // L1 load + "vmovapd %%zmm15, 64(%%rcx); vfmadd231pd %%zmm16, %%zmm0, %%zmm15; shl $1, %%edi; add %%r14, %%rcx; " // L2 store + "vfmadd231pd %%zmm17, %%zmm0, %%zmm16; vfmadd231pd %%zmm18, %%zmm1, %%zmm28; shl $1, %%esi; xor %%rdi, %%r13; " // REG ops only + "vfmadd231pd %%zmm18, %%zmm0, %%zmm17; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm17; shl $1, %%edx; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm19, %%zmm0, %%zmm18; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm18; shr $1, %%edi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm20, %%zmm0, %%zmm19; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm19; shr $1, %%esi; add %%r14, %%rbx; " // L1 load + "vmovapd %%zmm20, 64(%%rcx); vfmadd231pd %%zmm21, %%zmm0, %%zmm20; shr $1, %%edx; add %%r14, %%rcx; " // L2 store + "vfmadd231pd %%zmm22, %%zmm0, %%zmm21; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm21; shl $1, %%edi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm23, %%zmm0, %%zmm22; vfmadd231pd %%zmm24, %%zmm1, %%zmm29; shl $1, %%esi; xor %%rdi, %%r13; " // REG ops only + "vfmadd231pd %%zmm24, %%zmm0, %%zmm23; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm23; shl $1, %%edx; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm25, %%zmm0, %%zmm24; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm24; shr $1, %%edi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm2, %%zmm0, %%zmm25; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm25; shr $1, %%esi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd 64(%%rbx), %%zmm0, %%zmm2; prefetcht2 (%%r9); shr $1, %%edx; add %%r14, %%r9; " // RAM prefetch + "vfmadd231pd %%zmm4, %%zmm0, %%zmm3; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm3; shl $1, %%edi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm5, %%zmm0, %%zmm4; vfmadd231pd %%zmm6, %%zmm1, %%zmm30; shl $1, %%esi; xor %%rdi, %%r13; " // REG ops only + "vfmadd231pd %%zmm6, %%zmm0, %%zmm5; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm5; shl $1, %%edx; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm7, %%zmm0, %%zmm6; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm6; shr $1, %%edi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm8, %%zmm0, %%zmm7; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm7; shr $1, %%esi; add %%r14, %%rbx; " // L1 load + "vmovapd %%zmm8, 64(%%rcx); vfmadd231pd %%zmm9, %%zmm0, %%zmm8; shr $1, %%edx; add %%r14, %%rcx; " // L2 store + "vfmadd231pd %%zmm10, %%zmm0, %%zmm9; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm9; shl $1, %%edi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm11, %%zmm0, %%zmm10; vfmadd231pd %%zmm12, %%zmm1, %%zmm26; shl $1, %%esi; xor %%rdi, %%r13; " // REG ops only + "vfmadd231pd %%zmm12, %%zmm0, %%zmm11; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm11; shl $1, %%edx; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm13, %%zmm0, %%zmm12; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm12; shr $1, %%edi; add %%r14, %%rbx; " // L1 load + "vmovapd %%zmm13, 64(%%rcx); vfmadd231pd %%zmm14, %%zmm0, %%zmm13; shr $1, %%esi; add %%r14, %%rcx; " // L2 store + "vfmadd231pd %%zmm15, %%zmm0, %%zmm14; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm14; shr $1, %%edx; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm16, %%zmm0, %%zmm15; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm15; shl $1, %%edi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm17, %%zmm0, %%zmm16; vfmadd231pd %%zmm18, %%zmm1, %%zmm27; shl $1, %%esi; xor %%rdi, %%r13; " // REG ops only + "vfmadd231pd %%zmm18, %%zmm0, %%zmm17; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm17; shl $1, %%edx; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm19, %%zmm0, %%zmm18; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm18; shr $1, %%edi; add %%r14, %%rbx; " // L1 load + "vmovapd %%zmm19, 64(%%rcx); vfmadd231pd %%zmm20, %%zmm0, %%zmm19; shr $1, %%esi; add %%r14, %%rcx; " // L2 store + "vfmadd231pd %%zmm21, %%zmm0, %%zmm20; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm20; shr $1, %%edx; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm22, %%zmm0, %%zmm21; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm21; shl $1, %%edi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm23, %%zmm0, %%zmm22; vfmadd231pd %%zmm24, %%zmm1, %%zmm28; shl $1, %%esi; xor %%rdi, %%r13; " // REG ops only + "vfmadd231pd %%zmm24, %%zmm0, %%zmm23; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm23; shl $1, %%edx; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm25, %%zmm0, %%zmm24; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm24; shr $1, %%edi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd 64(%%rbx), %%zmm0, %%zmm25; prefetcht2 (%%r9); shr $1, %%esi; add %%r14, %%r9; " // RAM prefetch + "vfmadd231pd %%zmm3, %%zmm0, %%zmm2; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm2; shr $1, %%edx; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm4, %%zmm0, %%zmm3; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm3; shl $1, %%edi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm5, %%zmm0, %%zmm4; vfmadd231pd %%zmm6, %%zmm1, %%zmm29; shl $1, %%esi; xor %%rdi, %%r13; " // REG ops only + "vfmadd231pd %%zmm6, %%zmm0, %%zmm5; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm5; shl $1, %%edx; add %%r14, %%rbx; " // L1 load + "vmovapd %%zmm6, 64(%%rcx); vfmadd231pd %%zmm7, %%zmm0, %%zmm6; shr $1, %%edi; add %%r14, %%rcx; " // L2 store + "vfmadd231pd %%zmm8, %%zmm0, %%zmm7; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm7; shr $1, %%esi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm9, %%zmm0, %%zmm8; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm8; shr $1, %%edx; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm10, %%zmm0, %%zmm9; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm9; shl $1, %%edi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm11, %%zmm0, %%zmm10; vfmadd231pd %%zmm12, %%zmm1, %%zmm30; shl $1, %%esi; xor %%rdi, %%r13; " // REG ops only + "vfmadd231pd %%zmm12, %%zmm0, %%zmm11; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm11; shl $1, %%edx; add %%r14, %%rbx; " // L1 load + "vmovapd %%zmm12, 64(%%rcx); vfmadd231pd %%zmm13, %%zmm0, %%zmm12; shr $1, %%edi; add %%r14, %%rcx; " // L2 store + "vfmadd231pd %%zmm14, %%zmm0, %%zmm13; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm13; shr $1, %%esi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm15, %%zmm0, %%zmm14; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm14; shr $1, %%edx; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm16, %%zmm0, %%zmm15; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm15; shl $1, %%edi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd 64(%%rbx), %%zmm0, %%zmm16; prefetcht2 (%%r9); shl $1, %%esi; add %%r14, %%r9; " // RAM prefetch + "vfmadd231pd %%zmm18, %%zmm0, %%zmm17; vfmadd231pd %%zmm19, %%zmm1, %%zmm26; shl $1, %%edx; xor %%rsi, %%r13; " // REG ops only + "vfmadd231pd %%zmm19, %%zmm0, %%zmm18; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm18; shr $1, %%edi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm20, %%zmm0, %%zmm19; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm19; shr $1, %%esi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm21, %%zmm0, %%zmm20; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm20; shr $1, %%edx; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm22, %%zmm0, %%zmm21; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm21; shl $1, %%edi; add %%r14, %%rbx; " // L1 load + "vmovapd %%zmm22, 64(%%rcx); vfmadd231pd %%zmm23, %%zmm0, %%zmm22; shl $1, %%esi; add %%r14, %%rcx; " // L2 store + "vfmadd231pd %%zmm24, %%zmm0, %%zmm23; vfmadd231pd %%zmm25, %%zmm1, %%zmm27; shl $1, %%edx; xor %%rsi, %%r13; " // REG ops only + "vfmadd231pd %%zmm25, %%zmm0, %%zmm24; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm24; shr $1, %%edi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm2, %%zmm0, %%zmm25; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm25; shr $1, %%esi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm3, %%zmm0, %%zmm2; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm2; shr $1, %%edx; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm4, %%zmm0, %%zmm3; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm3; shl $1, %%edi; add %%r14, %%rbx; " // L1 load + "vmovapd %%zmm4, 64(%%rcx); vfmadd231pd %%zmm5, %%zmm0, %%zmm4; shl $1, %%esi; add %%r14, %%rcx; " // L2 store + "vfmadd231pd %%zmm6, %%zmm0, %%zmm5; vfmadd231pd %%zmm7, %%zmm1, %%zmm28; shl $1, %%edx; xor %%rsi, %%r13; " // REG ops only + "vfmadd231pd %%zmm7, %%zmm0, %%zmm6; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm6; shr $1, %%edi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm8, %%zmm0, %%zmm7; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm7; shr $1, %%esi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm9, %%zmm0, %%zmm8; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm8; shr $1, %%edx; add %%r14, %%rbx; " // L1 load + "vmovapd %%zmm9, 64(%%rcx); vfmadd231pd %%zmm10, %%zmm0, %%zmm9; shl $1, %%edi; add %%r14, %%rcx; " // L2 store + "vfmadd231pd %%zmm11, %%zmm0, %%zmm10; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm10; shl $1, %%esi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm12, %%zmm0, %%zmm11; vfmadd231pd %%zmm13, %%zmm1, %%zmm29; shl $1, %%edx; xor %%rsi, %%r13; " // REG ops only + "vfmadd231pd %%zmm13, %%zmm0, %%zmm12; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm12; shr $1, %%edi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm14, %%zmm0, %%zmm13; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm13; shr $1, %%esi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm15, %%zmm0, %%zmm14; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm14; shr $1, %%edx; add %%r14, %%rbx; " // L1 load + "vfmadd231pd 64(%%rbx), %%zmm0, %%zmm15; prefetcht2 (%%r9); shl $1, %%edi; add %%r14, %%r9; " // RAM prefetch + "vfmadd231pd %%zmm17, %%zmm0, %%zmm16; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm16; shl $1, %%esi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm18, %%zmm0, %%zmm17; vfmadd231pd %%zmm19, %%zmm1, %%zmm30; shl $1, %%edx; xor %%rsi, %%r13; " // REG ops only + "vfmadd231pd %%zmm19, %%zmm0, %%zmm18; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm18; shr $1, %%edi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm20, %%zmm0, %%zmm19; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm19; shr $1, %%esi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm21, %%zmm0, %%zmm20; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm20; shr $1, %%edx; add %%r14, %%rbx; " // L1 load + "vmovapd %%zmm21, 64(%%rcx); vfmadd231pd %%zmm22, %%zmm0, %%zmm21; shl $1, %%edi; add %%r14, %%rcx; " // L2 store + "vfmadd231pd %%zmm23, %%zmm0, %%zmm22; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm22; shl $1, %%esi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm24, %%zmm0, %%zmm23; vfmadd231pd %%zmm25, %%zmm1, %%zmm26; shl $1, %%edx; xor %%rsi, %%r13; " // REG ops only + "vfmadd231pd %%zmm25, %%zmm0, %%zmm24; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm24; shr $1, %%edi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm2, %%zmm0, %%zmm25; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm25; shr $1, %%esi; add %%r14, %%rbx; " // L1 load + "vmovapd %%zmm2, 64(%%rcx); vfmadd231pd %%zmm3, %%zmm0, %%zmm2; shr $1, %%edx; add %%r14, %%rcx; " // L2 store + "vfmadd231pd %%zmm4, %%zmm0, %%zmm3; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm3; shl $1, %%edi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm5, %%zmm0, %%zmm4; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm4; shl $1, %%esi; mov %%rax, %%rbx;" // L1 load + "vfmadd231pd %%zmm6, %%zmm0, %%zmm5; vfmadd231pd %%zmm7, %%zmm1, %%zmm27; shl $1, %%edx; xor %%rsi, %%r13; " // REG ops only + "vfmadd231pd %%zmm7, %%zmm0, %%zmm6; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm6; shr $1, %%edi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm8, %%zmm0, %%zmm7; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm7; shr $1, %%esi; add %%r14, %%rbx; " // L1 load + "vmovapd %%zmm8, 64(%%rcx); vfmadd231pd %%zmm9, %%zmm0, %%zmm8; shr $1, %%edx; add %%r14, %%rcx; " // L2 store + "vfmadd231pd %%zmm10, %%zmm0, %%zmm9; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm9; shl $1, %%edi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm11, %%zmm0, %%zmm10; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm10; shl $1, %%esi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm12, %%zmm0, %%zmm11; vfmadd231pd %%zmm13, %%zmm1, %%zmm28; shl $1, %%edx; xor %%rsi, %%r13; " // REG ops only + "vfmadd231pd %%zmm13, %%zmm0, %%zmm12; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm12; shr $1, %%edi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm14, %%zmm0, %%zmm13; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm13; shr $1, %%esi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd 64(%%rbx), %%zmm0, %%zmm14; prefetcht2 (%%r9); shr $1, %%edx; add %%r14, %%r9; " // RAM prefetch + "vfmadd231pd %%zmm16, %%zmm0, %%zmm15; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm15; shl $1, %%edi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm17, %%zmm0, %%zmm16; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm16; shl $1, %%esi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm18, %%zmm0, %%zmm17; vfmadd231pd %%zmm19, %%zmm1, %%zmm29; shl $1, %%edx; xor %%rsi, %%r13; " // REG ops only + "vfmadd231pd %%zmm19, %%zmm0, %%zmm18; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm18; shr $1, %%edi; add %%r14, %%rbx; " // L1 load + "vmovapd %%zmm19, 64(%%rcx); vfmadd231pd %%zmm20, %%zmm0, %%zmm19; shr $1, %%esi; add %%r14, %%rcx; " // L2 store + "vfmadd231pd %%zmm21, %%zmm0, %%zmm20; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm20; shr $1, %%edx; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm22, %%zmm0, %%zmm21; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm21; shl $1, %%edi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm23, %%zmm0, %%zmm22; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm22; shl $1, %%esi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm24, %%zmm0, %%zmm23; vfmadd231pd %%zmm25, %%zmm1, %%zmm30; shl $1, %%edx; xor %%rsi, %%r13; " // REG ops only + "vfmadd231pd %%zmm25, %%zmm0, %%zmm24; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm24; shr $1, %%edi; add %%r14, %%rbx; " // L1 load + "vmovapd %%zmm25, 64(%%rcx); vfmadd231pd %%zmm2, %%zmm0, %%zmm25; shr $1, %%esi; add %%r14, %%rcx; " // L2 store + "vfmadd231pd %%zmm3, %%zmm0, %%zmm2; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm2; shr $1, %%edx; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm4, %%zmm0, %%zmm3; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm3; shl $1, %%edi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm5, %%zmm0, %%zmm4; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm4; shl $1, %%esi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd 64(%%rbx), %%zmm0, %%zmm5; prefetcht2 (%%r9); shl $1, %%edx; add %%r14, %%r9; " // RAM prefetch + "vfmadd231pd %%zmm7, %%zmm0, %%zmm6; vfmadd231pd %%zmm8, %%zmm1, %%zmm26; shr $1, %%edi; xor %%rdx, %%r13; " // REG ops only + "vfmadd231pd %%zmm8, %%zmm0, %%zmm7; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm7; shr $1, %%esi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm9, %%zmm0, %%zmm8; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm8; shr $1, %%edx; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm10, %%zmm0, %%zmm9; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm9; shl $1, %%edi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm11, %%zmm0, %%zmm10; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm10; shl $1, %%esi; add %%r14, %%rbx; " // L1 load + "vmovapd %%zmm11, 64(%%rcx); vfmadd231pd %%zmm12, %%zmm0, %%zmm11; shl $1, %%edx; add %%r14, %%rcx; " // L2 store + "vfmadd231pd %%zmm13, %%zmm0, %%zmm12; vfmadd231pd %%zmm14, %%zmm1, %%zmm27; shr $1, %%edi; xor %%rdx, %%r13; " // REG ops only + "vfmadd231pd %%zmm14, %%zmm0, %%zmm13; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm13; shr $1, %%esi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm15, %%zmm0, %%zmm14; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm14; shr $1, %%edx; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm16, %%zmm0, %%zmm15; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm15; shl $1, %%edi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm17, %%zmm0, %%zmm16; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm16; shl $1, %%esi; add %%r14, %%rbx; " // L1 load + "vmovapd %%zmm17, 64(%%rcx); vfmadd231pd %%zmm18, %%zmm0, %%zmm17; shl $1, %%edx; add %%r14, %%rcx; " // L2 store + "vfmadd231pd %%zmm19, %%zmm0, %%zmm18; vfmadd231pd %%zmm20, %%zmm1, %%zmm28; shr $1, %%edi; xor %%rdx, %%r13; " // REG ops only + "vfmadd231pd %%zmm20, %%zmm0, %%zmm19; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm19; shr $1, %%esi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm21, %%zmm0, %%zmm20; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm20; shr $1, %%edx; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm22, %%zmm0, %%zmm21; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm21; shl $1, %%edi; add %%r14, %%rbx; " // L1 load + "vmovapd %%zmm22, 64(%%rcx); vfmadd231pd %%zmm23, %%zmm0, %%zmm22; shl $1, %%esi; add %%r14, %%rcx; " // L2 store + "vfmadd231pd %%zmm24, %%zmm0, %%zmm23; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm23; shl $1, %%edx; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm25, %%zmm0, %%zmm24; vfmadd231pd %%zmm2, %%zmm1, %%zmm29; shr $1, %%edi; xor %%rdx, %%r13; " // REG ops only + "vfmadd231pd %%zmm2, %%zmm0, %%zmm25; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm25; shr $1, %%esi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm3, %%zmm0, %%zmm2; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm2; shr $1, %%edx; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm4, %%zmm0, %%zmm3; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm3; shl $1, %%edi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd 64(%%rbx), %%zmm0, %%zmm4; prefetcht2 (%%r9); shl $1, %%esi; add %%r14, %%r9; " // RAM prefetch + "vfmadd231pd %%zmm6, %%zmm0, %%zmm5; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm5; shl $1, %%edx; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm7, %%zmm0, %%zmm6; vfmadd231pd %%zmm8, %%zmm1, %%zmm30; shr $1, %%edi; xor %%rdx, %%r13; " // REG ops only + "vfmadd231pd %%zmm8, %%zmm0, %%zmm7; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm7; shr $1, %%esi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm9, %%zmm0, %%zmm8; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm8; shr $1, %%edx; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm10, %%zmm0, %%zmm9; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm9; shl $1, %%edi; add %%r14, %%rbx; " // L1 load + "vmovapd %%zmm10, 64(%%rcx); vfmadd231pd %%zmm11, %%zmm0, %%zmm10; shl $1, %%esi; add %%r14, %%rcx; " // L2 store + "vfmadd231pd %%zmm12, %%zmm0, %%zmm11; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm11; shl $1, %%edx; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm13, %%zmm0, %%zmm12; vfmadd231pd %%zmm14, %%zmm1, %%zmm26; shr $1, %%edi; xor %%rdx, %%r13; " // REG ops only + "vfmadd231pd %%zmm14, %%zmm0, %%zmm13; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm13; shr $1, %%esi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm15, %%zmm0, %%zmm14; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm14; shr $1, %%edx; add %%r14, %%rbx; " // L1 load + "vmovapd %%zmm15, 64(%%rcx); vfmadd231pd %%zmm16, %%zmm0, %%zmm15; shl $1, %%edi; add %%r14, %%rcx; " // L2 store + "vfmadd231pd %%zmm17, %%zmm0, %%zmm16; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm16; shl $1, %%esi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm18, %%zmm0, %%zmm17; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm17; shl $1, %%edx; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm19, %%zmm0, %%zmm18; vfmadd231pd %%zmm20, %%zmm1, %%zmm27; shr $1, %%edi; xor %%rdx, %%r13; " // REG ops only + "vfmadd231pd %%zmm20, %%zmm0, %%zmm19; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm19; shr $1, %%esi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm21, %%zmm0, %%zmm20; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm20; shr $1, %%edx; add %%r14, %%rbx; " // L1 load + "vmovapd %%zmm21, 64(%%rcx); vfmadd231pd %%zmm22, %%zmm0, %%zmm21; shl $1, %%edi; add %%r14, %%rcx; " // L2 store + "vfmadd231pd %%zmm23, %%zmm0, %%zmm22; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm22; shl $1, %%esi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm24, %%zmm0, %%zmm23; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm23; shl $1, %%edx; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm25, %%zmm0, %%zmm24; vfmadd231pd %%zmm2, %%zmm1, %%zmm28; shr $1, %%edi; xor %%rdx, %%r13; " // REG ops only + "vfmadd231pd %%zmm2, %%zmm0, %%zmm25; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm25; shr $1, %%esi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm3, %%zmm0, %%zmm2; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm2; shr $1, %%edx; add %%r14, %%rbx; " // L1 load + "vfmadd231pd 64(%%rbx), %%zmm0, %%zmm3; prefetcht2 (%%r9); shl $1, %%edi; add %%r14, %%r9; " // RAM prefetch + "vfmadd231pd %%zmm5, %%zmm0, %%zmm4; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm4; shl $1, %%esi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm6, %%zmm0, %%zmm5; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm5; shl $1, %%edx; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm7, %%zmm0, %%zmm6; vfmadd231pd %%zmm8, %%zmm1, %%zmm29; shr $1, %%edi; xor %%rdx, %%r13; " // REG ops only + "vfmadd231pd %%zmm8, %%zmm0, %%zmm7; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm7; shr $1, %%esi; add %%r14, %%rbx; " // L1 load + "vmovapd %%zmm8, 64(%%rcx); vfmadd231pd %%zmm9, %%zmm0, %%zmm8; shr $1, %%edx; add %%r14, %%rcx; " // L2 store + "vfmadd231pd %%zmm10, %%zmm0, %%zmm9; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm9; shl $1, %%edi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm11, %%zmm0, %%zmm10; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm10; shl $1, %%esi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm12, %%zmm0, %%zmm11; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm11; shl $1, %%edx; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm13, %%zmm0, %%zmm12; vfmadd231pd %%zmm14, %%zmm1, %%zmm30; shr $1, %%edi; xor %%rdx, %%r13; " // REG ops only + "vfmadd231pd %%zmm14, %%zmm0, %%zmm13; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm13; shr $1, %%esi; add %%r14, %%rbx; " // L1 load + "vmovapd %%zmm14, 64(%%rcx); vfmadd231pd %%zmm15, %%zmm0, %%zmm14; shr $1, %%edx; add %%r14, %%rcx; " // L2 store + "vfmadd231pd %%zmm16, %%zmm0, %%zmm15; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm15; shl $1, %%edi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm17, %%zmm0, %%zmm16; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm16; shl $1, %%esi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm18, %%zmm0, %%zmm17; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm17; shl $1, %%edx; add %%r14, %%rbx; " // L1 load + "vfmadd231pd 64(%%rbx), %%zmm0, %%zmm18; prefetcht2 (%%r9); shr $1, %%edi; add %%r14, %%r9; " // RAM prefetch + "vfmadd231pd %%zmm20, %%zmm0, %%zmm19; vfmadd231pd %%zmm21, %%zmm1, %%zmm26; shr $1, %%esi; xor %%rdi, %%r13; " // REG ops only + "vfmadd231pd %%zmm21, %%zmm0, %%zmm20; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm20; shr $1, %%edx; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm22, %%zmm0, %%zmm21; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm21; shl $1, %%edi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm23, %%zmm0, %%zmm22; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm22; shl $1, %%esi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm24, %%zmm0, %%zmm23; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm23; shl $1, %%edx; add %%r14, %%rbx; " // L1 load + "vmovapd %%zmm24, 64(%%rcx); vfmadd231pd %%zmm25, %%zmm0, %%zmm24; shr $1, %%edi; add %%r14, %%rcx; " // L2 store + "vfmadd231pd %%zmm2, %%zmm0, %%zmm25; vfmadd231pd %%zmm3, %%zmm1, %%zmm27; shr $1, %%esi; xor %%rdi, %%r13; " // REG ops only + "vfmadd231pd %%zmm3, %%zmm0, %%zmm2; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm2; shr $1, %%edx; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm4, %%zmm0, %%zmm3; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm3; shl $1, %%edi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm5, %%zmm0, %%zmm4; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm4; shl $1, %%esi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm6, %%zmm0, %%zmm5; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm5; shl $1, %%edx; mov %%rax, %%rbx;" // L1 load + "vmovapd %%zmm6, 64(%%rcx); vfmadd231pd %%zmm7, %%zmm0, %%zmm6; shr $1, %%edi; add %%r14, %%rcx; " // L2 store + "vfmadd231pd %%zmm8, %%zmm0, %%zmm7; vfmadd231pd %%zmm9, %%zmm1, %%zmm28; shr $1, %%esi; xor %%rdi, %%r13; " // REG ops only + "vfmadd231pd %%zmm9, %%zmm0, %%zmm8; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm8; shr $1, %%edx; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm10, %%zmm0, %%zmm9; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm9; shl $1, %%edi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm11, %%zmm0, %%zmm10; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm10; shl $1, %%esi; add %%r14, %%rbx; " // L1 load + "vmovapd %%zmm11, 64(%%rcx); vfmadd231pd %%zmm12, %%zmm0, %%zmm11; shl $1, %%edx; add %%r14, %%rcx; " // L2 store + "vfmadd231pd %%zmm13, %%zmm0, %%zmm12; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm12; shr $1, %%edi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm14, %%zmm0, %%zmm13; vfmadd231pd %%zmm15, %%zmm1, %%zmm29; shr $1, %%esi; xor %%rdi, %%r13; " // REG ops only + "vfmadd231pd %%zmm15, %%zmm0, %%zmm14; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm14; shr $1, %%edx; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm16, %%zmm0, %%zmm15; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm15; shl $1, %%edi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm17, %%zmm0, %%zmm16; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm16; shl $1, %%esi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd 64(%%rbx), %%zmm0, %%zmm17; prefetcht2 (%%r9); shl $1, %%edx; add %%r14, %%r9; " // RAM prefetch + "vfmadd231pd %%zmm19, %%zmm0, %%zmm18; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm18; shr $1, %%edi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm20, %%zmm0, %%zmm19; vfmadd231pd %%zmm21, %%zmm1, %%zmm30; shr $1, %%esi; xor %%rdi, %%r13; " // REG ops only + "vfmadd231pd %%zmm21, %%zmm0, %%zmm20; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm20; shr $1, %%edx; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm22, %%zmm0, %%zmm21; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm21; shl $1, %%edi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm23, %%zmm0, %%zmm22; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm22; shl $1, %%esi; add %%r14, %%rbx; " // L1 load + "vmovapd %%zmm23, 64(%%rcx); vfmadd231pd %%zmm24, %%zmm0, %%zmm23; shl $1, %%edx; add %%r14, %%rcx; " // L2 store + "vfmadd231pd %%zmm25, %%zmm0, %%zmm24; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm24; shr $1, %%edi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm2, %%zmm0, %%zmm25; vfmadd231pd %%zmm3, %%zmm1, %%zmm26; shr $1, %%esi; xor %%rdi, %%r13; " // REG ops only + "vfmadd231pd %%zmm3, %%zmm0, %%zmm2; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm2; shr $1, %%edx; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm4, %%zmm0, %%zmm3; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm3; shl $1, %%edi; add %%r14, %%rbx; " // L1 load + "vmovapd %%zmm4, 64(%%rcx); vfmadd231pd %%zmm5, %%zmm0, %%zmm4; shl $1, %%esi; add %%r14, %%rcx; " // L2 store + "vfmadd231pd %%zmm6, %%zmm0, %%zmm5; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm5; shl $1, %%edx; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm7, %%zmm0, %%zmm6; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm6; shr $1, %%edi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm8, %%zmm0, %%zmm7; vfmadd231pd %%zmm9, %%zmm1, %%zmm27; shr $1, %%esi; xor %%rdi, %%r13; " // REG ops only + "vfmadd231pd %%zmm9, %%zmm0, %%zmm8; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm8; shr $1, %%edx; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm10, %%zmm0, %%zmm9; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm9; shl $1, %%edi; add %%r14, %%rbx; " // L1 load + "vmovapd %%zmm10, 64(%%rcx); vfmadd231pd %%zmm11, %%zmm0, %%zmm10; shl $1, %%esi; add %%r14, %%rcx; " // L2 store + "vfmadd231pd %%zmm12, %%zmm0, %%zmm11; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm11; shl $1, %%edx; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm13, %%zmm0, %%zmm12; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm12; shr $1, %%edi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm14, %%zmm0, %%zmm13; vfmadd231pd %%zmm15, %%zmm1, %%zmm28; shr $1, %%esi; xor %%rdi, %%r13; " // REG ops only + "vfmadd231pd %%zmm15, %%zmm0, %%zmm14; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm14; shr $1, %%edx; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm16, %%zmm0, %%zmm15; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm15; shl $1, %%edi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd 64(%%rbx), %%zmm0, %%zmm16; prefetcht2 (%%r9); shl $1, %%esi; add %%r14, %%r9; " // RAM prefetch + "vfmadd231pd %%zmm18, %%zmm0, %%zmm17; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm17; shl $1, %%edx; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm19, %%zmm0, %%zmm18; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm18; shr $1, %%edi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm20, %%zmm0, %%zmm19; vfmadd231pd %%zmm21, %%zmm1, %%zmm29; shr $1, %%esi; xor %%rdi, %%r13; " // REG ops only + "vfmadd231pd %%zmm21, %%zmm0, %%zmm20; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm20; shr $1, %%edx; add %%r14, %%rbx; " // L1 load + "vmovapd %%zmm21, 64(%%rcx); vfmadd231pd %%zmm22, %%zmm0, %%zmm21; shl $1, %%edi; add %%r14, %%rcx; " // L2 store + "vfmadd231pd %%zmm23, %%zmm0, %%zmm22; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm22; shl $1, %%esi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm24, %%zmm0, %%zmm23; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm23; shl $1, %%edx; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm25, %%zmm0, %%zmm24; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm24; shr $1, %%edi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm2, %%zmm0, %%zmm25; vfmadd231pd %%zmm3, %%zmm1, %%zmm30; shr $1, %%esi; xor %%rdi, %%r13; " // REG ops only + "vfmadd231pd %%zmm3, %%zmm0, %%zmm2; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm2; shr $1, %%edx; add %%r14, %%rbx; " // L1 load + "vmovapd %%zmm3, 64(%%rcx); vfmadd231pd %%zmm4, %%zmm0, %%zmm3; shl $1, %%edi; add %%r14, %%rcx; " // L2 store + "vfmadd231pd %%zmm5, %%zmm0, %%zmm4; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm4; shl $1, %%esi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm6, %%zmm0, %%zmm5; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm5; shl $1, %%edx; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm7, %%zmm0, %%zmm6; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm6; shr $1, %%edi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd 64(%%rbx), %%zmm0, %%zmm7; prefetcht2 (%%r9); shr $1, %%esi; add %%r14, %%r9; " // RAM prefetch + "vfmadd231pd %%zmm9, %%zmm0, %%zmm8; vfmadd231pd %%zmm10, %%zmm1, %%zmm26; shr $1, %%edx; xor %%rsi, %%r13; " // REG ops only + "vfmadd231pd %%zmm10, %%zmm0, %%zmm9; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm9; shl $1, %%edi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm11, %%zmm0, %%zmm10; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm10; shl $1, %%esi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm12, %%zmm0, %%zmm11; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm11; shl $1, %%edx; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm13, %%zmm0, %%zmm12; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm12; shr $1, %%edi; add %%r14, %%rbx; " // L1 load + "vmovapd %%zmm13, 64(%%rcx); vfmadd231pd %%zmm14, %%zmm0, %%zmm13; shr $1, %%esi; add %%r14, %%rcx; " // L2 store + "vfmadd231pd %%zmm15, %%zmm0, %%zmm14; vfmadd231pd %%zmm16, %%zmm1, %%zmm27; shr $1, %%edx; xor %%rsi, %%r13; " // REG ops only + "vfmadd231pd %%zmm16, %%zmm0, %%zmm15; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm15; shl $1, %%edi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm17, %%zmm0, %%zmm16; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm16; shl $1, %%esi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm18, %%zmm0, %%zmm17; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm17; shl $1, %%edx; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm19, %%zmm0, %%zmm18; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm18; shr $1, %%edi; add %%r14, %%rbx; " // L1 load + "vmovapd %%zmm19, 64(%%rcx); vfmadd231pd %%zmm20, %%zmm0, %%zmm19; shr $1, %%esi; add %%r14, %%rcx; " // L2 store + "vfmadd231pd %%zmm21, %%zmm0, %%zmm20; vfmadd231pd %%zmm22, %%zmm1, %%zmm28; shr $1, %%edx; xor %%rsi, %%r13; " // REG ops only + "vfmadd231pd %%zmm22, %%zmm0, %%zmm21; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm21; shl $1, %%edi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm23, %%zmm0, %%zmm22; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm22; shl $1, %%esi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm24, %%zmm0, %%zmm23; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm23; shl $1, %%edx; add %%r14, %%rbx; " // L1 load + "vmovapd %%zmm24, 64(%%rcx); vfmadd231pd %%zmm25, %%zmm0, %%zmm24; shr $1, %%edi; add %%r14, %%rcx; " // L2 store + "vfmadd231pd %%zmm2, %%zmm0, %%zmm25; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm25; shr $1, %%esi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm3, %%zmm0, %%zmm2; vfmadd231pd %%zmm4, %%zmm1, %%zmm29; shr $1, %%edx; xor %%rsi, %%r13; " // REG ops only + "vfmadd231pd %%zmm4, %%zmm0, %%zmm3; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm3; shl $1, %%edi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm5, %%zmm0, %%zmm4; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm4; shl $1, %%esi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm6, %%zmm0, %%zmm5; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm5; shl $1, %%edx; add %%r14, %%rbx; " // L1 load + "vfmadd231pd 64(%%rbx), %%zmm0, %%zmm6; prefetcht2 (%%r9); shr $1, %%edi; add %%r14, %%r9; " // RAM prefetch + "vfmadd231pd %%zmm8, %%zmm0, %%zmm7; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm7; shr $1, %%esi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm9, %%zmm0, %%zmm8; vfmadd231pd %%zmm10, %%zmm1, %%zmm30; shr $1, %%edx; xor %%rsi, %%r13; " // REG ops only + "vfmadd231pd %%zmm10, %%zmm0, %%zmm9; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm9; shl $1, %%edi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm11, %%zmm0, %%zmm10; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm10; shl $1, %%esi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm12, %%zmm0, %%zmm11; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm11; shl $1, %%edx; add %%r14, %%rbx; " // L1 load + "vmovapd %%zmm12, 64(%%rcx); vfmadd231pd %%zmm13, %%zmm0, %%zmm12; shr $1, %%edi; add %%r14, %%rcx; " // L2 store + "vfmadd231pd %%zmm14, %%zmm0, %%zmm13; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm13; shr $1, %%esi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm15, %%zmm0, %%zmm14; vfmadd231pd %%zmm16, %%zmm1, %%zmm26; shr $1, %%edx; xor %%rsi, %%r13; " // REG ops only + "vfmadd231pd %%zmm16, %%zmm0, %%zmm15; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm15; shl $1, %%edi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm17, %%zmm0, %%zmm16; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm16; shl $1, %%esi; add %%r14, %%rbx; " // L1 load + "vmovapd %%zmm17, 64(%%rcx); vfmadd231pd %%zmm18, %%zmm0, %%zmm17; shl $1, %%edx; add %%r14, %%rcx; " // L2 store + "vfmadd231pd %%zmm19, %%zmm0, %%zmm18; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm18; shr $1, %%edi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm20, %%zmm0, %%zmm19; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm19; shr $1, %%esi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm21, %%zmm0, %%zmm20; vfmadd231pd %%zmm22, %%zmm1, %%zmm27; shr $1, %%edx; xor %%rsi, %%r13; " // REG ops only + "vfmadd231pd %%zmm22, %%zmm0, %%zmm21; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm21; shl $1, %%edi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm23, %%zmm0, %%zmm22; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm22; shl $1, %%esi; add %%r14, %%rbx; " // L1 load + "vmovapd %%zmm23, 64(%%rcx); vfmadd231pd %%zmm24, %%zmm0, %%zmm23; shl $1, %%edx; add %%r14, %%rcx; " // L2 store + "vfmadd231pd %%zmm25, %%zmm0, %%zmm24; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm24; shr $1, %%edi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm2, %%zmm0, %%zmm25; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm25; shr $1, %%esi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm3, %%zmm0, %%zmm2; vfmadd231pd %%zmm4, %%zmm1, %%zmm28; shr $1, %%edx; xor %%rsi, %%r13; " // REG ops only + "vfmadd231pd %%zmm4, %%zmm0, %%zmm3; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm3; shl $1, %%edi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm5, %%zmm0, %%zmm4; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm4; shl $1, %%esi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd 64(%%rbx), %%zmm0, %%zmm5; prefetcht2 (%%r9); shl $1, %%edx; add %%r14, %%r9; " // RAM prefetch + "vfmadd231pd %%zmm7, %%zmm0, %%zmm6; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm6; shr $1, %%edi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm8, %%zmm0, %%zmm7; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm7; shr $1, %%esi; mov %%rax, %%rbx;" // L1 load + "vfmadd231pd %%zmm9, %%zmm0, %%zmm8; vfmadd231pd %%zmm10, %%zmm1, %%zmm29; shr $1, %%edx; xor %%rsi, %%r13; " // REG ops only + "vfmadd231pd %%zmm10, %%zmm0, %%zmm9; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm9; shl $1, %%edi; add %%r14, %%rbx; " // L1 load + "vmovapd %%zmm10, 64(%%rcx); vfmadd231pd %%zmm11, %%zmm0, %%zmm10; shl $1, %%esi; add %%r14, %%rcx; " // L2 store + "vfmadd231pd %%zmm12, %%zmm0, %%zmm11; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm11; shl $1, %%edx; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm13, %%zmm0, %%zmm12; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm12; shr $1, %%edi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm14, %%zmm0, %%zmm13; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm13; shr $1, %%esi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm15, %%zmm0, %%zmm14; vfmadd231pd %%zmm16, %%zmm1, %%zmm30; shr $1, %%edx; xor %%rsi, %%r13; " // REG ops only + "vfmadd231pd %%zmm16, %%zmm0, %%zmm15; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm15; shl $1, %%edi; add %%r14, %%rbx; " // L1 load + "vmovapd %%zmm16, 64(%%rcx); vfmadd231pd %%zmm17, %%zmm0, %%zmm16; shl $1, %%esi; add %%r14, %%rcx; " // L2 store + "vfmadd231pd %%zmm18, %%zmm0, %%zmm17; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm17; shl $1, %%edx; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm19, %%zmm0, %%zmm18; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm18; shr $1, %%edi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm20, %%zmm0, %%zmm19; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm19; shr $1, %%esi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd 64(%%rbx), %%zmm0, %%zmm20; prefetcht2 (%%r9); shr $1, %%edx; add %%r14, %%r9; " // RAM prefetch + "vfmadd231pd %%zmm22, %%zmm0, %%zmm21; vfmadd231pd %%zmm23, %%zmm1, %%zmm26; shl $1, %%edi; xor %%rdx, %%r13; " // REG ops only + "vfmadd231pd %%zmm23, %%zmm0, %%zmm22; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm22; shl $1, %%esi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm24, %%zmm0, %%zmm23; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm23; shl $1, %%edx; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm25, %%zmm0, %%zmm24; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm24; shr $1, %%edi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm2, %%zmm0, %%zmm25; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm25; shr $1, %%esi; add %%r14, %%rbx; " // L1 load + "vmovapd %%zmm2, 64(%%rcx); vfmadd231pd %%zmm3, %%zmm0, %%zmm2; shr $1, %%edx; add %%r14, %%rcx; " // L2 store + "vfmadd231pd %%zmm4, %%zmm0, %%zmm3; vfmadd231pd %%zmm5, %%zmm1, %%zmm27; shl $1, %%edi; xor %%rdx, %%r13; " // REG ops only + "vfmadd231pd %%zmm5, %%zmm0, %%zmm4; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm4; shl $1, %%esi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm6, %%zmm0, %%zmm5; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm5; shl $1, %%edx; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm7, %%zmm0, %%zmm6; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm6; shr $1, %%edi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm8, %%zmm0, %%zmm7; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm7; shr $1, %%esi; add %%r14, %%rbx; " // L1 load + "vmovapd %%zmm8, 64(%%rcx); vfmadd231pd %%zmm9, %%zmm0, %%zmm8; shr $1, %%edx; add %%r14, %%rcx; " // L2 store + "vfmadd231pd %%zmm10, %%zmm0, %%zmm9; vfmadd231pd %%zmm11, %%zmm1, %%zmm28; shl $1, %%edi; xor %%rdx, %%r13; " // REG ops only + "vfmadd231pd %%zmm11, %%zmm0, %%zmm10; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm10; shl $1, %%esi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm12, %%zmm0, %%zmm11; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm11; shl $1, %%edx; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm13, %%zmm0, %%zmm12; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm12; shr $1, %%edi; add %%r14, %%rbx; " // L1 load + "vmovapd %%zmm13, 64(%%rcx); vfmadd231pd %%zmm14, %%zmm0, %%zmm13; shr $1, %%esi; add %%r14, %%rcx; " // L2 store + "vfmadd231pd %%zmm15, %%zmm0, %%zmm14; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm14; shr $1, %%edx; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm16, %%zmm0, %%zmm15; vfmadd231pd %%zmm17, %%zmm1, %%zmm29; shl $1, %%edi; xor %%rdx, %%r13; " // REG ops only + "vfmadd231pd %%zmm17, %%zmm0, %%zmm16; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm16; shl $1, %%esi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm18, %%zmm0, %%zmm17; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm17; shl $1, %%edx; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm19, %%zmm0, %%zmm18; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm18; shr $1, %%edi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd 64(%%rbx), %%zmm0, %%zmm19; prefetcht2 (%%r9); shr $1, %%esi; add %%r14, %%r9; " // RAM prefetch + "vfmadd231pd %%zmm21, %%zmm0, %%zmm20; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm20; shr $1, %%edx; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm22, %%zmm0, %%zmm21; vfmadd231pd %%zmm23, %%zmm1, %%zmm30; shl $1, %%edi; xor %%rdx, %%r13; " // REG ops only + "vfmadd231pd %%zmm23, %%zmm0, %%zmm22; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm22; shl $1, %%esi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm24, %%zmm0, %%zmm23; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm23; shl $1, %%edx; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm25, %%zmm0, %%zmm24; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm24; shr $1, %%edi; add %%r14, %%rbx; " // L1 load + "vmovapd %%zmm25, 64(%%rcx); vfmadd231pd %%zmm2, %%zmm0, %%zmm25; shr $1, %%esi; add %%r14, %%rcx; " // L2 store + "vfmadd231pd %%zmm3, %%zmm0, %%zmm2; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm2; shr $1, %%edx; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm4, %%zmm0, %%zmm3; vfmadd231pd %%zmm5, %%zmm1, %%zmm26; shl $1, %%edi; xor %%rdx, %%r13; " // REG ops only + "vfmadd231pd %%zmm5, %%zmm0, %%zmm4; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm4; shl $1, %%esi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm6, %%zmm0, %%zmm5; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm5; shl $1, %%edx; add %%r14, %%rbx; " // L1 load + "vmovapd %%zmm6, 64(%%rcx); vfmadd231pd %%zmm7, %%zmm0, %%zmm6; shr $1, %%edi; add %%r14, %%rcx; " // L2 store + "vfmadd231pd %%zmm8, %%zmm0, %%zmm7; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm7; shr $1, %%esi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm9, %%zmm0, %%zmm8; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm8; shr $1, %%edx; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm10, %%zmm0, %%zmm9; vfmadd231pd %%zmm11, %%zmm1, %%zmm27; shl $1, %%edi; xor %%rdx, %%r13; " // REG ops only + "vfmadd231pd %%zmm11, %%zmm0, %%zmm10; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm10; shl $1, %%esi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm12, %%zmm0, %%zmm11; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm11; shl $1, %%edx; add %%r14, %%rbx; " // L1 load + "vmovapd %%zmm12, 64(%%rcx); vfmadd231pd %%zmm13, %%zmm0, %%zmm12; shr $1, %%edi; add %%r14, %%rcx; " // L2 store + "vfmadd231pd %%zmm14, %%zmm0, %%zmm13; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm13; shr $1, %%esi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm15, %%zmm0, %%zmm14; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm14; shr $1, %%edx; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm16, %%zmm0, %%zmm15; vfmadd231pd %%zmm17, %%zmm1, %%zmm28; shl $1, %%edi; xor %%rdx, %%r13; " // REG ops only + "vfmadd231pd %%zmm17, %%zmm0, %%zmm16; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm16; shl $1, %%esi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm18, %%zmm0, %%zmm17; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm17; shl $1, %%edx; add %%r14, %%rbx; " // L1 load + "vfmadd231pd 64(%%rbx), %%zmm0, %%zmm18; prefetcht2 (%%r9); shr $1, %%edi; add %%r14, %%r9; " // RAM prefetch + "vfmadd231pd %%zmm20, %%zmm0, %%zmm19; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm19; shr $1, %%esi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm21, %%zmm0, %%zmm20; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm20; shr $1, %%edx; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm22, %%zmm0, %%zmm21; vfmadd231pd %%zmm23, %%zmm1, %%zmm29; shl $1, %%edi; xor %%rdx, %%r13; " // REG ops only + "vfmadd231pd %%zmm23, %%zmm0, %%zmm22; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm22; shl $1, %%esi; add %%r14, %%rbx; " // L1 load + "vmovapd %%zmm23, 64(%%rcx); vfmadd231pd %%zmm24, %%zmm0, %%zmm23; shl $1, %%edx; add %%r14, %%rcx; " // L2 store + "vfmadd231pd %%zmm25, %%zmm0, %%zmm24; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm24; shr $1, %%edi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm2, %%zmm0, %%zmm25; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm25; shr $1, %%esi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm3, %%zmm0, %%zmm2; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm2; shr $1, %%edx; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm4, %%zmm0, %%zmm3; vfmadd231pd %%zmm5, %%zmm1, %%zmm30; shl $1, %%edi; xor %%rdx, %%r13; " // REG ops only + "vfmadd231pd %%zmm5, %%zmm0, %%zmm4; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm4; shl $1, %%esi; add %%r14, %%rbx; " // L1 load + "vmovapd %%zmm5, 64(%%rcx); vfmadd231pd %%zmm6, %%zmm0, %%zmm5; shl $1, %%edx; add %%r14, %%rcx; " // L2 store + "vfmadd231pd %%zmm7, %%zmm0, %%zmm6; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm6; shr $1, %%edi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm8, %%zmm0, %%zmm7; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm7; shr $1, %%esi; add %%r14, %%rbx; " // L1 load + "vfmadd231pd %%zmm9, %%zmm0, %%zmm8; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm8; shr $1, %%edx; add %%r14, %%rbx; " // L1 load + "movq %%mm0, %%r13;" // restore iteration counter + //reset RAM counter + "sub $1, %%r12;" + "jnz _work_no_ram_reset_knl_xeonphi_avx512_4t;" + "movabs $5688, %%r12;" + "mov %%rax, %%r9;" + "add $0, %%r9;" + "_work_no_ram_reset_knl_xeonphi_avx512_4t:" + "inc %%r13;" // increment iteration counter + //reset L2-Cache counter + "sub $1, %%r10;" + "jnz _work_no_L2_reset_knl_xeonphi_avx512_4t;" + "movabs $34, %%r10;" + "mov %%rax, %%rcx;" + "add $8192, %%rcx;" + "_work_no_L2_reset_knl_xeonphi_avx512_4t:" + "movq %%r13, %%mm0;" // store iteration counter + "mov %%rax, %%rbx;" + "testq $1, (%%r15);" + "jnz _work_loop_knl_xeonphi_avx512_4t;" + "movq %%mm0, %%rax;" // restore iteration counter + : "=a" (threaddata->iterations) + : "a"(threaddata->addrMem), "b"(threaddata->addrHigh), "c" (threaddata->iterations) + : "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "%rdx", "%rsi", "%rdi", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "%xmm16", "%xmm17", "%xmm18", "%xmm19", "%xmm20", "%xmm21", "%xmm22", "%xmm23", "%xmm24", "%xmm25", "%xmm26", "%xmm27", "%xmm28", "%xmm29", "%xmm30", "%xmm31" + ); + return EXIT_SUCCESS; +} diff --git a/avx_functions.c b/avx_functions.c index 5a4c6faa..0d1fd4b4 100644 --- a/avx_functions.c +++ b/avx_functions.c @@ -23,12 +23,33 @@ - int init_snb_corei_avx_1t(unsigned long long addrMem) __attribute__((noinline)); -int init_snb_corei_avx_1t(unsigned long long addrMem) + + int init_snb_corei_avx_1t(threaddata_t* threaddata) __attribute__((noinline)); +int init_snb_corei_avx_1t(threaddata_t* threaddata) { + unsigned long long addrMem = threaddata->addrMem; int i; for (i=0;i<13340672;i++) *((double*)(addrMem+8*i)) = i* 1.654738925401e-15; + // lines with register operations + threaddata->flops+=45*4; // 1 256 bit operation + + // lines with L1 operations + threaddata->flops+=90*4; // 1 256 bit operation + + // lines with L2 operations + threaddata->flops+=10*4; // 1 256 bit operation + + // lines with L3 operations + threaddata->flops+=4*4; // 1 256 bit operation + + // lines with RAM operations + threaddata->flops+=2*4; // 1 256 bit operation + threaddata->bytes=2*64; // 1 memory access + + threaddata->flops*=10; + threaddata->bytes*=10; + return EXIT_SUCCESS; } @@ -39,11 +60,11 @@ int init_snb_corei_avx_1t(unsigned long long addrMem) * @input - addrMem: pointer to buffer * @return EXIT_SUCCESS */ -int asm_work_snb_corei_avx_1t(unsigned long long addrMem, unsigned long long addrHigh) __attribute__((noinline)); -int asm_work_snb_corei_avx_1t(unsigned long long addrMem, unsigned long long addrHigh) +int asm_work_snb_corei_avx_1t(threaddata_t* threaddata) __attribute__((noinline)); +int asm_work_snb_corei_avx_1t(threaddata_t* threaddata) { - if (*((unsigned long long*)addrHigh) == 0) return EXIT_SUCCESS; + if (*((unsigned long long*)threaddata->addrHigh) == 0) return EXIT_SUCCESS; /* input: * - addrMem -> rax * register usage: @@ -58,11 +79,13 @@ int asm_work_snb_corei_avx_1t(unsigned long long addrMem, unsigned long long add * - r11: temp register for initialization of SIMD-registers * - r12: stores cacheline width as increment for buffer addresses * - r13: stores address of shared variable that controls load level + * - r14: stores iteration counter * - mm*,xmm*,ymm*: data registers for SIMD instructions */ __asm__ __volatile__( - "mov %0, %%rax;" // store start address of buffer - "mov %1, %%r13;" // store address of shared variable that controls load level + "mov %%rax, %%rax;" // store start address of buffer + "mov %%rbx, %%r13;" // store address of shared variable that controls load level + "mov %%rcx, %%r14;" // store iteration counter "mov $64, %%r12;" // increment after each cache/memory access //Initialize AVX-Registers for Addition "vmovapd 0(%%rax), %%ymm0;" @@ -1654,26 +1677,48 @@ int asm_work_snb_corei_avx_1t(unsigned long long addrMem, unsigned long long add "mov %%rax, %%rdx;" "add $262144, %%rdx;" "_work_no_L3_reset_snb_corei_avx_1t:" + "inc %%r14;" // increment iteration counter "mov %%rax, %%rbx;" "mfence;" - "mov (%%r13), %%r11;" - "test $1, %%r11;" + "testq $1, (%%r13);" "jnz _work_loop_snb_corei_avx_1t;" - : - : "r"(addrMem), "r"(addrHigh) - : "%rax", "%rbx", "%rcx", "%rdx", "%rdi", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15" + "movq %%r14, %%rax;" // restore iteration counter + : "=a" (threaddata->iterations) + : "a"(threaddata->addrMem), "b"(threaddata->addrHigh), "c" (threaddata->iterations) + : "%rdx", "%rdi", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15" ); return EXIT_SUCCESS; } - int init_snb_corei_avx_2t(unsigned long long addrMem) __attribute__((noinline)); -int init_snb_corei_avx_2t(unsigned long long addrMem) + + int init_snb_corei_avx_2t(threaddata_t* threaddata) __attribute__((noinline)); +int init_snb_corei_avx_2t(threaddata_t* threaddata) { + unsigned long long addrMem = threaddata->addrMem; int i; for (i=0;i<6670336;i++) *((double*)(addrMem+8*i)) = i* 1.654738925401e-15; + // lines with register operations + threaddata->flops+=45*4; // 1 256 bit operation + + // lines with L1 operations + threaddata->flops+=90*4; // 1 256 bit operation + + // lines with L2 operations + threaddata->flops+=10*4; // 1 256 bit operation + + // lines with L3 operations + threaddata->flops+=4*4; // 1 256 bit operation + + // lines with RAM operations + threaddata->flops+=2*4; // 1 256 bit operation + threaddata->bytes=2*64; // 1 memory access + + threaddata->flops*=5; + threaddata->bytes*=5; + return EXIT_SUCCESS; } @@ -1684,11 +1729,11 @@ int init_snb_corei_avx_2t(unsigned long long addrMem) * @input - addrMem: pointer to buffer * @return EXIT_SUCCESS */ -int asm_work_snb_corei_avx_2t(unsigned long long addrMem, unsigned long long addrHigh) __attribute__((noinline)); -int asm_work_snb_corei_avx_2t(unsigned long long addrMem, unsigned long long addrHigh) +int asm_work_snb_corei_avx_2t(threaddata_t* threaddata) __attribute__((noinline)); +int asm_work_snb_corei_avx_2t(threaddata_t* threaddata) { - if (*((unsigned long long*)addrHigh) == 0) return EXIT_SUCCESS; + if (*((unsigned long long*)threaddata->addrHigh) == 0) return EXIT_SUCCESS; /* input: * - addrMem -> rax * register usage: @@ -1703,11 +1748,13 @@ int asm_work_snb_corei_avx_2t(unsigned long long addrMem, unsigned long long add * - r11: temp register for initialization of SIMD-registers * - r12: stores cacheline width as increment for buffer addresses * - r13: stores address of shared variable that controls load level + * - r14: stores iteration counter * - mm*,xmm*,ymm*: data registers for SIMD instructions */ __asm__ __volatile__( - "mov %0, %%rax;" // store start address of buffer - "mov %1, %%r13;" // store address of shared variable that controls load level + "mov %%rax, %%rax;" // store start address of buffer + "mov %%rbx, %%r13;" // store address of shared variable that controls load level + "mov %%rcx, %%r14;" // store iteration counter "mov $64, %%r12;" // increment after each cache/memory access //Initialize AVX-Registers for Addition "vmovapd 0(%%rax), %%ymm0;" @@ -2544,26 +2591,48 @@ int asm_work_snb_corei_avx_2t(unsigned long long addrMem, unsigned long long add "mov %%rax, %%rdx;" "add $131072, %%rdx;" "_work_no_L3_reset_snb_corei_avx_2t:" + "inc %%r14;" // increment iteration counter "mov %%rax, %%rbx;" "mfence;" - "mov (%%r13), %%r11;" - "test $1, %%r11;" + "testq $1, (%%r13);" "jnz _work_loop_snb_corei_avx_2t;" - : - : "r"(addrMem), "r"(addrHigh) - : "%rax", "%rbx", "%rcx", "%rdx", "%rdi", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15" + "movq %%r14, %%rax;" // restore iteration counter + : "=a" (threaddata->iterations) + : "a"(threaddata->addrMem), "b"(threaddata->addrHigh), "c" (threaddata->iterations) + : "%rdx", "%rdi", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15" ); return EXIT_SUCCESS; } - int init_snb_xeonep_avx_1t(unsigned long long addrMem) __attribute__((noinline)); -int init_snb_xeonep_avx_1t(unsigned long long addrMem) + + int init_snb_xeonep_avx_1t(threaddata_t* threaddata) __attribute__((noinline)); +int init_snb_xeonep_avx_1t(threaddata_t* threaddata) { + unsigned long long addrMem = threaddata->addrMem; int i; for (i=0;i<13471744;i++) *((double*)(addrMem+8*i)) = i* 1.654738925401e-15; + // lines with register operations + threaddata->flops+=30*4; // 1 256 bit operation + + // lines with L1 operations + threaddata->flops+=90*4; // 1 256 bit operation + + // lines with L2 operations + threaddata->flops+=10*4; // 1 256 bit operation + + // lines with L3 operations + threaddata->flops+=2*4; // 1 256 bit operation + + // lines with RAM operations + threaddata->flops+=3*4; // 1 256 bit operation + threaddata->bytes=3*64; // 1 memory access + + threaddata->flops*=11; + threaddata->bytes*=11; + return EXIT_SUCCESS; } @@ -2574,11 +2643,11 @@ int init_snb_xeonep_avx_1t(unsigned long long addrMem) * @input - addrMem: pointer to buffer * @return EXIT_SUCCESS */ -int asm_work_snb_xeonep_avx_1t(unsigned long long addrMem, unsigned long long addrHigh) __attribute__((noinline)); -int asm_work_snb_xeonep_avx_1t(unsigned long long addrMem, unsigned long long addrHigh) +int asm_work_snb_xeonep_avx_1t(threaddata_t* threaddata) __attribute__((noinline)); +int asm_work_snb_xeonep_avx_1t(threaddata_t* threaddata) { - if (*((unsigned long long*)addrHigh) == 0) return EXIT_SUCCESS; + if (*((unsigned long long*)threaddata->addrHigh) == 0) return EXIT_SUCCESS; /* input: * - addrMem -> rax * register usage: @@ -2593,11 +2662,13 @@ int asm_work_snb_xeonep_avx_1t(unsigned long long addrMem, unsigned long long ad * - r11: temp register for initialization of SIMD-registers * - r12: stores cacheline width as increment for buffer addresses * - r13: stores address of shared variable that controls load level + * - r14: stores iteration counter * - mm*,xmm*,ymm*: data registers for SIMD instructions */ __asm__ __volatile__( - "mov %0, %%rax;" // store start address of buffer - "mov %1, %%r13;" // store address of shared variable that controls load level + "mov %%rax, %%rax;" // store start address of buffer + "mov %%rbx, %%r13;" // store address of shared variable that controls load level + "mov %%rcx, %%r14;" // store iteration counter "mov $64, %%r12;" // increment after each cache/memory access //Initialize AVX-Registers for Addition "vmovapd 0(%%rax), %%ymm0;" @@ -4164,26 +4235,48 @@ int asm_work_snb_xeonep_avx_1t(unsigned long long addrMem, unsigned long long ad "mov %%rax, %%rdx;" "add $262144, %%rdx;" "_work_no_L3_reset_snb_xeonep_avx_1t:" + "inc %%r14;" // increment iteration counter "mov %%rax, %%rbx;" "mfence;" - "mov (%%r13), %%r11;" - "test $1, %%r11;" + "testq $1, (%%r13);" "jnz _work_loop_snb_xeonep_avx_1t;" - : - : "r"(addrMem), "r"(addrHigh) - : "%rax", "%rbx", "%rcx", "%rdx", "%rdi", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15" + "movq %%r14, %%rax;" // restore iteration counter + : "=a" (threaddata->iterations) + : "a"(threaddata->addrMem), "b"(threaddata->addrHigh), "c" (threaddata->iterations) + : "%rdx", "%rdi", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15" ); return EXIT_SUCCESS; } - int init_snb_xeonep_avx_2t(unsigned long long addrMem) __attribute__((noinline)); -int init_snb_xeonep_avx_2t(unsigned long long addrMem) + + int init_snb_xeonep_avx_2t(threaddata_t* threaddata) __attribute__((noinline)); +int init_snb_xeonep_avx_2t(threaddata_t* threaddata) { + unsigned long long addrMem = threaddata->addrMem; int i; for (i=0;i<6735872;i++) *((double*)(addrMem+8*i)) = i* 1.654738925401e-15; + // lines with register operations + threaddata->flops+=30*4; // 1 256 bit operation + + // lines with L1 operations + threaddata->flops+=90*4; // 1 256 bit operation + + // lines with L2 operations + threaddata->flops+=10*4; // 1 256 bit operation + + // lines with L3 operations + threaddata->flops+=2*4; // 1 256 bit operation + + // lines with RAM operations + threaddata->flops+=3*4; // 1 256 bit operation + threaddata->bytes=3*64; // 1 memory access + + threaddata->flops*=5; + threaddata->bytes*=5; + return EXIT_SUCCESS; } @@ -4194,11 +4287,11 @@ int init_snb_xeonep_avx_2t(unsigned long long addrMem) * @input - addrMem: pointer to buffer * @return EXIT_SUCCESS */ -int asm_work_snb_xeonep_avx_2t(unsigned long long addrMem, unsigned long long addrHigh) __attribute__((noinline)); -int asm_work_snb_xeonep_avx_2t(unsigned long long addrMem, unsigned long long addrHigh) +int asm_work_snb_xeonep_avx_2t(threaddata_t* threaddata) __attribute__((noinline)); +int asm_work_snb_xeonep_avx_2t(threaddata_t* threaddata) { - if (*((unsigned long long*)addrHigh) == 0) return EXIT_SUCCESS; + if (*((unsigned long long*)threaddata->addrHigh) == 0) return EXIT_SUCCESS; /* input: * - addrMem -> rax * register usage: @@ -4213,11 +4306,13 @@ int asm_work_snb_xeonep_avx_2t(unsigned long long addrMem, unsigned long long ad * - r11: temp register for initialization of SIMD-registers * - r12: stores cacheline width as increment for buffer addresses * - r13: stores address of shared variable that controls load level + * - r14: stores iteration counter * - mm*,xmm*,ymm*: data registers for SIMD instructions */ __asm__ __volatile__( - "mov %0, %%rax;" // store start address of buffer - "mov %1, %%r13;" // store address of shared variable that controls load level + "mov %%rax, %%rax;" // store start address of buffer + "mov %%rbx, %%r13;" // store address of shared variable that controls load level + "mov %%rcx, %%r14;" // store iteration counter "mov $64, %%r12;" // increment after each cache/memory access //Initialize AVX-Registers for Addition "vmovapd 0(%%rax), %%ymm0;" @@ -4974,14 +5069,15 @@ int asm_work_snb_xeonep_avx_2t(unsigned long long addrMem, unsigned long long ad "mov %%rax, %%rdx;" "add $131072, %%rdx;" "_work_no_L3_reset_snb_xeonep_avx_2t:" + "inc %%r14;" // increment iteration counter "mov %%rax, %%rbx;" "mfence;" - "mov (%%r13), %%r11;" - "test $1, %%r11;" + "testq $1, (%%r13);" "jnz _work_loop_snb_xeonep_avx_2t;" - : - : "r"(addrMem), "r"(addrHigh) - : "%rax", "%rbx", "%rcx", "%rdx", "%rdi", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15" + "movq %%r14, %%rax;" // restore iteration counter + : "=a" (threaddata->iterations) + : "a"(threaddata->addrMem), "b"(threaddata->addrHigh), "c" (threaddata->iterations) + : "%rdx", "%rdi", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15" ); return EXIT_SUCCESS; } diff --git a/cpu.h b/cpu.h index a8045448..d38105fc 100644 --- a/cpu.h +++ b/cpu.h @@ -81,6 +81,7 @@ #define FMA4 0x00010000 #define FMA 0x00020000 #define AES 0x00040000 +#define AVX512 0x00080000 #define MAX_CACHELEVELS 3 diff --git a/firestarter_global.h b/firestarter_global.h index cec4ab86..85f7ad1d 100644 --- a/firestarter_global.h +++ b/firestarter_global.h @@ -25,9 +25,9 @@ /* current version */ #define VERSION_MAJOR 1 -#define VERSION_MINOR 4 +#define VERSION_MINOR 5 #define VERSION_INFO "" -#define BUILDDATE "2016-04-08" +#define BUILDDATE "2016-11-11" #define COPYRIGHT_YEAR 2016 #if (defined(linux) || defined(__linux__)) && defined (AFFINITY) @@ -84,7 +84,7 @@ typedef struct mydata { struct threaddata *threaddata; cpu_info_t *cpuinfo; - int *thread_comm; + int *thread_comm; volatile unsigned int ack; unsigned int num_threads; } mydata_t; @@ -96,7 +96,12 @@ typedef struct threaddata char* bufferMem; unsigned long long addrMem; unsigned long long addrHigh; - unsigned long long buffersizeMem; + unsigned long long buffersizeMem; + unsigned long long iterations; + unsigned long long flops; + unsigned long long bytes; + unsigned long long start_tsc; + unsigned long long stop_tsc; unsigned int alignment; unsigned int cpu_id; unsigned int thread_id; diff --git a/fma4_functions.c b/fma4_functions.c index 1bfaddc3..472ede26 100644 --- a/fma4_functions.c +++ b/fma4_functions.c @@ -23,12 +23,33 @@ - int init_bld_opteron_fma4_1t(unsigned long long addrMem) __attribute__((noinline)); -int init_bld_opteron_fma4_1t(unsigned long long addrMem) + + int init_bld_opteron_fma4_1t(threaddata_t* threaddata) __attribute__((noinline)); +int init_bld_opteron_fma4_1t(threaddata_t* threaddata) { + unsigned long long addrMem = threaddata->addrMem; int i; for (i=0;i<13338624;i++) *((double*)(addrMem+8*i)) = 0.25 + (double)(i%9267) * 0.24738995982e-4; + // lines with register operations + threaddata->flops+=45*8; // 2 128 bit FMA operations + + // lines with L1 operations + threaddata->flops+=90*12; // 1 128 and 1 256 bit FMA operation + + // lines with L2 operations + threaddata->flops+=5*4; // 1 128 bit FMA operation + + // lines with L3 operations + threaddata->flops+=1*4; // 1 128 bit FMA operation + + // lines with RAM operations + threaddata->flops+=1*8; // 2 128 bit FMA operations + threaddata->bytes=1*64; // 1 memory access + + threaddata->flops*=10; + threaddata->bytes*=10; + return EXIT_SUCCESS; } @@ -39,10 +60,10 @@ int init_bld_opteron_fma4_1t(unsigned long long addrMem) * @input - addrMem: pointer to buffer * @return EXIT_SUCCESS */ -int asm_work_bld_opteron_fma4_1t(unsigned long long addrMem, unsigned long long addrHigh) __attribute__((noinline)); -int asm_work_bld_opteron_fma4_1t(unsigned long long addrMem, unsigned long long addrHigh) +int asm_work_bld_opteron_fma4_1t(threaddata_t* threaddata) __attribute__((noinline)); +int asm_work_bld_opteron_fma4_1t(threaddata_t* threaddata) { - if (*((unsigned long long*)addrHigh) == 0) return EXIT_SUCCESS; + if (*((unsigned long long*)threaddata->addrHigh) == 0) return EXIT_SUCCESS; /* input: * - addrMem -> rax * register usage: @@ -57,12 +78,14 @@ int asm_work_bld_opteron_fma4_1t(unsigned long long addrMem, unsigned long long * - r13: register for temporary results * - r14: stores cacheline width as increment for buffer addresses * - r15: stores address of shared variable that controls load level + * - mm0: stores iteration counter * - rdx, rsi, rdi: registers for shift operations - * - mm*,xmm*,xmm*: data registers for SIMD instructions + * - xmm*,xmm*: data registers for SIMD instructions */ __asm__ __volatile__( - "mov %0, %%rax;" // store start address of buffer - "mov %1, %%r15;" // store address of shared variable that controls load level + "mov %%rax, %%rax;" // store start address of buffer + "mov %%rbx, %%r15;" // store address of shared variable that controls load level + "movq %%rcx, %%mm0;" // store iteration counter "mov $64, %%r14;" // increment after each cache/memory access //Initialize registers for shift operations "mov $0xAAAAAAAA, %%edi;" @@ -1518,6 +1541,7 @@ int asm_work_bld_opteron_fma4_1t(unsigned long long addrMem, unsigned long long "vfmaddpd %%xmm8, %%xmm0, %%xmm7, %%xmm7; vfmaddpd %%xmm9, %%xmm1, %%xmm13, %%xmm13; shl $1, %%esi; xor %%rdi, %%r13; " // REG ops only "vfmaddpd %%xmm9, %%xmm0, %%xmm8, %%xmm8; vfmaddpd 32(%%rbx), %%ymm1, %%ymm8, %%ymm8; shl $1, %%edx; add %%r14, %%rbx; " // L1 load "vfmaddpd %%xmm10, %%xmm0, %%xmm9, %%xmm9; vfmaddpd 32(%%rbx), %%ymm1, %%ymm9, %%ymm9; shr $1, %%edi; add %%r14, %%rbx; " // L1 load + "movq %%mm0, %%r13;" // restore iteration counter //reset RAM counter "sub $1, %%r12;" "jnz _work_no_ram_reset_bld_opteron_fma4_1t;" @@ -1525,6 +1549,7 @@ int asm_work_bld_opteron_fma4_1t(unsigned long long addrMem, unsigned long long "mov %%rax, %%r9;" "add $786432, %%r9;" "_work_no_ram_reset_bld_opteron_fma4_1t:" + "inc %%r13;" // increment iteration counter //reset L2-Cache counter "sub $1, %%r10;" "jnz _work_no_L2_reset_bld_opteron_fma4_1t;" @@ -1532,6 +1557,7 @@ int asm_work_bld_opteron_fma4_1t(unsigned long long addrMem, unsigned long long "mov %%rax, %%rcx;" "add $16384, %%rcx;" "_work_no_L2_reset_bld_opteron_fma4_1t:" + "movq %%r13, %%mm0;" // store iteration counter //reset L3-Cache counter "sub $1, %%r11;" "jnz _work_no_L3_reset_bld_opteron_fma4_1t;" @@ -1540,12 +1566,12 @@ int asm_work_bld_opteron_fma4_1t(unsigned long long addrMem, unsigned long long "add $1048576, %%r8;" "_work_no_L3_reset_bld_opteron_fma4_1t:" "mov %%rax, %%rbx;" - "mov (%%r15), %%r13;" - "test $1, %%r13;" + "testq $1, (%%r15);" "jnz _work_loop_bld_opteron_fma4_1t;" - : - : "a"(addrMem), "b"(addrHigh) - : "%rcx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "%rdx", "%rsi", "%rdi", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15" + "movq %%mm0, %%rax;" // restore iteration counter + : "=a" (threaddata->iterations) + : "a"(threaddata->addrMem), "b"(threaddata->addrHigh), "c" (threaddata->iterations) + : "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "%rdx", "%rsi", "%rdi", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15" ); return EXIT_SUCCESS; } diff --git a/fma_functions.c b/fma_functions.c index 8edf27df..39451246 100644 --- a/fma_functions.c +++ b/fma_functions.c @@ -23,12 +23,33 @@ - int init_skl_corei_fma_1t(unsigned long long addrMem) __attribute__((noinline)); -int init_skl_corei_fma_1t(unsigned long long addrMem) + + int init_skl_corei_fma_1t(threaddata_t* threaddata) __attribute__((noinline)); +int init_skl_corei_fma_1t(threaddata_t* threaddata) { + unsigned long long addrMem = threaddata->addrMem; int i; for (i=0;i<13340672;i++) *((double*)(addrMem+8*i)) = 0.25 + (double)(i%9267) * 0.24738995982e-4; + // lines with register operations + threaddata->flops+=40*16; // 2 256 bit FMA operations + + // lines with L1 operations + threaddata->flops+=78*16; // 2 256 bit FMA operations + + // lines with L2 operations + threaddata->flops+=18*8; // 1 256 bit FMA operation + + // lines with L3 operations + threaddata->flops+=5*8; // 1 256 bit FMA operation + + // lines with RAM operations + threaddata->flops+=3*16; // 2 256 bit FMA operations + threaddata->bytes=3*64; // 1 memory access + + threaddata->flops*=10; + threaddata->bytes*=10; + return EXIT_SUCCESS; } @@ -39,10 +60,10 @@ int init_skl_corei_fma_1t(unsigned long long addrMem) * @input - addrMem: pointer to buffer * @return EXIT_SUCCESS */ -int asm_work_skl_corei_fma_1t(unsigned long long addrMem, unsigned long long addrHigh) __attribute__((noinline)); -int asm_work_skl_corei_fma_1t(unsigned long long addrMem, unsigned long long addrHigh) +int asm_work_skl_corei_fma_1t(threaddata_t* threaddata) __attribute__((noinline)); +int asm_work_skl_corei_fma_1t(threaddata_t* threaddata) { - if (*((unsigned long long*)addrHigh) == 0) return EXIT_SUCCESS; + if (*((unsigned long long*)threaddata->addrHigh) == 0) return EXIT_SUCCESS; /* input: * - addrMem -> rax * register usage: @@ -57,12 +78,14 @@ int asm_work_skl_corei_fma_1t(unsigned long long addrMem, unsigned long long add * - r13: register for temporary results * - r14: stores cacheline width as increment for buffer addresses * - r15: stores address of shared variable that controls load level + * - mm0: stores iteration counter * - rdx, rsi, rdi: registers for shift operations - * - mm*,xmm*,ymm*: data registers for SIMD instructions + * - xmm*,ymm*: data registers for SIMD instructions */ __asm__ __volatile__( - "mov %0, %%rax;" // store start address of buffer - "mov %1, %%r15;" // store address of shared variable that controls load level + "mov %%rax, %%rax;" // store start address of buffer + "mov %%rbx, %%r15;" // store address of shared variable that controls load level + "movq %%rcx, %%mm0;" // store iteration counter "mov $64, %%r14;" // increment after each cache/memory access //Initialize registers for shift operations "mov $0xAAAAAAAA, %%edi;" @@ -1538,6 +1561,7 @@ int asm_work_skl_corei_fma_1t(unsigned long long addrMem, unsigned long long add "vfmadd231pd %%ymm10, %%ymm0, %%ymm9; vfmadd231pd %%ymm2, %%ymm1, %%ymm11; shr $1, %%edi; xor %%rdx, %%r13; " // REG ops only "vfmadd231pd 64(%%rbx), %%ymm0, %%ymm10; vfmadd231pd 96(%%rbx), %%ymm1, %%ymm12; vmovapd %%ymm10, 32(%%rbx); add $128, %%rbx; " // 2 L1 loads, L1 store "vfmadd231pd 64(%%rbx), %%ymm0, %%ymm2; vfmadd231pd 96(%%rbx), %%ymm1, %%ymm12; vmovapd %%ymm2, 32(%%rbx); add $128, %%rbx; " // 2 L1 loads, L1 store + "movq %%mm0, %%r13;" // restore iteration counter //reset RAM counter "sub $1, %%r12;" "jnz _work_no_ram_reset_skl_corei_fma_1t;" @@ -1545,6 +1569,7 @@ int asm_work_skl_corei_fma_1t(unsigned long long addrMem, unsigned long long add "mov %%rax, %%r9;" "add $1572864, %%r9;" "_work_no_ram_reset_skl_corei_fma_1t:" + "inc %%r13;" // increment iteration counter //reset L2-Cache counter "sub $1, %%r10;" "jnz _work_no_L2_reset_skl_corei_fma_1t;" @@ -1552,6 +1577,7 @@ int asm_work_skl_corei_fma_1t(unsigned long long addrMem, unsigned long long add "mov %%rax, %%rcx;" "add $32768, %%rcx;" "_work_no_L2_reset_skl_corei_fma_1t:" + "movq %%r13, %%mm0;" // store iteration counter //reset L3-Cache counter "sub $1, %%r11;" "jnz _work_no_L3_reset_skl_corei_fma_1t;" @@ -1560,24 +1586,45 @@ int asm_work_skl_corei_fma_1t(unsigned long long addrMem, unsigned long long add "add $262144, %%r8;" "_work_no_L3_reset_skl_corei_fma_1t:" "mov %%rax, %%rbx;" - "mov (%%r15), %%r13;" - "test $1, %%r13;" + "testq $1, (%%r15);" "jnz _work_loop_skl_corei_fma_1t;" - : - : "a"(addrMem), "b"(addrHigh) - : "%rcx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "%rdx", "%rsi", "%rdi", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15" + "movq %%mm0, %%rax;" // restore iteration counter + : "=a" (threaddata->iterations) + : "a"(threaddata->addrMem), "b"(threaddata->addrHigh), "c" (threaddata->iterations) + : "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "%rdx", "%rsi", "%rdi", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15" ); return EXIT_SUCCESS; } - int init_skl_corei_fma_2t(unsigned long long addrMem) __attribute__((noinline)); -int init_skl_corei_fma_2t(unsigned long long addrMem) + + int init_skl_corei_fma_2t(threaddata_t* threaddata) __attribute__((noinline)); +int init_skl_corei_fma_2t(threaddata_t* threaddata) { + unsigned long long addrMem = threaddata->addrMem; int i; for (i=0;i<6670336;i++) *((double*)(addrMem+8*i)) = 0.25 + (double)(i%9267) * 0.24738995982e-4; + // lines with register operations + threaddata->flops+=40*16; // 2 256 bit FMA operations + + // lines with L1 operations + threaddata->flops+=78*16; // 2 256 bit FMA operations + + // lines with L2 operations + threaddata->flops+=18*8; // 1 256 bit FMA operation + + // lines with L3 operations + threaddata->flops+=5*8; // 1 256 bit FMA operation + + // lines with RAM operations + threaddata->flops+=3*16; // 2 256 bit FMA operations + threaddata->bytes=3*64; // 1 memory access + + threaddata->flops*=5; + threaddata->bytes*=5; + return EXIT_SUCCESS; } @@ -1588,10 +1635,10 @@ int init_skl_corei_fma_2t(unsigned long long addrMem) * @input - addrMem: pointer to buffer * @return EXIT_SUCCESS */ -int asm_work_skl_corei_fma_2t(unsigned long long addrMem, unsigned long long addrHigh) __attribute__((noinline)); -int asm_work_skl_corei_fma_2t(unsigned long long addrMem, unsigned long long addrHigh) +int asm_work_skl_corei_fma_2t(threaddata_t* threaddata) __attribute__((noinline)); +int asm_work_skl_corei_fma_2t(threaddata_t* threaddata) { - if (*((unsigned long long*)addrHigh) == 0) return EXIT_SUCCESS; + if (*((unsigned long long*)threaddata->addrHigh) == 0) return EXIT_SUCCESS; /* input: * - addrMem -> rax * register usage: @@ -1606,12 +1653,14 @@ int asm_work_skl_corei_fma_2t(unsigned long long addrMem, unsigned long long add * - r13: register for temporary results * - r14: stores cacheline width as increment for buffer addresses * - r15: stores address of shared variable that controls load level + * - mm0: stores iteration counter * - rdx, rsi, rdi: registers for shift operations - * - mm*,xmm*,ymm*: data registers for SIMD instructions + * - xmm*,ymm*: data registers for SIMD instructions */ __asm__ __volatile__( - "mov %0, %%rax;" // store start address of buffer - "mov %1, %%r15;" // store address of shared variable that controls load level + "mov %%rax, %%rax;" // store start address of buffer + "mov %%rbx, %%r15;" // store address of shared variable that controls load level + "movq %%rcx, %%mm0;" // store iteration counter "mov $64, %%r14;" // increment after each cache/memory access //Initialize registers for shift operations "mov $0xAAAAAAAA, %%edi;" @@ -2367,6 +2416,7 @@ int asm_work_skl_corei_fma_2t(unsigned long long addrMem, unsigned long long add "vfmadd231pd %%ymm10, %%ymm0, %%ymm9; vfmadd231pd %%ymm2, %%ymm1, %%ymm12; shr $1, %%edi; xor %%rdx, %%r13; " // REG ops only "vfmadd231pd 64(%%rbx), %%ymm0, %%ymm10; vfmadd231pd 96(%%rbx), %%ymm1, %%ymm13; vmovapd %%ymm10, 32(%%rbx); add $128, %%rbx; " // 2 L1 loads, L1 store "vfmadd231pd 64(%%rbx), %%ymm0, %%ymm2; vfmadd231pd 96(%%rbx), %%ymm1, %%ymm13; vmovapd %%ymm2, 32(%%rbx); add $128, %%rbx; " // 2 L1 loads, L1 store + "movq %%mm0, %%r13;" // restore iteration counter //reset RAM counter "sub $1, %%r12;" "jnz _work_no_ram_reset_skl_corei_fma_2t;" @@ -2374,6 +2424,7 @@ int asm_work_skl_corei_fma_2t(unsigned long long addrMem, unsigned long long add "mov %%rax, %%r9;" "add $786432, %%r9;" "_work_no_ram_reset_skl_corei_fma_2t:" + "inc %%r13;" // increment iteration counter //reset L2-Cache counter "sub $1, %%r10;" "jnz _work_no_L2_reset_skl_corei_fma_2t;" @@ -2381,6 +2432,7 @@ int asm_work_skl_corei_fma_2t(unsigned long long addrMem, unsigned long long add "mov %%rax, %%rcx;" "add $16384, %%rcx;" "_work_no_L2_reset_skl_corei_fma_2t:" + "movq %%r13, %%mm0;" // store iteration counter //reset L3-Cache counter "sub $1, %%r11;" "jnz _work_no_L3_reset_skl_corei_fma_2t;" @@ -2389,24 +2441,45 @@ int asm_work_skl_corei_fma_2t(unsigned long long addrMem, unsigned long long add "add $131072, %%r8;" "_work_no_L3_reset_skl_corei_fma_2t:" "mov %%rax, %%rbx;" - "mov (%%r15), %%r13;" - "test $1, %%r13;" + "testq $1, (%%r15);" "jnz _work_loop_skl_corei_fma_2t;" - : - : "a"(addrMem), "b"(addrHigh) - : "%rcx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "%rdx", "%rsi", "%rdi", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15" + "movq %%mm0, %%rax;" // restore iteration counter + : "=a" (threaddata->iterations) + : "a"(threaddata->addrMem), "b"(threaddata->addrHigh), "c" (threaddata->iterations) + : "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "%rdx", "%rsi", "%rdi", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15" ); return EXIT_SUCCESS; } - int init_hsw_corei_fma_1t(unsigned long long addrMem) __attribute__((noinline)); -int init_hsw_corei_fma_1t(unsigned long long addrMem) + + int init_hsw_corei_fma_1t(threaddata_t* threaddata) __attribute__((noinline)); +int init_hsw_corei_fma_1t(threaddata_t* threaddata) { + unsigned long long addrMem = threaddata->addrMem; int i; for (i=0;i<13340672;i++) *((double*)(addrMem+8*i)) = 0.25 + (double)(i%9267) * 0.24738995982e-4; + // lines with register operations + threaddata->flops+=40*16; // 2 256 bit FMA operations + + // lines with L1 operations + threaddata->flops+=90*8; // 1 256 bit FMA operation + + // lines with L2 operations + threaddata->flops+=9*8; // 1 256 bit FMA operation + + // lines with L3 operations + threaddata->flops+=3*8; // 1 256 bit FMA operation + + // lines with RAM operations + threaddata->flops+=2*16; // 2 256 bit FMA operations + threaddata->bytes=2*64; // 1 memory access + + threaddata->flops*=10; + threaddata->bytes*=10; + return EXIT_SUCCESS; } @@ -2417,10 +2490,10 @@ int init_hsw_corei_fma_1t(unsigned long long addrMem) * @input - addrMem: pointer to buffer * @return EXIT_SUCCESS */ -int asm_work_hsw_corei_fma_1t(unsigned long long addrMem, unsigned long long addrHigh) __attribute__((noinline)); -int asm_work_hsw_corei_fma_1t(unsigned long long addrMem, unsigned long long addrHigh) +int asm_work_hsw_corei_fma_1t(threaddata_t* threaddata) __attribute__((noinline)); +int asm_work_hsw_corei_fma_1t(threaddata_t* threaddata) { - if (*((unsigned long long*)addrHigh) == 0) return EXIT_SUCCESS; + if (*((unsigned long long*)threaddata->addrHigh) == 0) return EXIT_SUCCESS; /* input: * - addrMem -> rax * register usage: @@ -2435,12 +2508,14 @@ int asm_work_hsw_corei_fma_1t(unsigned long long addrMem, unsigned long long add * - r13: register for temporary results * - r14: stores cacheline width as increment for buffer addresses * - r15: stores address of shared variable that controls load level + * - mm0: stores iteration counter * - rdx, rsi, rdi: registers for shift operations - * - mm*,xmm*,ymm*: data registers for SIMD instructions + * - xmm*,ymm*: data registers for SIMD instructions */ __asm__ __volatile__( - "mov %0, %%rax;" // store start address of buffer - "mov %1, %%r15;" // store address of shared variable that controls load level + "mov %%rax, %%rax;" // store start address of buffer + "mov %%rbx, %%r15;" // store address of shared variable that controls load level + "movq %%rcx, %%mm0;" // store iteration counter "mov $64, %%r14;" // increment after each cache/memory access //Initialize registers for shift operations "mov $0xAAAAAAAA, %%edi;" @@ -3916,6 +3991,7 @@ int asm_work_hsw_corei_fma_1t(unsigned long long addrMem, unsigned long long add "vfmadd231pd %%ymm10, %%ymm0, %%ymm9; vfmadd231pd %%ymm2, %%ymm1, %%ymm11; shr $1, %%edi; xor %%rdx, %%r13; " // REG ops only "vmovapd %%xmm10, 64(%%rbx); vfmadd231pd 32(%%rbx), %%ymm0, %%ymm10; shr $1, %%esi; add %%r14, %%rbx; " // L1 load, L1 store "vmovapd %%xmm2, 64(%%rbx); vfmadd231pd 32(%%rbx), %%ymm0, %%ymm2; shr $1, %%edx; add %%r14, %%rbx; " // L1 load, L1 store + "movq %%mm0, %%r13;" // restore iteration counter //reset RAM counter "sub $1, %%r12;" "jnz _work_no_ram_reset_hsw_corei_fma_1t;" @@ -3923,6 +3999,7 @@ int asm_work_hsw_corei_fma_1t(unsigned long long addrMem, unsigned long long add "mov %%rax, %%r9;" "add $1572864, %%r9;" "_work_no_ram_reset_hsw_corei_fma_1t:" + "inc %%r13;" // increment iteration counter //reset L2-Cache counter "sub $1, %%r10;" "jnz _work_no_L2_reset_hsw_corei_fma_1t;" @@ -3930,6 +4007,7 @@ int asm_work_hsw_corei_fma_1t(unsigned long long addrMem, unsigned long long add "mov %%rax, %%rcx;" "add $32768, %%rcx;" "_work_no_L2_reset_hsw_corei_fma_1t:" + "movq %%r13, %%mm0;" // store iteration counter //reset L3-Cache counter "sub $1, %%r11;" "jnz _work_no_L3_reset_hsw_corei_fma_1t;" @@ -3938,24 +4016,45 @@ int asm_work_hsw_corei_fma_1t(unsigned long long addrMem, unsigned long long add "add $262144, %%r8;" "_work_no_L3_reset_hsw_corei_fma_1t:" "mov %%rax, %%rbx;" - "mov (%%r15), %%r13;" - "test $1, %%r13;" + "testq $1, (%%r15);" "jnz _work_loop_hsw_corei_fma_1t;" - : - : "a"(addrMem), "b"(addrHigh) - : "%rcx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "%rdx", "%rsi", "%rdi", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15" + "movq %%mm0, %%rax;" // restore iteration counter + : "=a" (threaddata->iterations) + : "a"(threaddata->addrMem), "b"(threaddata->addrHigh), "c" (threaddata->iterations) + : "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "%rdx", "%rsi", "%rdi", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15" ); return EXIT_SUCCESS; } - int init_hsw_corei_fma_2t(unsigned long long addrMem) __attribute__((noinline)); -int init_hsw_corei_fma_2t(unsigned long long addrMem) + + int init_hsw_corei_fma_2t(threaddata_t* threaddata) __attribute__((noinline)); +int init_hsw_corei_fma_2t(threaddata_t* threaddata) { + unsigned long long addrMem = threaddata->addrMem; int i; for (i=0;i<6670336;i++) *((double*)(addrMem+8*i)) = 0.25 + (double)(i%9267) * 0.24738995982e-4; + // lines with register operations + threaddata->flops+=40*16; // 2 256 bit FMA operations + + // lines with L1 operations + threaddata->flops+=90*8; // 1 256 bit FMA operation + + // lines with L2 operations + threaddata->flops+=9*8; // 1 256 bit FMA operation + + // lines with L3 operations + threaddata->flops+=3*8; // 1 256 bit FMA operation + + // lines with RAM operations + threaddata->flops+=2*16; // 2 256 bit FMA operations + threaddata->bytes=2*64; // 1 memory access + + threaddata->flops*=5; + threaddata->bytes*=5; + return EXIT_SUCCESS; } @@ -3966,10 +4065,10 @@ int init_hsw_corei_fma_2t(unsigned long long addrMem) * @input - addrMem: pointer to buffer * @return EXIT_SUCCESS */ -int asm_work_hsw_corei_fma_2t(unsigned long long addrMem, unsigned long long addrHigh) __attribute__((noinline)); -int asm_work_hsw_corei_fma_2t(unsigned long long addrMem, unsigned long long addrHigh) +int asm_work_hsw_corei_fma_2t(threaddata_t* threaddata) __attribute__((noinline)); +int asm_work_hsw_corei_fma_2t(threaddata_t* threaddata) { - if (*((unsigned long long*)addrHigh) == 0) return EXIT_SUCCESS; + if (*((unsigned long long*)threaddata->addrHigh) == 0) return EXIT_SUCCESS; /* input: * - addrMem -> rax * register usage: @@ -3984,12 +4083,14 @@ int asm_work_hsw_corei_fma_2t(unsigned long long addrMem, unsigned long long add * - r13: register for temporary results * - r14: stores cacheline width as increment for buffer addresses * - r15: stores address of shared variable that controls load level + * - mm0: stores iteration counter * - rdx, rsi, rdi: registers for shift operations - * - mm*,xmm*,ymm*: data registers for SIMD instructions + * - xmm*,ymm*: data registers for SIMD instructions */ __asm__ __volatile__( - "mov %0, %%rax;" // store start address of buffer - "mov %1, %%r15;" // store address of shared variable that controls load level + "mov %%rax, %%rax;" // store start address of buffer + "mov %%rbx, %%r15;" // store address of shared variable that controls load level + "movq %%rcx, %%mm0;" // store iteration counter "mov $64, %%r14;" // increment after each cache/memory access //Initialize registers for shift operations "mov $0xAAAAAAAA, %%edi;" @@ -4745,6 +4846,7 @@ int asm_work_hsw_corei_fma_2t(unsigned long long addrMem, unsigned long long add "vfmadd231pd %%ymm10, %%ymm0, %%ymm9; vfmadd231pd %%ymm2, %%ymm1, %%ymm12; shr $1, %%edi; xor %%rdx, %%r13; " // REG ops only "vmovapd %%xmm10, 64(%%rbx); vfmadd231pd 32(%%rbx), %%ymm0, %%ymm10; shr $1, %%esi; add %%r14, %%rbx; " // L1 load, L1 store "vmovapd %%xmm2, 64(%%rbx); vfmadd231pd 32(%%rbx), %%ymm0, %%ymm2; shr $1, %%edx; add %%r14, %%rbx; " // L1 load, L1 store + "movq %%mm0, %%r13;" // restore iteration counter //reset RAM counter "sub $1, %%r12;" "jnz _work_no_ram_reset_hsw_corei_fma_2t;" @@ -4752,6 +4854,7 @@ int asm_work_hsw_corei_fma_2t(unsigned long long addrMem, unsigned long long add "mov %%rax, %%r9;" "add $786432, %%r9;" "_work_no_ram_reset_hsw_corei_fma_2t:" + "inc %%r13;" // increment iteration counter //reset L2-Cache counter "sub $1, %%r10;" "jnz _work_no_L2_reset_hsw_corei_fma_2t;" @@ -4759,6 +4862,7 @@ int asm_work_hsw_corei_fma_2t(unsigned long long addrMem, unsigned long long add "mov %%rax, %%rcx;" "add $16384, %%rcx;" "_work_no_L2_reset_hsw_corei_fma_2t:" + "movq %%r13, %%mm0;" // store iteration counter //reset L3-Cache counter "sub $1, %%r11;" "jnz _work_no_L3_reset_hsw_corei_fma_2t;" @@ -4767,24 +4871,45 @@ int asm_work_hsw_corei_fma_2t(unsigned long long addrMem, unsigned long long add "add $131072, %%r8;" "_work_no_L3_reset_hsw_corei_fma_2t:" "mov %%rax, %%rbx;" - "mov (%%r15), %%r13;" - "test $1, %%r13;" + "testq $1, (%%r15);" "jnz _work_loop_hsw_corei_fma_2t;" - : - : "a"(addrMem), "b"(addrHigh) - : "%rcx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "%rdx", "%rsi", "%rdi", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15" + "movq %%mm0, %%rax;" // restore iteration counter + : "=a" (threaddata->iterations) + : "a"(threaddata->addrMem), "b"(threaddata->addrHigh), "c" (threaddata->iterations) + : "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "%rdx", "%rsi", "%rdi", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15" ); return EXIT_SUCCESS; } - int init_hsw_xeonep_fma_1t(unsigned long long addrMem) __attribute__((noinline)); -int init_hsw_xeonep_fma_1t(unsigned long long addrMem) + + int init_hsw_xeonep_fma_1t(threaddata_t* threaddata) __attribute__((noinline)); +int init_hsw_xeonep_fma_1t(threaddata_t* threaddata) { + unsigned long long addrMem = threaddata->addrMem; int i; for (i=0;i<13471744;i++) *((double*)(addrMem+8*i)) = 0.25 + (double)(i%9267) * 0.24738995982e-4; + // lines with register operations + threaddata->flops+=35*16; // 2 256 bit FMA operations + + // lines with L1 operations + threaddata->flops+=79*8; // 1 256 bit FMA operation + + // lines with L2 operations + threaddata->flops+=9*8; // 1 256 bit FMA operation + + // lines with L3 operations + threaddata->flops+=1*8; // 1 256 bit FMA operation + + // lines with RAM operations + threaddata->flops+=2*16; // 2 256 bit FMA operations + threaddata->bytes=2*64; // 1 memory access + + threaddata->flops*=12; + threaddata->bytes*=12; + return EXIT_SUCCESS; } @@ -4795,10 +4920,10 @@ int init_hsw_xeonep_fma_1t(unsigned long long addrMem) * @input - addrMem: pointer to buffer * @return EXIT_SUCCESS */ -int asm_work_hsw_xeonep_fma_1t(unsigned long long addrMem, unsigned long long addrHigh) __attribute__((noinline)); -int asm_work_hsw_xeonep_fma_1t(unsigned long long addrMem, unsigned long long addrHigh) +int asm_work_hsw_xeonep_fma_1t(threaddata_t* threaddata) __attribute__((noinline)); +int asm_work_hsw_xeonep_fma_1t(threaddata_t* threaddata) { - if (*((unsigned long long*)addrHigh) == 0) return EXIT_SUCCESS; + if (*((unsigned long long*)threaddata->addrHigh) == 0) return EXIT_SUCCESS; /* input: * - addrMem -> rax * register usage: @@ -4813,12 +4938,14 @@ int asm_work_hsw_xeonep_fma_1t(unsigned long long addrMem, unsigned long long ad * - r13: register for temporary results * - r14: stores cacheline width as increment for buffer addresses * - r15: stores address of shared variable that controls load level + * - mm0: stores iteration counter * - rdx, rsi, rdi: registers for shift operations - * - mm*,xmm*,ymm*: data registers for SIMD instructions + * - xmm*,ymm*: data registers for SIMD instructions */ __asm__ __volatile__( - "mov %0, %%rax;" // store start address of buffer - "mov %1, %%r15;" // store address of shared variable that controls load level + "mov %%rax, %%rax;" // store start address of buffer + "mov %%rbx, %%r15;" // store address of shared variable that controls load level + "movq %%rcx, %%mm0;" // store iteration counter "mov $64, %%r14;" // increment after each cache/memory access //Initialize registers for shift operations "mov $0xAAAAAAAA, %%edi;" @@ -6366,6 +6493,7 @@ int asm_work_hsw_xeonep_fma_1t(unsigned long long addrMem, unsigned long long ad "vfmadd231pd %%ymm10, %%ymm0, %%ymm9; vfmadd231pd %%ymm2, %%ymm1, %%ymm13; shr $1, %%edi; xor %%rdx, %%r13; " // REG ops only "vmovapd %%xmm10, 64(%%rbx); vfmadd231pd 32(%%rbx), %%ymm0, %%ymm10; shr $1, %%esi; add %%r14, %%rbx; " // L1 load, L1 store "vmovapd %%xmm2, 64(%%rbx); vfmadd231pd 32(%%rbx), %%ymm0, %%ymm2; shr $1, %%edx; add %%r14, %%rbx; " // L1 load, L1 store + "movq %%mm0, %%r13;" // restore iteration counter //reset RAM counter "sub $1, %%r12;" "jnz _work_no_ram_reset_hsw_xeonep_fma_1t;" @@ -6373,6 +6501,7 @@ int asm_work_hsw_xeonep_fma_1t(unsigned long long addrMem, unsigned long long ad "mov %%rax, %%r9;" "add $2621440, %%r9;" "_work_no_ram_reset_hsw_xeonep_fma_1t:" + "inc %%r13;" // increment iteration counter //reset L2-Cache counter "sub $1, %%r10;" "jnz _work_no_L2_reset_hsw_xeonep_fma_1t;" @@ -6380,6 +6509,7 @@ int asm_work_hsw_xeonep_fma_1t(unsigned long long addrMem, unsigned long long ad "mov %%rax, %%rcx;" "add $32768, %%rcx;" "_work_no_L2_reset_hsw_xeonep_fma_1t:" + "movq %%r13, %%mm0;" // store iteration counter //reset L3-Cache counter "sub $1, %%r11;" "jnz _work_no_L3_reset_hsw_xeonep_fma_1t;" @@ -6388,24 +6518,45 @@ int asm_work_hsw_xeonep_fma_1t(unsigned long long addrMem, unsigned long long ad "add $262144, %%r8;" "_work_no_L3_reset_hsw_xeonep_fma_1t:" "mov %%rax, %%rbx;" - "mov (%%r15), %%r13;" - "test $1, %%r13;" + "testq $1, (%%r15);" "jnz _work_loop_hsw_xeonep_fma_1t;" - : - : "a"(addrMem), "b"(addrHigh) - : "%rcx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "%rdx", "%rsi", "%rdi", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15" + "movq %%mm0, %%rax;" // restore iteration counter + : "=a" (threaddata->iterations) + : "a"(threaddata->addrMem), "b"(threaddata->addrHigh), "c" (threaddata->iterations) + : "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "%rdx", "%rsi", "%rdi", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15" ); return EXIT_SUCCESS; } - int init_hsw_xeonep_fma_2t(unsigned long long addrMem) __attribute__((noinline)); -int init_hsw_xeonep_fma_2t(unsigned long long addrMem) + + int init_hsw_xeonep_fma_2t(threaddata_t* threaddata) __attribute__((noinline)); +int init_hsw_xeonep_fma_2t(threaddata_t* threaddata) { + unsigned long long addrMem = threaddata->addrMem; int i; for (i=0;i<6735872;i++) *((double*)(addrMem+8*i)) = 0.25 + (double)(i%9267) * 0.24738995982e-4; + // lines with register operations + threaddata->flops+=35*16; // 2 256 bit FMA operations + + // lines with L1 operations + threaddata->flops+=79*8; // 1 256 bit FMA operation + + // lines with L2 operations + threaddata->flops+=9*8; // 1 256 bit FMA operation + + // lines with L3 operations + threaddata->flops+=1*8; // 1 256 bit FMA operation + + // lines with RAM operations + threaddata->flops+=2*16; // 2 256 bit FMA operations + threaddata->bytes=2*64; // 1 memory access + + threaddata->flops*=6; + threaddata->bytes*=6; + return EXIT_SUCCESS; } @@ -6416,10 +6567,10 @@ int init_hsw_xeonep_fma_2t(unsigned long long addrMem) * @input - addrMem: pointer to buffer * @return EXIT_SUCCESS */ -int asm_work_hsw_xeonep_fma_2t(unsigned long long addrMem, unsigned long long addrHigh) __attribute__((noinline)); -int asm_work_hsw_xeonep_fma_2t(unsigned long long addrMem, unsigned long long addrHigh) +int asm_work_hsw_xeonep_fma_2t(threaddata_t* threaddata) __attribute__((noinline)); +int asm_work_hsw_xeonep_fma_2t(threaddata_t* threaddata) { - if (*((unsigned long long*)addrHigh) == 0) return EXIT_SUCCESS; + if (*((unsigned long long*)threaddata->addrHigh) == 0) return EXIT_SUCCESS; /* input: * - addrMem -> rax * register usage: @@ -6434,12 +6585,14 @@ int asm_work_hsw_xeonep_fma_2t(unsigned long long addrMem, unsigned long long ad * - r13: register for temporary results * - r14: stores cacheline width as increment for buffer addresses * - r15: stores address of shared variable that controls load level + * - mm0: stores iteration counter * - rdx, rsi, rdi: registers for shift operations - * - mm*,xmm*,ymm*: data registers for SIMD instructions + * - xmm*,ymm*: data registers for SIMD instructions */ __asm__ __volatile__( - "mov %0, %%rax;" // store start address of buffer - "mov %1, %%r15;" // store address of shared variable that controls load level + "mov %%rax, %%rax;" // store start address of buffer + "mov %%rbx, %%r15;" // store address of shared variable that controls load level + "movq %%rcx, %%mm0;" // store iteration counter "mov $64, %%r14;" // increment after each cache/memory access //Initialize registers for shift operations "mov $0xAAAAAAAA, %%edi;" @@ -7231,6 +7384,7 @@ int asm_work_hsw_xeonep_fma_2t(unsigned long long addrMem, unsigned long long ad "vfmadd231pd %%ymm10, %%ymm0, %%ymm9; vfmadd231pd %%ymm2, %%ymm1, %%ymm13; shr $1, %%edi; xor %%rdx, %%r13; " // REG ops only "vmovapd %%xmm10, 64(%%rbx); vfmadd231pd 32(%%rbx), %%ymm0, %%ymm10; shr $1, %%esi; add %%r14, %%rbx; " // L1 load, L1 store "vmovapd %%xmm2, 64(%%rbx); vfmadd231pd 32(%%rbx), %%ymm0, %%ymm2; shr $1, %%edx; add %%r14, %%rbx; " // L1 load, L1 store + "movq %%mm0, %%r13;" // restore iteration counter //reset RAM counter "sub $1, %%r12;" "jnz _work_no_ram_reset_hsw_xeonep_fma_2t;" @@ -7238,6 +7392,7 @@ int asm_work_hsw_xeonep_fma_2t(unsigned long long addrMem, unsigned long long ad "mov %%rax, %%r9;" "add $1310720, %%r9;" "_work_no_ram_reset_hsw_xeonep_fma_2t:" + "inc %%r13;" // increment iteration counter //reset L2-Cache counter "sub $1, %%r10;" "jnz _work_no_L2_reset_hsw_xeonep_fma_2t;" @@ -7245,6 +7400,7 @@ int asm_work_hsw_xeonep_fma_2t(unsigned long long addrMem, unsigned long long ad "mov %%rax, %%rcx;" "add $16384, %%rcx;" "_work_no_L2_reset_hsw_xeonep_fma_2t:" + "movq %%r13, %%mm0;" // store iteration counter //reset L3-Cache counter "sub $1, %%r11;" "jnz _work_no_L3_reset_hsw_xeonep_fma_2t;" @@ -7253,12 +7409,12 @@ int asm_work_hsw_xeonep_fma_2t(unsigned long long addrMem, unsigned long long ad "add $131072, %%r8;" "_work_no_L3_reset_hsw_xeonep_fma_2t:" "mov %%rax, %%rbx;" - "mov (%%r15), %%r13;" - "test $1, %%r13;" + "testq $1, (%%r15);" "jnz _work_loop_hsw_xeonep_fma_2t;" - : - : "a"(addrMem), "b"(addrHigh) - : "%rcx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "%rdx", "%rsi", "%rdi", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15" + "movq %%mm0, %%rax;" // restore iteration counter + : "=a" (threaddata->iterations) + : "a"(threaddata->addrMem), "b"(threaddata->addrHigh), "c" (threaddata->iterations) + : "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "%rdx", "%rsi", "%rdi", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15" ); return EXIT_SUCCESS; } diff --git a/generic.c b/generic.c index 7c3ed22f..f0efc62e 100644 --- a/generic.c +++ b/generic.c @@ -50,11 +50,14 @@ /* use old memcpy to avoid GLIBC 2.14 dependency */ __asm__(".symver memcpy, memcpy@GLIBC_2.2.5"); - /* buffer for some generic implementations */ +/* buffer for some generic implementations */ // TODO remove global variables to allow thread safe execution of detection static char output[_HW_DETECT_MAX_OUTPUT]; static char path[_HW_DETECT_MAX_OUTPUT]; +/* avoid multiple executions of the corresponding functions */ +static int num_packages_sav = 0, num_cores_per_package_sav = 0, num_threads_per_core_sav = 0, num_threads_per_package_sav = 0; + /** * list element for counting unique package_ids, core_ids etc. */ @@ -309,28 +312,22 @@ void generic_get_architecture(char* arch) */ int get_pkg(int cpu) { - int pkg=0; - char buffer[_HW_DETECT_MAX_OUTPUT]; - - if ((num_cpus() == 1) || (num_packages() == 1)) { return 0; } + int pkg=-1; + char buffer[10]; if (cpu == -1) { cpu = get_cpu(); } if (cpu != -1) { - sprintf(path, "/sys/devices/system/cpu/cpu%i/topology/physical_package_id", cpu); + if( read_file(path, buffer, sizeof(buffer)) ) pkg = atoi(buffer); - if( !read_file(path, buffer, sizeof(buffer)) ) { - pkg = -1; - } - else { - pkg = atoi(buffer); - } - + /* fallbacks if sysfs is not working */ if (pkg == -1) { + /* assume 0 if there is only one CPU or only one package */ + if ((num_cpus() == 1) || (num_packages() == 1)) { return 0; } /* get the physical package id from /proc/cpuinfo */ - if(!get_proc_cpuinfo_data("physical id", buffer, cpu)) { pkg = atoi(buffer); } + else if(!get_proc_cpuinfo_data("physical id", buffer, cpu)) { pkg = atoi(buffer); } /* if the number of cpus equals the number of packages assume pkg_id = cpu_id*/ else if (num_cpus() == num_packages()) { pkg = cpu; } /* if there is only one core per package assume pkg_id = core_id */ @@ -342,6 +339,7 @@ int get_pkg(int cpu) without correct topology information in sysfs*/ } } + return pkg; } @@ -353,28 +351,25 @@ int get_core_id(int cpu) int core=-1; char buffer[10]; - if (num_cpus() == 1) { return 0; } - if (cpu == -1) { cpu = get_cpu(); } if (cpu != -1) { sprintf(path, "/sys/devices/system/cpu/cpu%i/topology/core_id", cpu); + if(read_file(path, buffer, sizeof(buffer))) core = atoi(buffer); - if(!read_file(path, buffer, sizeof(buffer))) { - core = -1; - } - else { - core = atoi(buffer); + /* fallbacks if sysfs is not working */ + if (core == -1) + { + /* assume 0 if there is only one CPU */ + if (num_cpus() == 1) { return 0; } + /* if each package contains only one cpu assume core_id = package_id = cpu_id */ + else if (num_cores_per_package() == 1) { core = 0; } + + /* NOTE core_id can't be determined without correct topology information in sysfs if there are multiple cores per package + TODO /proc/cpuinfo */ } } - if (core == -1) - { - /* if each package contains only one cpu assume core_id = package_id = cpu_id */ - if (num_cores_per_package() == 1) { core = 0; } - /* NOTE core_id can't be determined without correct topology information in sysfs if there are multiple cores per package - TODO /proc/cpuinfo */ - } return core; } @@ -464,7 +459,7 @@ void init_cpuinfo(cpu_info_t *cpuinfo,int print) cpuinfo->clockrate = get_cpu_clockrate(1, 0); /* setup supported feature list*/ - if(!strcmp(cpuinfo->architecture,"x86_64")) cpuinfo->features |=X86_64; + if(!strcmp(cpuinfo->architecture,"x86_64")) cpuinfo->features |= X86_64; if (feature_available("SMT")) cpuinfo->features |= SMT; if (feature_available("FPU")) cpuinfo->features |= FPU; if (feature_available("MMX")) cpuinfo->features |= MMX; @@ -479,8 +474,10 @@ void init_cpuinfo(cpu_info_t *cpuinfo,int print) if (feature_available("ABM")) cpuinfo->features |= ABM; if (feature_available("POPCNT")) cpuinfo->features |= POPCNT; if (feature_available("AVX")) cpuinfo->features |= AVX; + if (feature_available("AVX2")) cpuinfo->features |= AVX2; if (feature_available("FMA")) cpuinfo->features |= FMA; if (feature_available("AES")) cpuinfo->features |= AES; + if (feature_available("AVX512")) cpuinfo->features |= AVX512; /* determine cache details */ for (i=0; i<(unsigned int)num_caches(0); i++) @@ -543,6 +540,8 @@ void init_cpuinfo(cpu_info_t *cpuinfo,int print) if(cpuinfo->features&SSE4A) printf(" SSE4A"); if(cpuinfo->features&POPCNT) printf(" POPCNT"); if(cpuinfo->features&AVX) printf(" AVX"); + if(cpuinfo->features&AVX2) printf(" AVX2"); + if(cpuinfo->features&AVX512) printf(" AVX512"); if(cpuinfo->features&FMA) printf(" FMA"); if(cpuinfo->features&AES) printf(" AES"); if(cpuinfo->features&SMT) printf(" SMT"); @@ -1020,11 +1019,13 @@ int generic_cacheline_length(int cpu, int id) { int generic_num_packages() { struct dirent **namelist; - int ndir, m, num = -1; + int ndir, m; char tmppath[_HW_DETECT_MAX_OUTPUT]; char buf[20]; id_le * pkg_id_list = NULL; + if (num_packages_sav != 0) return num_packages_sav; + num_packages_sav = -1; strcpy(path, "/sys/devices/system/cpu/"); ndir = scandir(path, &namelist, 0, 0); @@ -1049,20 +1050,23 @@ int generic_num_packages() free(namelist[ndir]); } free(namelist); - num = id_total_count(pkg_id_list); + num_packages_sav = id_total_count(pkg_id_list); free_id_list(&pkg_id_list); } - return num; + return num_packages_sav; } int generic_num_cores_per_package() { struct dirent **namelist; - int ndir, m, n, num = 0, pkg_id_tocount = -1; + int ndir, m, n, pkg_id_tocount = -1; char tmppath[_HW_DETECT_MAX_OUTPUT]; char buf[20]; id_le *core_id_list = NULL; + if (num_cores_per_package_sav != 0) return num_cores_per_package_sav; + num_cores_per_package_sav=-1; + strcpy(path, "/sys/devices/system/cpu/"); ndir = scandir(path, &namelist, 0, 0); if(ndir >= 0) @@ -1099,22 +1103,25 @@ int generic_num_cores_per_package() free(namelist[ndir]); } free(namelist); - num = id_total_count(core_id_list); + num_cores_per_package_sav = id_total_count(core_id_list); free_id_list(&core_id_list); } - else return -1; + else num_cores_per_package_sav = -1; - if (num==0) return -1; - return num; + if (num_cores_per_package_sav == 0) num_cores_per_package_sav = -1; + + return num_cores_per_package_sav; } int generic_num_threads_per_core() { struct dirent **namelist; - int ndir, m, n, num = 0, pkg_id_tocount = -1, core_id_tocount = -1; + int ndir, m, n, pkg_id_tocount = -1, core_id_tocount = -1; char tmppath[_HW_DETECT_MAX_OUTPUT]; char buf[20]; + if (num_threads_per_core_sav != 0) return num_threads_per_core_sav; + strcpy(path, "/sys/devices/system/cpu/"); ndir = scandir(path, &namelist, 0, 0); if(ndir >= 0) @@ -1144,7 +1151,7 @@ int generic_num_threads_per_core() if(m == core_id_tocount && n == pkg_id_tocount) /*FIXME: only counts threads from the first core_id and package_id that are found, assumes that every core has the same amount of threads*/ { - num++; + num_threads_per_core_sav++; } } } @@ -1152,23 +1159,23 @@ int generic_num_threads_per_core() } free(namelist); } - else return -1; + else num_threads_per_core_sav = -1; - if (num == 0) num = generic_num_threads_per_package() / generic_num_cores_per_package(); - if (num != generic_num_threads_per_package() / generic_num_cores_per_package()) return -1; + if (num_threads_per_core_sav == 0) num_threads_per_core_sav = generic_num_threads_per_package() / generic_num_cores_per_package(); + if (num_threads_per_core_sav != generic_num_threads_per_package() / generic_num_cores_per_package()) num_threads_per_core_sav = -1; - return num; + return num_threads_per_core_sav; } int generic_num_threads_per_package() { struct dirent **namelist; - int ndir, m, num = 0, pkg_id_tocount = -1; + int ndir, m, pkg_id_tocount = -1; char tmppath[_HW_DETECT_MAX_OUTPUT]; char buf[20]; - /*TODO proc/cpuinfo*/ + if (num_threads_per_package_sav != 0) return num_threads_per_package_sav; strcpy(path, "/sys/devices/system/cpu/"); ndir = scandir(path, &namelist, 0, 0); @@ -1192,7 +1199,7 @@ int generic_num_threads_per_package() if(m == pkg_id_tocount) /*FIXME: only counts threads from first package_id that is found and assumes that every package has the same amount of threads*/ { - num++; + num_threads_per_package_sav++; } } } @@ -1200,20 +1207,21 @@ int generic_num_threads_per_package() } free(namelist); } - else return -1; + else num_threads_per_package_sav = -1; - if (num == 0) return -1; - return num; + if (num_threads_per_package_sav == 0) num_threads_per_package_sav = -1; + + return num_threads_per_package_sav; } /* see cpu.h */ #if defined (__ARCH_UNKNOWN) - /* - * use generic implementations for unknown architectures - */ +/* + * use generic implementations for unknown architectures + */ - void get_architecture(char * arch) { +void get_architecture(char * arch) { generic_get_architecture(arch); } diff --git a/help.c b/help.c index c2254618..40b9073f 100644 --- a/help.c +++ b/help.c @@ -42,8 +42,10 @@ void show_help(void) " -c | --copyright display copyright information\n" " -w | --warranty display warranty information\n" " -q | --quiet disable output to stdout\n" + " -r | --report display additional information (overridden by -q)\n" " -a | --avail list available functions\n" - " -i ID | --function=ID specify ID of the load-function to be used\n" + " -i ID | --function=ID specify integer ID of the load-function to be\n" + " used (as listed by --avail)\n" #ifdef CUDA " -f | --usegpufloat use single precision matrix multiplications instead of double\n" " -g | --gpus number of gpus to use (default: all)\n" @@ -61,10 +63,13 @@ void show_help(void) " high load is defined by -l\n" " will be overwriten if used with -s, -e and -n\n" " -n COUNT | --threads=COUNT specify the number of threads\n" + " cannot be combined with -b | --bind, which\n" + " implicitly specifies the number of threads\n" #if (defined(linux) || defined(__linux__)) && defined (AFFINITY) - " -b CPULIST | --bind=CPULIST select certain CPUs (overrides -n)\n" + " -b CPULIST | --bind=CPULIST select certain CPUs\n" " CPULIST format: \"x,y,z\", \"x-y\", \"x-y/step\",\n" " and any combination of the above\n" + " cannot be combined with -n | --threads\n" #endif "\n" "\nExamples:\n\n" diff --git a/main.c b/main.c index 5e9da302..50db75c1 100644 --- a/main.c +++ b/main.c @@ -48,6 +48,26 @@ #include "gpu.h" #endif +/* + * used for --bind option + */ +#define ADD_CPU_SET(cpu,cpuset) \ +do { \ + if (cpu_allowed(cpu)) { \ + CPU_SET(cpu, &cpuset); \ + } else { \ + if (cpu >= num_cpus() ) { \ + fprintf( stderr, "Error: The given bind argument (-b/--bind) includes CPU %d that is not available on this system.\n",cpu ); \ + } \ + else { \ + fprintf( stderr, "Error: The given bind argument (-b/--bind) cannot be implemented with the cpuset given from the OS\n" ); \ + fprintf( stderr, "This can be caused by the taskset tool, cgroups, the batch system, or similar mechanisms.\n" ); \ + fprintf( stderr, "Please fix the argument to match the restrictions.\n" ); \ + } \ + exit( EACCES ); \ + } \ +} while (0) + mydata_t *mdp; /* global data structure */ cpu_info_t *cpuinfo = NULL; /* data structure for hardware detection */ unsigned long long LOADVAR = LOAD_HIGH; /* shared variable that specifies load level */ @@ -78,6 +98,11 @@ long long unsigned int STARTPERIOD = 0, ENDPERIOD = 0, NUMPERIODSTEPS = 0; */ char *fsbind = NULL; +/* + * temporary variables + */ +int tmp1,tmp2; + /* * worker threads */ @@ -126,6 +151,23 @@ static void *init() exit(127); } + if (verbose) { + printf(" using %i threads\n", NUM_THREADS); + #if (defined(linux) || defined(__linux__)) && defined (AFFINITY) + for (i = 0; i < NUM_THREADS; i++){ + /* avoid multiple sysfs accesses */ + tmp1=get_core_id(cpu_bind[i]); + tmp2=get_pkg(cpu_bind[i]); + if ((tmp1 != -1) && (tmp2 != -1)){ + printf(" - Thread %i runs on CPU %llu, core %i in package: %i\n", + i, cpu_bind[i], tmp1, tmp2); + } + } + #endif + printf("\n"); + fflush(stdout); + } + // create worker threads for (t = 0; t < NUM_THREADS; t++) { mdp->ack = 0; @@ -133,6 +175,9 @@ static void *init() mdp->threaddata[t].cpu_id = cpu_bind[t]; mdp->threaddata[t].data = mdp; mdp->threaddata[t].buffersizeMem = BUFFERSIZEMEM; + mdp->threaddata[t].iterations = 0; + mdp->threaddata[t].flops = 0; + mdp->threaddata[t].bytes = 0; mdp->threaddata[t].alignment = ALIGNMENT; mdp->threaddata[t].FUNCTION = FUNCTION; mdp->threaddata[t].period = PERIOD; @@ -159,20 +204,6 @@ static void *init() } mdp->ack = 0; - if (verbose) { - printf(" using %i threads\n", NUM_THREADS); - #if (defined(linux) || defined(__linux__)) && defined (AFFINITY) - for (i = 0; i < NUM_THREADS; i++){ - if ((get_pkg(cpu_bind[i]) != -1) && (get_core_id(cpu_bind[i]) != -1)){ - printf(" - Thread %i runs on CPU %llu, core %i in package: %i\n", - i, cpu_bind[i], get_core_id(cpu_bind[i]),get_pkg(cpu_bind[i])); - } - } - #endif - printf("\n"); - fflush(stdout); - } - return (void *) mdp; } @@ -183,36 +214,38 @@ static void list_functions(){ printf("\n available load-functions:\n"); printf(" ID | NAME | available on this system\n"); printf(" ----------------------------------------------------------------\n"); - if (feature_available("FMA")) printf(" %4.4s | %.30s | yes\n","1","FUNC_SKL_COREI_FMA_1T "); - else printf(" %4.4s | %.30s | no\n","1","FUNC_SKL_COREI_FMA_1T "); - if (feature_available("FMA")) printf(" %4.4s | %.30s | yes\n","2","FUNC_SKL_COREI_FMA_2T "); - else printf(" %4.4s | %.30s | no\n","2","FUNC_SKL_COREI_FMA_2T "); - if (feature_available("FMA")) printf(" %4.4s | %.30s | yes\n","3","FUNC_HSW_COREI_FMA_1T "); - else printf(" %4.4s | %.30s | no\n","3","FUNC_HSW_COREI_FMA_1T "); - if (feature_available("FMA")) printf(" %4.4s | %.30s | yes\n","4","FUNC_HSW_COREI_FMA_2T "); - else printf(" %4.4s | %.30s | no\n","4","FUNC_HSW_COREI_FMA_2T "); - if (feature_available("FMA")) printf(" %4.4s | %.30s | yes\n","5","FUNC_HSW_XEONEP_FMA_1T "); - else printf(" %4.4s | %.30s | no\n","5","FUNC_HSW_XEONEP_FMA_1T "); - if (feature_available("FMA")) printf(" %4.4s | %.30s | yes\n","6","FUNC_HSW_XEONEP_FMA_2T "); - else printf(" %4.4s | %.30s | no\n","6","FUNC_HSW_XEONEP_FMA_2T "); - if (feature_available("AVX")) printf(" %4.4s | %.30s | yes\n","7","FUNC_SNB_COREI_AVX_1T "); - else printf(" %4.4s | %.30s | no\n","7","FUNC_SNB_COREI_AVX_1T "); - if (feature_available("AVX")) printf(" %4.4s | %.30s | yes\n","8","FUNC_SNB_COREI_AVX_2T "); - else printf(" %4.4s | %.30s | no\n","8","FUNC_SNB_COREI_AVX_2T "); - if (feature_available("AVX")) printf(" %4.4s | %.30s | yes\n","9","FUNC_SNB_XEONEP_AVX_1T "); - else printf(" %4.4s | %.30s | no\n","9","FUNC_SNB_XEONEP_AVX_1T "); - if (feature_available("AVX")) printf(" %4.4s | %.30s | yes\n","10","FUNC_SNB_XEONEP_AVX_2T "); - else printf(" %4.4s | %.30s | no\n","10","FUNC_SNB_XEONEP_AVX_2T "); - if (feature_available("SSE2")) printf(" %4.4s | %.30s | yes\n","11","FUNC_NHM_COREI_SSE2_1T "); - else printf(" %4.4s | %.30s | no\n","11","FUNC_NHM_COREI_SSE2_1T "); - if (feature_available("SSE2")) printf(" %4.4s | %.30s | yes\n","12","FUNC_NHM_COREI_SSE2_2T "); - else printf(" %4.4s | %.30s | no\n","12","FUNC_NHM_COREI_SSE2_2T "); - if (feature_available("SSE2")) printf(" %4.4s | %.30s | yes\n","13","FUNC_NHM_XEONEP_SSE2_1T "); - else printf(" %4.4s | %.30s | no\n","13","FUNC_NHM_XEONEP_SSE2_1T "); - if (feature_available("SSE2")) printf(" %4.4s | %.30s | yes\n","14","FUNC_NHM_XEONEP_SSE2_2T "); - else printf(" %4.4s | %.30s | no\n","14","FUNC_NHM_XEONEP_SSE2_2T "); - if (feature_available("FMA4")) printf(" %4.4s | %.30s | yes\n","15","FUNC_BLD_OPTERON_FMA4_1T "); - else printf(" %4.4s | %.30s | no\n","15","FUNC_BLD_OPTERON_FMA4_1T "); + if (feature_available("AVX512")) printf(" %4.4s | %.30s | yes\n","1","FUNC_KNL_XEONPHI_AVX512_4T "); + else printf(" %4.4s | %.30s | no\n","1","FUNC_KNL_XEONPHI_AVX512_4T "); + if (feature_available("FMA")) printf(" %4.4s | %.30s | yes\n","2","FUNC_SKL_COREI_FMA_1T "); + else printf(" %4.4s | %.30s | no\n","2","FUNC_SKL_COREI_FMA_1T "); + if (feature_available("FMA")) printf(" %4.4s | %.30s | yes\n","3","FUNC_SKL_COREI_FMA_2T "); + else printf(" %4.4s | %.30s | no\n","3","FUNC_SKL_COREI_FMA_2T "); + if (feature_available("FMA")) printf(" %4.4s | %.30s | yes\n","4","FUNC_HSW_COREI_FMA_1T "); + else printf(" %4.4s | %.30s | no\n","4","FUNC_HSW_COREI_FMA_1T "); + if (feature_available("FMA")) printf(" %4.4s | %.30s | yes\n","5","FUNC_HSW_COREI_FMA_2T "); + else printf(" %4.4s | %.30s | no\n","5","FUNC_HSW_COREI_FMA_2T "); + if (feature_available("FMA")) printf(" %4.4s | %.30s | yes\n","6","FUNC_HSW_XEONEP_FMA_1T "); + else printf(" %4.4s | %.30s | no\n","6","FUNC_HSW_XEONEP_FMA_1T "); + if (feature_available("FMA")) printf(" %4.4s | %.30s | yes\n","7","FUNC_HSW_XEONEP_FMA_2T "); + else printf(" %4.4s | %.30s | no\n","7","FUNC_HSW_XEONEP_FMA_2T "); + if (feature_available("AVX")) printf(" %4.4s | %.30s | yes\n","8","FUNC_SNB_COREI_AVX_1T "); + else printf(" %4.4s | %.30s | no\n","8","FUNC_SNB_COREI_AVX_1T "); + if (feature_available("AVX")) printf(" %4.4s | %.30s | yes\n","9","FUNC_SNB_COREI_AVX_2T "); + else printf(" %4.4s | %.30s | no\n","9","FUNC_SNB_COREI_AVX_2T "); + if (feature_available("AVX")) printf(" %4.4s | %.30s | yes\n","10","FUNC_SNB_XEONEP_AVX_1T "); + else printf(" %4.4s | %.30s | no\n","10","FUNC_SNB_XEONEP_AVX_1T "); + if (feature_available("AVX")) printf(" %4.4s | %.30s | yes\n","11","FUNC_SNB_XEONEP_AVX_2T "); + else printf(" %4.4s | %.30s | no\n","11","FUNC_SNB_XEONEP_AVX_2T "); + if (feature_available("SSE2")) printf(" %4.4s | %.30s | yes\n","12","FUNC_NHM_COREI_SSE2_1T "); + else printf(" %4.4s | %.30s | no\n","12","FUNC_NHM_COREI_SSE2_1T "); + if (feature_available("SSE2")) printf(" %4.4s | %.30s | yes\n","13","FUNC_NHM_COREI_SSE2_2T "); + else printf(" %4.4s | %.30s | no\n","13","FUNC_NHM_COREI_SSE2_2T "); + if (feature_available("SSE2")) printf(" %4.4s | %.30s | yes\n","14","FUNC_NHM_XEONEP_SSE2_1T "); + else printf(" %4.4s | %.30s | no\n","14","FUNC_NHM_XEONEP_SSE2_1T "); + if (feature_available("SSE2")) printf(" %4.4s | %.30s | yes\n","15","FUNC_NHM_XEONEP_SSE2_2T "); + else printf(" %4.4s | %.30s | no\n","15","FUNC_NHM_XEONEP_SSE2_2T "); + if (feature_available("FMA4")) printf(" %4.4s | %.30s | yes\n","16","FUNC_BLD_OPTERON_FMA4_1T "); + else printf(" %4.4s | %.30s | no\n","16","FUNC_BLD_OPTERON_FMA4_1T "); return; } @@ -222,93 +255,99 @@ static int get_function(unsigned int id){ switch(id){ case 1: - if (feature_available("FMA")) func = FUNC_SKL_COREI_FMA_1T; + if (feature_available("AVX512")) func = FUNC_KNL_XEONPHI_AVX512_4T; else{ - fprintf(stderr, "\nError: Function 1 (\"FUNC_SKL_COREI_FMA_1T\") requires FMA, which is not supported by the processor.\n\n"); + fprintf(stderr, "\nError: Function 1 (\"FUNC_KNL_XEONPHI_AVX512_4T\") requires AVX512, which is not supported by the processor.\n\n"); } break; case 2: - if (feature_available("FMA")) func = FUNC_SKL_COREI_FMA_2T; + if (feature_available("FMA")) func = FUNC_SKL_COREI_FMA_1T; else{ - fprintf(stderr, "\nError: Function 2 (\"FUNC_SKL_COREI_FMA_2T\") requires FMA, which is not supported by the processor.\n\n"); + fprintf(stderr, "\nError: Function 2 (\"FUNC_SKL_COREI_FMA_1T\") requires FMA, which is not supported by the processor.\n\n"); } break; case 3: - if (feature_available("FMA")) func = FUNC_HSW_COREI_FMA_1T; + if (feature_available("FMA")) func = FUNC_SKL_COREI_FMA_2T; else{ - fprintf(stderr, "\nError: Function 3 (\"FUNC_HSW_COREI_FMA_1T\") requires FMA, which is not supported by the processor.\n\n"); + fprintf(stderr, "\nError: Function 3 (\"FUNC_SKL_COREI_FMA_2T\") requires FMA, which is not supported by the processor.\n\n"); } break; case 4: - if (feature_available("FMA")) func = FUNC_HSW_COREI_FMA_2T; + if (feature_available("FMA")) func = FUNC_HSW_COREI_FMA_1T; else{ - fprintf(stderr, "\nError: Function 4 (\"FUNC_HSW_COREI_FMA_2T\") requires FMA, which is not supported by the processor.\n\n"); + fprintf(stderr, "\nError: Function 4 (\"FUNC_HSW_COREI_FMA_1T\") requires FMA, which is not supported by the processor.\n\n"); } break; case 5: - if (feature_available("FMA")) func = FUNC_HSW_XEONEP_FMA_1T; + if (feature_available("FMA")) func = FUNC_HSW_COREI_FMA_2T; else{ - fprintf(stderr, "\nError: Function 5 (\"FUNC_HSW_XEONEP_FMA_1T\") requires FMA, which is not supported by the processor.\n\n"); + fprintf(stderr, "\nError: Function 5 (\"FUNC_HSW_COREI_FMA_2T\") requires FMA, which is not supported by the processor.\n\n"); } break; case 6: - if (feature_available("FMA")) func = FUNC_HSW_XEONEP_FMA_2T; + if (feature_available("FMA")) func = FUNC_HSW_XEONEP_FMA_1T; else{ - fprintf(stderr, "\nError: Function 6 (\"FUNC_HSW_XEONEP_FMA_2T\") requires FMA, which is not supported by the processor.\n\n"); + fprintf(stderr, "\nError: Function 6 (\"FUNC_HSW_XEONEP_FMA_1T\") requires FMA, which is not supported by the processor.\n\n"); } break; case 7: - if (feature_available("AVX")) func = FUNC_SNB_COREI_AVX_1T; + if (feature_available("FMA")) func = FUNC_HSW_XEONEP_FMA_2T; else{ - fprintf(stderr, "\nError: Function 7 (\"FUNC_SNB_COREI_AVX_1T\") requires AVX, which is not supported by the processor.\n\n"); + fprintf(stderr, "\nError: Function 7 (\"FUNC_HSW_XEONEP_FMA_2T\") requires FMA, which is not supported by the processor.\n\n"); } break; case 8: - if (feature_available("AVX")) func = FUNC_SNB_COREI_AVX_2T; + if (feature_available("AVX")) func = FUNC_SNB_COREI_AVX_1T; else{ - fprintf(stderr, "\nError: Function 8 (\"FUNC_SNB_COREI_AVX_2T\") requires AVX, which is not supported by the processor.\n\n"); + fprintf(stderr, "\nError: Function 8 (\"FUNC_SNB_COREI_AVX_1T\") requires AVX, which is not supported by the processor.\n\n"); } break; case 9: - if (feature_available("AVX")) func = FUNC_SNB_XEONEP_AVX_1T; + if (feature_available("AVX")) func = FUNC_SNB_COREI_AVX_2T; else{ - fprintf(stderr, "\nError: Function 9 (\"FUNC_SNB_XEONEP_AVX_1T\") requires AVX, which is not supported by the processor.\n\n"); + fprintf(stderr, "\nError: Function 9 (\"FUNC_SNB_COREI_AVX_2T\") requires AVX, which is not supported by the processor.\n\n"); } break; case 10: - if (feature_available("AVX")) func = FUNC_SNB_XEONEP_AVX_2T; + if (feature_available("AVX")) func = FUNC_SNB_XEONEP_AVX_1T; else{ - fprintf(stderr, "\nError: Function 10 (\"FUNC_SNB_XEONEP_AVX_2T\") requires AVX, which is not supported by the processor.\n\n"); + fprintf(stderr, "\nError: Function 10 (\"FUNC_SNB_XEONEP_AVX_1T\") requires AVX, which is not supported by the processor.\n\n"); } break; case 11: - if (feature_available("SSE2")) func = FUNC_NHM_COREI_SSE2_1T; + if (feature_available("AVX")) func = FUNC_SNB_XEONEP_AVX_2T; else{ - fprintf(stderr, "\nError: Function 11 (\"FUNC_NHM_COREI_SSE2_1T\") requires SSE2, which is not supported by the processor.\n\n"); + fprintf(stderr, "\nError: Function 11 (\"FUNC_SNB_XEONEP_AVX_2T\") requires AVX, which is not supported by the processor.\n\n"); } break; case 12: - if (feature_available("SSE2")) func = FUNC_NHM_COREI_SSE2_2T; + if (feature_available("SSE2")) func = FUNC_NHM_COREI_SSE2_1T; else{ - fprintf(stderr, "\nError: Function 12 (\"FUNC_NHM_COREI_SSE2_2T\") requires SSE2, which is not supported by the processor.\n\n"); + fprintf(stderr, "\nError: Function 12 (\"FUNC_NHM_COREI_SSE2_1T\") requires SSE2, which is not supported by the processor.\n\n"); } break; case 13: - if (feature_available("SSE2")) func = FUNC_NHM_XEONEP_SSE2_1T; + if (feature_available("SSE2")) func = FUNC_NHM_COREI_SSE2_2T; else{ - fprintf(stderr, "\nError: Function 13 (\"FUNC_NHM_XEONEP_SSE2_1T\") requires SSE2, which is not supported by the processor.\n\n"); + fprintf(stderr, "\nError: Function 13 (\"FUNC_NHM_COREI_SSE2_2T\") requires SSE2, which is not supported by the processor.\n\n"); } break; case 14: - if (feature_available("SSE2")) func = FUNC_NHM_XEONEP_SSE2_2T; + if (feature_available("SSE2")) func = FUNC_NHM_XEONEP_SSE2_1T; else{ - fprintf(stderr, "\nError: Function 14 (\"FUNC_NHM_XEONEP_SSE2_2T\") requires SSE2, which is not supported by the processor.\n\n"); + fprintf(stderr, "\nError: Function 14 (\"FUNC_NHM_XEONEP_SSE2_1T\") requires SSE2, which is not supported by the processor.\n\n"); } break; case 15: + if (feature_available("SSE2")) func = FUNC_NHM_XEONEP_SSE2_2T; + else{ + fprintf(stderr, "\nError: Function 15 (\"FUNC_NHM_XEONEP_SSE2_2T\") requires SSE2, which is not supported by the processor.\n\n"); + } + break; + case 16: if (feature_available("FMA4")) func = FUNC_BLD_OPTERON_FMA4_1T; else{ - fprintf(stderr, "\nError: Function 15 (\"FUNC_BLD_OPTERON_FMA4_1T\") requires FMA4, which is not supported by the processor.\n\n"); + fprintf(stderr, "\nError: Function 16 (\"FUNC_BLD_OPTERON_FMA4_1T\") requires FMA4, which is not supported by the processor.\n\n"); } break; default: @@ -336,10 +375,10 @@ static void evaluate_environment() fprintf(stderr, "\nWarning: not enough CPUs for requested number of threads\n"); } - if (fsbind==NULL) { //use all CPUs if not defined otherwise + if (fsbind==NULL) { // no cpu binding defined #if (defined(linux) || defined(__linux__)) && defined (AFFINITY) CPU_ZERO(&cpuset); - if (NUM_THREADS==0){ + if (NUM_THREADS==0){ // use all CPUs if not defined otherwise for (i = 0; i < cpuinfo->num_cpus; i++) { if (cpu_allowed(i)) { CPU_SET(i, &cpuset); @@ -347,9 +386,26 @@ static void evaluate_environment() } } } - else{ - for (i = 0; i < cpuinfo->num_cpus; i++) { - if (cpu_allowed(i)) CPU_SET(i, &cpuset); + else{ // if -n / --threads is set + int current_cpu=0; + for (i = 0; i < NUM_THREADS; i++) { + /* search for available cpu */ + while(! cpu_allowed(current_cpu) ) { + current_cpu++; + + /* if reached end of avail cpus or max(int) */ + if (current_cpu >= cpuinfo->num_cpus || current_cpu < 0) + { + /* start at beginning */ + fprintf(stderr, "Error: You are requesting more threads than there are CPUs available in the given cpuset.\n"); + fprintf(stderr, "This can be caused by the taskset tool, cgroups, the batch system, or similar mechanisms.\n" ); \ + fprintf(stderr, "Please fix the -n/--threads argument to match the restrictions.\n"); + exit( EACCES ); + } + } + ADD_CPU_SET(current_cpu,cpuset); + /* next cpu for next thread (or one of the following) */ + current_cpu++; } } #ifdef CUDA @@ -365,8 +421,9 @@ static void evaluate_environment() #if (defined(linux) || defined(__linux__)) && defined (AFFINITY) else { // parse CPULIST for binding char *p,*q,*r,*s,*t; - int p_val,r_val,s_val,error=0; + int p_val=0,r_val=0,s_val=0,error=0; + CPU_ZERO(&cpuset); errno=0; p=strdup(fsbind); while(p!=NULL) { @@ -407,19 +464,15 @@ static void evaluate_environment() exit(127); } if ((s)&&(r)) for (i=p_val; (int)i<=r_val; i+=s_val) { - if (cpu_allowed(i)) { - CPU_SET(i,&cpuset); - NUM_THREADS++; - } + ADD_CPU_SET(i,cpuset); + NUM_THREADS++; } else if (r) for (i=p_val; (int)i<=r_val; i++) { - if (cpu_allowed(i)) { - CPU_SET(i,&cpuset); - NUM_THREADS++; - } + ADD_CPU_SET(i,cpuset); + NUM_THREADS++; } - else if (cpu_allowed(p_val)) { - CPU_SET(p_val,&cpuset); + else { + ADD_CPU_SET(p_val,cpuset); NUM_THREADS++; } p=q; @@ -470,6 +523,18 @@ static void evaluate_environment() switch (cpuinfo->family) { case 6: switch (cpuinfo->model) { + case 87: + if (feature_available("AVX512")) { + if (num_threads_per_core() == 4) FUNCTION = FUNC_KNL_XEONPHI_AVX512_4T; + if (FUNCTION == FUNC_NOT_DEFINED) { + fprintf(stderr, "Warning: no code path for %i threads per core!\n",num_threads_per_core()); + } + } + if (FUNCTION == FUNC_NOT_DEFINED) { + fprintf(stderr, "\nWarning: AVX512 is requiered for architecture \"KNL\", but is not supported!\n"); + } + break; + case 78: case 94: if (feature_available("FMA")) { if (num_threads_per_core() == 1) FUNCTION = FUNC_SKL_COREI_FMA_1T; @@ -484,6 +549,8 @@ static void evaluate_environment() break; case 60: case 61: + case 69: + case 70: case 71: if (feature_available("FMA")) { if (num_threads_per_core() == 1) FUNCTION = FUNC_HSW_COREI_FMA_1T; @@ -594,6 +661,22 @@ static void evaluate_environment() } } + /* use AVX512 as fallback if available*/ + if ((FUNCTION == FUNC_NOT_DEFINED)&&(feature_available("AVX512"))) { + /* use function for correct number of threads per core if available */ + if(num_threads_per_core() == 4) { + FUNCTION = FUNC_KNL_XEONPHI_AVX512_4T; + fprintf(stderr, "Warning: using function FUNC_KNL_XEONPHI_AVX512_4T as fallback.\n"); + fprintf(stderr, " You can use the parameter --function to try other functions.\n"); + } + /* use function for 4 threads per core if no function for actual number of thread per core exists*/ + if (FUNCTION == FUNC_NOT_DEFINED) + { + FUNCTION = FUNC_KNL_XEONPHI_AVX512_4T; + fprintf(stderr, "Warning: using function FUNC_KNL_XEONPHI_AVX512_4T as fallback.\n"); + fprintf(stderr, " You can use the parameter --function to try other functions.\n"); + } + } /* use FMA4 as fallback if available*/ if ((FUNCTION == FUNC_NOT_DEFINED)&&(feature_available("FMA4"))) { /* use function for correct number of threads per core if available */ @@ -680,11 +763,28 @@ static void evaluate_environment() switch (FUNCTION) { + case FUNC_KNL_XEONPHI_AVX512_4T: + if (verbose) printf("\n Taking AVX512 Path optimized for Knights_Landing - 4 thread(s) per core"); + + + + + BUFFERSIZE[0] = 8192; + BUFFERSIZE[1] = 131072; + BUFFERSIZE[2] = 0; + RAMBUFFERSIZE = 6553600; + if (verbose) { + printf("\n Used buffersizes per thread:\n"); + for (i = 0; i < cpuinfo->Cachelevels; i++) printf(" - L%d-Cache: %d Bytes\n", i + 1, BUFFERSIZE[i]); + printf(" - Memory: %llu Bytes\n\n", RAMBUFFERSIZE); + } + break; case FUNC_SKL_COREI_FMA_1T: if (verbose) printf("\n Taking FMA Path optimized for Skylake - 1 thread(s) per core"); + BUFFERSIZE[0] = 32768; BUFFERSIZE[1] = 262144; BUFFERSIZE[2] = 1572864; @@ -700,6 +800,7 @@ static void evaluate_environment() + BUFFERSIZE[0] = 16384; BUFFERSIZE[1] = 131072; BUFFERSIZE[2] = 786432; @@ -715,6 +816,7 @@ static void evaluate_environment() + BUFFERSIZE[0] = 32768; BUFFERSIZE[1] = 262144; BUFFERSIZE[2] = 1572864; @@ -730,6 +832,7 @@ static void evaluate_environment() + BUFFERSIZE[0] = 16384; BUFFERSIZE[1] = 131072; BUFFERSIZE[2] = 786432; @@ -745,6 +848,7 @@ static void evaluate_environment() + BUFFERSIZE[0] = 32768; BUFFERSIZE[1] = 262144; BUFFERSIZE[2] = 2621440; @@ -760,6 +864,7 @@ static void evaluate_environment() + BUFFERSIZE[0] = 16384; BUFFERSIZE[1] = 131072; BUFFERSIZE[2] = 1310720; @@ -775,6 +880,7 @@ static void evaluate_environment() + BUFFERSIZE[0] = 32768; BUFFERSIZE[1] = 262144; BUFFERSIZE[2] = 1572864; @@ -790,6 +896,7 @@ static void evaluate_environment() + BUFFERSIZE[0] = 16384; BUFFERSIZE[1] = 131072; BUFFERSIZE[2] = 786432; @@ -805,6 +912,7 @@ static void evaluate_environment() + BUFFERSIZE[0] = 32768; BUFFERSIZE[1] = 262144; BUFFERSIZE[2] = 2621440; @@ -820,6 +928,7 @@ static void evaluate_environment() + BUFFERSIZE[0] = 16384; BUFFERSIZE[1] = 131072; BUFFERSIZE[2] = 1310720; @@ -835,6 +944,7 @@ static void evaluate_environment() + BUFFERSIZE[0] = 32768; BUFFERSIZE[1] = 262144; BUFFERSIZE[2] = 1572864; @@ -850,6 +960,7 @@ static void evaluate_environment() + BUFFERSIZE[0] = 16384; BUFFERSIZE[1] = 131072; BUFFERSIZE[2] = 786432; @@ -865,6 +976,7 @@ static void evaluate_environment() + BUFFERSIZE[0] = 32768; BUFFERSIZE[1] = 262144; BUFFERSIZE[2] = 2097152; @@ -880,6 +992,7 @@ static void evaluate_environment() + BUFFERSIZE[0] = 16384; BUFFERSIZE[1] = 131072; BUFFERSIZE[2] = 1048576; @@ -895,6 +1008,7 @@ static void evaluate_environment() + BUFFERSIZE[0] = 16384; BUFFERSIZE[1] = 1048576; BUFFERSIZE[2] = 786432; @@ -916,6 +1030,8 @@ static void evaluate_environment() int main(int argc, char *argv[]) { int i,c; + unsigned long long iterations=0; + #ifdef CUDA gpustruct * structpointer=malloc(sizeof(gpustruct)); structpointer->useDouble=1; //we want to use Doubles, if no -f Argument is given @@ -930,6 +1046,7 @@ int main(int argc, char *argv[]) {"version", no_argument, 0, 'v'}, {"warranty", no_argument, 0, 'w'}, {"quiet", no_argument, 0, 'q'}, + {"report", no_argument, 0, 'r'}, {"avail", no_argument, 0, 'a'}, {"function", required_argument, 0, 'i'}, #ifdef CUDA @@ -952,9 +1069,9 @@ int main(int argc, char *argv[]) { #if (defined(linux) || defined(__linux__)) && defined (AFFINITY) - c = getopt_long(argc, argv, "chvwqafb:i:t:l:p:n:m:g:", long_options, NULL); + c = getopt_long(argc, argv, "chvwqarfb:i:t:l:p:n:m:g:", long_options, NULL); #else - c = getopt_long(argc, argv, "chvwqafi:t:l:p:n:m:g:", long_options, NULL); + c = getopt_long(argc, argv, "chvwqarfi:t:l:p:n:m:g:", long_options, NULL); #endif if(c == -1) break; @@ -980,6 +1097,9 @@ int main(int argc, char *argv[]) FUNCTION=get_function((unsigned int)strtol(optarg,NULL,10)); if (FUNCTION==FUNC_UNKNOWN) return EXIT_FAILURE; break; + case 'r': + if (verbose) verbose = 2; + break; case 'q': #ifdef CUDA structpointer->verbose=0; @@ -987,10 +1107,18 @@ int main(int argc, char *argv[]) verbose = 0; break; case 'n': + if (fsbind!=NULL){ + printf("Error: -b/--bind and -n/--threads cannot be used together\n"); + return EXIT_FAILURE; + } NUM_THREADS=(unsigned int)strtol(optarg,NULL,10); break; #if (defined(linux) || defined(__linux__)) && defined (AFFINITY) case 'b': + if (NUM_THREADS){ + printf("Error: -b/--bind and -n/--threads cannot be used together\n"); + return EXIT_FAILURE; + } fsbind=strdup(optarg); break; #endif @@ -1082,6 +1210,30 @@ int main(int argc, char *argv[]) /* wait for threads after watchdog has requested termination */ for(i = 0; i < mdp->num_threads; i++) pthread_join(threads[i], NULL); + if (verbose == 2){ + unsigned long long start_tsc,stop_tsc; + double runtime; + + printf("\nperformance report:\n"); + + start_tsc=mdp->threaddata[0].start_tsc; + stop_tsc=mdp->threaddata[0].stop_tsc; + for(i = 0; i < mdp->num_threads; i++){ + printf("Thread %i: %llu iterations, tsc_delta: %llu\n",i,mdp->threaddata[i].iterations, mdp->threaddata[i].stop_tsc - mdp->threaddata[i].start_tsc ); + iterations+=mdp->threaddata[i].iterations; + if (start_tsc > mdp->threaddata[i].start_tsc) start_tsc = mdp->threaddata[i].start_tsc; + if (stop_tsc < mdp->threaddata[i].stop_tsc) stop_tsc = mdp->threaddata[i].stop_tsc; + } + printf("\ntotal iterations: %llu\n",iterations); + runtime=(double)(stop_tsc - start_tsc) / (double)cpuinfo->clockrate; + printf("runtime: %.2f seconds (%llu cycles)\n\n",runtime, stop_tsc - start_tsc); + + printf("estimated floating point performance: %.2f GFLOPS\n", (double)mdp->threaddata[0].flops*0.000000001*(double)iterations/runtime); + printf("estimated memory bandwidth: %.2f GB/s\n", (double)mdp->threaddata[0].bytes*0.000000001*(double)iterations/runtime); + + printf("\n"); + } + #ifdef CUDA free(structpointer); #endif diff --git a/sse2_functions.c b/sse2_functions.c index fd1abdec..f53d1030 100644 --- a/sse2_functions.c +++ b/sse2_functions.c @@ -23,12 +23,33 @@ - int init_nhm_corei_sse2_1t(unsigned long long addrMem) __attribute__((noinline)); -int init_nhm_corei_sse2_1t(unsigned long long addrMem) + + int init_nhm_corei_sse2_1t(threaddata_t* threaddata) __attribute__((noinline)); +int init_nhm_corei_sse2_1t(threaddata_t* threaddata) { + unsigned long long addrMem = threaddata->addrMem; int i; for (i = 0; i<13340672; i++) *((double*)(addrMem + 8*i)) = i * 1.654738925401e-15; + // lines with register operations + threaddata->flops+=2*2; // 1 128 bit operation + + // lines with L1 operations + threaddata->flops+=70*2; // 1 128 bit operation + + // lines with L2 operations + threaddata->flops+=0*2; // 1 128 bit operation + + // lines with L3 operations + threaddata->flops+=0*2; // 1 128 bit operation + + // lines with RAM operations + threaddata->flops+=1*2; // 1 128 bit operation + threaddata->bytes=1*64; // 1 memory access + + threaddata->flops*=21; + threaddata->bytes*=21; + return EXIT_SUCCESS; } @@ -39,10 +60,10 @@ int init_nhm_corei_sse2_1t(unsigned long long addrMem) * @input - addrMem: pointer to buffer * @return EXIT_SUCCESS */ -int asm_work_nhm_corei_sse2_1t(unsigned long long addrMem, unsigned long long addrHigh) __attribute__((noinline)); -int asm_work_nhm_corei_sse2_1t(unsigned long long addrMem, unsigned long long addrHigh) +int asm_work_nhm_corei_sse2_1t(threaddata_t* threaddata) __attribute__((noinline)); +int asm_work_nhm_corei_sse2_1t(threaddata_t* threaddata) { - if (*((unsigned long long*)addrHigh) == 0) return EXIT_SUCCESS; + if (*((unsigned long long*)threaddata->addrHigh) == 0) return EXIT_SUCCESS; /* input: * - addrMem -> rax * register usage: @@ -57,11 +78,13 @@ int asm_work_nhm_corei_sse2_1t(unsigned long long addrMem, unsigned long long ad * - r11: temp register for initialization of SIMD-registers * - r12: stores cacheline width as increment for buffer addresses * - r13: stores address of shared variable that controls load level + * - r14: stores iteration counter * - mm*,xmm*: data registers for SIMD instructions */ __asm__ __volatile__( - "mov %0, %%rax;" // store start address of buffer - "mov %1, %%r13;" // store address of shared variable that controls load level + "mov %%rax, %%rax;" // store start address of buffer + "mov %%rbx, %%r13;" // store address of shared variable that controls load level + "mov %%rcx, %%r14;" // store iteration counter "mov $64, %%r12;" // increment after each cache/memory access //Initialize SSE-Registers for Addition "movapd 0(%%rax), %%xmm0;" @@ -1640,25 +1663,47 @@ int asm_work_nhm_corei_sse2_1t(unsigned long long addrMem, unsigned long long ad "mov %%rax, %%rdi;" "add $1572864, %%rdi;" "_work_no_ram_reset_nhm_corei_sse2_1t:" + "inc %%r14;" // increment iteration counter "mov %%rax, %%rbx;" - "mov (%%r13), %%r11;" - "test $1, %%r11;" + "testq $1, (%%r13);" "jnz _work_loop_nhm_corei_sse2_1t;" - : - : "r"(addrMem), "r"(addrHigh) - : "%rax", "%rbx", "%rcx", "%rdx", "%rdi", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15" + "movq %%r14, %%rax;" // restore iteration counter + : "=a" (threaddata->iterations) + : "a"(threaddata->addrMem), "b"(threaddata->addrHigh), "c" (threaddata->iterations) + : "%rdx", "%rdi", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15" ); return EXIT_SUCCESS; } - int init_nhm_corei_sse2_2t(unsigned long long addrMem) __attribute__((noinline)); -int init_nhm_corei_sse2_2t(unsigned long long addrMem) + + int init_nhm_corei_sse2_2t(threaddata_t* threaddata) __attribute__((noinline)); +int init_nhm_corei_sse2_2t(threaddata_t* threaddata) { + unsigned long long addrMem = threaddata->addrMem; int i; for (i = 0; i<6670336; i++) *((double*)(addrMem + 8*i)) = i * 1.654738925401e-15; + // lines with register operations + threaddata->flops+=2*2; // 1 128 bit operation + + // lines with L1 operations + threaddata->flops+=70*2; // 1 128 bit operation + + // lines with L2 operations + threaddata->flops+=0*2; // 1 128 bit operation + + // lines with L3 operations + threaddata->flops+=0*2; // 1 128 bit operation + + // lines with RAM operations + threaddata->flops+=1*2; // 1 128 bit operation + threaddata->bytes=1*64; // 1 memory access + + threaddata->flops*=10; + threaddata->bytes*=10; + return EXIT_SUCCESS; } @@ -1669,10 +1714,10 @@ int init_nhm_corei_sse2_2t(unsigned long long addrMem) * @input - addrMem: pointer to buffer * @return EXIT_SUCCESS */ -int asm_work_nhm_corei_sse2_2t(unsigned long long addrMem, unsigned long long addrHigh) __attribute__((noinline)); -int asm_work_nhm_corei_sse2_2t(unsigned long long addrMem, unsigned long long addrHigh) +int asm_work_nhm_corei_sse2_2t(threaddata_t* threaddata) __attribute__((noinline)); +int asm_work_nhm_corei_sse2_2t(threaddata_t* threaddata) { - if (*((unsigned long long*)addrHigh) == 0) return EXIT_SUCCESS; + if (*((unsigned long long*)threaddata->addrHigh) == 0) return EXIT_SUCCESS; /* input: * - addrMem -> rax * register usage: @@ -1687,11 +1732,13 @@ int asm_work_nhm_corei_sse2_2t(unsigned long long addrMem, unsigned long long ad * - r11: temp register for initialization of SIMD-registers * - r12: stores cacheline width as increment for buffer addresses * - r13: stores address of shared variable that controls load level + * - r14: stores iteration counter * - mm*,xmm*: data registers for SIMD instructions */ __asm__ __volatile__( - "mov %0, %%rax;" // store start address of buffer - "mov %1, %%r13;" // store address of shared variable that controls load level + "mov %%rax, %%rax;" // store start address of buffer + "mov %%rbx, %%r13;" // store address of shared variable that controls load level + "mov %%rcx, %%r14;" // store iteration counter "mov $64, %%r12;" // increment after each cache/memory access //Initialize SSE-Registers for Addition "movapd 0(%%rax), %%xmm0;" @@ -2467,25 +2514,47 @@ int asm_work_nhm_corei_sse2_2t(unsigned long long addrMem, unsigned long long ad "mov %%rax, %%rdi;" "add $786432, %%rdi;" "_work_no_ram_reset_nhm_corei_sse2_2t:" + "inc %%r14;" // increment iteration counter "mov %%rax, %%rbx;" - "mov (%%r13), %%r11;" - "test $1, %%r11;" + "testq $1, (%%r13);" "jnz _work_loop_nhm_corei_sse2_2t;" - : - : "r"(addrMem), "r"(addrHigh) - : "%rax", "%rbx", "%rcx", "%rdx", "%rdi", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15" + "movq %%r14, %%rax;" // restore iteration counter + : "=a" (threaddata->iterations) + : "a"(threaddata->addrMem), "b"(threaddata->addrHigh), "c" (threaddata->iterations) + : "%rdx", "%rdi", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15" ); return EXIT_SUCCESS; } - int init_nhm_xeonep_sse2_1t(unsigned long long addrMem) __attribute__((noinline)); -int init_nhm_xeonep_sse2_1t(unsigned long long addrMem) + + int init_nhm_xeonep_sse2_1t(threaddata_t* threaddata) __attribute__((noinline)); +int init_nhm_xeonep_sse2_1t(threaddata_t* threaddata) { + unsigned long long addrMem = threaddata->addrMem; int i; for (i = 0; i<13406208; i++) *((double*)(addrMem + 8*i)) = i * 1.654738925401e-15; + // lines with register operations + threaddata->flops+=2*2; // 1 128 bit operation + + // lines with L1 operations + threaddata->flops+=60*2; // 1 128 bit operation + + // lines with L2 operations + threaddata->flops+=0*2; // 1 128 bit operation + + // lines with L3 operations + threaddata->flops+=0*2; // 1 128 bit operation + + // lines with RAM operations + threaddata->flops+=1*2; // 1 128 bit operation + threaddata->bytes=1*64; // 1 memory access + + threaddata->flops*=24; + threaddata->bytes*=24; + return EXIT_SUCCESS; } @@ -2496,10 +2565,10 @@ int init_nhm_xeonep_sse2_1t(unsigned long long addrMem) * @input - addrMem: pointer to buffer * @return EXIT_SUCCESS */ -int asm_work_nhm_xeonep_sse2_1t(unsigned long long addrMem, unsigned long long addrHigh) __attribute__((noinline)); -int asm_work_nhm_xeonep_sse2_1t(unsigned long long addrMem, unsigned long long addrHigh) +int asm_work_nhm_xeonep_sse2_1t(threaddata_t* threaddata) __attribute__((noinline)); +int asm_work_nhm_xeonep_sse2_1t(threaddata_t* threaddata) { - if (*((unsigned long long*)addrHigh) == 0) return EXIT_SUCCESS; + if (*((unsigned long long*)threaddata->addrHigh) == 0) return EXIT_SUCCESS; /* input: * - addrMem -> rax * register usage: @@ -2514,11 +2583,13 @@ int asm_work_nhm_xeonep_sse2_1t(unsigned long long addrMem, unsigned long long a * - r11: temp register for initialization of SIMD-registers * - r12: stores cacheline width as increment for buffer addresses * - r13: stores address of shared variable that controls load level + * - r14: stores iteration counter * - mm*,xmm*: data registers for SIMD instructions */ __asm__ __volatile__( - "mov %0, %%rax;" // store start address of buffer - "mov %1, %%r13;" // store address of shared variable that controls load level + "mov %%rax, %%rax;" // store start address of buffer + "mov %%rbx, %%r13;" // store address of shared variable that controls load level + "mov %%rcx, %%r14;" // store iteration counter "mov $64, %%r12;" // increment after each cache/memory access //Initialize SSE-Registers for Addition "movapd 0(%%rax), %%xmm0;" @@ -4076,25 +4147,47 @@ int asm_work_nhm_xeonep_sse2_1t(unsigned long long addrMem, unsigned long long a "mov %%rax, %%rdi;" "add $2097152, %%rdi;" "_work_no_ram_reset_nhm_xeonep_sse2_1t:" + "inc %%r14;" // increment iteration counter "mov %%rax, %%rbx;" - "mov (%%r13), %%r11;" - "test $1, %%r11;" + "testq $1, (%%r13);" "jnz _work_loop_nhm_xeonep_sse2_1t;" - : - : "r"(addrMem), "r"(addrHigh) - : "%rax", "%rbx", "%rcx", "%rdx", "%rdi", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15" + "movq %%r14, %%rax;" // restore iteration counter + : "=a" (threaddata->iterations) + : "a"(threaddata->addrMem), "b"(threaddata->addrHigh), "c" (threaddata->iterations) + : "%rdx", "%rdi", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15" ); return EXIT_SUCCESS; } - int init_nhm_xeonep_sse2_2t(unsigned long long addrMem) __attribute__((noinline)); -int init_nhm_xeonep_sse2_2t(unsigned long long addrMem) + + int init_nhm_xeonep_sse2_2t(threaddata_t* threaddata) __attribute__((noinline)); +int init_nhm_xeonep_sse2_2t(threaddata_t* threaddata) { + unsigned long long addrMem = threaddata->addrMem; int i; for (i = 0; i<6703104; i++) *((double*)(addrMem + 8*i)) = i * 1.654738925401e-15; + // lines with register operations + threaddata->flops+=2*2; // 1 128 bit operation + + // lines with L1 operations + threaddata->flops+=60*2; // 1 128 bit operation + + // lines with L2 operations + threaddata->flops+=0*2; // 1 128 bit operation + + // lines with L3 operations + threaddata->flops+=0*2; // 1 128 bit operation + + // lines with RAM operations + threaddata->flops+=1*2; // 1 128 bit operation + threaddata->bytes=1*64; // 1 memory access + + threaddata->flops*=12; + threaddata->bytes*=12; + return EXIT_SUCCESS; } @@ -4105,10 +4198,10 @@ int init_nhm_xeonep_sse2_2t(unsigned long long addrMem) * @input - addrMem: pointer to buffer * @return EXIT_SUCCESS */ -int asm_work_nhm_xeonep_sse2_2t(unsigned long long addrMem, unsigned long long addrHigh) __attribute__((noinline)); -int asm_work_nhm_xeonep_sse2_2t(unsigned long long addrMem, unsigned long long addrHigh) +int asm_work_nhm_xeonep_sse2_2t(threaddata_t* threaddata) __attribute__((noinline)); +int asm_work_nhm_xeonep_sse2_2t(threaddata_t* threaddata) { - if (*((unsigned long long*)addrHigh) == 0) return EXIT_SUCCESS; + if (*((unsigned long long*)threaddata->addrHigh) == 0) return EXIT_SUCCESS; /* input: * - addrMem -> rax * register usage: @@ -4123,11 +4216,13 @@ int asm_work_nhm_xeonep_sse2_2t(unsigned long long addrMem, unsigned long long a * - r11: temp register for initialization of SIMD-registers * - r12: stores cacheline width as increment for buffer addresses * - r13: stores address of shared variable that controls load level + * - r14: stores iteration counter * - mm*,xmm*: data registers for SIMD instructions */ __asm__ __volatile__( - "mov %0, %%rax;" // store start address of buffer - "mov %1, %%r13;" // store address of shared variable that controls load level + "mov %%rax, %%rax;" // store start address of buffer + "mov %%rbx, %%r13;" // store address of shared variable that controls load level + "mov %%rcx, %%r14;" // store iteration counter "mov $64, %%r12;" // increment after each cache/memory access //Initialize SSE-Registers for Addition "movapd 0(%%rax), %%xmm0;" @@ -4929,13 +5024,14 @@ int asm_work_nhm_xeonep_sse2_2t(unsigned long long addrMem, unsigned long long a "mov %%rax, %%rdi;" "add $1048576, %%rdi;" "_work_no_ram_reset_nhm_xeonep_sse2_2t:" + "inc %%r14;" // increment iteration counter "mov %%rax, %%rbx;" - "mov (%%r13), %%r11;" - "test $1, %%r11;" + "testq $1, (%%r13);" "jnz _work_loop_nhm_xeonep_sse2_2t;" - : - : "r"(addrMem), "r"(addrHigh) - : "%rax", "%rbx", "%rcx", "%rdx", "%rdi", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15" + "movq %%r14, %%rax;" // restore iteration counter + : "=a" (threaddata->iterations) + : "a"(threaddata->addrMem), "b"(threaddata->addrHigh), "c" (threaddata->iterations) + : "%rdx", "%rdi", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15" ); return EXIT_SUCCESS; } diff --git a/work.c b/work.c index e2dd94cd..5528bfc8 100644 --- a/work.c +++ b/work.c @@ -123,50 +123,53 @@ void *thread(void *threaddata) /* call init function */ switch (mydata->FUNCTION) { + case FUNC_KNL_XEONPHI_AVX512_4T: + tmp = init_knl_xeonphi_avx512_4t(mydata); + break; case FUNC_SKL_COREI_FMA_1T: - tmp = init_skl_corei_fma_1t(mydata->addrMem); + tmp = init_skl_corei_fma_1t(mydata); break; case FUNC_SKL_COREI_FMA_2T: - tmp = init_skl_corei_fma_2t(mydata->addrMem); + tmp = init_skl_corei_fma_2t(mydata); break; case FUNC_HSW_COREI_FMA_1T: - tmp = init_hsw_corei_fma_1t(mydata->addrMem); + tmp = init_hsw_corei_fma_1t(mydata); break; case FUNC_HSW_COREI_FMA_2T: - tmp = init_hsw_corei_fma_2t(mydata->addrMem); + tmp = init_hsw_corei_fma_2t(mydata); break; case FUNC_HSW_XEONEP_FMA_1T: - tmp = init_hsw_xeonep_fma_1t(mydata->addrMem); + tmp = init_hsw_xeonep_fma_1t(mydata); break; case FUNC_HSW_XEONEP_FMA_2T: - tmp = init_hsw_xeonep_fma_2t(mydata->addrMem); + tmp = init_hsw_xeonep_fma_2t(mydata); break; case FUNC_SNB_COREI_AVX_1T: - tmp = init_snb_corei_avx_1t(mydata->addrMem); + tmp = init_snb_corei_avx_1t(mydata); break; case FUNC_SNB_COREI_AVX_2T: - tmp = init_snb_corei_avx_2t(mydata->addrMem); + tmp = init_snb_corei_avx_2t(mydata); break; case FUNC_SNB_XEONEP_AVX_1T: - tmp = init_snb_xeonep_avx_1t(mydata->addrMem); + tmp = init_snb_xeonep_avx_1t(mydata); break; case FUNC_SNB_XEONEP_AVX_2T: - tmp = init_snb_xeonep_avx_2t(mydata->addrMem); + tmp = init_snb_xeonep_avx_2t(mydata); break; case FUNC_NHM_COREI_SSE2_1T: - tmp = init_nhm_corei_sse2_1t(mydata->addrMem); + tmp = init_nhm_corei_sse2_1t(mydata); break; case FUNC_NHM_COREI_SSE2_2T: - tmp = init_nhm_corei_sse2_2t(mydata->addrMem); + tmp = init_nhm_corei_sse2_2t(mydata); break; case FUNC_NHM_XEONEP_SSE2_1T: - tmp = init_nhm_xeonep_sse2_1t(mydata->addrMem); + tmp = init_nhm_xeonep_sse2_1t(mydata); break; case FUNC_NHM_XEONEP_SSE2_2T: - tmp = init_nhm_xeonep_sse2_2t(mydata->addrMem); + tmp = init_nhm_xeonep_sse2_2t(mydata); break; case FUNC_BLD_OPTERON_FMA4_1T: - tmp = init_bld_opteron_fma4_1t(mydata->addrMem); + tmp = init_bld_opteron_fma4_1t(mydata); break; default: fprintf(stderr, "Error: unknown function %i\n", mydata->FUNCTION); @@ -188,6 +191,9 @@ void *thread(void *threaddata) old = THREAD_WORK; global_data->ack = id + 1; + /* record thread's start timestamp */ + ((threaddata_t *)threaddata)->start_tsc = timestamp(); + /* will be terminated by watchdog * watchdog also alters mydata->addrHigh to switch between high and low load function */ @@ -201,50 +207,53 @@ void *thread(void *threaddata) #endif switch (mydata->FUNCTION) { + case FUNC_KNL_XEONPHI_AVX512_4T: + tmp=asm_work_knl_xeonphi_avx512_4t(mydata); + break; case FUNC_SKL_COREI_FMA_1T: - tmp=asm_work_skl_corei_fma_1t(mydata->addrMem,mydata->addrHigh); + tmp=asm_work_skl_corei_fma_1t(mydata); break; case FUNC_SKL_COREI_FMA_2T: - tmp=asm_work_skl_corei_fma_2t(mydata->addrMem,mydata->addrHigh); + tmp=asm_work_skl_corei_fma_2t(mydata); break; case FUNC_HSW_COREI_FMA_1T: - tmp=asm_work_hsw_corei_fma_1t(mydata->addrMem,mydata->addrHigh); + tmp=asm_work_hsw_corei_fma_1t(mydata); break; case FUNC_HSW_COREI_FMA_2T: - tmp=asm_work_hsw_corei_fma_2t(mydata->addrMem,mydata->addrHigh); + tmp=asm_work_hsw_corei_fma_2t(mydata); break; case FUNC_HSW_XEONEP_FMA_1T: - tmp=asm_work_hsw_xeonep_fma_1t(mydata->addrMem,mydata->addrHigh); + tmp=asm_work_hsw_xeonep_fma_1t(mydata); break; case FUNC_HSW_XEONEP_FMA_2T: - tmp=asm_work_hsw_xeonep_fma_2t(mydata->addrMem,mydata->addrHigh); + tmp=asm_work_hsw_xeonep_fma_2t(mydata); break; case FUNC_SNB_COREI_AVX_1T: - tmp=asm_work_snb_corei_avx_1t(mydata->addrMem,mydata->addrHigh); + tmp=asm_work_snb_corei_avx_1t(mydata); break; case FUNC_SNB_COREI_AVX_2T: - tmp=asm_work_snb_corei_avx_2t(mydata->addrMem,mydata->addrHigh); + tmp=asm_work_snb_corei_avx_2t(mydata); break; case FUNC_SNB_XEONEP_AVX_1T: - tmp=asm_work_snb_xeonep_avx_1t(mydata->addrMem,mydata->addrHigh); + tmp=asm_work_snb_xeonep_avx_1t(mydata); break; case FUNC_SNB_XEONEP_AVX_2T: - tmp=asm_work_snb_xeonep_avx_2t(mydata->addrMem,mydata->addrHigh); + tmp=asm_work_snb_xeonep_avx_2t(mydata); break; case FUNC_NHM_COREI_SSE2_1T: - tmp=asm_work_nhm_corei_sse2_1t(mydata->addrMem,mydata->addrHigh); + tmp=asm_work_nhm_corei_sse2_1t(mydata); break; case FUNC_NHM_COREI_SSE2_2T: - tmp=asm_work_nhm_corei_sse2_2t(mydata->addrMem,mydata->addrHigh); + tmp=asm_work_nhm_corei_sse2_2t(mydata); break; case FUNC_NHM_XEONEP_SSE2_1T: - tmp=asm_work_nhm_xeonep_sse2_1t(mydata->addrMem,mydata->addrHigh); + tmp=asm_work_nhm_xeonep_sse2_1t(mydata); break; case FUNC_NHM_XEONEP_SSE2_2T: - tmp=asm_work_nhm_xeonep_sse2_2t(mydata->addrMem,mydata->addrHigh); + tmp=asm_work_nhm_xeonep_sse2_2t(mydata); break; case FUNC_BLD_OPTERON_FMA4_1T: - tmp=asm_work_bld_opteron_fma4_1t(mydata->addrMem,mydata->addrHigh); + tmp=asm_work_bld_opteron_fma4_1t(mydata); break; default: fprintf(stderr,"Error: unknown function %i\n",mydata->FUNCTION); @@ -274,6 +283,8 @@ void *thread(void *threaddata) /* terminate if master signals end of run */ if(*((volatile unsigned long long *)(mydata->addrHigh)) == LOAD_STOP) { + ((threaddata_t *)threaddata) -> stop_tsc = timestamp(); + pthread_exit(NULL); } } // end while diff --git a/work.h b/work.h index 9fd3f4a6..a3908d24 100644 --- a/work.h +++ b/work.h @@ -25,21 +25,22 @@ #include "firestarter_global.h" #include -#define FUNC_SKL_COREI_FMA_1T 1 -#define FUNC_SKL_COREI_FMA_2T 2 -#define FUNC_HSW_COREI_FMA_1T 3 -#define FUNC_HSW_COREI_FMA_2T 4 -#define FUNC_HSW_XEONEP_FMA_1T 5 -#define FUNC_HSW_XEONEP_FMA_2T 6 -#define FUNC_SNB_COREI_AVX_1T 7 -#define FUNC_SNB_COREI_AVX_2T 8 -#define FUNC_SNB_XEONEP_AVX_1T 9 -#define FUNC_SNB_XEONEP_AVX_2T 10 -#define FUNC_NHM_COREI_SSE2_1T 11 -#define FUNC_NHM_COREI_SSE2_2T 12 -#define FUNC_NHM_XEONEP_SSE2_1T 13 -#define FUNC_NHM_XEONEP_SSE2_2T 14 -#define FUNC_BLD_OPTERON_FMA4_1T 15 +#define FUNC_KNL_XEONPHI_AVX512_4T 1 +#define FUNC_SKL_COREI_FMA_1T 2 +#define FUNC_SKL_COREI_FMA_2T 3 +#define FUNC_HSW_COREI_FMA_1T 4 +#define FUNC_HSW_COREI_FMA_2T 5 +#define FUNC_HSW_XEONEP_FMA_1T 6 +#define FUNC_HSW_XEONEP_FMA_2T 7 +#define FUNC_SNB_COREI_AVX_1T 8 +#define FUNC_SNB_COREI_AVX_2T 9 +#define FUNC_SNB_XEONEP_AVX_1T 10 +#define FUNC_SNB_XEONEP_AVX_2T 11 +#define FUNC_NHM_COREI_SSE2_1T 12 +#define FUNC_NHM_COREI_SSE2_2T 13 +#define FUNC_NHM_XEONEP_SSE2_1T 14 +#define FUNC_NHM_XEONEP_SSE2_2T 15 +#define FUNC_BLD_OPTERON_FMA4_1T 16 /* @@ -55,99 +56,105 @@ extern void *thread(void *threaddata); /* * init functions */ -int init_skl_corei_fma_1t(unsigned long long addrMem) __attribute__((noinline)); -int init_skl_corei_fma_1t(unsigned long long addrMem); +int init_knl_xeonphi_avx512_4t(threaddata_t* threaddata) __attribute__((noinline)); +int init_knl_xeonphi_avx512_4t(threaddata_t* threaddata); -int init_skl_corei_fma_2t(unsigned long long addrMem) __attribute__((noinline)); -int init_skl_corei_fma_2t(unsigned long long addrMem); +int init_skl_corei_fma_1t(threaddata_t* threaddata) __attribute__((noinline)); +int init_skl_corei_fma_1t(threaddata_t* threaddata); -int init_hsw_corei_fma_1t(unsigned long long addrMem) __attribute__((noinline)); -int init_hsw_corei_fma_1t(unsigned long long addrMem); +int init_skl_corei_fma_2t(threaddata_t* threaddata) __attribute__((noinline)); +int init_skl_corei_fma_2t(threaddata_t* threaddata); -int init_hsw_corei_fma_2t(unsigned long long addrMem) __attribute__((noinline)); -int init_hsw_corei_fma_2t(unsigned long long addrMem); +int init_hsw_corei_fma_1t(threaddata_t* threaddata) __attribute__((noinline)); +int init_hsw_corei_fma_1t(threaddata_t* threaddata); -int init_hsw_xeonep_fma_1t(unsigned long long addrMem) __attribute__((noinline)); -int init_hsw_xeonep_fma_1t(unsigned long long addrMem); +int init_hsw_corei_fma_2t(threaddata_t* threaddata) __attribute__((noinline)); +int init_hsw_corei_fma_2t(threaddata_t* threaddata); -int init_hsw_xeonep_fma_2t(unsigned long long addrMem) __attribute__((noinline)); -int init_hsw_xeonep_fma_2t(unsigned long long addrMem); +int init_hsw_xeonep_fma_1t(threaddata_t* threaddata) __attribute__((noinline)); +int init_hsw_xeonep_fma_1t(threaddata_t* threaddata); -int init_snb_corei_avx_1t(unsigned long long addrMem) __attribute__((noinline)); -int init_snb_corei_avx_1t(unsigned long long addrMem); +int init_hsw_xeonep_fma_2t(threaddata_t* threaddata) __attribute__((noinline)); +int init_hsw_xeonep_fma_2t(threaddata_t* threaddata); -int init_snb_corei_avx_2t(unsigned long long addrMem) __attribute__((noinline)); -int init_snb_corei_avx_2t(unsigned long long addrMem); +int init_snb_corei_avx_1t(threaddata_t* threaddata) __attribute__((noinline)); +int init_snb_corei_avx_1t(threaddata_t* threaddata); -int init_snb_xeonep_avx_1t(unsigned long long addrMem) __attribute__((noinline)); -int init_snb_xeonep_avx_1t(unsigned long long addrMem); +int init_snb_corei_avx_2t(threaddata_t* threaddata) __attribute__((noinline)); +int init_snb_corei_avx_2t(threaddata_t* threaddata); -int init_snb_xeonep_avx_2t(unsigned long long addrMem) __attribute__((noinline)); -int init_snb_xeonep_avx_2t(unsigned long long addrMem); +int init_snb_xeonep_avx_1t(threaddata_t* threaddata) __attribute__((noinline)); +int init_snb_xeonep_avx_1t(threaddata_t* threaddata); -int init_nhm_corei_sse2_1t(unsigned long long addrMem) __attribute__((noinline)); -int init_nhm_corei_sse2_1t(unsigned long long addrMem); +int init_snb_xeonep_avx_2t(threaddata_t* threaddata) __attribute__((noinline)); +int init_snb_xeonep_avx_2t(threaddata_t* threaddata); -int init_nhm_corei_sse2_2t(unsigned long long addrMem) __attribute__((noinline)); -int init_nhm_corei_sse2_2t(unsigned long long addrMem); +int init_nhm_corei_sse2_1t(threaddata_t* threaddata) __attribute__((noinline)); +int init_nhm_corei_sse2_1t(threaddata_t* threaddata); -int init_nhm_xeonep_sse2_1t(unsigned long long addrMem) __attribute__((noinline)); -int init_nhm_xeonep_sse2_1t(unsigned long long addrMem); +int init_nhm_corei_sse2_2t(threaddata_t* threaddata) __attribute__((noinline)); +int init_nhm_corei_sse2_2t(threaddata_t* threaddata); -int init_nhm_xeonep_sse2_2t(unsigned long long addrMem) __attribute__((noinline)); -int init_nhm_xeonep_sse2_2t(unsigned long long addrMem); +int init_nhm_xeonep_sse2_1t(threaddata_t* threaddata) __attribute__((noinline)); +int init_nhm_xeonep_sse2_1t(threaddata_t* threaddata); -int init_bld_opteron_fma4_1t(unsigned long long addrMem) __attribute__((noinline)); -int init_bld_opteron_fma4_1t(unsigned long long addrMem); +int init_nhm_xeonep_sse2_2t(threaddata_t* threaddata) __attribute__((noinline)); +int init_nhm_xeonep_sse2_2t(threaddata_t* threaddata); + +int init_bld_opteron_fma4_1t(threaddata_t* threaddata) __attribute__((noinline)); +int init_bld_opteron_fma4_1t(threaddata_t* threaddata); /* * stress test functions */ -int asm_work_skl_corei_fma_1t(unsigned long long addrMem, unsigned long long addrHigh) __attribute__((noinline)); -int asm_work_skl_corei_fma_1t(unsigned long long addrMem, unsigned long long addrHigh); +int asm_work_knl_xeonphi_avx512_4t(threaddata_t* threaddata) __attribute__((noinline)); +int asm_work_knl_xeonphi_avx512_4t(threaddata_t* threaddata); + +int asm_work_skl_corei_fma_1t(threaddata_t* threaddata) __attribute__((noinline)); +int asm_work_skl_corei_fma_1t(threaddata_t* threaddata); -int asm_work_skl_corei_fma_2t(unsigned long long addrMem, unsigned long long addrHigh) __attribute__((noinline)); -int asm_work_skl_corei_fma_2t(unsigned long long addrMem, unsigned long long addrHigh); +int asm_work_skl_corei_fma_2t(threaddata_t* threaddata) __attribute__((noinline)); +int asm_work_skl_corei_fma_2t(threaddata_t* threaddata); -int asm_work_hsw_corei_fma_1t(unsigned long long addrMem, unsigned long long addrHigh) __attribute__((noinline)); -int asm_work_hsw_corei_fma_1t(unsigned long long addrMem, unsigned long long addrHigh); +int asm_work_hsw_corei_fma_1t(threaddata_t* threaddata) __attribute__((noinline)); +int asm_work_hsw_corei_fma_1t(threaddata_t* threaddata); -int asm_work_hsw_corei_fma_2t(unsigned long long addrMem, unsigned long long addrHigh) __attribute__((noinline)); -int asm_work_hsw_corei_fma_2t(unsigned long long addrMem, unsigned long long addrHigh); +int asm_work_hsw_corei_fma_2t(threaddata_t* threaddata) __attribute__((noinline)); +int asm_work_hsw_corei_fma_2t(threaddata_t* threaddata); -int asm_work_hsw_xeonep_fma_1t(unsigned long long addrMem, unsigned long long addrHigh) __attribute__((noinline)); -int asm_work_hsw_xeonep_fma_1t(unsigned long long addrMem, unsigned long long addrHigh); +int asm_work_hsw_xeonep_fma_1t(threaddata_t* threaddata) __attribute__((noinline)); +int asm_work_hsw_xeonep_fma_1t(threaddata_t* threaddata); -int asm_work_hsw_xeonep_fma_2t(unsigned long long addrMem, unsigned long long addrHigh) __attribute__((noinline)); -int asm_work_hsw_xeonep_fma_2t(unsigned long long addrMem, unsigned long long addrHigh); +int asm_work_hsw_xeonep_fma_2t(threaddata_t* threaddata) __attribute__((noinline)); +int asm_work_hsw_xeonep_fma_2t(threaddata_t* threaddata); -int asm_work_snb_corei_avx_1t(unsigned long long addrMem, unsigned long long addrHigh) __attribute__((noinline)); -int asm_work_snb_corei_avx_1t(unsigned long long addrMem, unsigned long long addrHigh); +int asm_work_snb_corei_avx_1t(threaddata_t* threaddata) __attribute__((noinline)); +int asm_work_snb_corei_avx_1t(threaddata_t* threaddata); -int asm_work_snb_corei_avx_2t(unsigned long long addrMem, unsigned long long addrHigh) __attribute__((noinline)); -int asm_work_snb_corei_avx_2t(unsigned long long addrMem, unsigned long long addrHigh); +int asm_work_snb_corei_avx_2t(threaddata_t* threaddata) __attribute__((noinline)); +int asm_work_snb_corei_avx_2t(threaddata_t* threaddata); -int asm_work_snb_xeonep_avx_1t(unsigned long long addrMem, unsigned long long addrHigh) __attribute__((noinline)); -int asm_work_snb_xeonep_avx_1t(unsigned long long addrMem, unsigned long long addrHigh); +int asm_work_snb_xeonep_avx_1t(threaddata_t* threaddata) __attribute__((noinline)); +int asm_work_snb_xeonep_avx_1t(threaddata_t* threaddata); -int asm_work_snb_xeonep_avx_2t(unsigned long long addrMem, unsigned long long addrHigh) __attribute__((noinline)); -int asm_work_snb_xeonep_avx_2t(unsigned long long addrMem, unsigned long long addrHigh); +int asm_work_snb_xeonep_avx_2t(threaddata_t* threaddata) __attribute__((noinline)); +int asm_work_snb_xeonep_avx_2t(threaddata_t* threaddata); -int asm_work_nhm_corei_sse2_1t(unsigned long long addrMem, unsigned long long addrHigh) __attribute__((noinline)); -int asm_work_nhm_corei_sse2_1t(unsigned long long addrMem, unsigned long long addrHigh); +int asm_work_nhm_corei_sse2_1t(threaddata_t* threaddata) __attribute__((noinline)); +int asm_work_nhm_corei_sse2_1t(threaddata_t* threaddata); -int asm_work_nhm_corei_sse2_2t(unsigned long long addrMem, unsigned long long addrHigh) __attribute__((noinline)); -int asm_work_nhm_corei_sse2_2t(unsigned long long addrMem, unsigned long long addrHigh); +int asm_work_nhm_corei_sse2_2t(threaddata_t* threaddata) __attribute__((noinline)); +int asm_work_nhm_corei_sse2_2t(threaddata_t* threaddata); -int asm_work_nhm_xeonep_sse2_1t(unsigned long long addrMem, unsigned long long addrHigh) __attribute__((noinline)); -int asm_work_nhm_xeonep_sse2_1t(unsigned long long addrMem, unsigned long long addrHigh); +int asm_work_nhm_xeonep_sse2_1t(threaddata_t* threaddata) __attribute__((noinline)); +int asm_work_nhm_xeonep_sse2_1t(threaddata_t* threaddata); -int asm_work_nhm_xeonep_sse2_2t(unsigned long long addrMem, unsigned long long addrHigh) __attribute__((noinline)); -int asm_work_nhm_xeonep_sse2_2t(unsigned long long addrMem, unsigned long long addrHigh); +int asm_work_nhm_xeonep_sse2_2t(threaddata_t* threaddata) __attribute__((noinline)); +int asm_work_nhm_xeonep_sse2_2t(threaddata_t* threaddata); -int asm_work_bld_opteron_fma4_1t(unsigned long long addrMem, unsigned long long addrHigh) __attribute__((noinline)); -int asm_work_bld_opteron_fma4_1t(unsigned long long addrMem, unsigned long long addrHigh); +int asm_work_bld_opteron_fma4_1t(threaddata_t* threaddata) __attribute__((noinline)); +int asm_work_bld_opteron_fma4_1t(threaddata_t* threaddata); /* diff --git a/x86.c b/x86.c index b231214f..fc765bcc 100644 --- a/x86.c +++ b/x86.c @@ -53,9 +53,6 @@ static int has_invariant_rdtsc(); #endif #endif -/** used to store Registers {R|E}AX, {R|E}BX, {R|E}CX and {R|E}DX */ -static unsigned long long a,b,c,d; - /* * declarations of x86 specific functions, only used within this file */ @@ -77,11 +74,11 @@ static int has_htt(); /** 64 Bit implementations */ #if defined _64_BIT -static unsigned long long reg_a,reg_b,reg_c,reg_d; - static void cpuid(unsigned long long *a, unsigned long long *b, unsigned long long *c, unsigned long long *d) { + unsigned long long reg_a,reg_b,reg_c,reg_d; + __asm__ __volatile__( "cpuid;" : "=a" (reg_a), "=b" (reg_b), "=c" (reg_c), "=d" (reg_d) @@ -101,6 +98,8 @@ static int has_cpuid() unsigned long long timestamp() { + unsigned long long reg_a,reg_d; + if (!has_rdtsc()) return 0; __asm__ __volatile__("rdtsc;": "=a" (reg_a), "=d" (reg_d)); return (reg_d<<32)|(reg_a&0xffffffffULL); @@ -110,11 +109,11 @@ unsigned long long timestamp() /** 32 Bit implementations */ #if defined(_32_BIT) -/* 32 Bit Registers */ -static unsigned int reg_a,reg_b,reg_c,reg_d; static void cpuid(unsigned long long *a, unsigned long long *b, unsigned long long *c, unsigned long long *d) { + unsigned int reg_a,reg_b,reg_c,reg_d; + __asm__ __volatile__( "cpuid;" : "=a" (reg_a), "=b" (reg_b), "=c" (reg_c), "=d" (reg_d) @@ -165,6 +164,8 @@ static int has_cpuid() unsigned long long timestamp() { + unsigned int reg_a,reg_d; + if (!has_rdtsc()) return 0; __asm__ __volatile__("rdtsc;": "=a" (reg_a) , "=d" (reg_d)); // upper 32 Bit in EDX, lower 32 Bit in EAX @@ -206,6 +207,8 @@ void get_architecture(char* arch, size_t len) int has_rdtsc() { + unsigned long long a,b,c,d; + if (!has_cpuid()) return 0; a=0; @@ -223,6 +226,7 @@ int has_rdtsc() int has_invariant_rdtsc() { + unsigned long long a,b,c,d; char tmp[_HW_DETECT_MAX_OUTPUT]; int res=0; @@ -281,6 +285,8 @@ int has_invariant_rdtsc() static int has_htt() { + unsigned long long a,b,c,d; + if (!has_cpuid()) return 0; a=0; cpuid(&a,&b,&c,&d); @@ -295,6 +301,7 @@ static int has_htt() int get_cpu_vendor(char* vendor, size_t len) { + unsigned long long a,b,c,d; char tmp_vendor[13]; if (!has_cpuid()) return generic_get_cpu_vendor(vendor); @@ -312,6 +319,7 @@ int get_cpu_vendor(char* vendor, size_t len) int get_cpu_name(char* name, size_t len) { + unsigned long long a,b,c,d; char tmp[48]; char* start; @@ -361,6 +369,8 @@ int get_cpu_name(char* name, size_t len) int get_cpu_family() { + unsigned long long a,b,c,d; + if (!has_cpuid()) return generic_get_cpu_family(); a=0; cpuid(&a,&b,&c,&d); @@ -375,6 +385,8 @@ int get_cpu_family() } int get_cpu_model() { + unsigned long long a,b,c,d; + if (!has_cpuid()) return generic_get_cpu_model(); a=0; cpuid(&a,&b,&c,&d); @@ -389,6 +401,8 @@ int get_cpu_model() } int get_cpu_stepping() { + unsigned long long a,b,c,d; + if (!has_cpuid()) return generic_get_cpu_stepping(); a=0; cpuid(&a,&b,&c,&d); @@ -404,6 +418,7 @@ int get_cpu_stepping() int get_cpu_isa_extensions(char* features, size_t len) { + unsigned long long a,b,c,d; unsigned long long max,max_ext; char tmp[16]; @@ -451,6 +466,14 @@ int get_cpu_isa_extensions(char* features, size_t len) if (c&(1<<23)) strncat(features,"POPCNT ",(len-strlen(features))-1); } + if (max>=7) + { + a=7;c=0; + cpuid(&a,&b,&c,&d); + + if (b&(1<<5)) strncat(features,"AVX2 ", (len-strlen(features))-1); + if (b&(1<<16)) strncat(features,"AVX512 ", (len-strlen(features))-1); + } if (max_ext>=0x80000001) { a=0x80000001; @@ -591,6 +614,7 @@ unsigned long long get_cpu_clockrate(int check,int cpu) */ int num_caches(int cpu) { + unsigned long long a,b,c,d; unsigned long long max,max_ext; char tmp[16]; int num; @@ -660,6 +684,7 @@ int num_caches(int cpu) //TODO use sysfs if available to determine cache sharing int cache_info(int cpu,int id, char* output, size_t len) { + unsigned long long a,b,c,d; unsigned long long max,max_ext; char tmp[16]; @@ -1120,6 +1145,7 @@ int num_packages() int num_cores_per_package() { + unsigned long long a,b,c,d; char tmp[16]; int num=-1; @@ -1167,7 +1193,7 @@ int num_cores_per_package() /* consistency checks */ /* more cores than cpus is not possible -> some cores are deactivated */ if (num>num_cpus()) num=num_cpus(); - /* if the number of packages is known this cann be checked for multi-socket systems, too + /* if the number of packages is known this can be checked for multi-socket systems, too NOTE depends on valid entries in sysfs */ if ((generic_num_packages()!=-1)&&(generic_num_packages()*num>num_cpus())) num=num_cpus()/generic_num_packages(); @@ -1185,6 +1211,7 @@ int num_threads_per_core() int num_threads_per_package() { + unsigned long long a,b,c,d; int num=-1; char tmp[16];