diff --git a/CHANGELOG b/CHANGELOG
index 70c9bf81..f26e56ee 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -19,6 +19,10 @@
# Contact: daniel.hackenberg@tu-dresden.de
###############################################################################
+Version 1.5
+ - added Knights Landing support (AVX512F)
+ - added error handling for restricted cpu sets for --bind and --threads options
+
Version 1.4
- added support for Skylake-H (FMA)
- added support for Broadwell-E/EP (FMA)
diff --git a/FIRESTARTER b/FIRESTARTER
index 22cba54d..1136c093 100644
Binary files a/FIRESTARTER and b/FIRESTARTER differ
diff --git a/FIRESTARTER_CUDA b/FIRESTARTER_CUDA
index bcbbfbba..e76fabed 100644
Binary files a/FIRESTARTER_CUDA and b/FIRESTARTER_CUDA differ
diff --git a/LICENSE b/LICENSE
index 9cecc1d4..bc08fe2e 100644
--- a/LICENSE
+++ b/LICENSE
@@ -617,58 +617,3 @@ reviewing courts shall apply local law that most closely approximates
an absolute waiver of all civil liability in connection with the
Program, unless a warranty or assumption of liability accompanies a
copy of the Program in return for a fee.
-
- END OF TERMS AND CONDITIONS
-
- How to Apply These Terms to Your New Programs
-
- If you develop a new program, and you want it to be of the greatest
-possible use to the public, the best way to achieve this is to make it
-free software which everyone can redistribute and change under these terms.
-
- To do so, attach the following notices to the program. It is safest
-to attach them to the start of each source file to most effectively
-state the exclusion of warranty; and each file should have at least
-the "copyright" line and a pointer to where the full notice is found.
-
- {one line to give the program's name and a brief idea of what it does.}
- Copyright (C) {year} {name of author}
-
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see .
-
-Also add information on how to contact you by electronic and paper mail.
-
- If the program does terminal interaction, make it output a short
-notice like this when it starts in an interactive mode:
-
- {project} Copyright (C) {year} {fullname}
- This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
- This is free software, and you are welcome to redistribute it
- under certain conditions; type `show c' for details.
-
-The hypothetical commands `show w' and `show c' should show the appropriate
-parts of the General Public License. Of course, your program's commands
-might be different; for a GUI interface, you would use an "about box".
-
- You should also get your employer (if you work as a programmer) or school,
-if any, to sign a "copyright disclaimer" for the program, if necessary.
-For more information on this, and how to apply and follow the GNU GPL, see
-.
-
- The GNU General Public License does not permit incorporating your program
-into proprietary programs. If your program is a subroutine library, you
-may consider it more useful to permit linking proprietary applications with
-the library. If this is what you want to do, use the GNU Lesser General
-Public License instead of this License. But first, please read
-.
diff --git a/Makefile b/Makefile
index 83b90e7e..adb946d5 100644
--- a/Makefile
+++ b/Makefile
@@ -44,11 +44,11 @@ cuda: FIRESTARTER_CUDA
all: linux cuda
-FIRESTARTER: generic.o x86.o main.o work.o x86.o watchdog.o help.o sse2_functions.o avx_functions.o fma_functions.o fma4_functions.o
- ${LINUX_CC} -o FIRESTARTER generic.o main.o work.o x86.o watchdog.o help.o sse2_functions.o avx_functions.o fma_functions.o fma4_functions.o ${LINUX_L_FLAGS}
+FIRESTARTER: generic.o x86.o main.o work.o x86.o watchdog.o help.o sse2_functions.o avx_functions.o fma_functions.o fma4_functions.o avx512_functions.o
+ ${LINUX_CC} -o FIRESTARTER generic.o main.o work.o x86.o watchdog.o help.o sse2_functions.o avx_functions.o fma_functions.o fma4_functions.o avx512_functions.o ${LINUX_L_FLAGS}
-FIRESTARTER_CUDA: generic.o x86.o work.o x86.o watchdog.o sse2_functions.o avx_functions.o fma_functions.o fma4_functions.o gpu.o main_cuda.o help_cuda.o
- ${LINUX_CC} -o FIRESTARTER_CUDA generic.o main_cuda.o work.o x86.o watchdog.o help_cuda.o sse2_functions.o avx_functions.o fma_functions.o fma4_functions.o gpu.o ${LINUX_CUDA_L_FLAGS}
+FIRESTARTER_CUDA: generic.o x86.o work.o x86.o watchdog.o sse2_functions.o avx_functions.o fma_functions.o fma4_functions.o avx512_functions.o gpu.o main_cuda.o help_cuda.o
+ ${LINUX_CC} -o FIRESTARTER_CUDA generic.o main_cuda.o work.o x86.o watchdog.o help_cuda.o sse2_functions.o avx_functions.o fma_functions.o fma4_functions.o avx512_functions.o gpu.o ${LINUX_CUDA_L_FLAGS}
generic.o: generic.c cpu.h
${LINUX_CC} ${OPT} ${LINUX_C_FLAGS} -c generic.c
@@ -77,6 +77,9 @@ main_cuda.o: main.c work.h cpu.h
help_cuda.o: help.c help.h
${LINUX_CC} ${OPT} ${LINUX_C_FLAGS} -o help_cuda.o -c help.c -DCUDA
+avx512_functions.o: avx512_functions.c
+ ${LINUX_CC} ${OPT} ${LINUX_C_FLAGS} -mavx512f -c avx512_functions.c
+
fma4_functions.o: fma4_functions.c
${LINUX_CC} ${OPT} ${LINUX_C_FLAGS} -mfma4 -mavx -c fma4_functions.c
diff --git a/README b/README
index 9b737581..7168d62d 100644
--- a/README
+++ b/README
@@ -30,6 +30,7 @@ Supported CPU microarchitectures
- Intel Ivy Bridge
- Intel Haswell
- Intel Skylake
+- Intel Knights Landing
- AMD Bulldozer (experimental)
Since version 1.1 it is also possible to create alternating and repetitive
@@ -54,8 +55,10 @@ Options:
-c | --copyright display copyright information
-w | --warranty display warranty information
-q | --quiet disable output to stdout
+-r | --report display additional information (overridden by -q)
-a | --avail list available functions
--i ID | --function=ID specify ID of the load-function to be used
+-i ID | --function=ID specify integer ID of the load-function to be
+ used (as listed by --avail)
-t TIMEOUT | --timeout=TIMEOUT set timeout (seconds) after which FIRESTARTER
terminates itself, default: no timeout
-l LOAD | --load=LOAD set the percentage of high load to LOAD (%),
@@ -67,9 +70,12 @@ Options:
load and an idle phase, the percentage of
high load is defined by -l
-n COUNT | --threads=COUNT specify the number of threads
--b CPULIST | --bind=CPULIST select certain CPUs (overrides -n)
+ cannot be combined with -b | --bind, which
+ implicitly specifies the number of threads
+-b CPULIST | --bind=CPULIST select certain CPUs
CPULIST format: "x,y,z", "x-y", "x-y/step",
and any combination of the above
+ cannot be combined with -n | --threads
CUDA Options:
-g | --gpus number of gpus to use (default: all)
diff --git a/avx512_functions.c b/avx512_functions.c
new file mode 100644
index 00000000..fe503168
--- /dev/null
+++ b/avx512_functions.c
@@ -0,0 +1,530 @@
+/******************************************************************************
+ * FIRESTARTER - A Processor Stress Test Utility
+ * Copyright (C) 2016 TU Dresden, Center for Information Services and High
+ * Performance Computing
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see .
+ *
+ * Contact: daniel.hackenberg@tu-dresden.de
+ *****************************************************************************/
+
+#include "work.h"
+
+
+
+
+ int init_knl_xeonphi_avx512_4t(threaddata_t* threaddata) __attribute__((noinline));
+int init_knl_xeonphi_avx512_4t(threaddata_t* threaddata)
+{
+ unsigned long long addrMem = threaddata->addrMem;
+ int i;
+ for (i=0;i<836608;i++) *((double*)(addrMem+8*i)) = 0.25 + (double)(i%9267) * 0.24738995982e-4;
+
+ // lines with register operations
+ threaddata->flops+=10*32; // 2 512 bit FMA operations
+
+ // lines with L1 operations
+ threaddata->flops+=40*32; // 2 512 bit FMA operations
+
+ // lines with L2 operations
+ threaddata->flops+=8*16; // 1 512 bit FMA operation
+
+ // lines with RAM operations
+ threaddata->flops+=3*16; // 1 512 bit FMA operation
+ threaddata->bytes=3*64; // 1 memory access
+
+ threaddata->flops*=6;
+ threaddata->bytes*=6;
+
+ return EXIT_SUCCESS;
+}
+
+/**
+ * assembler implementation of processor and memory stress test
+ * uses AVX512F instruction set
+ * optimized for Intel Xeon Phi (Knights Landing)
+ * @input - addrMem: pointer to buffer
+ * @return EXIT_SUCCESS
+ */
+int asm_work_knl_xeonphi_avx512_4t(threaddata_t* threaddata) __attribute__((noinline));
+int asm_work_knl_xeonphi_avx512_4t(threaddata_t* threaddata)
+{
+ if (*((unsigned long long*)threaddata->addrHigh) == 0) return EXIT_SUCCESS;
+ /* input:
+ * - addrMem -> rax
+ * register usage:
+ * - rax: stores original pointer to buffer, used to periodically reset other pointers
+ * - rbx: pointer to L1 buffer
+ * - rcx: pointer to L2 buffer
+ * - r8: pointer to L3 buffer
+ * - r9: pointer to RAM buffer
+ * - r10: counter for L2-pointer reset
+ * - r11: counter for L3-pointer reset
+ * - r12: counter for RAM-pointer reset
+ * - r13: register for temporary results
+ * - r14: stores cacheline width as increment for buffer addresses
+ * - r15: stores address of shared variable that controls load level
+ * - mm0: stores iteration counter
+ * - rdx, rsi, rdi: registers for shift operations
+ * - xmm*,zmm*: data registers for SIMD instructions
+ */
+ __asm__ __volatile__(
+ "mov %%rax, %%rax;" // store start address of buffer
+ "mov %%rbx, %%r15;" // store address of shared variable that controls load level
+ "movq %%rcx, %%mm0;" // store iteration counter
+ "mov $64, %%r14;" // increment after each cache/memory access
+ //Initialize registers for shift operations
+ "mov $0xAAAAAAAA, %%edi;"
+ "mov $0xAAAAAAAA, %%esi;"
+ "mov $0xAAAAAAAA, %%edx;"
+ //Initialize AVX-Registers for FMA Operations
+ "vmovapd (%%rax), %%zmm0;"
+ "vmovapd (%%rax), %%zmm1;"
+ "vmovapd 384(%%rax), %%zmm2;"
+ "vmovapd 448(%%rax), %%zmm3;"
+ "vmovapd 512(%%rax), %%zmm4;"
+ "vmovapd 576(%%rax), %%zmm5;"
+ "vmovapd 640(%%rax), %%zmm6;"
+ "vmovapd 704(%%rax), %%zmm7;"
+ "vmovapd 768(%%rax), %%zmm8;"
+ "vmovapd 832(%%rax), %%zmm9;"
+ "vmovapd 896(%%rax), %%zmm10;"
+ "vmovapd 960(%%rax), %%zmm11;"
+ "vmovapd 1024(%%rax), %%zmm12;"
+ "vmovapd 1088(%%rax), %%zmm13;"
+ "vmovapd 1152(%%rax), %%zmm14;"
+ "vmovapd 1216(%%rax), %%zmm15;"
+ "vmovapd 1280(%%rax), %%zmm16;"
+ "vmovapd 1344(%%rax), %%zmm17;"
+ "vmovapd 1408(%%rax), %%zmm18;"
+ "vmovapd 1472(%%rax), %%zmm19;"
+ "vmovapd 1536(%%rax), %%zmm20;"
+ "vmovapd 1600(%%rax), %%zmm21;"
+ "vmovapd 1664(%%rax), %%zmm22;"
+ "vmovapd 1728(%%rax), %%zmm23;"
+ "vmovapd 1792(%%rax), %%zmm24;"
+ "vmovapd 1856(%%rax), %%zmm25;"
+ "vmovapd 1920(%%rax), %%zmm26;"
+ "vmovapd 1984(%%rax), %%zmm27;"
+ "vmovapd 2048(%%rax), %%zmm28;"
+ "vmovapd 2112(%%rax), %%zmm29;"
+ "vmovapd 2176(%%rax), %%zmm30;"
+ "mov %%rax, %%rbx;" // address for L1-buffer
+ "mov %%rax, %%rcx;"
+ "add $8192, %%rcx;" // address for L2-buffer
+ "mov %%rax, %%r8;"
+ "add $131072, %%r8;" // address for L3-buffer
+ "mov %%rax, %%r9;"
+ "add $0, %%r9;" // address for RAM-buffer
+ "movabs $34, %%r10;" // reset-counter for L2-buffer with 48 cache line accesses per loop (102 KB)
+ "movabs $0, %%r11;" // reset-counter for L3-buffer with 0 cache line accesses per loop (0 KB)
+ "movabs $5688, %%r12;" // reset-counter for RAM-buffer with 18 cache line accesses per loop (6399 KB)
+
+ ".align 64;" /* alignment in bytes */
+ "_work_loop_knl_xeonphi_avx512_4t:"
+ /*****************************************************************************************************************************************************
+ decode 0 decode 1 decode 2 decode 3 */
+ "vfmadd231pd 64(%%rbx), %%zmm0, %%zmm3; prefetcht2 (%%r9); shl $1, %%edi; add %%r14, %%r9; " // RAM prefetch
+ "vfmadd231pd %%zmm5, %%zmm0, %%zmm4; vfmadd231pd %%zmm6, %%zmm1, %%zmm26; shl $1, %%esi; xor %%rdi, %%r13; " // REG ops only
+ "vfmadd231pd %%zmm6, %%zmm0, %%zmm5; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm5; shl $1, %%edx; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm7, %%zmm0, %%zmm6; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm6; shr $1, %%edi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm8, %%zmm0, %%zmm7; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm7; shr $1, %%esi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm9, %%zmm0, %%zmm8; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm8; shr $1, %%edx; add %%r14, %%rbx; " // L1 load
+ "vmovapd %%zmm9, 64(%%rcx); vfmadd231pd %%zmm10, %%zmm0, %%zmm9; shl $1, %%edi; add %%r14, %%rcx; " // L2 store
+ "vfmadd231pd %%zmm11, %%zmm0, %%zmm10; vfmadd231pd %%zmm12, %%zmm1, %%zmm27; shl $1, %%esi; xor %%rdi, %%r13; " // REG ops only
+ "vfmadd231pd %%zmm12, %%zmm0, %%zmm11; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm11; shl $1, %%edx; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm13, %%zmm0, %%zmm12; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm12; shr $1, %%edi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm14, %%zmm0, %%zmm13; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm13; shr $1, %%esi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm15, %%zmm0, %%zmm14; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm14; shr $1, %%edx; add %%r14, %%rbx; " // L1 load
+ "vmovapd %%zmm15, 64(%%rcx); vfmadd231pd %%zmm16, %%zmm0, %%zmm15; shl $1, %%edi; add %%r14, %%rcx; " // L2 store
+ "vfmadd231pd %%zmm17, %%zmm0, %%zmm16; vfmadd231pd %%zmm18, %%zmm1, %%zmm28; shl $1, %%esi; xor %%rdi, %%r13; " // REG ops only
+ "vfmadd231pd %%zmm18, %%zmm0, %%zmm17; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm17; shl $1, %%edx; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm19, %%zmm0, %%zmm18; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm18; shr $1, %%edi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm20, %%zmm0, %%zmm19; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm19; shr $1, %%esi; add %%r14, %%rbx; " // L1 load
+ "vmovapd %%zmm20, 64(%%rcx); vfmadd231pd %%zmm21, %%zmm0, %%zmm20; shr $1, %%edx; add %%r14, %%rcx; " // L2 store
+ "vfmadd231pd %%zmm22, %%zmm0, %%zmm21; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm21; shl $1, %%edi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm23, %%zmm0, %%zmm22; vfmadd231pd %%zmm24, %%zmm1, %%zmm29; shl $1, %%esi; xor %%rdi, %%r13; " // REG ops only
+ "vfmadd231pd %%zmm24, %%zmm0, %%zmm23; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm23; shl $1, %%edx; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm25, %%zmm0, %%zmm24; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm24; shr $1, %%edi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm2, %%zmm0, %%zmm25; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm25; shr $1, %%esi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd 64(%%rbx), %%zmm0, %%zmm2; prefetcht2 (%%r9); shr $1, %%edx; add %%r14, %%r9; " // RAM prefetch
+ "vfmadd231pd %%zmm4, %%zmm0, %%zmm3; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm3; shl $1, %%edi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm5, %%zmm0, %%zmm4; vfmadd231pd %%zmm6, %%zmm1, %%zmm30; shl $1, %%esi; xor %%rdi, %%r13; " // REG ops only
+ "vfmadd231pd %%zmm6, %%zmm0, %%zmm5; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm5; shl $1, %%edx; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm7, %%zmm0, %%zmm6; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm6; shr $1, %%edi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm8, %%zmm0, %%zmm7; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm7; shr $1, %%esi; add %%r14, %%rbx; " // L1 load
+ "vmovapd %%zmm8, 64(%%rcx); vfmadd231pd %%zmm9, %%zmm0, %%zmm8; shr $1, %%edx; add %%r14, %%rcx; " // L2 store
+ "vfmadd231pd %%zmm10, %%zmm0, %%zmm9; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm9; shl $1, %%edi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm11, %%zmm0, %%zmm10; vfmadd231pd %%zmm12, %%zmm1, %%zmm26; shl $1, %%esi; xor %%rdi, %%r13; " // REG ops only
+ "vfmadd231pd %%zmm12, %%zmm0, %%zmm11; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm11; shl $1, %%edx; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm13, %%zmm0, %%zmm12; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm12; shr $1, %%edi; add %%r14, %%rbx; " // L1 load
+ "vmovapd %%zmm13, 64(%%rcx); vfmadd231pd %%zmm14, %%zmm0, %%zmm13; shr $1, %%esi; add %%r14, %%rcx; " // L2 store
+ "vfmadd231pd %%zmm15, %%zmm0, %%zmm14; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm14; shr $1, %%edx; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm16, %%zmm0, %%zmm15; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm15; shl $1, %%edi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm17, %%zmm0, %%zmm16; vfmadd231pd %%zmm18, %%zmm1, %%zmm27; shl $1, %%esi; xor %%rdi, %%r13; " // REG ops only
+ "vfmadd231pd %%zmm18, %%zmm0, %%zmm17; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm17; shl $1, %%edx; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm19, %%zmm0, %%zmm18; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm18; shr $1, %%edi; add %%r14, %%rbx; " // L1 load
+ "vmovapd %%zmm19, 64(%%rcx); vfmadd231pd %%zmm20, %%zmm0, %%zmm19; shr $1, %%esi; add %%r14, %%rcx; " // L2 store
+ "vfmadd231pd %%zmm21, %%zmm0, %%zmm20; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm20; shr $1, %%edx; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm22, %%zmm0, %%zmm21; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm21; shl $1, %%edi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm23, %%zmm0, %%zmm22; vfmadd231pd %%zmm24, %%zmm1, %%zmm28; shl $1, %%esi; xor %%rdi, %%r13; " // REG ops only
+ "vfmadd231pd %%zmm24, %%zmm0, %%zmm23; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm23; shl $1, %%edx; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm25, %%zmm0, %%zmm24; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm24; shr $1, %%edi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd 64(%%rbx), %%zmm0, %%zmm25; prefetcht2 (%%r9); shr $1, %%esi; add %%r14, %%r9; " // RAM prefetch
+ "vfmadd231pd %%zmm3, %%zmm0, %%zmm2; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm2; shr $1, %%edx; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm4, %%zmm0, %%zmm3; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm3; shl $1, %%edi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm5, %%zmm0, %%zmm4; vfmadd231pd %%zmm6, %%zmm1, %%zmm29; shl $1, %%esi; xor %%rdi, %%r13; " // REG ops only
+ "vfmadd231pd %%zmm6, %%zmm0, %%zmm5; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm5; shl $1, %%edx; add %%r14, %%rbx; " // L1 load
+ "vmovapd %%zmm6, 64(%%rcx); vfmadd231pd %%zmm7, %%zmm0, %%zmm6; shr $1, %%edi; add %%r14, %%rcx; " // L2 store
+ "vfmadd231pd %%zmm8, %%zmm0, %%zmm7; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm7; shr $1, %%esi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm9, %%zmm0, %%zmm8; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm8; shr $1, %%edx; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm10, %%zmm0, %%zmm9; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm9; shl $1, %%edi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm11, %%zmm0, %%zmm10; vfmadd231pd %%zmm12, %%zmm1, %%zmm30; shl $1, %%esi; xor %%rdi, %%r13; " // REG ops only
+ "vfmadd231pd %%zmm12, %%zmm0, %%zmm11; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm11; shl $1, %%edx; add %%r14, %%rbx; " // L1 load
+ "vmovapd %%zmm12, 64(%%rcx); vfmadd231pd %%zmm13, %%zmm0, %%zmm12; shr $1, %%edi; add %%r14, %%rcx; " // L2 store
+ "vfmadd231pd %%zmm14, %%zmm0, %%zmm13; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm13; shr $1, %%esi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm15, %%zmm0, %%zmm14; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm14; shr $1, %%edx; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm16, %%zmm0, %%zmm15; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm15; shl $1, %%edi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd 64(%%rbx), %%zmm0, %%zmm16; prefetcht2 (%%r9); shl $1, %%esi; add %%r14, %%r9; " // RAM prefetch
+ "vfmadd231pd %%zmm18, %%zmm0, %%zmm17; vfmadd231pd %%zmm19, %%zmm1, %%zmm26; shl $1, %%edx; xor %%rsi, %%r13; " // REG ops only
+ "vfmadd231pd %%zmm19, %%zmm0, %%zmm18; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm18; shr $1, %%edi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm20, %%zmm0, %%zmm19; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm19; shr $1, %%esi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm21, %%zmm0, %%zmm20; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm20; shr $1, %%edx; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm22, %%zmm0, %%zmm21; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm21; shl $1, %%edi; add %%r14, %%rbx; " // L1 load
+ "vmovapd %%zmm22, 64(%%rcx); vfmadd231pd %%zmm23, %%zmm0, %%zmm22; shl $1, %%esi; add %%r14, %%rcx; " // L2 store
+ "vfmadd231pd %%zmm24, %%zmm0, %%zmm23; vfmadd231pd %%zmm25, %%zmm1, %%zmm27; shl $1, %%edx; xor %%rsi, %%r13; " // REG ops only
+ "vfmadd231pd %%zmm25, %%zmm0, %%zmm24; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm24; shr $1, %%edi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm2, %%zmm0, %%zmm25; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm25; shr $1, %%esi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm3, %%zmm0, %%zmm2; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm2; shr $1, %%edx; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm4, %%zmm0, %%zmm3; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm3; shl $1, %%edi; add %%r14, %%rbx; " // L1 load
+ "vmovapd %%zmm4, 64(%%rcx); vfmadd231pd %%zmm5, %%zmm0, %%zmm4; shl $1, %%esi; add %%r14, %%rcx; " // L2 store
+ "vfmadd231pd %%zmm6, %%zmm0, %%zmm5; vfmadd231pd %%zmm7, %%zmm1, %%zmm28; shl $1, %%edx; xor %%rsi, %%r13; " // REG ops only
+ "vfmadd231pd %%zmm7, %%zmm0, %%zmm6; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm6; shr $1, %%edi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm8, %%zmm0, %%zmm7; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm7; shr $1, %%esi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm9, %%zmm0, %%zmm8; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm8; shr $1, %%edx; add %%r14, %%rbx; " // L1 load
+ "vmovapd %%zmm9, 64(%%rcx); vfmadd231pd %%zmm10, %%zmm0, %%zmm9; shl $1, %%edi; add %%r14, %%rcx; " // L2 store
+ "vfmadd231pd %%zmm11, %%zmm0, %%zmm10; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm10; shl $1, %%esi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm12, %%zmm0, %%zmm11; vfmadd231pd %%zmm13, %%zmm1, %%zmm29; shl $1, %%edx; xor %%rsi, %%r13; " // REG ops only
+ "vfmadd231pd %%zmm13, %%zmm0, %%zmm12; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm12; shr $1, %%edi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm14, %%zmm0, %%zmm13; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm13; shr $1, %%esi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm15, %%zmm0, %%zmm14; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm14; shr $1, %%edx; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd 64(%%rbx), %%zmm0, %%zmm15; prefetcht2 (%%r9); shl $1, %%edi; add %%r14, %%r9; " // RAM prefetch
+ "vfmadd231pd %%zmm17, %%zmm0, %%zmm16; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm16; shl $1, %%esi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm18, %%zmm0, %%zmm17; vfmadd231pd %%zmm19, %%zmm1, %%zmm30; shl $1, %%edx; xor %%rsi, %%r13; " // REG ops only
+ "vfmadd231pd %%zmm19, %%zmm0, %%zmm18; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm18; shr $1, %%edi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm20, %%zmm0, %%zmm19; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm19; shr $1, %%esi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm21, %%zmm0, %%zmm20; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm20; shr $1, %%edx; add %%r14, %%rbx; " // L1 load
+ "vmovapd %%zmm21, 64(%%rcx); vfmadd231pd %%zmm22, %%zmm0, %%zmm21; shl $1, %%edi; add %%r14, %%rcx; " // L2 store
+ "vfmadd231pd %%zmm23, %%zmm0, %%zmm22; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm22; shl $1, %%esi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm24, %%zmm0, %%zmm23; vfmadd231pd %%zmm25, %%zmm1, %%zmm26; shl $1, %%edx; xor %%rsi, %%r13; " // REG ops only
+ "vfmadd231pd %%zmm25, %%zmm0, %%zmm24; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm24; shr $1, %%edi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm2, %%zmm0, %%zmm25; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm25; shr $1, %%esi; add %%r14, %%rbx; " // L1 load
+ "vmovapd %%zmm2, 64(%%rcx); vfmadd231pd %%zmm3, %%zmm0, %%zmm2; shr $1, %%edx; add %%r14, %%rcx; " // L2 store
+ "vfmadd231pd %%zmm4, %%zmm0, %%zmm3; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm3; shl $1, %%edi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm5, %%zmm0, %%zmm4; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm4; shl $1, %%esi; mov %%rax, %%rbx;" // L1 load
+ "vfmadd231pd %%zmm6, %%zmm0, %%zmm5; vfmadd231pd %%zmm7, %%zmm1, %%zmm27; shl $1, %%edx; xor %%rsi, %%r13; " // REG ops only
+ "vfmadd231pd %%zmm7, %%zmm0, %%zmm6; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm6; shr $1, %%edi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm8, %%zmm0, %%zmm7; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm7; shr $1, %%esi; add %%r14, %%rbx; " // L1 load
+ "vmovapd %%zmm8, 64(%%rcx); vfmadd231pd %%zmm9, %%zmm0, %%zmm8; shr $1, %%edx; add %%r14, %%rcx; " // L2 store
+ "vfmadd231pd %%zmm10, %%zmm0, %%zmm9; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm9; shl $1, %%edi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm11, %%zmm0, %%zmm10; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm10; shl $1, %%esi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm12, %%zmm0, %%zmm11; vfmadd231pd %%zmm13, %%zmm1, %%zmm28; shl $1, %%edx; xor %%rsi, %%r13; " // REG ops only
+ "vfmadd231pd %%zmm13, %%zmm0, %%zmm12; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm12; shr $1, %%edi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm14, %%zmm0, %%zmm13; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm13; shr $1, %%esi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd 64(%%rbx), %%zmm0, %%zmm14; prefetcht2 (%%r9); shr $1, %%edx; add %%r14, %%r9; " // RAM prefetch
+ "vfmadd231pd %%zmm16, %%zmm0, %%zmm15; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm15; shl $1, %%edi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm17, %%zmm0, %%zmm16; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm16; shl $1, %%esi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm18, %%zmm0, %%zmm17; vfmadd231pd %%zmm19, %%zmm1, %%zmm29; shl $1, %%edx; xor %%rsi, %%r13; " // REG ops only
+ "vfmadd231pd %%zmm19, %%zmm0, %%zmm18; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm18; shr $1, %%edi; add %%r14, %%rbx; " // L1 load
+ "vmovapd %%zmm19, 64(%%rcx); vfmadd231pd %%zmm20, %%zmm0, %%zmm19; shr $1, %%esi; add %%r14, %%rcx; " // L2 store
+ "vfmadd231pd %%zmm21, %%zmm0, %%zmm20; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm20; shr $1, %%edx; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm22, %%zmm0, %%zmm21; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm21; shl $1, %%edi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm23, %%zmm0, %%zmm22; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm22; shl $1, %%esi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm24, %%zmm0, %%zmm23; vfmadd231pd %%zmm25, %%zmm1, %%zmm30; shl $1, %%edx; xor %%rsi, %%r13; " // REG ops only
+ "vfmadd231pd %%zmm25, %%zmm0, %%zmm24; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm24; shr $1, %%edi; add %%r14, %%rbx; " // L1 load
+ "vmovapd %%zmm25, 64(%%rcx); vfmadd231pd %%zmm2, %%zmm0, %%zmm25; shr $1, %%esi; add %%r14, %%rcx; " // L2 store
+ "vfmadd231pd %%zmm3, %%zmm0, %%zmm2; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm2; shr $1, %%edx; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm4, %%zmm0, %%zmm3; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm3; shl $1, %%edi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm5, %%zmm0, %%zmm4; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm4; shl $1, %%esi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd 64(%%rbx), %%zmm0, %%zmm5; prefetcht2 (%%r9); shl $1, %%edx; add %%r14, %%r9; " // RAM prefetch
+ "vfmadd231pd %%zmm7, %%zmm0, %%zmm6; vfmadd231pd %%zmm8, %%zmm1, %%zmm26; shr $1, %%edi; xor %%rdx, %%r13; " // REG ops only
+ "vfmadd231pd %%zmm8, %%zmm0, %%zmm7; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm7; shr $1, %%esi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm9, %%zmm0, %%zmm8; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm8; shr $1, %%edx; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm10, %%zmm0, %%zmm9; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm9; shl $1, %%edi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm11, %%zmm0, %%zmm10; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm10; shl $1, %%esi; add %%r14, %%rbx; " // L1 load
+ "vmovapd %%zmm11, 64(%%rcx); vfmadd231pd %%zmm12, %%zmm0, %%zmm11; shl $1, %%edx; add %%r14, %%rcx; " // L2 store
+ "vfmadd231pd %%zmm13, %%zmm0, %%zmm12; vfmadd231pd %%zmm14, %%zmm1, %%zmm27; shr $1, %%edi; xor %%rdx, %%r13; " // REG ops only
+ "vfmadd231pd %%zmm14, %%zmm0, %%zmm13; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm13; shr $1, %%esi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm15, %%zmm0, %%zmm14; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm14; shr $1, %%edx; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm16, %%zmm0, %%zmm15; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm15; shl $1, %%edi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm17, %%zmm0, %%zmm16; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm16; shl $1, %%esi; add %%r14, %%rbx; " // L1 load
+ "vmovapd %%zmm17, 64(%%rcx); vfmadd231pd %%zmm18, %%zmm0, %%zmm17; shl $1, %%edx; add %%r14, %%rcx; " // L2 store
+ "vfmadd231pd %%zmm19, %%zmm0, %%zmm18; vfmadd231pd %%zmm20, %%zmm1, %%zmm28; shr $1, %%edi; xor %%rdx, %%r13; " // REG ops only
+ "vfmadd231pd %%zmm20, %%zmm0, %%zmm19; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm19; shr $1, %%esi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm21, %%zmm0, %%zmm20; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm20; shr $1, %%edx; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm22, %%zmm0, %%zmm21; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm21; shl $1, %%edi; add %%r14, %%rbx; " // L1 load
+ "vmovapd %%zmm22, 64(%%rcx); vfmadd231pd %%zmm23, %%zmm0, %%zmm22; shl $1, %%esi; add %%r14, %%rcx; " // L2 store
+ "vfmadd231pd %%zmm24, %%zmm0, %%zmm23; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm23; shl $1, %%edx; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm25, %%zmm0, %%zmm24; vfmadd231pd %%zmm2, %%zmm1, %%zmm29; shr $1, %%edi; xor %%rdx, %%r13; " // REG ops only
+ "vfmadd231pd %%zmm2, %%zmm0, %%zmm25; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm25; shr $1, %%esi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm3, %%zmm0, %%zmm2; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm2; shr $1, %%edx; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm4, %%zmm0, %%zmm3; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm3; shl $1, %%edi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd 64(%%rbx), %%zmm0, %%zmm4; prefetcht2 (%%r9); shl $1, %%esi; add %%r14, %%r9; " // RAM prefetch
+ "vfmadd231pd %%zmm6, %%zmm0, %%zmm5; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm5; shl $1, %%edx; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm7, %%zmm0, %%zmm6; vfmadd231pd %%zmm8, %%zmm1, %%zmm30; shr $1, %%edi; xor %%rdx, %%r13; " // REG ops only
+ "vfmadd231pd %%zmm8, %%zmm0, %%zmm7; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm7; shr $1, %%esi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm9, %%zmm0, %%zmm8; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm8; shr $1, %%edx; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm10, %%zmm0, %%zmm9; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm9; shl $1, %%edi; add %%r14, %%rbx; " // L1 load
+ "vmovapd %%zmm10, 64(%%rcx); vfmadd231pd %%zmm11, %%zmm0, %%zmm10; shl $1, %%esi; add %%r14, %%rcx; " // L2 store
+ "vfmadd231pd %%zmm12, %%zmm0, %%zmm11; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm11; shl $1, %%edx; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm13, %%zmm0, %%zmm12; vfmadd231pd %%zmm14, %%zmm1, %%zmm26; shr $1, %%edi; xor %%rdx, %%r13; " // REG ops only
+ "vfmadd231pd %%zmm14, %%zmm0, %%zmm13; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm13; shr $1, %%esi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm15, %%zmm0, %%zmm14; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm14; shr $1, %%edx; add %%r14, %%rbx; " // L1 load
+ "vmovapd %%zmm15, 64(%%rcx); vfmadd231pd %%zmm16, %%zmm0, %%zmm15; shl $1, %%edi; add %%r14, %%rcx; " // L2 store
+ "vfmadd231pd %%zmm17, %%zmm0, %%zmm16; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm16; shl $1, %%esi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm18, %%zmm0, %%zmm17; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm17; shl $1, %%edx; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm19, %%zmm0, %%zmm18; vfmadd231pd %%zmm20, %%zmm1, %%zmm27; shr $1, %%edi; xor %%rdx, %%r13; " // REG ops only
+ "vfmadd231pd %%zmm20, %%zmm0, %%zmm19; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm19; shr $1, %%esi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm21, %%zmm0, %%zmm20; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm20; shr $1, %%edx; add %%r14, %%rbx; " // L1 load
+ "vmovapd %%zmm21, 64(%%rcx); vfmadd231pd %%zmm22, %%zmm0, %%zmm21; shl $1, %%edi; add %%r14, %%rcx; " // L2 store
+ "vfmadd231pd %%zmm23, %%zmm0, %%zmm22; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm22; shl $1, %%esi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm24, %%zmm0, %%zmm23; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm23; shl $1, %%edx; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm25, %%zmm0, %%zmm24; vfmadd231pd %%zmm2, %%zmm1, %%zmm28; shr $1, %%edi; xor %%rdx, %%r13; " // REG ops only
+ "vfmadd231pd %%zmm2, %%zmm0, %%zmm25; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm25; shr $1, %%esi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm3, %%zmm0, %%zmm2; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm2; shr $1, %%edx; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd 64(%%rbx), %%zmm0, %%zmm3; prefetcht2 (%%r9); shl $1, %%edi; add %%r14, %%r9; " // RAM prefetch
+ "vfmadd231pd %%zmm5, %%zmm0, %%zmm4; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm4; shl $1, %%esi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm6, %%zmm0, %%zmm5; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm5; shl $1, %%edx; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm7, %%zmm0, %%zmm6; vfmadd231pd %%zmm8, %%zmm1, %%zmm29; shr $1, %%edi; xor %%rdx, %%r13; " // REG ops only
+ "vfmadd231pd %%zmm8, %%zmm0, %%zmm7; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm7; shr $1, %%esi; add %%r14, %%rbx; " // L1 load
+ "vmovapd %%zmm8, 64(%%rcx); vfmadd231pd %%zmm9, %%zmm0, %%zmm8; shr $1, %%edx; add %%r14, %%rcx; " // L2 store
+ "vfmadd231pd %%zmm10, %%zmm0, %%zmm9; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm9; shl $1, %%edi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm11, %%zmm0, %%zmm10; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm10; shl $1, %%esi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm12, %%zmm0, %%zmm11; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm11; shl $1, %%edx; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm13, %%zmm0, %%zmm12; vfmadd231pd %%zmm14, %%zmm1, %%zmm30; shr $1, %%edi; xor %%rdx, %%r13; " // REG ops only
+ "vfmadd231pd %%zmm14, %%zmm0, %%zmm13; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm13; shr $1, %%esi; add %%r14, %%rbx; " // L1 load
+ "vmovapd %%zmm14, 64(%%rcx); vfmadd231pd %%zmm15, %%zmm0, %%zmm14; shr $1, %%edx; add %%r14, %%rcx; " // L2 store
+ "vfmadd231pd %%zmm16, %%zmm0, %%zmm15; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm15; shl $1, %%edi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm17, %%zmm0, %%zmm16; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm16; shl $1, %%esi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm18, %%zmm0, %%zmm17; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm17; shl $1, %%edx; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd 64(%%rbx), %%zmm0, %%zmm18; prefetcht2 (%%r9); shr $1, %%edi; add %%r14, %%r9; " // RAM prefetch
+ "vfmadd231pd %%zmm20, %%zmm0, %%zmm19; vfmadd231pd %%zmm21, %%zmm1, %%zmm26; shr $1, %%esi; xor %%rdi, %%r13; " // REG ops only
+ "vfmadd231pd %%zmm21, %%zmm0, %%zmm20; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm20; shr $1, %%edx; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm22, %%zmm0, %%zmm21; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm21; shl $1, %%edi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm23, %%zmm0, %%zmm22; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm22; shl $1, %%esi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm24, %%zmm0, %%zmm23; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm23; shl $1, %%edx; add %%r14, %%rbx; " // L1 load
+ "vmovapd %%zmm24, 64(%%rcx); vfmadd231pd %%zmm25, %%zmm0, %%zmm24; shr $1, %%edi; add %%r14, %%rcx; " // L2 store
+ "vfmadd231pd %%zmm2, %%zmm0, %%zmm25; vfmadd231pd %%zmm3, %%zmm1, %%zmm27; shr $1, %%esi; xor %%rdi, %%r13; " // REG ops only
+ "vfmadd231pd %%zmm3, %%zmm0, %%zmm2; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm2; shr $1, %%edx; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm4, %%zmm0, %%zmm3; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm3; shl $1, %%edi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm5, %%zmm0, %%zmm4; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm4; shl $1, %%esi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm6, %%zmm0, %%zmm5; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm5; shl $1, %%edx; mov %%rax, %%rbx;" // L1 load
+ "vmovapd %%zmm6, 64(%%rcx); vfmadd231pd %%zmm7, %%zmm0, %%zmm6; shr $1, %%edi; add %%r14, %%rcx; " // L2 store
+ "vfmadd231pd %%zmm8, %%zmm0, %%zmm7; vfmadd231pd %%zmm9, %%zmm1, %%zmm28; shr $1, %%esi; xor %%rdi, %%r13; " // REG ops only
+ "vfmadd231pd %%zmm9, %%zmm0, %%zmm8; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm8; shr $1, %%edx; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm10, %%zmm0, %%zmm9; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm9; shl $1, %%edi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm11, %%zmm0, %%zmm10; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm10; shl $1, %%esi; add %%r14, %%rbx; " // L1 load
+ "vmovapd %%zmm11, 64(%%rcx); vfmadd231pd %%zmm12, %%zmm0, %%zmm11; shl $1, %%edx; add %%r14, %%rcx; " // L2 store
+ "vfmadd231pd %%zmm13, %%zmm0, %%zmm12; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm12; shr $1, %%edi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm14, %%zmm0, %%zmm13; vfmadd231pd %%zmm15, %%zmm1, %%zmm29; shr $1, %%esi; xor %%rdi, %%r13; " // REG ops only
+ "vfmadd231pd %%zmm15, %%zmm0, %%zmm14; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm14; shr $1, %%edx; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm16, %%zmm0, %%zmm15; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm15; shl $1, %%edi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm17, %%zmm0, %%zmm16; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm16; shl $1, %%esi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd 64(%%rbx), %%zmm0, %%zmm17; prefetcht2 (%%r9); shl $1, %%edx; add %%r14, %%r9; " // RAM prefetch
+ "vfmadd231pd %%zmm19, %%zmm0, %%zmm18; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm18; shr $1, %%edi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm20, %%zmm0, %%zmm19; vfmadd231pd %%zmm21, %%zmm1, %%zmm30; shr $1, %%esi; xor %%rdi, %%r13; " // REG ops only
+ "vfmadd231pd %%zmm21, %%zmm0, %%zmm20; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm20; shr $1, %%edx; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm22, %%zmm0, %%zmm21; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm21; shl $1, %%edi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm23, %%zmm0, %%zmm22; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm22; shl $1, %%esi; add %%r14, %%rbx; " // L1 load
+ "vmovapd %%zmm23, 64(%%rcx); vfmadd231pd %%zmm24, %%zmm0, %%zmm23; shl $1, %%edx; add %%r14, %%rcx; " // L2 store
+ "vfmadd231pd %%zmm25, %%zmm0, %%zmm24; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm24; shr $1, %%edi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm2, %%zmm0, %%zmm25; vfmadd231pd %%zmm3, %%zmm1, %%zmm26; shr $1, %%esi; xor %%rdi, %%r13; " // REG ops only
+ "vfmadd231pd %%zmm3, %%zmm0, %%zmm2; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm2; shr $1, %%edx; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm4, %%zmm0, %%zmm3; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm3; shl $1, %%edi; add %%r14, %%rbx; " // L1 load
+ "vmovapd %%zmm4, 64(%%rcx); vfmadd231pd %%zmm5, %%zmm0, %%zmm4; shl $1, %%esi; add %%r14, %%rcx; " // L2 store
+ "vfmadd231pd %%zmm6, %%zmm0, %%zmm5; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm5; shl $1, %%edx; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm7, %%zmm0, %%zmm6; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm6; shr $1, %%edi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm8, %%zmm0, %%zmm7; vfmadd231pd %%zmm9, %%zmm1, %%zmm27; shr $1, %%esi; xor %%rdi, %%r13; " // REG ops only
+ "vfmadd231pd %%zmm9, %%zmm0, %%zmm8; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm8; shr $1, %%edx; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm10, %%zmm0, %%zmm9; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm9; shl $1, %%edi; add %%r14, %%rbx; " // L1 load
+ "vmovapd %%zmm10, 64(%%rcx); vfmadd231pd %%zmm11, %%zmm0, %%zmm10; shl $1, %%esi; add %%r14, %%rcx; " // L2 store
+ "vfmadd231pd %%zmm12, %%zmm0, %%zmm11; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm11; shl $1, %%edx; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm13, %%zmm0, %%zmm12; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm12; shr $1, %%edi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm14, %%zmm0, %%zmm13; vfmadd231pd %%zmm15, %%zmm1, %%zmm28; shr $1, %%esi; xor %%rdi, %%r13; " // REG ops only
+ "vfmadd231pd %%zmm15, %%zmm0, %%zmm14; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm14; shr $1, %%edx; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm16, %%zmm0, %%zmm15; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm15; shl $1, %%edi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd 64(%%rbx), %%zmm0, %%zmm16; prefetcht2 (%%r9); shl $1, %%esi; add %%r14, %%r9; " // RAM prefetch
+ "vfmadd231pd %%zmm18, %%zmm0, %%zmm17; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm17; shl $1, %%edx; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm19, %%zmm0, %%zmm18; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm18; shr $1, %%edi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm20, %%zmm0, %%zmm19; vfmadd231pd %%zmm21, %%zmm1, %%zmm29; shr $1, %%esi; xor %%rdi, %%r13; " // REG ops only
+ "vfmadd231pd %%zmm21, %%zmm0, %%zmm20; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm20; shr $1, %%edx; add %%r14, %%rbx; " // L1 load
+ "vmovapd %%zmm21, 64(%%rcx); vfmadd231pd %%zmm22, %%zmm0, %%zmm21; shl $1, %%edi; add %%r14, %%rcx; " // L2 store
+ "vfmadd231pd %%zmm23, %%zmm0, %%zmm22; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm22; shl $1, %%esi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm24, %%zmm0, %%zmm23; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm23; shl $1, %%edx; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm25, %%zmm0, %%zmm24; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm24; shr $1, %%edi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm2, %%zmm0, %%zmm25; vfmadd231pd %%zmm3, %%zmm1, %%zmm30; shr $1, %%esi; xor %%rdi, %%r13; " // REG ops only
+ "vfmadd231pd %%zmm3, %%zmm0, %%zmm2; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm2; shr $1, %%edx; add %%r14, %%rbx; " // L1 load
+ "vmovapd %%zmm3, 64(%%rcx); vfmadd231pd %%zmm4, %%zmm0, %%zmm3; shl $1, %%edi; add %%r14, %%rcx; " // L2 store
+ "vfmadd231pd %%zmm5, %%zmm0, %%zmm4; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm4; shl $1, %%esi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm6, %%zmm0, %%zmm5; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm5; shl $1, %%edx; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm7, %%zmm0, %%zmm6; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm6; shr $1, %%edi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd 64(%%rbx), %%zmm0, %%zmm7; prefetcht2 (%%r9); shr $1, %%esi; add %%r14, %%r9; " // RAM prefetch
+ "vfmadd231pd %%zmm9, %%zmm0, %%zmm8; vfmadd231pd %%zmm10, %%zmm1, %%zmm26; shr $1, %%edx; xor %%rsi, %%r13; " // REG ops only
+ "vfmadd231pd %%zmm10, %%zmm0, %%zmm9; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm9; shl $1, %%edi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm11, %%zmm0, %%zmm10; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm10; shl $1, %%esi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm12, %%zmm0, %%zmm11; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm11; shl $1, %%edx; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm13, %%zmm0, %%zmm12; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm12; shr $1, %%edi; add %%r14, %%rbx; " // L1 load
+ "vmovapd %%zmm13, 64(%%rcx); vfmadd231pd %%zmm14, %%zmm0, %%zmm13; shr $1, %%esi; add %%r14, %%rcx; " // L2 store
+ "vfmadd231pd %%zmm15, %%zmm0, %%zmm14; vfmadd231pd %%zmm16, %%zmm1, %%zmm27; shr $1, %%edx; xor %%rsi, %%r13; " // REG ops only
+ "vfmadd231pd %%zmm16, %%zmm0, %%zmm15; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm15; shl $1, %%edi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm17, %%zmm0, %%zmm16; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm16; shl $1, %%esi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm18, %%zmm0, %%zmm17; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm17; shl $1, %%edx; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm19, %%zmm0, %%zmm18; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm18; shr $1, %%edi; add %%r14, %%rbx; " // L1 load
+ "vmovapd %%zmm19, 64(%%rcx); vfmadd231pd %%zmm20, %%zmm0, %%zmm19; shr $1, %%esi; add %%r14, %%rcx; " // L2 store
+ "vfmadd231pd %%zmm21, %%zmm0, %%zmm20; vfmadd231pd %%zmm22, %%zmm1, %%zmm28; shr $1, %%edx; xor %%rsi, %%r13; " // REG ops only
+ "vfmadd231pd %%zmm22, %%zmm0, %%zmm21; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm21; shl $1, %%edi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm23, %%zmm0, %%zmm22; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm22; shl $1, %%esi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm24, %%zmm0, %%zmm23; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm23; shl $1, %%edx; add %%r14, %%rbx; " // L1 load
+ "vmovapd %%zmm24, 64(%%rcx); vfmadd231pd %%zmm25, %%zmm0, %%zmm24; shr $1, %%edi; add %%r14, %%rcx; " // L2 store
+ "vfmadd231pd %%zmm2, %%zmm0, %%zmm25; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm25; shr $1, %%esi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm3, %%zmm0, %%zmm2; vfmadd231pd %%zmm4, %%zmm1, %%zmm29; shr $1, %%edx; xor %%rsi, %%r13; " // REG ops only
+ "vfmadd231pd %%zmm4, %%zmm0, %%zmm3; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm3; shl $1, %%edi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm5, %%zmm0, %%zmm4; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm4; shl $1, %%esi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm6, %%zmm0, %%zmm5; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm5; shl $1, %%edx; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd 64(%%rbx), %%zmm0, %%zmm6; prefetcht2 (%%r9); shr $1, %%edi; add %%r14, %%r9; " // RAM prefetch
+ "vfmadd231pd %%zmm8, %%zmm0, %%zmm7; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm7; shr $1, %%esi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm9, %%zmm0, %%zmm8; vfmadd231pd %%zmm10, %%zmm1, %%zmm30; shr $1, %%edx; xor %%rsi, %%r13; " // REG ops only
+ "vfmadd231pd %%zmm10, %%zmm0, %%zmm9; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm9; shl $1, %%edi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm11, %%zmm0, %%zmm10; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm10; shl $1, %%esi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm12, %%zmm0, %%zmm11; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm11; shl $1, %%edx; add %%r14, %%rbx; " // L1 load
+ "vmovapd %%zmm12, 64(%%rcx); vfmadd231pd %%zmm13, %%zmm0, %%zmm12; shr $1, %%edi; add %%r14, %%rcx; " // L2 store
+ "vfmadd231pd %%zmm14, %%zmm0, %%zmm13; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm13; shr $1, %%esi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm15, %%zmm0, %%zmm14; vfmadd231pd %%zmm16, %%zmm1, %%zmm26; shr $1, %%edx; xor %%rsi, %%r13; " // REG ops only
+ "vfmadd231pd %%zmm16, %%zmm0, %%zmm15; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm15; shl $1, %%edi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm17, %%zmm0, %%zmm16; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm16; shl $1, %%esi; add %%r14, %%rbx; " // L1 load
+ "vmovapd %%zmm17, 64(%%rcx); vfmadd231pd %%zmm18, %%zmm0, %%zmm17; shl $1, %%edx; add %%r14, %%rcx; " // L2 store
+ "vfmadd231pd %%zmm19, %%zmm0, %%zmm18; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm18; shr $1, %%edi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm20, %%zmm0, %%zmm19; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm19; shr $1, %%esi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm21, %%zmm0, %%zmm20; vfmadd231pd %%zmm22, %%zmm1, %%zmm27; shr $1, %%edx; xor %%rsi, %%r13; " // REG ops only
+ "vfmadd231pd %%zmm22, %%zmm0, %%zmm21; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm21; shl $1, %%edi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm23, %%zmm0, %%zmm22; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm22; shl $1, %%esi; add %%r14, %%rbx; " // L1 load
+ "vmovapd %%zmm23, 64(%%rcx); vfmadd231pd %%zmm24, %%zmm0, %%zmm23; shl $1, %%edx; add %%r14, %%rcx; " // L2 store
+ "vfmadd231pd %%zmm25, %%zmm0, %%zmm24; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm24; shr $1, %%edi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm2, %%zmm0, %%zmm25; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm25; shr $1, %%esi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm3, %%zmm0, %%zmm2; vfmadd231pd %%zmm4, %%zmm1, %%zmm28; shr $1, %%edx; xor %%rsi, %%r13; " // REG ops only
+ "vfmadd231pd %%zmm4, %%zmm0, %%zmm3; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm3; shl $1, %%edi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm5, %%zmm0, %%zmm4; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm4; shl $1, %%esi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd 64(%%rbx), %%zmm0, %%zmm5; prefetcht2 (%%r9); shl $1, %%edx; add %%r14, %%r9; " // RAM prefetch
+ "vfmadd231pd %%zmm7, %%zmm0, %%zmm6; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm6; shr $1, %%edi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm8, %%zmm0, %%zmm7; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm7; shr $1, %%esi; mov %%rax, %%rbx;" // L1 load
+ "vfmadd231pd %%zmm9, %%zmm0, %%zmm8; vfmadd231pd %%zmm10, %%zmm1, %%zmm29; shr $1, %%edx; xor %%rsi, %%r13; " // REG ops only
+ "vfmadd231pd %%zmm10, %%zmm0, %%zmm9; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm9; shl $1, %%edi; add %%r14, %%rbx; " // L1 load
+ "vmovapd %%zmm10, 64(%%rcx); vfmadd231pd %%zmm11, %%zmm0, %%zmm10; shl $1, %%esi; add %%r14, %%rcx; " // L2 store
+ "vfmadd231pd %%zmm12, %%zmm0, %%zmm11; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm11; shl $1, %%edx; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm13, %%zmm0, %%zmm12; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm12; shr $1, %%edi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm14, %%zmm0, %%zmm13; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm13; shr $1, %%esi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm15, %%zmm0, %%zmm14; vfmadd231pd %%zmm16, %%zmm1, %%zmm30; shr $1, %%edx; xor %%rsi, %%r13; " // REG ops only
+ "vfmadd231pd %%zmm16, %%zmm0, %%zmm15; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm15; shl $1, %%edi; add %%r14, %%rbx; " // L1 load
+ "vmovapd %%zmm16, 64(%%rcx); vfmadd231pd %%zmm17, %%zmm0, %%zmm16; shl $1, %%esi; add %%r14, %%rcx; " // L2 store
+ "vfmadd231pd %%zmm18, %%zmm0, %%zmm17; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm17; shl $1, %%edx; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm19, %%zmm0, %%zmm18; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm18; shr $1, %%edi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm20, %%zmm0, %%zmm19; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm19; shr $1, %%esi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd 64(%%rbx), %%zmm0, %%zmm20; prefetcht2 (%%r9); shr $1, %%edx; add %%r14, %%r9; " // RAM prefetch
+ "vfmadd231pd %%zmm22, %%zmm0, %%zmm21; vfmadd231pd %%zmm23, %%zmm1, %%zmm26; shl $1, %%edi; xor %%rdx, %%r13; " // REG ops only
+ "vfmadd231pd %%zmm23, %%zmm0, %%zmm22; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm22; shl $1, %%esi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm24, %%zmm0, %%zmm23; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm23; shl $1, %%edx; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm25, %%zmm0, %%zmm24; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm24; shr $1, %%edi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm2, %%zmm0, %%zmm25; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm25; shr $1, %%esi; add %%r14, %%rbx; " // L1 load
+ "vmovapd %%zmm2, 64(%%rcx); vfmadd231pd %%zmm3, %%zmm0, %%zmm2; shr $1, %%edx; add %%r14, %%rcx; " // L2 store
+ "vfmadd231pd %%zmm4, %%zmm0, %%zmm3; vfmadd231pd %%zmm5, %%zmm1, %%zmm27; shl $1, %%edi; xor %%rdx, %%r13; " // REG ops only
+ "vfmadd231pd %%zmm5, %%zmm0, %%zmm4; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm4; shl $1, %%esi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm6, %%zmm0, %%zmm5; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm5; shl $1, %%edx; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm7, %%zmm0, %%zmm6; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm6; shr $1, %%edi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm8, %%zmm0, %%zmm7; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm7; shr $1, %%esi; add %%r14, %%rbx; " // L1 load
+ "vmovapd %%zmm8, 64(%%rcx); vfmadd231pd %%zmm9, %%zmm0, %%zmm8; shr $1, %%edx; add %%r14, %%rcx; " // L2 store
+ "vfmadd231pd %%zmm10, %%zmm0, %%zmm9; vfmadd231pd %%zmm11, %%zmm1, %%zmm28; shl $1, %%edi; xor %%rdx, %%r13; " // REG ops only
+ "vfmadd231pd %%zmm11, %%zmm0, %%zmm10; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm10; shl $1, %%esi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm12, %%zmm0, %%zmm11; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm11; shl $1, %%edx; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm13, %%zmm0, %%zmm12; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm12; shr $1, %%edi; add %%r14, %%rbx; " // L1 load
+ "vmovapd %%zmm13, 64(%%rcx); vfmadd231pd %%zmm14, %%zmm0, %%zmm13; shr $1, %%esi; add %%r14, %%rcx; " // L2 store
+ "vfmadd231pd %%zmm15, %%zmm0, %%zmm14; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm14; shr $1, %%edx; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm16, %%zmm0, %%zmm15; vfmadd231pd %%zmm17, %%zmm1, %%zmm29; shl $1, %%edi; xor %%rdx, %%r13; " // REG ops only
+ "vfmadd231pd %%zmm17, %%zmm0, %%zmm16; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm16; shl $1, %%esi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm18, %%zmm0, %%zmm17; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm17; shl $1, %%edx; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm19, %%zmm0, %%zmm18; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm18; shr $1, %%edi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd 64(%%rbx), %%zmm0, %%zmm19; prefetcht2 (%%r9); shr $1, %%esi; add %%r14, %%r9; " // RAM prefetch
+ "vfmadd231pd %%zmm21, %%zmm0, %%zmm20; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm20; shr $1, %%edx; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm22, %%zmm0, %%zmm21; vfmadd231pd %%zmm23, %%zmm1, %%zmm30; shl $1, %%edi; xor %%rdx, %%r13; " // REG ops only
+ "vfmadd231pd %%zmm23, %%zmm0, %%zmm22; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm22; shl $1, %%esi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm24, %%zmm0, %%zmm23; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm23; shl $1, %%edx; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm25, %%zmm0, %%zmm24; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm24; shr $1, %%edi; add %%r14, %%rbx; " // L1 load
+ "vmovapd %%zmm25, 64(%%rcx); vfmadd231pd %%zmm2, %%zmm0, %%zmm25; shr $1, %%esi; add %%r14, %%rcx; " // L2 store
+ "vfmadd231pd %%zmm3, %%zmm0, %%zmm2; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm2; shr $1, %%edx; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm4, %%zmm0, %%zmm3; vfmadd231pd %%zmm5, %%zmm1, %%zmm26; shl $1, %%edi; xor %%rdx, %%r13; " // REG ops only
+ "vfmadd231pd %%zmm5, %%zmm0, %%zmm4; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm4; shl $1, %%esi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm6, %%zmm0, %%zmm5; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm5; shl $1, %%edx; add %%r14, %%rbx; " // L1 load
+ "vmovapd %%zmm6, 64(%%rcx); vfmadd231pd %%zmm7, %%zmm0, %%zmm6; shr $1, %%edi; add %%r14, %%rcx; " // L2 store
+ "vfmadd231pd %%zmm8, %%zmm0, %%zmm7; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm7; shr $1, %%esi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm9, %%zmm0, %%zmm8; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm8; shr $1, %%edx; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm10, %%zmm0, %%zmm9; vfmadd231pd %%zmm11, %%zmm1, %%zmm27; shl $1, %%edi; xor %%rdx, %%r13; " // REG ops only
+ "vfmadd231pd %%zmm11, %%zmm0, %%zmm10; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm10; shl $1, %%esi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm12, %%zmm0, %%zmm11; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm11; shl $1, %%edx; add %%r14, %%rbx; " // L1 load
+ "vmovapd %%zmm12, 64(%%rcx); vfmadd231pd %%zmm13, %%zmm0, %%zmm12; shr $1, %%edi; add %%r14, %%rcx; " // L2 store
+ "vfmadd231pd %%zmm14, %%zmm0, %%zmm13; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm13; shr $1, %%esi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm15, %%zmm0, %%zmm14; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm14; shr $1, %%edx; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm16, %%zmm0, %%zmm15; vfmadd231pd %%zmm17, %%zmm1, %%zmm28; shl $1, %%edi; xor %%rdx, %%r13; " // REG ops only
+ "vfmadd231pd %%zmm17, %%zmm0, %%zmm16; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm16; shl $1, %%esi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm18, %%zmm0, %%zmm17; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm17; shl $1, %%edx; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd 64(%%rbx), %%zmm0, %%zmm18; prefetcht2 (%%r9); shr $1, %%edi; add %%r14, %%r9; " // RAM prefetch
+ "vfmadd231pd %%zmm20, %%zmm0, %%zmm19; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm19; shr $1, %%esi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm21, %%zmm0, %%zmm20; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm20; shr $1, %%edx; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm22, %%zmm0, %%zmm21; vfmadd231pd %%zmm23, %%zmm1, %%zmm29; shl $1, %%edi; xor %%rdx, %%r13; " // REG ops only
+ "vfmadd231pd %%zmm23, %%zmm0, %%zmm22; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm22; shl $1, %%esi; add %%r14, %%rbx; " // L1 load
+ "vmovapd %%zmm23, 64(%%rcx); vfmadd231pd %%zmm24, %%zmm0, %%zmm23; shl $1, %%edx; add %%r14, %%rcx; " // L2 store
+ "vfmadd231pd %%zmm25, %%zmm0, %%zmm24; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm24; shr $1, %%edi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm2, %%zmm0, %%zmm25; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm25; shr $1, %%esi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm3, %%zmm0, %%zmm2; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm2; shr $1, %%edx; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm4, %%zmm0, %%zmm3; vfmadd231pd %%zmm5, %%zmm1, %%zmm30; shl $1, %%edi; xor %%rdx, %%r13; " // REG ops only
+ "vfmadd231pd %%zmm5, %%zmm0, %%zmm4; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm4; shl $1, %%esi; add %%r14, %%rbx; " // L1 load
+ "vmovapd %%zmm5, 64(%%rcx); vfmadd231pd %%zmm6, %%zmm0, %%zmm5; shl $1, %%edx; add %%r14, %%rcx; " // L2 store
+ "vfmadd231pd %%zmm7, %%zmm0, %%zmm6; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm6; shr $1, %%edi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm8, %%zmm0, %%zmm7; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm7; shr $1, %%esi; add %%r14, %%rbx; " // L1 load
+ "vfmadd231pd %%zmm9, %%zmm0, %%zmm8; vfmadd231pd 64(%%rbx), %%zmm1, %%zmm8; shr $1, %%edx; add %%r14, %%rbx; " // L1 load
+ "movq %%mm0, %%r13;" // restore iteration counter
+ //reset RAM counter
+ "sub $1, %%r12;"
+ "jnz _work_no_ram_reset_knl_xeonphi_avx512_4t;"
+ "movabs $5688, %%r12;"
+ "mov %%rax, %%r9;"
+ "add $0, %%r9;"
+ "_work_no_ram_reset_knl_xeonphi_avx512_4t:"
+ "inc %%r13;" // increment iteration counter
+ //reset L2-Cache counter
+ "sub $1, %%r10;"
+ "jnz _work_no_L2_reset_knl_xeonphi_avx512_4t;"
+ "movabs $34, %%r10;"
+ "mov %%rax, %%rcx;"
+ "add $8192, %%rcx;"
+ "_work_no_L2_reset_knl_xeonphi_avx512_4t:"
+ "movq %%r13, %%mm0;" // store iteration counter
+ "mov %%rax, %%rbx;"
+ "testq $1, (%%r15);"
+ "jnz _work_loop_knl_xeonphi_avx512_4t;"
+ "movq %%mm0, %%rax;" // restore iteration counter
+ : "=a" (threaddata->iterations)
+ : "a"(threaddata->addrMem), "b"(threaddata->addrHigh), "c" (threaddata->iterations)
+ : "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "%rdx", "%rsi", "%rdi", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "%xmm16", "%xmm17", "%xmm18", "%xmm19", "%xmm20", "%xmm21", "%xmm22", "%xmm23", "%xmm24", "%xmm25", "%xmm26", "%xmm27", "%xmm28", "%xmm29", "%xmm30", "%xmm31"
+ );
+ return EXIT_SUCCESS;
+}
diff --git a/avx_functions.c b/avx_functions.c
index 5a4c6faa..0d1fd4b4 100644
--- a/avx_functions.c
+++ b/avx_functions.c
@@ -23,12 +23,33 @@
- int init_snb_corei_avx_1t(unsigned long long addrMem) __attribute__((noinline));
-int init_snb_corei_avx_1t(unsigned long long addrMem)
+
+ int init_snb_corei_avx_1t(threaddata_t* threaddata) __attribute__((noinline));
+int init_snb_corei_avx_1t(threaddata_t* threaddata)
{
+ unsigned long long addrMem = threaddata->addrMem;
int i;
for (i=0;i<13340672;i++) *((double*)(addrMem+8*i)) = i* 1.654738925401e-15;
+ // lines with register operations
+ threaddata->flops+=45*4; // 1 256 bit operation
+
+ // lines with L1 operations
+ threaddata->flops+=90*4; // 1 256 bit operation
+
+ // lines with L2 operations
+ threaddata->flops+=10*4; // 1 256 bit operation
+
+ // lines with L3 operations
+ threaddata->flops+=4*4; // 1 256 bit operation
+
+ // lines with RAM operations
+ threaddata->flops+=2*4; // 1 256 bit operation
+ threaddata->bytes=2*64; // 1 memory access
+
+ threaddata->flops*=10;
+ threaddata->bytes*=10;
+
return EXIT_SUCCESS;
}
@@ -39,11 +60,11 @@ int init_snb_corei_avx_1t(unsigned long long addrMem)
* @input - addrMem: pointer to buffer
* @return EXIT_SUCCESS
*/
-int asm_work_snb_corei_avx_1t(unsigned long long addrMem, unsigned long long addrHigh) __attribute__((noinline));
-int asm_work_snb_corei_avx_1t(unsigned long long addrMem, unsigned long long addrHigh)
+int asm_work_snb_corei_avx_1t(threaddata_t* threaddata) __attribute__((noinline));
+int asm_work_snb_corei_avx_1t(threaddata_t* threaddata)
{
- if (*((unsigned long long*)addrHigh) == 0) return EXIT_SUCCESS;
+ if (*((unsigned long long*)threaddata->addrHigh) == 0) return EXIT_SUCCESS;
/* input:
* - addrMem -> rax
* register usage:
@@ -58,11 +79,13 @@ int asm_work_snb_corei_avx_1t(unsigned long long addrMem, unsigned long long add
* - r11: temp register for initialization of SIMD-registers
* - r12: stores cacheline width as increment for buffer addresses
* - r13: stores address of shared variable that controls load level
+ * - r14: stores iteration counter
* - mm*,xmm*,ymm*: data registers for SIMD instructions
*/
__asm__ __volatile__(
- "mov %0, %%rax;" // store start address of buffer
- "mov %1, %%r13;" // store address of shared variable that controls load level
+ "mov %%rax, %%rax;" // store start address of buffer
+ "mov %%rbx, %%r13;" // store address of shared variable that controls load level
+ "mov %%rcx, %%r14;" // store iteration counter
"mov $64, %%r12;" // increment after each cache/memory access
//Initialize AVX-Registers for Addition
"vmovapd 0(%%rax), %%ymm0;"
@@ -1654,26 +1677,48 @@ int asm_work_snb_corei_avx_1t(unsigned long long addrMem, unsigned long long add
"mov %%rax, %%rdx;"
"add $262144, %%rdx;"
"_work_no_L3_reset_snb_corei_avx_1t:"
+ "inc %%r14;" // increment iteration counter
"mov %%rax, %%rbx;"
"mfence;"
- "mov (%%r13), %%r11;"
- "test $1, %%r11;"
+ "testq $1, (%%r13);"
"jnz _work_loop_snb_corei_avx_1t;"
- :
- : "r"(addrMem), "r"(addrHigh)
- : "%rax", "%rbx", "%rcx", "%rdx", "%rdi", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15"
+ "movq %%r14, %%rax;" // restore iteration counter
+ : "=a" (threaddata->iterations)
+ : "a"(threaddata->addrMem), "b"(threaddata->addrHigh), "c" (threaddata->iterations)
+ : "%rdx", "%rdi", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15"
);
return EXIT_SUCCESS;
}
- int init_snb_corei_avx_2t(unsigned long long addrMem) __attribute__((noinline));
-int init_snb_corei_avx_2t(unsigned long long addrMem)
+
+ int init_snb_corei_avx_2t(threaddata_t* threaddata) __attribute__((noinline));
+int init_snb_corei_avx_2t(threaddata_t* threaddata)
{
+ unsigned long long addrMem = threaddata->addrMem;
int i;
for (i=0;i<6670336;i++) *((double*)(addrMem+8*i)) = i* 1.654738925401e-15;
+ // lines with register operations
+ threaddata->flops+=45*4; // 1 256 bit operation
+
+ // lines with L1 operations
+ threaddata->flops+=90*4; // 1 256 bit operation
+
+ // lines with L2 operations
+ threaddata->flops+=10*4; // 1 256 bit operation
+
+ // lines with L3 operations
+ threaddata->flops+=4*4; // 1 256 bit operation
+
+ // lines with RAM operations
+ threaddata->flops+=2*4; // 1 256 bit operation
+ threaddata->bytes=2*64; // 1 memory access
+
+ threaddata->flops*=5;
+ threaddata->bytes*=5;
+
return EXIT_SUCCESS;
}
@@ -1684,11 +1729,11 @@ int init_snb_corei_avx_2t(unsigned long long addrMem)
* @input - addrMem: pointer to buffer
* @return EXIT_SUCCESS
*/
-int asm_work_snb_corei_avx_2t(unsigned long long addrMem, unsigned long long addrHigh) __attribute__((noinline));
-int asm_work_snb_corei_avx_2t(unsigned long long addrMem, unsigned long long addrHigh)
+int asm_work_snb_corei_avx_2t(threaddata_t* threaddata) __attribute__((noinline));
+int asm_work_snb_corei_avx_2t(threaddata_t* threaddata)
{
- if (*((unsigned long long*)addrHigh) == 0) return EXIT_SUCCESS;
+ if (*((unsigned long long*)threaddata->addrHigh) == 0) return EXIT_SUCCESS;
/* input:
* - addrMem -> rax
* register usage:
@@ -1703,11 +1748,13 @@ int asm_work_snb_corei_avx_2t(unsigned long long addrMem, unsigned long long add
* - r11: temp register for initialization of SIMD-registers
* - r12: stores cacheline width as increment for buffer addresses
* - r13: stores address of shared variable that controls load level
+ * - r14: stores iteration counter
* - mm*,xmm*,ymm*: data registers for SIMD instructions
*/
__asm__ __volatile__(
- "mov %0, %%rax;" // store start address of buffer
- "mov %1, %%r13;" // store address of shared variable that controls load level
+ "mov %%rax, %%rax;" // store start address of buffer
+ "mov %%rbx, %%r13;" // store address of shared variable that controls load level
+ "mov %%rcx, %%r14;" // store iteration counter
"mov $64, %%r12;" // increment after each cache/memory access
//Initialize AVX-Registers for Addition
"vmovapd 0(%%rax), %%ymm0;"
@@ -2544,26 +2591,48 @@ int asm_work_snb_corei_avx_2t(unsigned long long addrMem, unsigned long long add
"mov %%rax, %%rdx;"
"add $131072, %%rdx;"
"_work_no_L3_reset_snb_corei_avx_2t:"
+ "inc %%r14;" // increment iteration counter
"mov %%rax, %%rbx;"
"mfence;"
- "mov (%%r13), %%r11;"
- "test $1, %%r11;"
+ "testq $1, (%%r13);"
"jnz _work_loop_snb_corei_avx_2t;"
- :
- : "r"(addrMem), "r"(addrHigh)
- : "%rax", "%rbx", "%rcx", "%rdx", "%rdi", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15"
+ "movq %%r14, %%rax;" // restore iteration counter
+ : "=a" (threaddata->iterations)
+ : "a"(threaddata->addrMem), "b"(threaddata->addrHigh), "c" (threaddata->iterations)
+ : "%rdx", "%rdi", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15"
);
return EXIT_SUCCESS;
}
- int init_snb_xeonep_avx_1t(unsigned long long addrMem) __attribute__((noinline));
-int init_snb_xeonep_avx_1t(unsigned long long addrMem)
+
+ int init_snb_xeonep_avx_1t(threaddata_t* threaddata) __attribute__((noinline));
+int init_snb_xeonep_avx_1t(threaddata_t* threaddata)
{
+ unsigned long long addrMem = threaddata->addrMem;
int i;
for (i=0;i<13471744;i++) *((double*)(addrMem+8*i)) = i* 1.654738925401e-15;
+ // lines with register operations
+ threaddata->flops+=30*4; // 1 256 bit operation
+
+ // lines with L1 operations
+ threaddata->flops+=90*4; // 1 256 bit operation
+
+ // lines with L2 operations
+ threaddata->flops+=10*4; // 1 256 bit operation
+
+ // lines with L3 operations
+ threaddata->flops+=2*4; // 1 256 bit operation
+
+ // lines with RAM operations
+ threaddata->flops+=3*4; // 1 256 bit operation
+ threaddata->bytes=3*64; // 1 memory access
+
+ threaddata->flops*=11;
+ threaddata->bytes*=11;
+
return EXIT_SUCCESS;
}
@@ -2574,11 +2643,11 @@ int init_snb_xeonep_avx_1t(unsigned long long addrMem)
* @input - addrMem: pointer to buffer
* @return EXIT_SUCCESS
*/
-int asm_work_snb_xeonep_avx_1t(unsigned long long addrMem, unsigned long long addrHigh) __attribute__((noinline));
-int asm_work_snb_xeonep_avx_1t(unsigned long long addrMem, unsigned long long addrHigh)
+int asm_work_snb_xeonep_avx_1t(threaddata_t* threaddata) __attribute__((noinline));
+int asm_work_snb_xeonep_avx_1t(threaddata_t* threaddata)
{
- if (*((unsigned long long*)addrHigh) == 0) return EXIT_SUCCESS;
+ if (*((unsigned long long*)threaddata->addrHigh) == 0) return EXIT_SUCCESS;
/* input:
* - addrMem -> rax
* register usage:
@@ -2593,11 +2662,13 @@ int asm_work_snb_xeonep_avx_1t(unsigned long long addrMem, unsigned long long ad
* - r11: temp register for initialization of SIMD-registers
* - r12: stores cacheline width as increment for buffer addresses
* - r13: stores address of shared variable that controls load level
+ * - r14: stores iteration counter
* - mm*,xmm*,ymm*: data registers for SIMD instructions
*/
__asm__ __volatile__(
- "mov %0, %%rax;" // store start address of buffer
- "mov %1, %%r13;" // store address of shared variable that controls load level
+ "mov %%rax, %%rax;" // store start address of buffer
+ "mov %%rbx, %%r13;" // store address of shared variable that controls load level
+ "mov %%rcx, %%r14;" // store iteration counter
"mov $64, %%r12;" // increment after each cache/memory access
//Initialize AVX-Registers for Addition
"vmovapd 0(%%rax), %%ymm0;"
@@ -4164,26 +4235,48 @@ int asm_work_snb_xeonep_avx_1t(unsigned long long addrMem, unsigned long long ad
"mov %%rax, %%rdx;"
"add $262144, %%rdx;"
"_work_no_L3_reset_snb_xeonep_avx_1t:"
+ "inc %%r14;" // increment iteration counter
"mov %%rax, %%rbx;"
"mfence;"
- "mov (%%r13), %%r11;"
- "test $1, %%r11;"
+ "testq $1, (%%r13);"
"jnz _work_loop_snb_xeonep_avx_1t;"
- :
- : "r"(addrMem), "r"(addrHigh)
- : "%rax", "%rbx", "%rcx", "%rdx", "%rdi", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15"
+ "movq %%r14, %%rax;" // restore iteration counter
+ : "=a" (threaddata->iterations)
+ : "a"(threaddata->addrMem), "b"(threaddata->addrHigh), "c" (threaddata->iterations)
+ : "%rdx", "%rdi", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15"
);
return EXIT_SUCCESS;
}
- int init_snb_xeonep_avx_2t(unsigned long long addrMem) __attribute__((noinline));
-int init_snb_xeonep_avx_2t(unsigned long long addrMem)
+
+ int init_snb_xeonep_avx_2t(threaddata_t* threaddata) __attribute__((noinline));
+int init_snb_xeonep_avx_2t(threaddata_t* threaddata)
{
+ unsigned long long addrMem = threaddata->addrMem;
int i;
for (i=0;i<6735872;i++) *((double*)(addrMem+8*i)) = i* 1.654738925401e-15;
+ // lines with register operations
+ threaddata->flops+=30*4; // 1 256 bit operation
+
+ // lines with L1 operations
+ threaddata->flops+=90*4; // 1 256 bit operation
+
+ // lines with L2 operations
+ threaddata->flops+=10*4; // 1 256 bit operation
+
+ // lines with L3 operations
+ threaddata->flops+=2*4; // 1 256 bit operation
+
+ // lines with RAM operations
+ threaddata->flops+=3*4; // 1 256 bit operation
+ threaddata->bytes=3*64; // 1 memory access
+
+ threaddata->flops*=5;
+ threaddata->bytes*=5;
+
return EXIT_SUCCESS;
}
@@ -4194,11 +4287,11 @@ int init_snb_xeonep_avx_2t(unsigned long long addrMem)
* @input - addrMem: pointer to buffer
* @return EXIT_SUCCESS
*/
-int asm_work_snb_xeonep_avx_2t(unsigned long long addrMem, unsigned long long addrHigh) __attribute__((noinline));
-int asm_work_snb_xeonep_avx_2t(unsigned long long addrMem, unsigned long long addrHigh)
+int asm_work_snb_xeonep_avx_2t(threaddata_t* threaddata) __attribute__((noinline));
+int asm_work_snb_xeonep_avx_2t(threaddata_t* threaddata)
{
- if (*((unsigned long long*)addrHigh) == 0) return EXIT_SUCCESS;
+ if (*((unsigned long long*)threaddata->addrHigh) == 0) return EXIT_SUCCESS;
/* input:
* - addrMem -> rax
* register usage:
@@ -4213,11 +4306,13 @@ int asm_work_snb_xeonep_avx_2t(unsigned long long addrMem, unsigned long long ad
* - r11: temp register for initialization of SIMD-registers
* - r12: stores cacheline width as increment for buffer addresses
* - r13: stores address of shared variable that controls load level
+ * - r14: stores iteration counter
* - mm*,xmm*,ymm*: data registers for SIMD instructions
*/
__asm__ __volatile__(
- "mov %0, %%rax;" // store start address of buffer
- "mov %1, %%r13;" // store address of shared variable that controls load level
+ "mov %%rax, %%rax;" // store start address of buffer
+ "mov %%rbx, %%r13;" // store address of shared variable that controls load level
+ "mov %%rcx, %%r14;" // store iteration counter
"mov $64, %%r12;" // increment after each cache/memory access
//Initialize AVX-Registers for Addition
"vmovapd 0(%%rax), %%ymm0;"
@@ -4974,14 +5069,15 @@ int asm_work_snb_xeonep_avx_2t(unsigned long long addrMem, unsigned long long ad
"mov %%rax, %%rdx;"
"add $131072, %%rdx;"
"_work_no_L3_reset_snb_xeonep_avx_2t:"
+ "inc %%r14;" // increment iteration counter
"mov %%rax, %%rbx;"
"mfence;"
- "mov (%%r13), %%r11;"
- "test $1, %%r11;"
+ "testq $1, (%%r13);"
"jnz _work_loop_snb_xeonep_avx_2t;"
- :
- : "r"(addrMem), "r"(addrHigh)
- : "%rax", "%rbx", "%rcx", "%rdx", "%rdi", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15"
+ "movq %%r14, %%rax;" // restore iteration counter
+ : "=a" (threaddata->iterations)
+ : "a"(threaddata->addrMem), "b"(threaddata->addrHigh), "c" (threaddata->iterations)
+ : "%rdx", "%rdi", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15"
);
return EXIT_SUCCESS;
}
diff --git a/cpu.h b/cpu.h
index a8045448..d38105fc 100644
--- a/cpu.h
+++ b/cpu.h
@@ -81,6 +81,7 @@
#define FMA4 0x00010000
#define FMA 0x00020000
#define AES 0x00040000
+#define AVX512 0x00080000
#define MAX_CACHELEVELS 3
diff --git a/firestarter_global.h b/firestarter_global.h
index cec4ab86..85f7ad1d 100644
--- a/firestarter_global.h
+++ b/firestarter_global.h
@@ -25,9 +25,9 @@
/* current version */
#define VERSION_MAJOR 1
-#define VERSION_MINOR 4
+#define VERSION_MINOR 5
#define VERSION_INFO ""
-#define BUILDDATE "2016-04-08"
+#define BUILDDATE "2016-11-11"
#define COPYRIGHT_YEAR 2016
#if (defined(linux) || defined(__linux__)) && defined (AFFINITY)
@@ -84,7 +84,7 @@ typedef struct mydata
{
struct threaddata *threaddata;
cpu_info_t *cpuinfo;
- int *thread_comm;
+ int *thread_comm;
volatile unsigned int ack;
unsigned int num_threads;
} mydata_t;
@@ -96,7 +96,12 @@ typedef struct threaddata
char* bufferMem;
unsigned long long addrMem;
unsigned long long addrHigh;
- unsigned long long buffersizeMem;
+ unsigned long long buffersizeMem;
+ unsigned long long iterations;
+ unsigned long long flops;
+ unsigned long long bytes;
+ unsigned long long start_tsc;
+ unsigned long long stop_tsc;
unsigned int alignment;
unsigned int cpu_id;
unsigned int thread_id;
diff --git a/fma4_functions.c b/fma4_functions.c
index 1bfaddc3..472ede26 100644
--- a/fma4_functions.c
+++ b/fma4_functions.c
@@ -23,12 +23,33 @@
- int init_bld_opteron_fma4_1t(unsigned long long addrMem) __attribute__((noinline));
-int init_bld_opteron_fma4_1t(unsigned long long addrMem)
+
+ int init_bld_opteron_fma4_1t(threaddata_t* threaddata) __attribute__((noinline));
+int init_bld_opteron_fma4_1t(threaddata_t* threaddata)
{
+ unsigned long long addrMem = threaddata->addrMem;
int i;
for (i=0;i<13338624;i++) *((double*)(addrMem+8*i)) = 0.25 + (double)(i%9267) * 0.24738995982e-4;
+ // lines with register operations
+ threaddata->flops+=45*8; // 2 128 bit FMA operations
+
+ // lines with L1 operations
+ threaddata->flops+=90*12; // 1 128 and 1 256 bit FMA operation
+
+ // lines with L2 operations
+ threaddata->flops+=5*4; // 1 128 bit FMA operation
+
+ // lines with L3 operations
+ threaddata->flops+=1*4; // 1 128 bit FMA operation
+
+ // lines with RAM operations
+ threaddata->flops+=1*8; // 2 128 bit FMA operations
+ threaddata->bytes=1*64; // 1 memory access
+
+ threaddata->flops*=10;
+ threaddata->bytes*=10;
+
return EXIT_SUCCESS;
}
@@ -39,10 +60,10 @@ int init_bld_opteron_fma4_1t(unsigned long long addrMem)
* @input - addrMem: pointer to buffer
* @return EXIT_SUCCESS
*/
-int asm_work_bld_opteron_fma4_1t(unsigned long long addrMem, unsigned long long addrHigh) __attribute__((noinline));
-int asm_work_bld_opteron_fma4_1t(unsigned long long addrMem, unsigned long long addrHigh)
+int asm_work_bld_opteron_fma4_1t(threaddata_t* threaddata) __attribute__((noinline));
+int asm_work_bld_opteron_fma4_1t(threaddata_t* threaddata)
{
- if (*((unsigned long long*)addrHigh) == 0) return EXIT_SUCCESS;
+ if (*((unsigned long long*)threaddata->addrHigh) == 0) return EXIT_SUCCESS;
/* input:
* - addrMem -> rax
* register usage:
@@ -57,12 +78,14 @@ int asm_work_bld_opteron_fma4_1t(unsigned long long addrMem, unsigned long long
* - r13: register for temporary results
* - r14: stores cacheline width as increment for buffer addresses
* - r15: stores address of shared variable that controls load level
+ * - mm0: stores iteration counter
* - rdx, rsi, rdi: registers for shift operations
- * - mm*,xmm*,xmm*: data registers for SIMD instructions
+ * - xmm*,xmm*: data registers for SIMD instructions
*/
__asm__ __volatile__(
- "mov %0, %%rax;" // store start address of buffer
- "mov %1, %%r15;" // store address of shared variable that controls load level
+ "mov %%rax, %%rax;" // store start address of buffer
+ "mov %%rbx, %%r15;" // store address of shared variable that controls load level
+ "movq %%rcx, %%mm0;" // store iteration counter
"mov $64, %%r14;" // increment after each cache/memory access
//Initialize registers for shift operations
"mov $0xAAAAAAAA, %%edi;"
@@ -1518,6 +1541,7 @@ int asm_work_bld_opteron_fma4_1t(unsigned long long addrMem, unsigned long long
"vfmaddpd %%xmm8, %%xmm0, %%xmm7, %%xmm7; vfmaddpd %%xmm9, %%xmm1, %%xmm13, %%xmm13; shl $1, %%esi; xor %%rdi, %%r13; " // REG ops only
"vfmaddpd %%xmm9, %%xmm0, %%xmm8, %%xmm8; vfmaddpd 32(%%rbx), %%ymm1, %%ymm8, %%ymm8; shl $1, %%edx; add %%r14, %%rbx; " // L1 load
"vfmaddpd %%xmm10, %%xmm0, %%xmm9, %%xmm9; vfmaddpd 32(%%rbx), %%ymm1, %%ymm9, %%ymm9; shr $1, %%edi; add %%r14, %%rbx; " // L1 load
+ "movq %%mm0, %%r13;" // restore iteration counter
//reset RAM counter
"sub $1, %%r12;"
"jnz _work_no_ram_reset_bld_opteron_fma4_1t;"
@@ -1525,6 +1549,7 @@ int asm_work_bld_opteron_fma4_1t(unsigned long long addrMem, unsigned long long
"mov %%rax, %%r9;"
"add $786432, %%r9;"
"_work_no_ram_reset_bld_opteron_fma4_1t:"
+ "inc %%r13;" // increment iteration counter
//reset L2-Cache counter
"sub $1, %%r10;"
"jnz _work_no_L2_reset_bld_opteron_fma4_1t;"
@@ -1532,6 +1557,7 @@ int asm_work_bld_opteron_fma4_1t(unsigned long long addrMem, unsigned long long
"mov %%rax, %%rcx;"
"add $16384, %%rcx;"
"_work_no_L2_reset_bld_opteron_fma4_1t:"
+ "movq %%r13, %%mm0;" // store iteration counter
//reset L3-Cache counter
"sub $1, %%r11;"
"jnz _work_no_L3_reset_bld_opteron_fma4_1t;"
@@ -1540,12 +1566,12 @@ int asm_work_bld_opteron_fma4_1t(unsigned long long addrMem, unsigned long long
"add $1048576, %%r8;"
"_work_no_L3_reset_bld_opteron_fma4_1t:"
"mov %%rax, %%rbx;"
- "mov (%%r15), %%r13;"
- "test $1, %%r13;"
+ "testq $1, (%%r15);"
"jnz _work_loop_bld_opteron_fma4_1t;"
- :
- : "a"(addrMem), "b"(addrHigh)
- : "%rcx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "%rdx", "%rsi", "%rdi", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15"
+ "movq %%mm0, %%rax;" // restore iteration counter
+ : "=a" (threaddata->iterations)
+ : "a"(threaddata->addrMem), "b"(threaddata->addrHigh), "c" (threaddata->iterations)
+ : "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "%rdx", "%rsi", "%rdi", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15"
);
return EXIT_SUCCESS;
}
diff --git a/fma_functions.c b/fma_functions.c
index 8edf27df..39451246 100644
--- a/fma_functions.c
+++ b/fma_functions.c
@@ -23,12 +23,33 @@
- int init_skl_corei_fma_1t(unsigned long long addrMem) __attribute__((noinline));
-int init_skl_corei_fma_1t(unsigned long long addrMem)
+
+ int init_skl_corei_fma_1t(threaddata_t* threaddata) __attribute__((noinline));
+int init_skl_corei_fma_1t(threaddata_t* threaddata)
{
+ unsigned long long addrMem = threaddata->addrMem;
int i;
for (i=0;i<13340672;i++) *((double*)(addrMem+8*i)) = 0.25 + (double)(i%9267) * 0.24738995982e-4;
+ // lines with register operations
+ threaddata->flops+=40*16; // 2 256 bit FMA operations
+
+ // lines with L1 operations
+ threaddata->flops+=78*16; // 2 256 bit FMA operations
+
+ // lines with L2 operations
+ threaddata->flops+=18*8; // 1 256 bit FMA operation
+
+ // lines with L3 operations
+ threaddata->flops+=5*8; // 1 256 bit FMA operation
+
+ // lines with RAM operations
+ threaddata->flops+=3*16; // 2 256 bit FMA operations
+ threaddata->bytes=3*64; // 1 memory access
+
+ threaddata->flops*=10;
+ threaddata->bytes*=10;
+
return EXIT_SUCCESS;
}
@@ -39,10 +60,10 @@ int init_skl_corei_fma_1t(unsigned long long addrMem)
* @input - addrMem: pointer to buffer
* @return EXIT_SUCCESS
*/
-int asm_work_skl_corei_fma_1t(unsigned long long addrMem, unsigned long long addrHigh) __attribute__((noinline));
-int asm_work_skl_corei_fma_1t(unsigned long long addrMem, unsigned long long addrHigh)
+int asm_work_skl_corei_fma_1t(threaddata_t* threaddata) __attribute__((noinline));
+int asm_work_skl_corei_fma_1t(threaddata_t* threaddata)
{
- if (*((unsigned long long*)addrHigh) == 0) return EXIT_SUCCESS;
+ if (*((unsigned long long*)threaddata->addrHigh) == 0) return EXIT_SUCCESS;
/* input:
* - addrMem -> rax
* register usage:
@@ -57,12 +78,14 @@ int asm_work_skl_corei_fma_1t(unsigned long long addrMem, unsigned long long add
* - r13: register for temporary results
* - r14: stores cacheline width as increment for buffer addresses
* - r15: stores address of shared variable that controls load level
+ * - mm0: stores iteration counter
* - rdx, rsi, rdi: registers for shift operations
- * - mm*,xmm*,ymm*: data registers for SIMD instructions
+ * - xmm*,ymm*: data registers for SIMD instructions
*/
__asm__ __volatile__(
- "mov %0, %%rax;" // store start address of buffer
- "mov %1, %%r15;" // store address of shared variable that controls load level
+ "mov %%rax, %%rax;" // store start address of buffer
+ "mov %%rbx, %%r15;" // store address of shared variable that controls load level
+ "movq %%rcx, %%mm0;" // store iteration counter
"mov $64, %%r14;" // increment after each cache/memory access
//Initialize registers for shift operations
"mov $0xAAAAAAAA, %%edi;"
@@ -1538,6 +1561,7 @@ int asm_work_skl_corei_fma_1t(unsigned long long addrMem, unsigned long long add
"vfmadd231pd %%ymm10, %%ymm0, %%ymm9; vfmadd231pd %%ymm2, %%ymm1, %%ymm11; shr $1, %%edi; xor %%rdx, %%r13; " // REG ops only
"vfmadd231pd 64(%%rbx), %%ymm0, %%ymm10; vfmadd231pd 96(%%rbx), %%ymm1, %%ymm12; vmovapd %%ymm10, 32(%%rbx); add $128, %%rbx; " // 2 L1 loads, L1 store
"vfmadd231pd 64(%%rbx), %%ymm0, %%ymm2; vfmadd231pd 96(%%rbx), %%ymm1, %%ymm12; vmovapd %%ymm2, 32(%%rbx); add $128, %%rbx; " // 2 L1 loads, L1 store
+ "movq %%mm0, %%r13;" // restore iteration counter
//reset RAM counter
"sub $1, %%r12;"
"jnz _work_no_ram_reset_skl_corei_fma_1t;"
@@ -1545,6 +1569,7 @@ int asm_work_skl_corei_fma_1t(unsigned long long addrMem, unsigned long long add
"mov %%rax, %%r9;"
"add $1572864, %%r9;"
"_work_no_ram_reset_skl_corei_fma_1t:"
+ "inc %%r13;" // increment iteration counter
//reset L2-Cache counter
"sub $1, %%r10;"
"jnz _work_no_L2_reset_skl_corei_fma_1t;"
@@ -1552,6 +1577,7 @@ int asm_work_skl_corei_fma_1t(unsigned long long addrMem, unsigned long long add
"mov %%rax, %%rcx;"
"add $32768, %%rcx;"
"_work_no_L2_reset_skl_corei_fma_1t:"
+ "movq %%r13, %%mm0;" // store iteration counter
//reset L3-Cache counter
"sub $1, %%r11;"
"jnz _work_no_L3_reset_skl_corei_fma_1t;"
@@ -1560,24 +1586,45 @@ int asm_work_skl_corei_fma_1t(unsigned long long addrMem, unsigned long long add
"add $262144, %%r8;"
"_work_no_L3_reset_skl_corei_fma_1t:"
"mov %%rax, %%rbx;"
- "mov (%%r15), %%r13;"
- "test $1, %%r13;"
+ "testq $1, (%%r15);"
"jnz _work_loop_skl_corei_fma_1t;"
- :
- : "a"(addrMem), "b"(addrHigh)
- : "%rcx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "%rdx", "%rsi", "%rdi", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15"
+ "movq %%mm0, %%rax;" // restore iteration counter
+ : "=a" (threaddata->iterations)
+ : "a"(threaddata->addrMem), "b"(threaddata->addrHigh), "c" (threaddata->iterations)
+ : "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "%rdx", "%rsi", "%rdi", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15"
);
return EXIT_SUCCESS;
}
- int init_skl_corei_fma_2t(unsigned long long addrMem) __attribute__((noinline));
-int init_skl_corei_fma_2t(unsigned long long addrMem)
+
+ int init_skl_corei_fma_2t(threaddata_t* threaddata) __attribute__((noinline));
+int init_skl_corei_fma_2t(threaddata_t* threaddata)
{
+ unsigned long long addrMem = threaddata->addrMem;
int i;
for (i=0;i<6670336;i++) *((double*)(addrMem+8*i)) = 0.25 + (double)(i%9267) * 0.24738995982e-4;
+ // lines with register operations
+ threaddata->flops+=40*16; // 2 256 bit FMA operations
+
+ // lines with L1 operations
+ threaddata->flops+=78*16; // 2 256 bit FMA operations
+
+ // lines with L2 operations
+ threaddata->flops+=18*8; // 1 256 bit FMA operation
+
+ // lines with L3 operations
+ threaddata->flops+=5*8; // 1 256 bit FMA operation
+
+ // lines with RAM operations
+ threaddata->flops+=3*16; // 2 256 bit FMA operations
+ threaddata->bytes=3*64; // 1 memory access
+
+ threaddata->flops*=5;
+ threaddata->bytes*=5;
+
return EXIT_SUCCESS;
}
@@ -1588,10 +1635,10 @@ int init_skl_corei_fma_2t(unsigned long long addrMem)
* @input - addrMem: pointer to buffer
* @return EXIT_SUCCESS
*/
-int asm_work_skl_corei_fma_2t(unsigned long long addrMem, unsigned long long addrHigh) __attribute__((noinline));
-int asm_work_skl_corei_fma_2t(unsigned long long addrMem, unsigned long long addrHigh)
+int asm_work_skl_corei_fma_2t(threaddata_t* threaddata) __attribute__((noinline));
+int asm_work_skl_corei_fma_2t(threaddata_t* threaddata)
{
- if (*((unsigned long long*)addrHigh) == 0) return EXIT_SUCCESS;
+ if (*((unsigned long long*)threaddata->addrHigh) == 0) return EXIT_SUCCESS;
/* input:
* - addrMem -> rax
* register usage:
@@ -1606,12 +1653,14 @@ int asm_work_skl_corei_fma_2t(unsigned long long addrMem, unsigned long long add
* - r13: register for temporary results
* - r14: stores cacheline width as increment for buffer addresses
* - r15: stores address of shared variable that controls load level
+ * - mm0: stores iteration counter
* - rdx, rsi, rdi: registers for shift operations
- * - mm*,xmm*,ymm*: data registers for SIMD instructions
+ * - xmm*,ymm*: data registers for SIMD instructions
*/
__asm__ __volatile__(
- "mov %0, %%rax;" // store start address of buffer
- "mov %1, %%r15;" // store address of shared variable that controls load level
+ "mov %%rax, %%rax;" // store start address of buffer
+ "mov %%rbx, %%r15;" // store address of shared variable that controls load level
+ "movq %%rcx, %%mm0;" // store iteration counter
"mov $64, %%r14;" // increment after each cache/memory access
//Initialize registers for shift operations
"mov $0xAAAAAAAA, %%edi;"
@@ -2367,6 +2416,7 @@ int asm_work_skl_corei_fma_2t(unsigned long long addrMem, unsigned long long add
"vfmadd231pd %%ymm10, %%ymm0, %%ymm9; vfmadd231pd %%ymm2, %%ymm1, %%ymm12; shr $1, %%edi; xor %%rdx, %%r13; " // REG ops only
"vfmadd231pd 64(%%rbx), %%ymm0, %%ymm10; vfmadd231pd 96(%%rbx), %%ymm1, %%ymm13; vmovapd %%ymm10, 32(%%rbx); add $128, %%rbx; " // 2 L1 loads, L1 store
"vfmadd231pd 64(%%rbx), %%ymm0, %%ymm2; vfmadd231pd 96(%%rbx), %%ymm1, %%ymm13; vmovapd %%ymm2, 32(%%rbx); add $128, %%rbx; " // 2 L1 loads, L1 store
+ "movq %%mm0, %%r13;" // restore iteration counter
//reset RAM counter
"sub $1, %%r12;"
"jnz _work_no_ram_reset_skl_corei_fma_2t;"
@@ -2374,6 +2424,7 @@ int asm_work_skl_corei_fma_2t(unsigned long long addrMem, unsigned long long add
"mov %%rax, %%r9;"
"add $786432, %%r9;"
"_work_no_ram_reset_skl_corei_fma_2t:"
+ "inc %%r13;" // increment iteration counter
//reset L2-Cache counter
"sub $1, %%r10;"
"jnz _work_no_L2_reset_skl_corei_fma_2t;"
@@ -2381,6 +2432,7 @@ int asm_work_skl_corei_fma_2t(unsigned long long addrMem, unsigned long long add
"mov %%rax, %%rcx;"
"add $16384, %%rcx;"
"_work_no_L2_reset_skl_corei_fma_2t:"
+ "movq %%r13, %%mm0;" // store iteration counter
//reset L3-Cache counter
"sub $1, %%r11;"
"jnz _work_no_L3_reset_skl_corei_fma_2t;"
@@ -2389,24 +2441,45 @@ int asm_work_skl_corei_fma_2t(unsigned long long addrMem, unsigned long long add
"add $131072, %%r8;"
"_work_no_L3_reset_skl_corei_fma_2t:"
"mov %%rax, %%rbx;"
- "mov (%%r15), %%r13;"
- "test $1, %%r13;"
+ "testq $1, (%%r15);"
"jnz _work_loop_skl_corei_fma_2t;"
- :
- : "a"(addrMem), "b"(addrHigh)
- : "%rcx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "%rdx", "%rsi", "%rdi", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15"
+ "movq %%mm0, %%rax;" // restore iteration counter
+ : "=a" (threaddata->iterations)
+ : "a"(threaddata->addrMem), "b"(threaddata->addrHigh), "c" (threaddata->iterations)
+ : "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "%rdx", "%rsi", "%rdi", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15"
);
return EXIT_SUCCESS;
}
- int init_hsw_corei_fma_1t(unsigned long long addrMem) __attribute__((noinline));
-int init_hsw_corei_fma_1t(unsigned long long addrMem)
+
+ int init_hsw_corei_fma_1t(threaddata_t* threaddata) __attribute__((noinline));
+int init_hsw_corei_fma_1t(threaddata_t* threaddata)
{
+ unsigned long long addrMem = threaddata->addrMem;
int i;
for (i=0;i<13340672;i++) *((double*)(addrMem+8*i)) = 0.25 + (double)(i%9267) * 0.24738995982e-4;
+ // lines with register operations
+ threaddata->flops+=40*16; // 2 256 bit FMA operations
+
+ // lines with L1 operations
+ threaddata->flops+=90*8; // 1 256 bit FMA operation
+
+ // lines with L2 operations
+ threaddata->flops+=9*8; // 1 256 bit FMA operation
+
+ // lines with L3 operations
+ threaddata->flops+=3*8; // 1 256 bit FMA operation
+
+ // lines with RAM operations
+ threaddata->flops+=2*16; // 2 256 bit FMA operations
+ threaddata->bytes=2*64; // 1 memory access
+
+ threaddata->flops*=10;
+ threaddata->bytes*=10;
+
return EXIT_SUCCESS;
}
@@ -2417,10 +2490,10 @@ int init_hsw_corei_fma_1t(unsigned long long addrMem)
* @input - addrMem: pointer to buffer
* @return EXIT_SUCCESS
*/
-int asm_work_hsw_corei_fma_1t(unsigned long long addrMem, unsigned long long addrHigh) __attribute__((noinline));
-int asm_work_hsw_corei_fma_1t(unsigned long long addrMem, unsigned long long addrHigh)
+int asm_work_hsw_corei_fma_1t(threaddata_t* threaddata) __attribute__((noinline));
+int asm_work_hsw_corei_fma_1t(threaddata_t* threaddata)
{
- if (*((unsigned long long*)addrHigh) == 0) return EXIT_SUCCESS;
+ if (*((unsigned long long*)threaddata->addrHigh) == 0) return EXIT_SUCCESS;
/* input:
* - addrMem -> rax
* register usage:
@@ -2435,12 +2508,14 @@ int asm_work_hsw_corei_fma_1t(unsigned long long addrMem, unsigned long long add
* - r13: register for temporary results
* - r14: stores cacheline width as increment for buffer addresses
* - r15: stores address of shared variable that controls load level
+ * - mm0: stores iteration counter
* - rdx, rsi, rdi: registers for shift operations
- * - mm*,xmm*,ymm*: data registers for SIMD instructions
+ * - xmm*,ymm*: data registers for SIMD instructions
*/
__asm__ __volatile__(
- "mov %0, %%rax;" // store start address of buffer
- "mov %1, %%r15;" // store address of shared variable that controls load level
+ "mov %%rax, %%rax;" // store start address of buffer
+ "mov %%rbx, %%r15;" // store address of shared variable that controls load level
+ "movq %%rcx, %%mm0;" // store iteration counter
"mov $64, %%r14;" // increment after each cache/memory access
//Initialize registers for shift operations
"mov $0xAAAAAAAA, %%edi;"
@@ -3916,6 +3991,7 @@ int asm_work_hsw_corei_fma_1t(unsigned long long addrMem, unsigned long long add
"vfmadd231pd %%ymm10, %%ymm0, %%ymm9; vfmadd231pd %%ymm2, %%ymm1, %%ymm11; shr $1, %%edi; xor %%rdx, %%r13; " // REG ops only
"vmovapd %%xmm10, 64(%%rbx); vfmadd231pd 32(%%rbx), %%ymm0, %%ymm10; shr $1, %%esi; add %%r14, %%rbx; " // L1 load, L1 store
"vmovapd %%xmm2, 64(%%rbx); vfmadd231pd 32(%%rbx), %%ymm0, %%ymm2; shr $1, %%edx; add %%r14, %%rbx; " // L1 load, L1 store
+ "movq %%mm0, %%r13;" // restore iteration counter
//reset RAM counter
"sub $1, %%r12;"
"jnz _work_no_ram_reset_hsw_corei_fma_1t;"
@@ -3923,6 +3999,7 @@ int asm_work_hsw_corei_fma_1t(unsigned long long addrMem, unsigned long long add
"mov %%rax, %%r9;"
"add $1572864, %%r9;"
"_work_no_ram_reset_hsw_corei_fma_1t:"
+ "inc %%r13;" // increment iteration counter
//reset L2-Cache counter
"sub $1, %%r10;"
"jnz _work_no_L2_reset_hsw_corei_fma_1t;"
@@ -3930,6 +4007,7 @@ int asm_work_hsw_corei_fma_1t(unsigned long long addrMem, unsigned long long add
"mov %%rax, %%rcx;"
"add $32768, %%rcx;"
"_work_no_L2_reset_hsw_corei_fma_1t:"
+ "movq %%r13, %%mm0;" // store iteration counter
//reset L3-Cache counter
"sub $1, %%r11;"
"jnz _work_no_L3_reset_hsw_corei_fma_1t;"
@@ -3938,24 +4016,45 @@ int asm_work_hsw_corei_fma_1t(unsigned long long addrMem, unsigned long long add
"add $262144, %%r8;"
"_work_no_L3_reset_hsw_corei_fma_1t:"
"mov %%rax, %%rbx;"
- "mov (%%r15), %%r13;"
- "test $1, %%r13;"
+ "testq $1, (%%r15);"
"jnz _work_loop_hsw_corei_fma_1t;"
- :
- : "a"(addrMem), "b"(addrHigh)
- : "%rcx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "%rdx", "%rsi", "%rdi", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15"
+ "movq %%mm0, %%rax;" // restore iteration counter
+ : "=a" (threaddata->iterations)
+ : "a"(threaddata->addrMem), "b"(threaddata->addrHigh), "c" (threaddata->iterations)
+ : "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "%rdx", "%rsi", "%rdi", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15"
);
return EXIT_SUCCESS;
}
- int init_hsw_corei_fma_2t(unsigned long long addrMem) __attribute__((noinline));
-int init_hsw_corei_fma_2t(unsigned long long addrMem)
+
+ int init_hsw_corei_fma_2t(threaddata_t* threaddata) __attribute__((noinline));
+int init_hsw_corei_fma_2t(threaddata_t* threaddata)
{
+ unsigned long long addrMem = threaddata->addrMem;
int i;
for (i=0;i<6670336;i++) *((double*)(addrMem+8*i)) = 0.25 + (double)(i%9267) * 0.24738995982e-4;
+ // lines with register operations
+ threaddata->flops+=40*16; // 2 256 bit FMA operations
+
+ // lines with L1 operations
+ threaddata->flops+=90*8; // 1 256 bit FMA operation
+
+ // lines with L2 operations
+ threaddata->flops+=9*8; // 1 256 bit FMA operation
+
+ // lines with L3 operations
+ threaddata->flops+=3*8; // 1 256 bit FMA operation
+
+ // lines with RAM operations
+ threaddata->flops+=2*16; // 2 256 bit FMA operations
+ threaddata->bytes=2*64; // 1 memory access
+
+ threaddata->flops*=5;
+ threaddata->bytes*=5;
+
return EXIT_SUCCESS;
}
@@ -3966,10 +4065,10 @@ int init_hsw_corei_fma_2t(unsigned long long addrMem)
* @input - addrMem: pointer to buffer
* @return EXIT_SUCCESS
*/
-int asm_work_hsw_corei_fma_2t(unsigned long long addrMem, unsigned long long addrHigh) __attribute__((noinline));
-int asm_work_hsw_corei_fma_2t(unsigned long long addrMem, unsigned long long addrHigh)
+int asm_work_hsw_corei_fma_2t(threaddata_t* threaddata) __attribute__((noinline));
+int asm_work_hsw_corei_fma_2t(threaddata_t* threaddata)
{
- if (*((unsigned long long*)addrHigh) == 0) return EXIT_SUCCESS;
+ if (*((unsigned long long*)threaddata->addrHigh) == 0) return EXIT_SUCCESS;
/* input:
* - addrMem -> rax
* register usage:
@@ -3984,12 +4083,14 @@ int asm_work_hsw_corei_fma_2t(unsigned long long addrMem, unsigned long long add
* - r13: register for temporary results
* - r14: stores cacheline width as increment for buffer addresses
* - r15: stores address of shared variable that controls load level
+ * - mm0: stores iteration counter
* - rdx, rsi, rdi: registers for shift operations
- * - mm*,xmm*,ymm*: data registers for SIMD instructions
+ * - xmm*,ymm*: data registers for SIMD instructions
*/
__asm__ __volatile__(
- "mov %0, %%rax;" // store start address of buffer
- "mov %1, %%r15;" // store address of shared variable that controls load level
+ "mov %%rax, %%rax;" // store start address of buffer
+ "mov %%rbx, %%r15;" // store address of shared variable that controls load level
+ "movq %%rcx, %%mm0;" // store iteration counter
"mov $64, %%r14;" // increment after each cache/memory access
//Initialize registers for shift operations
"mov $0xAAAAAAAA, %%edi;"
@@ -4745,6 +4846,7 @@ int asm_work_hsw_corei_fma_2t(unsigned long long addrMem, unsigned long long add
"vfmadd231pd %%ymm10, %%ymm0, %%ymm9; vfmadd231pd %%ymm2, %%ymm1, %%ymm12; shr $1, %%edi; xor %%rdx, %%r13; " // REG ops only
"vmovapd %%xmm10, 64(%%rbx); vfmadd231pd 32(%%rbx), %%ymm0, %%ymm10; shr $1, %%esi; add %%r14, %%rbx; " // L1 load, L1 store
"vmovapd %%xmm2, 64(%%rbx); vfmadd231pd 32(%%rbx), %%ymm0, %%ymm2; shr $1, %%edx; add %%r14, %%rbx; " // L1 load, L1 store
+ "movq %%mm0, %%r13;" // restore iteration counter
//reset RAM counter
"sub $1, %%r12;"
"jnz _work_no_ram_reset_hsw_corei_fma_2t;"
@@ -4752,6 +4854,7 @@ int asm_work_hsw_corei_fma_2t(unsigned long long addrMem, unsigned long long add
"mov %%rax, %%r9;"
"add $786432, %%r9;"
"_work_no_ram_reset_hsw_corei_fma_2t:"
+ "inc %%r13;" // increment iteration counter
//reset L2-Cache counter
"sub $1, %%r10;"
"jnz _work_no_L2_reset_hsw_corei_fma_2t;"
@@ -4759,6 +4862,7 @@ int asm_work_hsw_corei_fma_2t(unsigned long long addrMem, unsigned long long add
"mov %%rax, %%rcx;"
"add $16384, %%rcx;"
"_work_no_L2_reset_hsw_corei_fma_2t:"
+ "movq %%r13, %%mm0;" // store iteration counter
//reset L3-Cache counter
"sub $1, %%r11;"
"jnz _work_no_L3_reset_hsw_corei_fma_2t;"
@@ -4767,24 +4871,45 @@ int asm_work_hsw_corei_fma_2t(unsigned long long addrMem, unsigned long long add
"add $131072, %%r8;"
"_work_no_L3_reset_hsw_corei_fma_2t:"
"mov %%rax, %%rbx;"
- "mov (%%r15), %%r13;"
- "test $1, %%r13;"
+ "testq $1, (%%r15);"
"jnz _work_loop_hsw_corei_fma_2t;"
- :
- : "a"(addrMem), "b"(addrHigh)
- : "%rcx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "%rdx", "%rsi", "%rdi", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15"
+ "movq %%mm0, %%rax;" // restore iteration counter
+ : "=a" (threaddata->iterations)
+ : "a"(threaddata->addrMem), "b"(threaddata->addrHigh), "c" (threaddata->iterations)
+ : "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "%rdx", "%rsi", "%rdi", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15"
);
return EXIT_SUCCESS;
}
- int init_hsw_xeonep_fma_1t(unsigned long long addrMem) __attribute__((noinline));
-int init_hsw_xeonep_fma_1t(unsigned long long addrMem)
+
+ int init_hsw_xeonep_fma_1t(threaddata_t* threaddata) __attribute__((noinline));
+int init_hsw_xeonep_fma_1t(threaddata_t* threaddata)
{
+ unsigned long long addrMem = threaddata->addrMem;
int i;
for (i=0;i<13471744;i++) *((double*)(addrMem+8*i)) = 0.25 + (double)(i%9267) * 0.24738995982e-4;
+ // lines with register operations
+ threaddata->flops+=35*16; // 2 256 bit FMA operations
+
+ // lines with L1 operations
+ threaddata->flops+=79*8; // 1 256 bit FMA operation
+
+ // lines with L2 operations
+ threaddata->flops+=9*8; // 1 256 bit FMA operation
+
+ // lines with L3 operations
+ threaddata->flops+=1*8; // 1 256 bit FMA operation
+
+ // lines with RAM operations
+ threaddata->flops+=2*16; // 2 256 bit FMA operations
+ threaddata->bytes=2*64; // 1 memory access
+
+ threaddata->flops*=12;
+ threaddata->bytes*=12;
+
return EXIT_SUCCESS;
}
@@ -4795,10 +4920,10 @@ int init_hsw_xeonep_fma_1t(unsigned long long addrMem)
* @input - addrMem: pointer to buffer
* @return EXIT_SUCCESS
*/
-int asm_work_hsw_xeonep_fma_1t(unsigned long long addrMem, unsigned long long addrHigh) __attribute__((noinline));
-int asm_work_hsw_xeonep_fma_1t(unsigned long long addrMem, unsigned long long addrHigh)
+int asm_work_hsw_xeonep_fma_1t(threaddata_t* threaddata) __attribute__((noinline));
+int asm_work_hsw_xeonep_fma_1t(threaddata_t* threaddata)
{
- if (*((unsigned long long*)addrHigh) == 0) return EXIT_SUCCESS;
+ if (*((unsigned long long*)threaddata->addrHigh) == 0) return EXIT_SUCCESS;
/* input:
* - addrMem -> rax
* register usage:
@@ -4813,12 +4938,14 @@ int asm_work_hsw_xeonep_fma_1t(unsigned long long addrMem, unsigned long long ad
* - r13: register for temporary results
* - r14: stores cacheline width as increment for buffer addresses
* - r15: stores address of shared variable that controls load level
+ * - mm0: stores iteration counter
* - rdx, rsi, rdi: registers for shift operations
- * - mm*,xmm*,ymm*: data registers for SIMD instructions
+ * - xmm*,ymm*: data registers for SIMD instructions
*/
__asm__ __volatile__(
- "mov %0, %%rax;" // store start address of buffer
- "mov %1, %%r15;" // store address of shared variable that controls load level
+ "mov %%rax, %%rax;" // store start address of buffer
+ "mov %%rbx, %%r15;" // store address of shared variable that controls load level
+ "movq %%rcx, %%mm0;" // store iteration counter
"mov $64, %%r14;" // increment after each cache/memory access
//Initialize registers for shift operations
"mov $0xAAAAAAAA, %%edi;"
@@ -6366,6 +6493,7 @@ int asm_work_hsw_xeonep_fma_1t(unsigned long long addrMem, unsigned long long ad
"vfmadd231pd %%ymm10, %%ymm0, %%ymm9; vfmadd231pd %%ymm2, %%ymm1, %%ymm13; shr $1, %%edi; xor %%rdx, %%r13; " // REG ops only
"vmovapd %%xmm10, 64(%%rbx); vfmadd231pd 32(%%rbx), %%ymm0, %%ymm10; shr $1, %%esi; add %%r14, %%rbx; " // L1 load, L1 store
"vmovapd %%xmm2, 64(%%rbx); vfmadd231pd 32(%%rbx), %%ymm0, %%ymm2; shr $1, %%edx; add %%r14, %%rbx; " // L1 load, L1 store
+ "movq %%mm0, %%r13;" // restore iteration counter
//reset RAM counter
"sub $1, %%r12;"
"jnz _work_no_ram_reset_hsw_xeonep_fma_1t;"
@@ -6373,6 +6501,7 @@ int asm_work_hsw_xeonep_fma_1t(unsigned long long addrMem, unsigned long long ad
"mov %%rax, %%r9;"
"add $2621440, %%r9;"
"_work_no_ram_reset_hsw_xeonep_fma_1t:"
+ "inc %%r13;" // increment iteration counter
//reset L2-Cache counter
"sub $1, %%r10;"
"jnz _work_no_L2_reset_hsw_xeonep_fma_1t;"
@@ -6380,6 +6509,7 @@ int asm_work_hsw_xeonep_fma_1t(unsigned long long addrMem, unsigned long long ad
"mov %%rax, %%rcx;"
"add $32768, %%rcx;"
"_work_no_L2_reset_hsw_xeonep_fma_1t:"
+ "movq %%r13, %%mm0;" // store iteration counter
//reset L3-Cache counter
"sub $1, %%r11;"
"jnz _work_no_L3_reset_hsw_xeonep_fma_1t;"
@@ -6388,24 +6518,45 @@ int asm_work_hsw_xeonep_fma_1t(unsigned long long addrMem, unsigned long long ad
"add $262144, %%r8;"
"_work_no_L3_reset_hsw_xeonep_fma_1t:"
"mov %%rax, %%rbx;"
- "mov (%%r15), %%r13;"
- "test $1, %%r13;"
+ "testq $1, (%%r15);"
"jnz _work_loop_hsw_xeonep_fma_1t;"
- :
- : "a"(addrMem), "b"(addrHigh)
- : "%rcx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "%rdx", "%rsi", "%rdi", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15"
+ "movq %%mm0, %%rax;" // restore iteration counter
+ : "=a" (threaddata->iterations)
+ : "a"(threaddata->addrMem), "b"(threaddata->addrHigh), "c" (threaddata->iterations)
+ : "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "%rdx", "%rsi", "%rdi", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15"
);
return EXIT_SUCCESS;
}
- int init_hsw_xeonep_fma_2t(unsigned long long addrMem) __attribute__((noinline));
-int init_hsw_xeonep_fma_2t(unsigned long long addrMem)
+
+ int init_hsw_xeonep_fma_2t(threaddata_t* threaddata) __attribute__((noinline));
+int init_hsw_xeonep_fma_2t(threaddata_t* threaddata)
{
+ unsigned long long addrMem = threaddata->addrMem;
int i;
for (i=0;i<6735872;i++) *((double*)(addrMem+8*i)) = 0.25 + (double)(i%9267) * 0.24738995982e-4;
+ // lines with register operations
+ threaddata->flops+=35*16; // 2 256 bit FMA operations
+
+ // lines with L1 operations
+ threaddata->flops+=79*8; // 1 256 bit FMA operation
+
+ // lines with L2 operations
+ threaddata->flops+=9*8; // 1 256 bit FMA operation
+
+ // lines with L3 operations
+ threaddata->flops+=1*8; // 1 256 bit FMA operation
+
+ // lines with RAM operations
+ threaddata->flops+=2*16; // 2 256 bit FMA operations
+ threaddata->bytes=2*64; // 1 memory access
+
+ threaddata->flops*=6;
+ threaddata->bytes*=6;
+
return EXIT_SUCCESS;
}
@@ -6416,10 +6567,10 @@ int init_hsw_xeonep_fma_2t(unsigned long long addrMem)
* @input - addrMem: pointer to buffer
* @return EXIT_SUCCESS
*/
-int asm_work_hsw_xeonep_fma_2t(unsigned long long addrMem, unsigned long long addrHigh) __attribute__((noinline));
-int asm_work_hsw_xeonep_fma_2t(unsigned long long addrMem, unsigned long long addrHigh)
+int asm_work_hsw_xeonep_fma_2t(threaddata_t* threaddata) __attribute__((noinline));
+int asm_work_hsw_xeonep_fma_2t(threaddata_t* threaddata)
{
- if (*((unsigned long long*)addrHigh) == 0) return EXIT_SUCCESS;
+ if (*((unsigned long long*)threaddata->addrHigh) == 0) return EXIT_SUCCESS;
/* input:
* - addrMem -> rax
* register usage:
@@ -6434,12 +6585,14 @@ int asm_work_hsw_xeonep_fma_2t(unsigned long long addrMem, unsigned long long ad
* - r13: register for temporary results
* - r14: stores cacheline width as increment for buffer addresses
* - r15: stores address of shared variable that controls load level
+ * - mm0: stores iteration counter
* - rdx, rsi, rdi: registers for shift operations
- * - mm*,xmm*,ymm*: data registers for SIMD instructions
+ * - xmm*,ymm*: data registers for SIMD instructions
*/
__asm__ __volatile__(
- "mov %0, %%rax;" // store start address of buffer
- "mov %1, %%r15;" // store address of shared variable that controls load level
+ "mov %%rax, %%rax;" // store start address of buffer
+ "mov %%rbx, %%r15;" // store address of shared variable that controls load level
+ "movq %%rcx, %%mm0;" // store iteration counter
"mov $64, %%r14;" // increment after each cache/memory access
//Initialize registers for shift operations
"mov $0xAAAAAAAA, %%edi;"
@@ -7231,6 +7384,7 @@ int asm_work_hsw_xeonep_fma_2t(unsigned long long addrMem, unsigned long long ad
"vfmadd231pd %%ymm10, %%ymm0, %%ymm9; vfmadd231pd %%ymm2, %%ymm1, %%ymm13; shr $1, %%edi; xor %%rdx, %%r13; " // REG ops only
"vmovapd %%xmm10, 64(%%rbx); vfmadd231pd 32(%%rbx), %%ymm0, %%ymm10; shr $1, %%esi; add %%r14, %%rbx; " // L1 load, L1 store
"vmovapd %%xmm2, 64(%%rbx); vfmadd231pd 32(%%rbx), %%ymm0, %%ymm2; shr $1, %%edx; add %%r14, %%rbx; " // L1 load, L1 store
+ "movq %%mm0, %%r13;" // restore iteration counter
//reset RAM counter
"sub $1, %%r12;"
"jnz _work_no_ram_reset_hsw_xeonep_fma_2t;"
@@ -7238,6 +7392,7 @@ int asm_work_hsw_xeonep_fma_2t(unsigned long long addrMem, unsigned long long ad
"mov %%rax, %%r9;"
"add $1310720, %%r9;"
"_work_no_ram_reset_hsw_xeonep_fma_2t:"
+ "inc %%r13;" // increment iteration counter
//reset L2-Cache counter
"sub $1, %%r10;"
"jnz _work_no_L2_reset_hsw_xeonep_fma_2t;"
@@ -7245,6 +7400,7 @@ int asm_work_hsw_xeonep_fma_2t(unsigned long long addrMem, unsigned long long ad
"mov %%rax, %%rcx;"
"add $16384, %%rcx;"
"_work_no_L2_reset_hsw_xeonep_fma_2t:"
+ "movq %%r13, %%mm0;" // store iteration counter
//reset L3-Cache counter
"sub $1, %%r11;"
"jnz _work_no_L3_reset_hsw_xeonep_fma_2t;"
@@ -7253,12 +7409,12 @@ int asm_work_hsw_xeonep_fma_2t(unsigned long long addrMem, unsigned long long ad
"add $131072, %%r8;"
"_work_no_L3_reset_hsw_xeonep_fma_2t:"
"mov %%rax, %%rbx;"
- "mov (%%r15), %%r13;"
- "test $1, %%r13;"
+ "testq $1, (%%r15);"
"jnz _work_loop_hsw_xeonep_fma_2t;"
- :
- : "a"(addrMem), "b"(addrHigh)
- : "%rcx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "%rdx", "%rsi", "%rdi", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15"
+ "movq %%mm0, %%rax;" // restore iteration counter
+ : "=a" (threaddata->iterations)
+ : "a"(threaddata->addrMem), "b"(threaddata->addrHigh), "c" (threaddata->iterations)
+ : "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "%rdx", "%rsi", "%rdi", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15"
);
return EXIT_SUCCESS;
}
diff --git a/generic.c b/generic.c
index 7c3ed22f..f0efc62e 100644
--- a/generic.c
+++ b/generic.c
@@ -50,11 +50,14 @@
/* use old memcpy to avoid GLIBC 2.14 dependency */
__asm__(".symver memcpy, memcpy@GLIBC_2.2.5");
- /* buffer for some generic implementations */
+/* buffer for some generic implementations */
// TODO remove global variables to allow thread safe execution of detection
static char output[_HW_DETECT_MAX_OUTPUT];
static char path[_HW_DETECT_MAX_OUTPUT];
+/* avoid multiple executions of the corresponding functions */
+static int num_packages_sav = 0, num_cores_per_package_sav = 0, num_threads_per_core_sav = 0, num_threads_per_package_sav = 0;
+
/**
* list element for counting unique package_ids, core_ids etc.
*/
@@ -309,28 +312,22 @@ void generic_get_architecture(char* arch)
*/
int get_pkg(int cpu)
{
- int pkg=0;
- char buffer[_HW_DETECT_MAX_OUTPUT];
-
- if ((num_cpus() == 1) || (num_packages() == 1)) { return 0; }
+ int pkg=-1;
+ char buffer[10];
if (cpu == -1) { cpu = get_cpu(); }
if (cpu != -1)
{
-
sprintf(path, "/sys/devices/system/cpu/cpu%i/topology/physical_package_id", cpu);
+ if( read_file(path, buffer, sizeof(buffer)) ) pkg = atoi(buffer);
- if( !read_file(path, buffer, sizeof(buffer)) ) {
- pkg = -1;
- }
- else {
- pkg = atoi(buffer);
- }
-
+ /* fallbacks if sysfs is not working */
if (pkg == -1)
{
+ /* assume 0 if there is only one CPU or only one package */
+ if ((num_cpus() == 1) || (num_packages() == 1)) { return 0; }
/* get the physical package id from /proc/cpuinfo */
- if(!get_proc_cpuinfo_data("physical id", buffer, cpu)) { pkg = atoi(buffer); }
+ else if(!get_proc_cpuinfo_data("physical id", buffer, cpu)) { pkg = atoi(buffer); }
/* if the number of cpus equals the number of packages assume pkg_id = cpu_id*/
else if (num_cpus() == num_packages()) { pkg = cpu; }
/* if there is only one core per package assume pkg_id = core_id */
@@ -342,6 +339,7 @@ int get_pkg(int cpu)
without correct topology information in sysfs*/
}
}
+
return pkg;
}
@@ -353,28 +351,25 @@ int get_core_id(int cpu)
int core=-1;
char buffer[10];
- if (num_cpus() == 1) { return 0; }
-
if (cpu == -1) { cpu = get_cpu(); }
if (cpu != -1)
{
sprintf(path, "/sys/devices/system/cpu/cpu%i/topology/core_id", cpu);
+ if(read_file(path, buffer, sizeof(buffer))) core = atoi(buffer);
- if(!read_file(path, buffer, sizeof(buffer))) {
- core = -1;
- }
- else {
- core = atoi(buffer);
+ /* fallbacks if sysfs is not working */
+ if (core == -1)
+ {
+ /* assume 0 if there is only one CPU */
+ if (num_cpus() == 1) { return 0; }
+ /* if each package contains only one cpu assume core_id = package_id = cpu_id */
+ else if (num_cores_per_package() == 1) { core = 0; }
+
+ /* NOTE core_id can't be determined without correct topology information in sysfs if there are multiple cores per package
+ TODO /proc/cpuinfo */
}
}
- if (core == -1)
- {
- /* if each package contains only one cpu assume core_id = package_id = cpu_id */
- if (num_cores_per_package() == 1) { core = 0; }
- /* NOTE core_id can't be determined without correct topology information in sysfs if there are multiple cores per package
- TODO /proc/cpuinfo */
- }
return core;
}
@@ -464,7 +459,7 @@ void init_cpuinfo(cpu_info_t *cpuinfo,int print)
cpuinfo->clockrate = get_cpu_clockrate(1, 0);
/* setup supported feature list*/
- if(!strcmp(cpuinfo->architecture,"x86_64")) cpuinfo->features |=X86_64;
+ if(!strcmp(cpuinfo->architecture,"x86_64")) cpuinfo->features |= X86_64;
if (feature_available("SMT")) cpuinfo->features |= SMT;
if (feature_available("FPU")) cpuinfo->features |= FPU;
if (feature_available("MMX")) cpuinfo->features |= MMX;
@@ -479,8 +474,10 @@ void init_cpuinfo(cpu_info_t *cpuinfo,int print)
if (feature_available("ABM")) cpuinfo->features |= ABM;
if (feature_available("POPCNT")) cpuinfo->features |= POPCNT;
if (feature_available("AVX")) cpuinfo->features |= AVX;
+ if (feature_available("AVX2")) cpuinfo->features |= AVX2;
if (feature_available("FMA")) cpuinfo->features |= FMA;
if (feature_available("AES")) cpuinfo->features |= AES;
+ if (feature_available("AVX512")) cpuinfo->features |= AVX512;
/* determine cache details */
for (i=0; i<(unsigned int)num_caches(0); i++)
@@ -543,6 +540,8 @@ void init_cpuinfo(cpu_info_t *cpuinfo,int print)
if(cpuinfo->features&SSE4A) printf(" SSE4A");
if(cpuinfo->features&POPCNT) printf(" POPCNT");
if(cpuinfo->features&AVX) printf(" AVX");
+ if(cpuinfo->features&AVX2) printf(" AVX2");
+ if(cpuinfo->features&AVX512) printf(" AVX512");
if(cpuinfo->features&FMA) printf(" FMA");
if(cpuinfo->features&AES) printf(" AES");
if(cpuinfo->features&SMT) printf(" SMT");
@@ -1020,11 +1019,13 @@ int generic_cacheline_length(int cpu, int id) {
int generic_num_packages()
{
struct dirent **namelist;
- int ndir, m, num = -1;
+ int ndir, m;
char tmppath[_HW_DETECT_MAX_OUTPUT];
char buf[20];
id_le * pkg_id_list = NULL;
+ if (num_packages_sav != 0) return num_packages_sav;
+ num_packages_sav = -1;
strcpy(path, "/sys/devices/system/cpu/");
ndir = scandir(path, &namelist, 0, 0);
@@ -1049,20 +1050,23 @@ int generic_num_packages()
free(namelist[ndir]);
}
free(namelist);
- num = id_total_count(pkg_id_list);
+ num_packages_sav = id_total_count(pkg_id_list);
free_id_list(&pkg_id_list);
}
- return num;
+ return num_packages_sav;
}
int generic_num_cores_per_package()
{
struct dirent **namelist;
- int ndir, m, n, num = 0, pkg_id_tocount = -1;
+ int ndir, m, n, pkg_id_tocount = -1;
char tmppath[_HW_DETECT_MAX_OUTPUT];
char buf[20];
id_le *core_id_list = NULL;
+ if (num_cores_per_package_sav != 0) return num_cores_per_package_sav;
+ num_cores_per_package_sav=-1;
+
strcpy(path, "/sys/devices/system/cpu/");
ndir = scandir(path, &namelist, 0, 0);
if(ndir >= 0)
@@ -1099,22 +1103,25 @@ int generic_num_cores_per_package()
free(namelist[ndir]);
}
free(namelist);
- num = id_total_count(core_id_list);
+ num_cores_per_package_sav = id_total_count(core_id_list);
free_id_list(&core_id_list);
}
- else return -1;
+ else num_cores_per_package_sav = -1;
- if (num==0) return -1;
- return num;
+ if (num_cores_per_package_sav == 0) num_cores_per_package_sav = -1;
+
+ return num_cores_per_package_sav;
}
int generic_num_threads_per_core()
{
struct dirent **namelist;
- int ndir, m, n, num = 0, pkg_id_tocount = -1, core_id_tocount = -1;
+ int ndir, m, n, pkg_id_tocount = -1, core_id_tocount = -1;
char tmppath[_HW_DETECT_MAX_OUTPUT];
char buf[20];
+ if (num_threads_per_core_sav != 0) return num_threads_per_core_sav;
+
strcpy(path, "/sys/devices/system/cpu/");
ndir = scandir(path, &namelist, 0, 0);
if(ndir >= 0)
@@ -1144,7 +1151,7 @@ int generic_num_threads_per_core()
if(m == core_id_tocount && n == pkg_id_tocount) /*FIXME: only counts threads from the first core_id and package_id that are found, assumes that every core has the same amount of threads*/
{
- num++;
+ num_threads_per_core_sav++;
}
}
}
@@ -1152,23 +1159,23 @@ int generic_num_threads_per_core()
}
free(namelist);
}
- else return -1;
+ else num_threads_per_core_sav = -1;
- if (num == 0) num = generic_num_threads_per_package() / generic_num_cores_per_package();
- if (num != generic_num_threads_per_package() / generic_num_cores_per_package()) return -1;
+ if (num_threads_per_core_sav == 0) num_threads_per_core_sav = generic_num_threads_per_package() / generic_num_cores_per_package();
+ if (num_threads_per_core_sav != generic_num_threads_per_package() / generic_num_cores_per_package()) num_threads_per_core_sav = -1;
- return num;
+ return num_threads_per_core_sav;
}
int generic_num_threads_per_package()
{
struct dirent **namelist;
- int ndir, m, num = 0, pkg_id_tocount = -1;
+ int ndir, m, pkg_id_tocount = -1;
char tmppath[_HW_DETECT_MAX_OUTPUT];
char buf[20];
- /*TODO proc/cpuinfo*/
+ if (num_threads_per_package_sav != 0) return num_threads_per_package_sav;
strcpy(path, "/sys/devices/system/cpu/");
ndir = scandir(path, &namelist, 0, 0);
@@ -1192,7 +1199,7 @@ int generic_num_threads_per_package()
if(m == pkg_id_tocount) /*FIXME: only counts threads from first package_id that is found and assumes that every package has the same amount of threads*/
{
- num++;
+ num_threads_per_package_sav++;
}
}
}
@@ -1200,20 +1207,21 @@ int generic_num_threads_per_package()
}
free(namelist);
}
- else return -1;
+ else num_threads_per_package_sav = -1;
- if (num == 0) return -1;
- return num;
+ if (num_threads_per_package_sav == 0) num_threads_per_package_sav = -1;
+
+ return num_threads_per_package_sav;
}
/* see cpu.h */
#if defined (__ARCH_UNKNOWN)
- /*
- * use generic implementations for unknown architectures
- */
+/*
+ * use generic implementations for unknown architectures
+ */
- void get_architecture(char * arch) {
+void get_architecture(char * arch) {
generic_get_architecture(arch);
}
diff --git a/help.c b/help.c
index c2254618..40b9073f 100644
--- a/help.c
+++ b/help.c
@@ -42,8 +42,10 @@ void show_help(void)
" -c | --copyright display copyright information\n"
" -w | --warranty display warranty information\n"
" -q | --quiet disable output to stdout\n"
+ " -r | --report display additional information (overridden by -q)\n"
" -a | --avail list available functions\n"
- " -i ID | --function=ID specify ID of the load-function to be used\n"
+ " -i ID | --function=ID specify integer ID of the load-function to be\n"
+ " used (as listed by --avail)\n"
#ifdef CUDA
" -f | --usegpufloat use single precision matrix multiplications instead of double\n"
" -g | --gpus number of gpus to use (default: all)\n"
@@ -61,10 +63,13 @@ void show_help(void)
" high load is defined by -l\n"
" will be overwriten if used with -s, -e and -n\n"
" -n COUNT | --threads=COUNT specify the number of threads\n"
+ " cannot be combined with -b | --bind, which\n"
+ " implicitly specifies the number of threads\n"
#if (defined(linux) || defined(__linux__)) && defined (AFFINITY)
- " -b CPULIST | --bind=CPULIST select certain CPUs (overrides -n)\n"
+ " -b CPULIST | --bind=CPULIST select certain CPUs\n"
" CPULIST format: \"x,y,z\", \"x-y\", \"x-y/step\",\n"
" and any combination of the above\n"
+ " cannot be combined with -n | --threads\n"
#endif
"\n"
"\nExamples:\n\n"
diff --git a/main.c b/main.c
index 5e9da302..50db75c1 100644
--- a/main.c
+++ b/main.c
@@ -48,6 +48,26 @@
#include "gpu.h"
#endif
+/*
+ * used for --bind option
+ */
+#define ADD_CPU_SET(cpu,cpuset) \
+do { \
+ if (cpu_allowed(cpu)) { \
+ CPU_SET(cpu, &cpuset); \
+ } else { \
+ if (cpu >= num_cpus() ) { \
+ fprintf( stderr, "Error: The given bind argument (-b/--bind) includes CPU %d that is not available on this system.\n",cpu ); \
+ } \
+ else { \
+ fprintf( stderr, "Error: The given bind argument (-b/--bind) cannot be implemented with the cpuset given from the OS\n" ); \
+ fprintf( stderr, "This can be caused by the taskset tool, cgroups, the batch system, or similar mechanisms.\n" ); \
+ fprintf( stderr, "Please fix the argument to match the restrictions.\n" ); \
+ } \
+ exit( EACCES ); \
+ } \
+} while (0)
+
mydata_t *mdp; /* global data structure */
cpu_info_t *cpuinfo = NULL; /* data structure for hardware detection */
unsigned long long LOADVAR = LOAD_HIGH; /* shared variable that specifies load level */
@@ -78,6 +98,11 @@ long long unsigned int STARTPERIOD = 0, ENDPERIOD = 0, NUMPERIODSTEPS = 0;
*/
char *fsbind = NULL;
+/*
+ * temporary variables
+ */
+int tmp1,tmp2;
+
/*
* worker threads
*/
@@ -126,6 +151,23 @@ static void *init()
exit(127);
}
+ if (verbose) {
+ printf(" using %i threads\n", NUM_THREADS);
+ #if (defined(linux) || defined(__linux__)) && defined (AFFINITY)
+ for (i = 0; i < NUM_THREADS; i++){
+ /* avoid multiple sysfs accesses */
+ tmp1=get_core_id(cpu_bind[i]);
+ tmp2=get_pkg(cpu_bind[i]);
+ if ((tmp1 != -1) && (tmp2 != -1)){
+ printf(" - Thread %i runs on CPU %llu, core %i in package: %i\n",
+ i, cpu_bind[i], tmp1, tmp2);
+ }
+ }
+ #endif
+ printf("\n");
+ fflush(stdout);
+ }
+
// create worker threads
for (t = 0; t < NUM_THREADS; t++) {
mdp->ack = 0;
@@ -133,6 +175,9 @@ static void *init()
mdp->threaddata[t].cpu_id = cpu_bind[t];
mdp->threaddata[t].data = mdp;
mdp->threaddata[t].buffersizeMem = BUFFERSIZEMEM;
+ mdp->threaddata[t].iterations = 0;
+ mdp->threaddata[t].flops = 0;
+ mdp->threaddata[t].bytes = 0;
mdp->threaddata[t].alignment = ALIGNMENT;
mdp->threaddata[t].FUNCTION = FUNCTION;
mdp->threaddata[t].period = PERIOD;
@@ -159,20 +204,6 @@ static void *init()
}
mdp->ack = 0;
- if (verbose) {
- printf(" using %i threads\n", NUM_THREADS);
- #if (defined(linux) || defined(__linux__)) && defined (AFFINITY)
- for (i = 0; i < NUM_THREADS; i++){
- if ((get_pkg(cpu_bind[i]) != -1) && (get_core_id(cpu_bind[i]) != -1)){
- printf(" - Thread %i runs on CPU %llu, core %i in package: %i\n",
- i, cpu_bind[i], get_core_id(cpu_bind[i]),get_pkg(cpu_bind[i]));
- }
- }
- #endif
- printf("\n");
- fflush(stdout);
- }
-
return (void *) mdp;
}
@@ -183,36 +214,38 @@ static void list_functions(){
printf("\n available load-functions:\n");
printf(" ID | NAME | available on this system\n");
printf(" ----------------------------------------------------------------\n");
- if (feature_available("FMA")) printf(" %4.4s | %.30s | yes\n","1","FUNC_SKL_COREI_FMA_1T ");
- else printf(" %4.4s | %.30s | no\n","1","FUNC_SKL_COREI_FMA_1T ");
- if (feature_available("FMA")) printf(" %4.4s | %.30s | yes\n","2","FUNC_SKL_COREI_FMA_2T ");
- else printf(" %4.4s | %.30s | no\n","2","FUNC_SKL_COREI_FMA_2T ");
- if (feature_available("FMA")) printf(" %4.4s | %.30s | yes\n","3","FUNC_HSW_COREI_FMA_1T ");
- else printf(" %4.4s | %.30s | no\n","3","FUNC_HSW_COREI_FMA_1T ");
- if (feature_available("FMA")) printf(" %4.4s | %.30s | yes\n","4","FUNC_HSW_COREI_FMA_2T ");
- else printf(" %4.4s | %.30s | no\n","4","FUNC_HSW_COREI_FMA_2T ");
- if (feature_available("FMA")) printf(" %4.4s | %.30s | yes\n","5","FUNC_HSW_XEONEP_FMA_1T ");
- else printf(" %4.4s | %.30s | no\n","5","FUNC_HSW_XEONEP_FMA_1T ");
- if (feature_available("FMA")) printf(" %4.4s | %.30s | yes\n","6","FUNC_HSW_XEONEP_FMA_2T ");
- else printf(" %4.4s | %.30s | no\n","6","FUNC_HSW_XEONEP_FMA_2T ");
- if (feature_available("AVX")) printf(" %4.4s | %.30s | yes\n","7","FUNC_SNB_COREI_AVX_1T ");
- else printf(" %4.4s | %.30s | no\n","7","FUNC_SNB_COREI_AVX_1T ");
- if (feature_available("AVX")) printf(" %4.4s | %.30s | yes\n","8","FUNC_SNB_COREI_AVX_2T ");
- else printf(" %4.4s | %.30s | no\n","8","FUNC_SNB_COREI_AVX_2T ");
- if (feature_available("AVX")) printf(" %4.4s | %.30s | yes\n","9","FUNC_SNB_XEONEP_AVX_1T ");
- else printf(" %4.4s | %.30s | no\n","9","FUNC_SNB_XEONEP_AVX_1T ");
- if (feature_available("AVX")) printf(" %4.4s | %.30s | yes\n","10","FUNC_SNB_XEONEP_AVX_2T ");
- else printf(" %4.4s | %.30s | no\n","10","FUNC_SNB_XEONEP_AVX_2T ");
- if (feature_available("SSE2")) printf(" %4.4s | %.30s | yes\n","11","FUNC_NHM_COREI_SSE2_1T ");
- else printf(" %4.4s | %.30s | no\n","11","FUNC_NHM_COREI_SSE2_1T ");
- if (feature_available("SSE2")) printf(" %4.4s | %.30s | yes\n","12","FUNC_NHM_COREI_SSE2_2T ");
- else printf(" %4.4s | %.30s | no\n","12","FUNC_NHM_COREI_SSE2_2T ");
- if (feature_available("SSE2")) printf(" %4.4s | %.30s | yes\n","13","FUNC_NHM_XEONEP_SSE2_1T ");
- else printf(" %4.4s | %.30s | no\n","13","FUNC_NHM_XEONEP_SSE2_1T ");
- if (feature_available("SSE2")) printf(" %4.4s | %.30s | yes\n","14","FUNC_NHM_XEONEP_SSE2_2T ");
- else printf(" %4.4s | %.30s | no\n","14","FUNC_NHM_XEONEP_SSE2_2T ");
- if (feature_available("FMA4")) printf(" %4.4s | %.30s | yes\n","15","FUNC_BLD_OPTERON_FMA4_1T ");
- else printf(" %4.4s | %.30s | no\n","15","FUNC_BLD_OPTERON_FMA4_1T ");
+ if (feature_available("AVX512")) printf(" %4.4s | %.30s | yes\n","1","FUNC_KNL_XEONPHI_AVX512_4T ");
+ else printf(" %4.4s | %.30s | no\n","1","FUNC_KNL_XEONPHI_AVX512_4T ");
+ if (feature_available("FMA")) printf(" %4.4s | %.30s | yes\n","2","FUNC_SKL_COREI_FMA_1T ");
+ else printf(" %4.4s | %.30s | no\n","2","FUNC_SKL_COREI_FMA_1T ");
+ if (feature_available("FMA")) printf(" %4.4s | %.30s | yes\n","3","FUNC_SKL_COREI_FMA_2T ");
+ else printf(" %4.4s | %.30s | no\n","3","FUNC_SKL_COREI_FMA_2T ");
+ if (feature_available("FMA")) printf(" %4.4s | %.30s | yes\n","4","FUNC_HSW_COREI_FMA_1T ");
+ else printf(" %4.4s | %.30s | no\n","4","FUNC_HSW_COREI_FMA_1T ");
+ if (feature_available("FMA")) printf(" %4.4s | %.30s | yes\n","5","FUNC_HSW_COREI_FMA_2T ");
+ else printf(" %4.4s | %.30s | no\n","5","FUNC_HSW_COREI_FMA_2T ");
+ if (feature_available("FMA")) printf(" %4.4s | %.30s | yes\n","6","FUNC_HSW_XEONEP_FMA_1T ");
+ else printf(" %4.4s | %.30s | no\n","6","FUNC_HSW_XEONEP_FMA_1T ");
+ if (feature_available("FMA")) printf(" %4.4s | %.30s | yes\n","7","FUNC_HSW_XEONEP_FMA_2T ");
+ else printf(" %4.4s | %.30s | no\n","7","FUNC_HSW_XEONEP_FMA_2T ");
+ if (feature_available("AVX")) printf(" %4.4s | %.30s | yes\n","8","FUNC_SNB_COREI_AVX_1T ");
+ else printf(" %4.4s | %.30s | no\n","8","FUNC_SNB_COREI_AVX_1T ");
+ if (feature_available("AVX")) printf(" %4.4s | %.30s | yes\n","9","FUNC_SNB_COREI_AVX_2T ");
+ else printf(" %4.4s | %.30s | no\n","9","FUNC_SNB_COREI_AVX_2T ");
+ if (feature_available("AVX")) printf(" %4.4s | %.30s | yes\n","10","FUNC_SNB_XEONEP_AVX_1T ");
+ else printf(" %4.4s | %.30s | no\n","10","FUNC_SNB_XEONEP_AVX_1T ");
+ if (feature_available("AVX")) printf(" %4.4s | %.30s | yes\n","11","FUNC_SNB_XEONEP_AVX_2T ");
+ else printf(" %4.4s | %.30s | no\n","11","FUNC_SNB_XEONEP_AVX_2T ");
+ if (feature_available("SSE2")) printf(" %4.4s | %.30s | yes\n","12","FUNC_NHM_COREI_SSE2_1T ");
+ else printf(" %4.4s | %.30s | no\n","12","FUNC_NHM_COREI_SSE2_1T ");
+ if (feature_available("SSE2")) printf(" %4.4s | %.30s | yes\n","13","FUNC_NHM_COREI_SSE2_2T ");
+ else printf(" %4.4s | %.30s | no\n","13","FUNC_NHM_COREI_SSE2_2T ");
+ if (feature_available("SSE2")) printf(" %4.4s | %.30s | yes\n","14","FUNC_NHM_XEONEP_SSE2_1T ");
+ else printf(" %4.4s | %.30s | no\n","14","FUNC_NHM_XEONEP_SSE2_1T ");
+ if (feature_available("SSE2")) printf(" %4.4s | %.30s | yes\n","15","FUNC_NHM_XEONEP_SSE2_2T ");
+ else printf(" %4.4s | %.30s | no\n","15","FUNC_NHM_XEONEP_SSE2_2T ");
+ if (feature_available("FMA4")) printf(" %4.4s | %.30s | yes\n","16","FUNC_BLD_OPTERON_FMA4_1T ");
+ else printf(" %4.4s | %.30s | no\n","16","FUNC_BLD_OPTERON_FMA4_1T ");
return;
}
@@ -222,93 +255,99 @@ static int get_function(unsigned int id){
switch(id){
case 1:
- if (feature_available("FMA")) func = FUNC_SKL_COREI_FMA_1T;
+ if (feature_available("AVX512")) func = FUNC_KNL_XEONPHI_AVX512_4T;
else{
- fprintf(stderr, "\nError: Function 1 (\"FUNC_SKL_COREI_FMA_1T\") requires FMA, which is not supported by the processor.\n\n");
+ fprintf(stderr, "\nError: Function 1 (\"FUNC_KNL_XEONPHI_AVX512_4T\") requires AVX512, which is not supported by the processor.\n\n");
}
break;
case 2:
- if (feature_available("FMA")) func = FUNC_SKL_COREI_FMA_2T;
+ if (feature_available("FMA")) func = FUNC_SKL_COREI_FMA_1T;
else{
- fprintf(stderr, "\nError: Function 2 (\"FUNC_SKL_COREI_FMA_2T\") requires FMA, which is not supported by the processor.\n\n");
+ fprintf(stderr, "\nError: Function 2 (\"FUNC_SKL_COREI_FMA_1T\") requires FMA, which is not supported by the processor.\n\n");
}
break;
case 3:
- if (feature_available("FMA")) func = FUNC_HSW_COREI_FMA_1T;
+ if (feature_available("FMA")) func = FUNC_SKL_COREI_FMA_2T;
else{
- fprintf(stderr, "\nError: Function 3 (\"FUNC_HSW_COREI_FMA_1T\") requires FMA, which is not supported by the processor.\n\n");
+ fprintf(stderr, "\nError: Function 3 (\"FUNC_SKL_COREI_FMA_2T\") requires FMA, which is not supported by the processor.\n\n");
}
break;
case 4:
- if (feature_available("FMA")) func = FUNC_HSW_COREI_FMA_2T;
+ if (feature_available("FMA")) func = FUNC_HSW_COREI_FMA_1T;
else{
- fprintf(stderr, "\nError: Function 4 (\"FUNC_HSW_COREI_FMA_2T\") requires FMA, which is not supported by the processor.\n\n");
+ fprintf(stderr, "\nError: Function 4 (\"FUNC_HSW_COREI_FMA_1T\") requires FMA, which is not supported by the processor.\n\n");
}
break;
case 5:
- if (feature_available("FMA")) func = FUNC_HSW_XEONEP_FMA_1T;
+ if (feature_available("FMA")) func = FUNC_HSW_COREI_FMA_2T;
else{
- fprintf(stderr, "\nError: Function 5 (\"FUNC_HSW_XEONEP_FMA_1T\") requires FMA, which is not supported by the processor.\n\n");
+ fprintf(stderr, "\nError: Function 5 (\"FUNC_HSW_COREI_FMA_2T\") requires FMA, which is not supported by the processor.\n\n");
}
break;
case 6:
- if (feature_available("FMA")) func = FUNC_HSW_XEONEP_FMA_2T;
+ if (feature_available("FMA")) func = FUNC_HSW_XEONEP_FMA_1T;
else{
- fprintf(stderr, "\nError: Function 6 (\"FUNC_HSW_XEONEP_FMA_2T\") requires FMA, which is not supported by the processor.\n\n");
+ fprintf(stderr, "\nError: Function 6 (\"FUNC_HSW_XEONEP_FMA_1T\") requires FMA, which is not supported by the processor.\n\n");
}
break;
case 7:
- if (feature_available("AVX")) func = FUNC_SNB_COREI_AVX_1T;
+ if (feature_available("FMA")) func = FUNC_HSW_XEONEP_FMA_2T;
else{
- fprintf(stderr, "\nError: Function 7 (\"FUNC_SNB_COREI_AVX_1T\") requires AVX, which is not supported by the processor.\n\n");
+ fprintf(stderr, "\nError: Function 7 (\"FUNC_HSW_XEONEP_FMA_2T\") requires FMA, which is not supported by the processor.\n\n");
}
break;
case 8:
- if (feature_available("AVX")) func = FUNC_SNB_COREI_AVX_2T;
+ if (feature_available("AVX")) func = FUNC_SNB_COREI_AVX_1T;
else{
- fprintf(stderr, "\nError: Function 8 (\"FUNC_SNB_COREI_AVX_2T\") requires AVX, which is not supported by the processor.\n\n");
+ fprintf(stderr, "\nError: Function 8 (\"FUNC_SNB_COREI_AVX_1T\") requires AVX, which is not supported by the processor.\n\n");
}
break;
case 9:
- if (feature_available("AVX")) func = FUNC_SNB_XEONEP_AVX_1T;
+ if (feature_available("AVX")) func = FUNC_SNB_COREI_AVX_2T;
else{
- fprintf(stderr, "\nError: Function 9 (\"FUNC_SNB_XEONEP_AVX_1T\") requires AVX, which is not supported by the processor.\n\n");
+ fprintf(stderr, "\nError: Function 9 (\"FUNC_SNB_COREI_AVX_2T\") requires AVX, which is not supported by the processor.\n\n");
}
break;
case 10:
- if (feature_available("AVX")) func = FUNC_SNB_XEONEP_AVX_2T;
+ if (feature_available("AVX")) func = FUNC_SNB_XEONEP_AVX_1T;
else{
- fprintf(stderr, "\nError: Function 10 (\"FUNC_SNB_XEONEP_AVX_2T\") requires AVX, which is not supported by the processor.\n\n");
+ fprintf(stderr, "\nError: Function 10 (\"FUNC_SNB_XEONEP_AVX_1T\") requires AVX, which is not supported by the processor.\n\n");
}
break;
case 11:
- if (feature_available("SSE2")) func = FUNC_NHM_COREI_SSE2_1T;
+ if (feature_available("AVX")) func = FUNC_SNB_XEONEP_AVX_2T;
else{
- fprintf(stderr, "\nError: Function 11 (\"FUNC_NHM_COREI_SSE2_1T\") requires SSE2, which is not supported by the processor.\n\n");
+ fprintf(stderr, "\nError: Function 11 (\"FUNC_SNB_XEONEP_AVX_2T\") requires AVX, which is not supported by the processor.\n\n");
}
break;
case 12:
- if (feature_available("SSE2")) func = FUNC_NHM_COREI_SSE2_2T;
+ if (feature_available("SSE2")) func = FUNC_NHM_COREI_SSE2_1T;
else{
- fprintf(stderr, "\nError: Function 12 (\"FUNC_NHM_COREI_SSE2_2T\") requires SSE2, which is not supported by the processor.\n\n");
+ fprintf(stderr, "\nError: Function 12 (\"FUNC_NHM_COREI_SSE2_1T\") requires SSE2, which is not supported by the processor.\n\n");
}
break;
case 13:
- if (feature_available("SSE2")) func = FUNC_NHM_XEONEP_SSE2_1T;
+ if (feature_available("SSE2")) func = FUNC_NHM_COREI_SSE2_2T;
else{
- fprintf(stderr, "\nError: Function 13 (\"FUNC_NHM_XEONEP_SSE2_1T\") requires SSE2, which is not supported by the processor.\n\n");
+ fprintf(stderr, "\nError: Function 13 (\"FUNC_NHM_COREI_SSE2_2T\") requires SSE2, which is not supported by the processor.\n\n");
}
break;
case 14:
- if (feature_available("SSE2")) func = FUNC_NHM_XEONEP_SSE2_2T;
+ if (feature_available("SSE2")) func = FUNC_NHM_XEONEP_SSE2_1T;
else{
- fprintf(stderr, "\nError: Function 14 (\"FUNC_NHM_XEONEP_SSE2_2T\") requires SSE2, which is not supported by the processor.\n\n");
+ fprintf(stderr, "\nError: Function 14 (\"FUNC_NHM_XEONEP_SSE2_1T\") requires SSE2, which is not supported by the processor.\n\n");
}
break;
case 15:
+ if (feature_available("SSE2")) func = FUNC_NHM_XEONEP_SSE2_2T;
+ else{
+ fprintf(stderr, "\nError: Function 15 (\"FUNC_NHM_XEONEP_SSE2_2T\") requires SSE2, which is not supported by the processor.\n\n");
+ }
+ break;
+ case 16:
if (feature_available("FMA4")) func = FUNC_BLD_OPTERON_FMA4_1T;
else{
- fprintf(stderr, "\nError: Function 15 (\"FUNC_BLD_OPTERON_FMA4_1T\") requires FMA4, which is not supported by the processor.\n\n");
+ fprintf(stderr, "\nError: Function 16 (\"FUNC_BLD_OPTERON_FMA4_1T\") requires FMA4, which is not supported by the processor.\n\n");
}
break;
default:
@@ -336,10 +375,10 @@ static void evaluate_environment()
fprintf(stderr, "\nWarning: not enough CPUs for requested number of threads\n");
}
- if (fsbind==NULL) { //use all CPUs if not defined otherwise
+ if (fsbind==NULL) { // no cpu binding defined
#if (defined(linux) || defined(__linux__)) && defined (AFFINITY)
CPU_ZERO(&cpuset);
- if (NUM_THREADS==0){
+ if (NUM_THREADS==0){ // use all CPUs if not defined otherwise
for (i = 0; i < cpuinfo->num_cpus; i++) {
if (cpu_allowed(i)) {
CPU_SET(i, &cpuset);
@@ -347,9 +386,26 @@ static void evaluate_environment()
}
}
}
- else{
- for (i = 0; i < cpuinfo->num_cpus; i++) {
- if (cpu_allowed(i)) CPU_SET(i, &cpuset);
+ else{ // if -n / --threads is set
+ int current_cpu=0;
+ for (i = 0; i < NUM_THREADS; i++) {
+ /* search for available cpu */
+ while(! cpu_allowed(current_cpu) ) {
+ current_cpu++;
+
+ /* if reached end of avail cpus or max(int) */
+ if (current_cpu >= cpuinfo->num_cpus || current_cpu < 0)
+ {
+ /* start at beginning */
+ fprintf(stderr, "Error: You are requesting more threads than there are CPUs available in the given cpuset.\n");
+ fprintf(stderr, "This can be caused by the taskset tool, cgroups, the batch system, or similar mechanisms.\n" ); \
+ fprintf(stderr, "Please fix the -n/--threads argument to match the restrictions.\n");
+ exit( EACCES );
+ }
+ }
+ ADD_CPU_SET(current_cpu,cpuset);
+ /* next cpu for next thread (or one of the following) */
+ current_cpu++;
}
}
#ifdef CUDA
@@ -365,8 +421,9 @@ static void evaluate_environment()
#if (defined(linux) || defined(__linux__)) && defined (AFFINITY)
else { // parse CPULIST for binding
char *p,*q,*r,*s,*t;
- int p_val,r_val,s_val,error=0;
+ int p_val=0,r_val=0,s_val=0,error=0;
+ CPU_ZERO(&cpuset);
errno=0;
p=strdup(fsbind);
while(p!=NULL) {
@@ -407,19 +464,15 @@ static void evaluate_environment()
exit(127);
}
if ((s)&&(r)) for (i=p_val; (int)i<=r_val; i+=s_val) {
- if (cpu_allowed(i)) {
- CPU_SET(i,&cpuset);
- NUM_THREADS++;
- }
+ ADD_CPU_SET(i,cpuset);
+ NUM_THREADS++;
}
else if (r) for (i=p_val; (int)i<=r_val; i++) {
- if (cpu_allowed(i)) {
- CPU_SET(i,&cpuset);
- NUM_THREADS++;
- }
+ ADD_CPU_SET(i,cpuset);
+ NUM_THREADS++;
}
- else if (cpu_allowed(p_val)) {
- CPU_SET(p_val,&cpuset);
+ else {
+ ADD_CPU_SET(p_val,cpuset);
NUM_THREADS++;
}
p=q;
@@ -470,6 +523,18 @@ static void evaluate_environment()
switch (cpuinfo->family) {
case 6:
switch (cpuinfo->model) {
+ case 87:
+ if (feature_available("AVX512")) {
+ if (num_threads_per_core() == 4) FUNCTION = FUNC_KNL_XEONPHI_AVX512_4T;
+ if (FUNCTION == FUNC_NOT_DEFINED) {
+ fprintf(stderr, "Warning: no code path for %i threads per core!\n",num_threads_per_core());
+ }
+ }
+ if (FUNCTION == FUNC_NOT_DEFINED) {
+ fprintf(stderr, "\nWarning: AVX512 is requiered for architecture \"KNL\", but is not supported!\n");
+ }
+ break;
+ case 78:
case 94:
if (feature_available("FMA")) {
if (num_threads_per_core() == 1) FUNCTION = FUNC_SKL_COREI_FMA_1T;
@@ -484,6 +549,8 @@ static void evaluate_environment()
break;
case 60:
case 61:
+ case 69:
+ case 70:
case 71:
if (feature_available("FMA")) {
if (num_threads_per_core() == 1) FUNCTION = FUNC_HSW_COREI_FMA_1T;
@@ -594,6 +661,22 @@ static void evaluate_environment()
}
}
+ /* use AVX512 as fallback if available*/
+ if ((FUNCTION == FUNC_NOT_DEFINED)&&(feature_available("AVX512"))) {
+ /* use function for correct number of threads per core if available */
+ if(num_threads_per_core() == 4) {
+ FUNCTION = FUNC_KNL_XEONPHI_AVX512_4T;
+ fprintf(stderr, "Warning: using function FUNC_KNL_XEONPHI_AVX512_4T as fallback.\n");
+ fprintf(stderr, " You can use the parameter --function to try other functions.\n");
+ }
+ /* use function for 4 threads per core if no function for actual number of thread per core exists*/
+ if (FUNCTION == FUNC_NOT_DEFINED)
+ {
+ FUNCTION = FUNC_KNL_XEONPHI_AVX512_4T;
+ fprintf(stderr, "Warning: using function FUNC_KNL_XEONPHI_AVX512_4T as fallback.\n");
+ fprintf(stderr, " You can use the parameter --function to try other functions.\n");
+ }
+ }
/* use FMA4 as fallback if available*/
if ((FUNCTION == FUNC_NOT_DEFINED)&&(feature_available("FMA4"))) {
/* use function for correct number of threads per core if available */
@@ -680,11 +763,28 @@ static void evaluate_environment()
switch (FUNCTION) {
+ case FUNC_KNL_XEONPHI_AVX512_4T:
+ if (verbose) printf("\n Taking AVX512 Path optimized for Knights_Landing - 4 thread(s) per core");
+
+
+
+
+ BUFFERSIZE[0] = 8192;
+ BUFFERSIZE[1] = 131072;
+ BUFFERSIZE[2] = 0;
+ RAMBUFFERSIZE = 6553600;
+ if (verbose) {
+ printf("\n Used buffersizes per thread:\n");
+ for (i = 0; i < cpuinfo->Cachelevels; i++) printf(" - L%d-Cache: %d Bytes\n", i + 1, BUFFERSIZE[i]);
+ printf(" - Memory: %llu Bytes\n\n", RAMBUFFERSIZE);
+ }
+ break;
case FUNC_SKL_COREI_FMA_1T:
if (verbose) printf("\n Taking FMA Path optimized for Skylake - 1 thread(s) per core");
+
BUFFERSIZE[0] = 32768;
BUFFERSIZE[1] = 262144;
BUFFERSIZE[2] = 1572864;
@@ -700,6 +800,7 @@ static void evaluate_environment()
+
BUFFERSIZE[0] = 16384;
BUFFERSIZE[1] = 131072;
BUFFERSIZE[2] = 786432;
@@ -715,6 +816,7 @@ static void evaluate_environment()
+
BUFFERSIZE[0] = 32768;
BUFFERSIZE[1] = 262144;
BUFFERSIZE[2] = 1572864;
@@ -730,6 +832,7 @@ static void evaluate_environment()
+
BUFFERSIZE[0] = 16384;
BUFFERSIZE[1] = 131072;
BUFFERSIZE[2] = 786432;
@@ -745,6 +848,7 @@ static void evaluate_environment()
+
BUFFERSIZE[0] = 32768;
BUFFERSIZE[1] = 262144;
BUFFERSIZE[2] = 2621440;
@@ -760,6 +864,7 @@ static void evaluate_environment()
+
BUFFERSIZE[0] = 16384;
BUFFERSIZE[1] = 131072;
BUFFERSIZE[2] = 1310720;
@@ -775,6 +880,7 @@ static void evaluate_environment()
+
BUFFERSIZE[0] = 32768;
BUFFERSIZE[1] = 262144;
BUFFERSIZE[2] = 1572864;
@@ -790,6 +896,7 @@ static void evaluate_environment()
+
BUFFERSIZE[0] = 16384;
BUFFERSIZE[1] = 131072;
BUFFERSIZE[2] = 786432;
@@ -805,6 +912,7 @@ static void evaluate_environment()
+
BUFFERSIZE[0] = 32768;
BUFFERSIZE[1] = 262144;
BUFFERSIZE[2] = 2621440;
@@ -820,6 +928,7 @@ static void evaluate_environment()
+
BUFFERSIZE[0] = 16384;
BUFFERSIZE[1] = 131072;
BUFFERSIZE[2] = 1310720;
@@ -835,6 +944,7 @@ static void evaluate_environment()
+
BUFFERSIZE[0] = 32768;
BUFFERSIZE[1] = 262144;
BUFFERSIZE[2] = 1572864;
@@ -850,6 +960,7 @@ static void evaluate_environment()
+
BUFFERSIZE[0] = 16384;
BUFFERSIZE[1] = 131072;
BUFFERSIZE[2] = 786432;
@@ -865,6 +976,7 @@ static void evaluate_environment()
+
BUFFERSIZE[0] = 32768;
BUFFERSIZE[1] = 262144;
BUFFERSIZE[2] = 2097152;
@@ -880,6 +992,7 @@ static void evaluate_environment()
+
BUFFERSIZE[0] = 16384;
BUFFERSIZE[1] = 131072;
BUFFERSIZE[2] = 1048576;
@@ -895,6 +1008,7 @@ static void evaluate_environment()
+
BUFFERSIZE[0] = 16384;
BUFFERSIZE[1] = 1048576;
BUFFERSIZE[2] = 786432;
@@ -916,6 +1030,8 @@ static void evaluate_environment()
int main(int argc, char *argv[])
{
int i,c;
+ unsigned long long iterations=0;
+
#ifdef CUDA
gpustruct * structpointer=malloc(sizeof(gpustruct));
structpointer->useDouble=1; //we want to use Doubles, if no -f Argument is given
@@ -930,6 +1046,7 @@ int main(int argc, char *argv[])
{"version", no_argument, 0, 'v'},
{"warranty", no_argument, 0, 'w'},
{"quiet", no_argument, 0, 'q'},
+ {"report", no_argument, 0, 'r'},
{"avail", no_argument, 0, 'a'},
{"function", required_argument, 0, 'i'},
#ifdef CUDA
@@ -952,9 +1069,9 @@ int main(int argc, char *argv[])
{
#if (defined(linux) || defined(__linux__)) && defined (AFFINITY)
- c = getopt_long(argc, argv, "chvwqafb:i:t:l:p:n:m:g:", long_options, NULL);
+ c = getopt_long(argc, argv, "chvwqarfb:i:t:l:p:n:m:g:", long_options, NULL);
#else
- c = getopt_long(argc, argv, "chvwqafi:t:l:p:n:m:g:", long_options, NULL);
+ c = getopt_long(argc, argv, "chvwqarfi:t:l:p:n:m:g:", long_options, NULL);
#endif
if(c == -1) break;
@@ -980,6 +1097,9 @@ int main(int argc, char *argv[])
FUNCTION=get_function((unsigned int)strtol(optarg,NULL,10));
if (FUNCTION==FUNC_UNKNOWN) return EXIT_FAILURE;
break;
+ case 'r':
+ if (verbose) verbose = 2;
+ break;
case 'q':
#ifdef CUDA
structpointer->verbose=0;
@@ -987,10 +1107,18 @@ int main(int argc, char *argv[])
verbose = 0;
break;
case 'n':
+ if (fsbind!=NULL){
+ printf("Error: -b/--bind and -n/--threads cannot be used together\n");
+ return EXIT_FAILURE;
+ }
NUM_THREADS=(unsigned int)strtol(optarg,NULL,10);
break;
#if (defined(linux) || defined(__linux__)) && defined (AFFINITY)
case 'b':
+ if (NUM_THREADS){
+ printf("Error: -b/--bind and -n/--threads cannot be used together\n");
+ return EXIT_FAILURE;
+ }
fsbind=strdup(optarg);
break;
#endif
@@ -1082,6 +1210,30 @@ int main(int argc, char *argv[])
/* wait for threads after watchdog has requested termination */
for(i = 0; i < mdp->num_threads; i++) pthread_join(threads[i], NULL);
+ if (verbose == 2){
+ unsigned long long start_tsc,stop_tsc;
+ double runtime;
+
+ printf("\nperformance report:\n");
+
+ start_tsc=mdp->threaddata[0].start_tsc;
+ stop_tsc=mdp->threaddata[0].stop_tsc;
+ for(i = 0; i < mdp->num_threads; i++){
+ printf("Thread %i: %llu iterations, tsc_delta: %llu\n",i,mdp->threaddata[i].iterations, mdp->threaddata[i].stop_tsc - mdp->threaddata[i].start_tsc );
+ iterations+=mdp->threaddata[i].iterations;
+ if (start_tsc > mdp->threaddata[i].start_tsc) start_tsc = mdp->threaddata[i].start_tsc;
+ if (stop_tsc < mdp->threaddata[i].stop_tsc) stop_tsc = mdp->threaddata[i].stop_tsc;
+ }
+ printf("\ntotal iterations: %llu\n",iterations);
+ runtime=(double)(stop_tsc - start_tsc) / (double)cpuinfo->clockrate;
+ printf("runtime: %.2f seconds (%llu cycles)\n\n",runtime, stop_tsc - start_tsc);
+
+ printf("estimated floating point performance: %.2f GFLOPS\n", (double)mdp->threaddata[0].flops*0.000000001*(double)iterations/runtime);
+ printf("estimated memory bandwidth: %.2f GB/s\n", (double)mdp->threaddata[0].bytes*0.000000001*(double)iterations/runtime);
+
+ printf("\n");
+ }
+
#ifdef CUDA
free(structpointer);
#endif
diff --git a/sse2_functions.c b/sse2_functions.c
index fd1abdec..f53d1030 100644
--- a/sse2_functions.c
+++ b/sse2_functions.c
@@ -23,12 +23,33 @@
- int init_nhm_corei_sse2_1t(unsigned long long addrMem) __attribute__((noinline));
-int init_nhm_corei_sse2_1t(unsigned long long addrMem)
+
+ int init_nhm_corei_sse2_1t(threaddata_t* threaddata) __attribute__((noinline));
+int init_nhm_corei_sse2_1t(threaddata_t* threaddata)
{
+ unsigned long long addrMem = threaddata->addrMem;
int i;
for (i = 0; i<13340672; i++) *((double*)(addrMem + 8*i)) = i * 1.654738925401e-15;
+ // lines with register operations
+ threaddata->flops+=2*2; // 1 128 bit operation
+
+ // lines with L1 operations
+ threaddata->flops+=70*2; // 1 128 bit operation
+
+ // lines with L2 operations
+ threaddata->flops+=0*2; // 1 128 bit operation
+
+ // lines with L3 operations
+ threaddata->flops+=0*2; // 1 128 bit operation
+
+ // lines with RAM operations
+ threaddata->flops+=1*2; // 1 128 bit operation
+ threaddata->bytes=1*64; // 1 memory access
+
+ threaddata->flops*=21;
+ threaddata->bytes*=21;
+
return EXIT_SUCCESS;
}
@@ -39,10 +60,10 @@ int init_nhm_corei_sse2_1t(unsigned long long addrMem)
* @input - addrMem: pointer to buffer
* @return EXIT_SUCCESS
*/
-int asm_work_nhm_corei_sse2_1t(unsigned long long addrMem, unsigned long long addrHigh) __attribute__((noinline));
-int asm_work_nhm_corei_sse2_1t(unsigned long long addrMem, unsigned long long addrHigh)
+int asm_work_nhm_corei_sse2_1t(threaddata_t* threaddata) __attribute__((noinline));
+int asm_work_nhm_corei_sse2_1t(threaddata_t* threaddata)
{
- if (*((unsigned long long*)addrHigh) == 0) return EXIT_SUCCESS;
+ if (*((unsigned long long*)threaddata->addrHigh) == 0) return EXIT_SUCCESS;
/* input:
* - addrMem -> rax
* register usage:
@@ -57,11 +78,13 @@ int asm_work_nhm_corei_sse2_1t(unsigned long long addrMem, unsigned long long ad
* - r11: temp register for initialization of SIMD-registers
* - r12: stores cacheline width as increment for buffer addresses
* - r13: stores address of shared variable that controls load level
+ * - r14: stores iteration counter
* - mm*,xmm*: data registers for SIMD instructions
*/
__asm__ __volatile__(
- "mov %0, %%rax;" // store start address of buffer
- "mov %1, %%r13;" // store address of shared variable that controls load level
+ "mov %%rax, %%rax;" // store start address of buffer
+ "mov %%rbx, %%r13;" // store address of shared variable that controls load level
+ "mov %%rcx, %%r14;" // store iteration counter
"mov $64, %%r12;" // increment after each cache/memory access
//Initialize SSE-Registers for Addition
"movapd 0(%%rax), %%xmm0;"
@@ -1640,25 +1663,47 @@ int asm_work_nhm_corei_sse2_1t(unsigned long long addrMem, unsigned long long ad
"mov %%rax, %%rdi;"
"add $1572864, %%rdi;"
"_work_no_ram_reset_nhm_corei_sse2_1t:"
+ "inc %%r14;" // increment iteration counter
"mov %%rax, %%rbx;"
- "mov (%%r13), %%r11;"
- "test $1, %%r11;"
+ "testq $1, (%%r13);"
"jnz _work_loop_nhm_corei_sse2_1t;"
- :
- : "r"(addrMem), "r"(addrHigh)
- : "%rax", "%rbx", "%rcx", "%rdx", "%rdi", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15"
+ "movq %%r14, %%rax;" // restore iteration counter
+ : "=a" (threaddata->iterations)
+ : "a"(threaddata->addrMem), "b"(threaddata->addrHigh), "c" (threaddata->iterations)
+ : "%rdx", "%rdi", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15"
);
return EXIT_SUCCESS;
}
- int init_nhm_corei_sse2_2t(unsigned long long addrMem) __attribute__((noinline));
-int init_nhm_corei_sse2_2t(unsigned long long addrMem)
+
+ int init_nhm_corei_sse2_2t(threaddata_t* threaddata) __attribute__((noinline));
+int init_nhm_corei_sse2_2t(threaddata_t* threaddata)
{
+ unsigned long long addrMem = threaddata->addrMem;
int i;
for (i = 0; i<6670336; i++) *((double*)(addrMem + 8*i)) = i * 1.654738925401e-15;
+ // lines with register operations
+ threaddata->flops+=2*2; // 1 128 bit operation
+
+ // lines with L1 operations
+ threaddata->flops+=70*2; // 1 128 bit operation
+
+ // lines with L2 operations
+ threaddata->flops+=0*2; // 1 128 bit operation
+
+ // lines with L3 operations
+ threaddata->flops+=0*2; // 1 128 bit operation
+
+ // lines with RAM operations
+ threaddata->flops+=1*2; // 1 128 bit operation
+ threaddata->bytes=1*64; // 1 memory access
+
+ threaddata->flops*=10;
+ threaddata->bytes*=10;
+
return EXIT_SUCCESS;
}
@@ -1669,10 +1714,10 @@ int init_nhm_corei_sse2_2t(unsigned long long addrMem)
* @input - addrMem: pointer to buffer
* @return EXIT_SUCCESS
*/
-int asm_work_nhm_corei_sse2_2t(unsigned long long addrMem, unsigned long long addrHigh) __attribute__((noinline));
-int asm_work_nhm_corei_sse2_2t(unsigned long long addrMem, unsigned long long addrHigh)
+int asm_work_nhm_corei_sse2_2t(threaddata_t* threaddata) __attribute__((noinline));
+int asm_work_nhm_corei_sse2_2t(threaddata_t* threaddata)
{
- if (*((unsigned long long*)addrHigh) == 0) return EXIT_SUCCESS;
+ if (*((unsigned long long*)threaddata->addrHigh) == 0) return EXIT_SUCCESS;
/* input:
* - addrMem -> rax
* register usage:
@@ -1687,11 +1732,13 @@ int asm_work_nhm_corei_sse2_2t(unsigned long long addrMem, unsigned long long ad
* - r11: temp register for initialization of SIMD-registers
* - r12: stores cacheline width as increment for buffer addresses
* - r13: stores address of shared variable that controls load level
+ * - r14: stores iteration counter
* - mm*,xmm*: data registers for SIMD instructions
*/
__asm__ __volatile__(
- "mov %0, %%rax;" // store start address of buffer
- "mov %1, %%r13;" // store address of shared variable that controls load level
+ "mov %%rax, %%rax;" // store start address of buffer
+ "mov %%rbx, %%r13;" // store address of shared variable that controls load level
+ "mov %%rcx, %%r14;" // store iteration counter
"mov $64, %%r12;" // increment after each cache/memory access
//Initialize SSE-Registers for Addition
"movapd 0(%%rax), %%xmm0;"
@@ -2467,25 +2514,47 @@ int asm_work_nhm_corei_sse2_2t(unsigned long long addrMem, unsigned long long ad
"mov %%rax, %%rdi;"
"add $786432, %%rdi;"
"_work_no_ram_reset_nhm_corei_sse2_2t:"
+ "inc %%r14;" // increment iteration counter
"mov %%rax, %%rbx;"
- "mov (%%r13), %%r11;"
- "test $1, %%r11;"
+ "testq $1, (%%r13);"
"jnz _work_loop_nhm_corei_sse2_2t;"
- :
- : "r"(addrMem), "r"(addrHigh)
- : "%rax", "%rbx", "%rcx", "%rdx", "%rdi", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15"
+ "movq %%r14, %%rax;" // restore iteration counter
+ : "=a" (threaddata->iterations)
+ : "a"(threaddata->addrMem), "b"(threaddata->addrHigh), "c" (threaddata->iterations)
+ : "%rdx", "%rdi", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15"
);
return EXIT_SUCCESS;
}
- int init_nhm_xeonep_sse2_1t(unsigned long long addrMem) __attribute__((noinline));
-int init_nhm_xeonep_sse2_1t(unsigned long long addrMem)
+
+ int init_nhm_xeonep_sse2_1t(threaddata_t* threaddata) __attribute__((noinline));
+int init_nhm_xeonep_sse2_1t(threaddata_t* threaddata)
{
+ unsigned long long addrMem = threaddata->addrMem;
int i;
for (i = 0; i<13406208; i++) *((double*)(addrMem + 8*i)) = i * 1.654738925401e-15;
+ // lines with register operations
+ threaddata->flops+=2*2; // 1 128 bit operation
+
+ // lines with L1 operations
+ threaddata->flops+=60*2; // 1 128 bit operation
+
+ // lines with L2 operations
+ threaddata->flops+=0*2; // 1 128 bit operation
+
+ // lines with L3 operations
+ threaddata->flops+=0*2; // 1 128 bit operation
+
+ // lines with RAM operations
+ threaddata->flops+=1*2; // 1 128 bit operation
+ threaddata->bytes=1*64; // 1 memory access
+
+ threaddata->flops*=24;
+ threaddata->bytes*=24;
+
return EXIT_SUCCESS;
}
@@ -2496,10 +2565,10 @@ int init_nhm_xeonep_sse2_1t(unsigned long long addrMem)
* @input - addrMem: pointer to buffer
* @return EXIT_SUCCESS
*/
-int asm_work_nhm_xeonep_sse2_1t(unsigned long long addrMem, unsigned long long addrHigh) __attribute__((noinline));
-int asm_work_nhm_xeonep_sse2_1t(unsigned long long addrMem, unsigned long long addrHigh)
+int asm_work_nhm_xeonep_sse2_1t(threaddata_t* threaddata) __attribute__((noinline));
+int asm_work_nhm_xeonep_sse2_1t(threaddata_t* threaddata)
{
- if (*((unsigned long long*)addrHigh) == 0) return EXIT_SUCCESS;
+ if (*((unsigned long long*)threaddata->addrHigh) == 0) return EXIT_SUCCESS;
/* input:
* - addrMem -> rax
* register usage:
@@ -2514,11 +2583,13 @@ int asm_work_nhm_xeonep_sse2_1t(unsigned long long addrMem, unsigned long long a
* - r11: temp register for initialization of SIMD-registers
* - r12: stores cacheline width as increment for buffer addresses
* - r13: stores address of shared variable that controls load level
+ * - r14: stores iteration counter
* - mm*,xmm*: data registers for SIMD instructions
*/
__asm__ __volatile__(
- "mov %0, %%rax;" // store start address of buffer
- "mov %1, %%r13;" // store address of shared variable that controls load level
+ "mov %%rax, %%rax;" // store start address of buffer
+ "mov %%rbx, %%r13;" // store address of shared variable that controls load level
+ "mov %%rcx, %%r14;" // store iteration counter
"mov $64, %%r12;" // increment after each cache/memory access
//Initialize SSE-Registers for Addition
"movapd 0(%%rax), %%xmm0;"
@@ -4076,25 +4147,47 @@ int asm_work_nhm_xeonep_sse2_1t(unsigned long long addrMem, unsigned long long a
"mov %%rax, %%rdi;"
"add $2097152, %%rdi;"
"_work_no_ram_reset_nhm_xeonep_sse2_1t:"
+ "inc %%r14;" // increment iteration counter
"mov %%rax, %%rbx;"
- "mov (%%r13), %%r11;"
- "test $1, %%r11;"
+ "testq $1, (%%r13);"
"jnz _work_loop_nhm_xeonep_sse2_1t;"
- :
- : "r"(addrMem), "r"(addrHigh)
- : "%rax", "%rbx", "%rcx", "%rdx", "%rdi", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15"
+ "movq %%r14, %%rax;" // restore iteration counter
+ : "=a" (threaddata->iterations)
+ : "a"(threaddata->addrMem), "b"(threaddata->addrHigh), "c" (threaddata->iterations)
+ : "%rdx", "%rdi", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15"
);
return EXIT_SUCCESS;
}
- int init_nhm_xeonep_sse2_2t(unsigned long long addrMem) __attribute__((noinline));
-int init_nhm_xeonep_sse2_2t(unsigned long long addrMem)
+
+ int init_nhm_xeonep_sse2_2t(threaddata_t* threaddata) __attribute__((noinline));
+int init_nhm_xeonep_sse2_2t(threaddata_t* threaddata)
{
+ unsigned long long addrMem = threaddata->addrMem;
int i;
for (i = 0; i<6703104; i++) *((double*)(addrMem + 8*i)) = i * 1.654738925401e-15;
+ // lines with register operations
+ threaddata->flops+=2*2; // 1 128 bit operation
+
+ // lines with L1 operations
+ threaddata->flops+=60*2; // 1 128 bit operation
+
+ // lines with L2 operations
+ threaddata->flops+=0*2; // 1 128 bit operation
+
+ // lines with L3 operations
+ threaddata->flops+=0*2; // 1 128 bit operation
+
+ // lines with RAM operations
+ threaddata->flops+=1*2; // 1 128 bit operation
+ threaddata->bytes=1*64; // 1 memory access
+
+ threaddata->flops*=12;
+ threaddata->bytes*=12;
+
return EXIT_SUCCESS;
}
@@ -4105,10 +4198,10 @@ int init_nhm_xeonep_sse2_2t(unsigned long long addrMem)
* @input - addrMem: pointer to buffer
* @return EXIT_SUCCESS
*/
-int asm_work_nhm_xeonep_sse2_2t(unsigned long long addrMem, unsigned long long addrHigh) __attribute__((noinline));
-int asm_work_nhm_xeonep_sse2_2t(unsigned long long addrMem, unsigned long long addrHigh)
+int asm_work_nhm_xeonep_sse2_2t(threaddata_t* threaddata) __attribute__((noinline));
+int asm_work_nhm_xeonep_sse2_2t(threaddata_t* threaddata)
{
- if (*((unsigned long long*)addrHigh) == 0) return EXIT_SUCCESS;
+ if (*((unsigned long long*)threaddata->addrHigh) == 0) return EXIT_SUCCESS;
/* input:
* - addrMem -> rax
* register usage:
@@ -4123,11 +4216,13 @@ int asm_work_nhm_xeonep_sse2_2t(unsigned long long addrMem, unsigned long long a
* - r11: temp register for initialization of SIMD-registers
* - r12: stores cacheline width as increment for buffer addresses
* - r13: stores address of shared variable that controls load level
+ * - r14: stores iteration counter
* - mm*,xmm*: data registers for SIMD instructions
*/
__asm__ __volatile__(
- "mov %0, %%rax;" // store start address of buffer
- "mov %1, %%r13;" // store address of shared variable that controls load level
+ "mov %%rax, %%rax;" // store start address of buffer
+ "mov %%rbx, %%r13;" // store address of shared variable that controls load level
+ "mov %%rcx, %%r14;" // store iteration counter
"mov $64, %%r12;" // increment after each cache/memory access
//Initialize SSE-Registers for Addition
"movapd 0(%%rax), %%xmm0;"
@@ -4929,13 +5024,14 @@ int asm_work_nhm_xeonep_sse2_2t(unsigned long long addrMem, unsigned long long a
"mov %%rax, %%rdi;"
"add $1048576, %%rdi;"
"_work_no_ram_reset_nhm_xeonep_sse2_2t:"
+ "inc %%r14;" // increment iteration counter
"mov %%rax, %%rbx;"
- "mov (%%r13), %%r11;"
- "test $1, %%r11;"
+ "testq $1, (%%r13);"
"jnz _work_loop_nhm_xeonep_sse2_2t;"
- :
- : "r"(addrMem), "r"(addrHigh)
- : "%rax", "%rbx", "%rcx", "%rdx", "%rdi", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15"
+ "movq %%r14, %%rax;" // restore iteration counter
+ : "=a" (threaddata->iterations)
+ : "a"(threaddata->addrMem), "b"(threaddata->addrHigh), "c" (threaddata->iterations)
+ : "%rdx", "%rdi", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15"
);
return EXIT_SUCCESS;
}
diff --git a/work.c b/work.c
index e2dd94cd..5528bfc8 100644
--- a/work.c
+++ b/work.c
@@ -123,50 +123,53 @@ void *thread(void *threaddata)
/* call init function */
switch (mydata->FUNCTION)
{
+ case FUNC_KNL_XEONPHI_AVX512_4T:
+ tmp = init_knl_xeonphi_avx512_4t(mydata);
+ break;
case FUNC_SKL_COREI_FMA_1T:
- tmp = init_skl_corei_fma_1t(mydata->addrMem);
+ tmp = init_skl_corei_fma_1t(mydata);
break;
case FUNC_SKL_COREI_FMA_2T:
- tmp = init_skl_corei_fma_2t(mydata->addrMem);
+ tmp = init_skl_corei_fma_2t(mydata);
break;
case FUNC_HSW_COREI_FMA_1T:
- tmp = init_hsw_corei_fma_1t(mydata->addrMem);
+ tmp = init_hsw_corei_fma_1t(mydata);
break;
case FUNC_HSW_COREI_FMA_2T:
- tmp = init_hsw_corei_fma_2t(mydata->addrMem);
+ tmp = init_hsw_corei_fma_2t(mydata);
break;
case FUNC_HSW_XEONEP_FMA_1T:
- tmp = init_hsw_xeonep_fma_1t(mydata->addrMem);
+ tmp = init_hsw_xeonep_fma_1t(mydata);
break;
case FUNC_HSW_XEONEP_FMA_2T:
- tmp = init_hsw_xeonep_fma_2t(mydata->addrMem);
+ tmp = init_hsw_xeonep_fma_2t(mydata);
break;
case FUNC_SNB_COREI_AVX_1T:
- tmp = init_snb_corei_avx_1t(mydata->addrMem);
+ tmp = init_snb_corei_avx_1t(mydata);
break;
case FUNC_SNB_COREI_AVX_2T:
- tmp = init_snb_corei_avx_2t(mydata->addrMem);
+ tmp = init_snb_corei_avx_2t(mydata);
break;
case FUNC_SNB_XEONEP_AVX_1T:
- tmp = init_snb_xeonep_avx_1t(mydata->addrMem);
+ tmp = init_snb_xeonep_avx_1t(mydata);
break;
case FUNC_SNB_XEONEP_AVX_2T:
- tmp = init_snb_xeonep_avx_2t(mydata->addrMem);
+ tmp = init_snb_xeonep_avx_2t(mydata);
break;
case FUNC_NHM_COREI_SSE2_1T:
- tmp = init_nhm_corei_sse2_1t(mydata->addrMem);
+ tmp = init_nhm_corei_sse2_1t(mydata);
break;
case FUNC_NHM_COREI_SSE2_2T:
- tmp = init_nhm_corei_sse2_2t(mydata->addrMem);
+ tmp = init_nhm_corei_sse2_2t(mydata);
break;
case FUNC_NHM_XEONEP_SSE2_1T:
- tmp = init_nhm_xeonep_sse2_1t(mydata->addrMem);
+ tmp = init_nhm_xeonep_sse2_1t(mydata);
break;
case FUNC_NHM_XEONEP_SSE2_2T:
- tmp = init_nhm_xeonep_sse2_2t(mydata->addrMem);
+ tmp = init_nhm_xeonep_sse2_2t(mydata);
break;
case FUNC_BLD_OPTERON_FMA4_1T:
- tmp = init_bld_opteron_fma4_1t(mydata->addrMem);
+ tmp = init_bld_opteron_fma4_1t(mydata);
break;
default:
fprintf(stderr, "Error: unknown function %i\n", mydata->FUNCTION);
@@ -188,6 +191,9 @@ void *thread(void *threaddata)
old = THREAD_WORK;
global_data->ack = id + 1;
+ /* record thread's start timestamp */
+ ((threaddata_t *)threaddata)->start_tsc = timestamp();
+
/* will be terminated by watchdog
* watchdog also alters mydata->addrHigh to switch between high and low load function
*/
@@ -201,50 +207,53 @@ void *thread(void *threaddata)
#endif
switch (mydata->FUNCTION)
{
+ case FUNC_KNL_XEONPHI_AVX512_4T:
+ tmp=asm_work_knl_xeonphi_avx512_4t(mydata);
+ break;
case FUNC_SKL_COREI_FMA_1T:
- tmp=asm_work_skl_corei_fma_1t(mydata->addrMem,mydata->addrHigh);
+ tmp=asm_work_skl_corei_fma_1t(mydata);
break;
case FUNC_SKL_COREI_FMA_2T:
- tmp=asm_work_skl_corei_fma_2t(mydata->addrMem,mydata->addrHigh);
+ tmp=asm_work_skl_corei_fma_2t(mydata);
break;
case FUNC_HSW_COREI_FMA_1T:
- tmp=asm_work_hsw_corei_fma_1t(mydata->addrMem,mydata->addrHigh);
+ tmp=asm_work_hsw_corei_fma_1t(mydata);
break;
case FUNC_HSW_COREI_FMA_2T:
- tmp=asm_work_hsw_corei_fma_2t(mydata->addrMem,mydata->addrHigh);
+ tmp=asm_work_hsw_corei_fma_2t(mydata);
break;
case FUNC_HSW_XEONEP_FMA_1T:
- tmp=asm_work_hsw_xeonep_fma_1t(mydata->addrMem,mydata->addrHigh);
+ tmp=asm_work_hsw_xeonep_fma_1t(mydata);
break;
case FUNC_HSW_XEONEP_FMA_2T:
- tmp=asm_work_hsw_xeonep_fma_2t(mydata->addrMem,mydata->addrHigh);
+ tmp=asm_work_hsw_xeonep_fma_2t(mydata);
break;
case FUNC_SNB_COREI_AVX_1T:
- tmp=asm_work_snb_corei_avx_1t(mydata->addrMem,mydata->addrHigh);
+ tmp=asm_work_snb_corei_avx_1t(mydata);
break;
case FUNC_SNB_COREI_AVX_2T:
- tmp=asm_work_snb_corei_avx_2t(mydata->addrMem,mydata->addrHigh);
+ tmp=asm_work_snb_corei_avx_2t(mydata);
break;
case FUNC_SNB_XEONEP_AVX_1T:
- tmp=asm_work_snb_xeonep_avx_1t(mydata->addrMem,mydata->addrHigh);
+ tmp=asm_work_snb_xeonep_avx_1t(mydata);
break;
case FUNC_SNB_XEONEP_AVX_2T:
- tmp=asm_work_snb_xeonep_avx_2t(mydata->addrMem,mydata->addrHigh);
+ tmp=asm_work_snb_xeonep_avx_2t(mydata);
break;
case FUNC_NHM_COREI_SSE2_1T:
- tmp=asm_work_nhm_corei_sse2_1t(mydata->addrMem,mydata->addrHigh);
+ tmp=asm_work_nhm_corei_sse2_1t(mydata);
break;
case FUNC_NHM_COREI_SSE2_2T:
- tmp=asm_work_nhm_corei_sse2_2t(mydata->addrMem,mydata->addrHigh);
+ tmp=asm_work_nhm_corei_sse2_2t(mydata);
break;
case FUNC_NHM_XEONEP_SSE2_1T:
- tmp=asm_work_nhm_xeonep_sse2_1t(mydata->addrMem,mydata->addrHigh);
+ tmp=asm_work_nhm_xeonep_sse2_1t(mydata);
break;
case FUNC_NHM_XEONEP_SSE2_2T:
- tmp=asm_work_nhm_xeonep_sse2_2t(mydata->addrMem,mydata->addrHigh);
+ tmp=asm_work_nhm_xeonep_sse2_2t(mydata);
break;
case FUNC_BLD_OPTERON_FMA4_1T:
- tmp=asm_work_bld_opteron_fma4_1t(mydata->addrMem,mydata->addrHigh);
+ tmp=asm_work_bld_opteron_fma4_1t(mydata);
break;
default:
fprintf(stderr,"Error: unknown function %i\n",mydata->FUNCTION);
@@ -274,6 +283,8 @@ void *thread(void *threaddata)
/* terminate if master signals end of run */
if(*((volatile unsigned long long *)(mydata->addrHigh)) == LOAD_STOP) {
+ ((threaddata_t *)threaddata) -> stop_tsc = timestamp();
+
pthread_exit(NULL);
}
} // end while
diff --git a/work.h b/work.h
index 9fd3f4a6..a3908d24 100644
--- a/work.h
+++ b/work.h
@@ -25,21 +25,22 @@
#include "firestarter_global.h"
#include
-#define FUNC_SKL_COREI_FMA_1T 1
-#define FUNC_SKL_COREI_FMA_2T 2
-#define FUNC_HSW_COREI_FMA_1T 3
-#define FUNC_HSW_COREI_FMA_2T 4
-#define FUNC_HSW_XEONEP_FMA_1T 5
-#define FUNC_HSW_XEONEP_FMA_2T 6
-#define FUNC_SNB_COREI_AVX_1T 7
-#define FUNC_SNB_COREI_AVX_2T 8
-#define FUNC_SNB_XEONEP_AVX_1T 9
-#define FUNC_SNB_XEONEP_AVX_2T 10
-#define FUNC_NHM_COREI_SSE2_1T 11
-#define FUNC_NHM_COREI_SSE2_2T 12
-#define FUNC_NHM_XEONEP_SSE2_1T 13
-#define FUNC_NHM_XEONEP_SSE2_2T 14
-#define FUNC_BLD_OPTERON_FMA4_1T 15
+#define FUNC_KNL_XEONPHI_AVX512_4T 1
+#define FUNC_SKL_COREI_FMA_1T 2
+#define FUNC_SKL_COREI_FMA_2T 3
+#define FUNC_HSW_COREI_FMA_1T 4
+#define FUNC_HSW_COREI_FMA_2T 5
+#define FUNC_HSW_XEONEP_FMA_1T 6
+#define FUNC_HSW_XEONEP_FMA_2T 7
+#define FUNC_SNB_COREI_AVX_1T 8
+#define FUNC_SNB_COREI_AVX_2T 9
+#define FUNC_SNB_XEONEP_AVX_1T 10
+#define FUNC_SNB_XEONEP_AVX_2T 11
+#define FUNC_NHM_COREI_SSE2_1T 12
+#define FUNC_NHM_COREI_SSE2_2T 13
+#define FUNC_NHM_XEONEP_SSE2_1T 14
+#define FUNC_NHM_XEONEP_SSE2_2T 15
+#define FUNC_BLD_OPTERON_FMA4_1T 16
/*
@@ -55,99 +56,105 @@ extern void *thread(void *threaddata);
/*
* init functions
*/
-int init_skl_corei_fma_1t(unsigned long long addrMem) __attribute__((noinline));
-int init_skl_corei_fma_1t(unsigned long long addrMem);
+int init_knl_xeonphi_avx512_4t(threaddata_t* threaddata) __attribute__((noinline));
+int init_knl_xeonphi_avx512_4t(threaddata_t* threaddata);
-int init_skl_corei_fma_2t(unsigned long long addrMem) __attribute__((noinline));
-int init_skl_corei_fma_2t(unsigned long long addrMem);
+int init_skl_corei_fma_1t(threaddata_t* threaddata) __attribute__((noinline));
+int init_skl_corei_fma_1t(threaddata_t* threaddata);
-int init_hsw_corei_fma_1t(unsigned long long addrMem) __attribute__((noinline));
-int init_hsw_corei_fma_1t(unsigned long long addrMem);
+int init_skl_corei_fma_2t(threaddata_t* threaddata) __attribute__((noinline));
+int init_skl_corei_fma_2t(threaddata_t* threaddata);
-int init_hsw_corei_fma_2t(unsigned long long addrMem) __attribute__((noinline));
-int init_hsw_corei_fma_2t(unsigned long long addrMem);
+int init_hsw_corei_fma_1t(threaddata_t* threaddata) __attribute__((noinline));
+int init_hsw_corei_fma_1t(threaddata_t* threaddata);
-int init_hsw_xeonep_fma_1t(unsigned long long addrMem) __attribute__((noinline));
-int init_hsw_xeonep_fma_1t(unsigned long long addrMem);
+int init_hsw_corei_fma_2t(threaddata_t* threaddata) __attribute__((noinline));
+int init_hsw_corei_fma_2t(threaddata_t* threaddata);
-int init_hsw_xeonep_fma_2t(unsigned long long addrMem) __attribute__((noinline));
-int init_hsw_xeonep_fma_2t(unsigned long long addrMem);
+int init_hsw_xeonep_fma_1t(threaddata_t* threaddata) __attribute__((noinline));
+int init_hsw_xeonep_fma_1t(threaddata_t* threaddata);
-int init_snb_corei_avx_1t(unsigned long long addrMem) __attribute__((noinline));
-int init_snb_corei_avx_1t(unsigned long long addrMem);
+int init_hsw_xeonep_fma_2t(threaddata_t* threaddata) __attribute__((noinline));
+int init_hsw_xeonep_fma_2t(threaddata_t* threaddata);
-int init_snb_corei_avx_2t(unsigned long long addrMem) __attribute__((noinline));
-int init_snb_corei_avx_2t(unsigned long long addrMem);
+int init_snb_corei_avx_1t(threaddata_t* threaddata) __attribute__((noinline));
+int init_snb_corei_avx_1t(threaddata_t* threaddata);
-int init_snb_xeonep_avx_1t(unsigned long long addrMem) __attribute__((noinline));
-int init_snb_xeonep_avx_1t(unsigned long long addrMem);
+int init_snb_corei_avx_2t(threaddata_t* threaddata) __attribute__((noinline));
+int init_snb_corei_avx_2t(threaddata_t* threaddata);
-int init_snb_xeonep_avx_2t(unsigned long long addrMem) __attribute__((noinline));
-int init_snb_xeonep_avx_2t(unsigned long long addrMem);
+int init_snb_xeonep_avx_1t(threaddata_t* threaddata) __attribute__((noinline));
+int init_snb_xeonep_avx_1t(threaddata_t* threaddata);
-int init_nhm_corei_sse2_1t(unsigned long long addrMem) __attribute__((noinline));
-int init_nhm_corei_sse2_1t(unsigned long long addrMem);
+int init_snb_xeonep_avx_2t(threaddata_t* threaddata) __attribute__((noinline));
+int init_snb_xeonep_avx_2t(threaddata_t* threaddata);
-int init_nhm_corei_sse2_2t(unsigned long long addrMem) __attribute__((noinline));
-int init_nhm_corei_sse2_2t(unsigned long long addrMem);
+int init_nhm_corei_sse2_1t(threaddata_t* threaddata) __attribute__((noinline));
+int init_nhm_corei_sse2_1t(threaddata_t* threaddata);
-int init_nhm_xeonep_sse2_1t(unsigned long long addrMem) __attribute__((noinline));
-int init_nhm_xeonep_sse2_1t(unsigned long long addrMem);
+int init_nhm_corei_sse2_2t(threaddata_t* threaddata) __attribute__((noinline));
+int init_nhm_corei_sse2_2t(threaddata_t* threaddata);
-int init_nhm_xeonep_sse2_2t(unsigned long long addrMem) __attribute__((noinline));
-int init_nhm_xeonep_sse2_2t(unsigned long long addrMem);
+int init_nhm_xeonep_sse2_1t(threaddata_t* threaddata) __attribute__((noinline));
+int init_nhm_xeonep_sse2_1t(threaddata_t* threaddata);
-int init_bld_opteron_fma4_1t(unsigned long long addrMem) __attribute__((noinline));
-int init_bld_opteron_fma4_1t(unsigned long long addrMem);
+int init_nhm_xeonep_sse2_2t(threaddata_t* threaddata) __attribute__((noinline));
+int init_nhm_xeonep_sse2_2t(threaddata_t* threaddata);
+
+int init_bld_opteron_fma4_1t(threaddata_t* threaddata) __attribute__((noinline));
+int init_bld_opteron_fma4_1t(threaddata_t* threaddata);
/*
* stress test functions
*/
-int asm_work_skl_corei_fma_1t(unsigned long long addrMem, unsigned long long addrHigh) __attribute__((noinline));
-int asm_work_skl_corei_fma_1t(unsigned long long addrMem, unsigned long long addrHigh);
+int asm_work_knl_xeonphi_avx512_4t(threaddata_t* threaddata) __attribute__((noinline));
+int asm_work_knl_xeonphi_avx512_4t(threaddata_t* threaddata);
+
+int asm_work_skl_corei_fma_1t(threaddata_t* threaddata) __attribute__((noinline));
+int asm_work_skl_corei_fma_1t(threaddata_t* threaddata);
-int asm_work_skl_corei_fma_2t(unsigned long long addrMem, unsigned long long addrHigh) __attribute__((noinline));
-int asm_work_skl_corei_fma_2t(unsigned long long addrMem, unsigned long long addrHigh);
+int asm_work_skl_corei_fma_2t(threaddata_t* threaddata) __attribute__((noinline));
+int asm_work_skl_corei_fma_2t(threaddata_t* threaddata);
-int asm_work_hsw_corei_fma_1t(unsigned long long addrMem, unsigned long long addrHigh) __attribute__((noinline));
-int asm_work_hsw_corei_fma_1t(unsigned long long addrMem, unsigned long long addrHigh);
+int asm_work_hsw_corei_fma_1t(threaddata_t* threaddata) __attribute__((noinline));
+int asm_work_hsw_corei_fma_1t(threaddata_t* threaddata);
-int asm_work_hsw_corei_fma_2t(unsigned long long addrMem, unsigned long long addrHigh) __attribute__((noinline));
-int asm_work_hsw_corei_fma_2t(unsigned long long addrMem, unsigned long long addrHigh);
+int asm_work_hsw_corei_fma_2t(threaddata_t* threaddata) __attribute__((noinline));
+int asm_work_hsw_corei_fma_2t(threaddata_t* threaddata);
-int asm_work_hsw_xeonep_fma_1t(unsigned long long addrMem, unsigned long long addrHigh) __attribute__((noinline));
-int asm_work_hsw_xeonep_fma_1t(unsigned long long addrMem, unsigned long long addrHigh);
+int asm_work_hsw_xeonep_fma_1t(threaddata_t* threaddata) __attribute__((noinline));
+int asm_work_hsw_xeonep_fma_1t(threaddata_t* threaddata);
-int asm_work_hsw_xeonep_fma_2t(unsigned long long addrMem, unsigned long long addrHigh) __attribute__((noinline));
-int asm_work_hsw_xeonep_fma_2t(unsigned long long addrMem, unsigned long long addrHigh);
+int asm_work_hsw_xeonep_fma_2t(threaddata_t* threaddata) __attribute__((noinline));
+int asm_work_hsw_xeonep_fma_2t(threaddata_t* threaddata);
-int asm_work_snb_corei_avx_1t(unsigned long long addrMem, unsigned long long addrHigh) __attribute__((noinline));
-int asm_work_snb_corei_avx_1t(unsigned long long addrMem, unsigned long long addrHigh);
+int asm_work_snb_corei_avx_1t(threaddata_t* threaddata) __attribute__((noinline));
+int asm_work_snb_corei_avx_1t(threaddata_t* threaddata);
-int asm_work_snb_corei_avx_2t(unsigned long long addrMem, unsigned long long addrHigh) __attribute__((noinline));
-int asm_work_snb_corei_avx_2t(unsigned long long addrMem, unsigned long long addrHigh);
+int asm_work_snb_corei_avx_2t(threaddata_t* threaddata) __attribute__((noinline));
+int asm_work_snb_corei_avx_2t(threaddata_t* threaddata);
-int asm_work_snb_xeonep_avx_1t(unsigned long long addrMem, unsigned long long addrHigh) __attribute__((noinline));
-int asm_work_snb_xeonep_avx_1t(unsigned long long addrMem, unsigned long long addrHigh);
+int asm_work_snb_xeonep_avx_1t(threaddata_t* threaddata) __attribute__((noinline));
+int asm_work_snb_xeonep_avx_1t(threaddata_t* threaddata);
-int asm_work_snb_xeonep_avx_2t(unsigned long long addrMem, unsigned long long addrHigh) __attribute__((noinline));
-int asm_work_snb_xeonep_avx_2t(unsigned long long addrMem, unsigned long long addrHigh);
+int asm_work_snb_xeonep_avx_2t(threaddata_t* threaddata) __attribute__((noinline));
+int asm_work_snb_xeonep_avx_2t(threaddata_t* threaddata);
-int asm_work_nhm_corei_sse2_1t(unsigned long long addrMem, unsigned long long addrHigh) __attribute__((noinline));
-int asm_work_nhm_corei_sse2_1t(unsigned long long addrMem, unsigned long long addrHigh);
+int asm_work_nhm_corei_sse2_1t(threaddata_t* threaddata) __attribute__((noinline));
+int asm_work_nhm_corei_sse2_1t(threaddata_t* threaddata);
-int asm_work_nhm_corei_sse2_2t(unsigned long long addrMem, unsigned long long addrHigh) __attribute__((noinline));
-int asm_work_nhm_corei_sse2_2t(unsigned long long addrMem, unsigned long long addrHigh);
+int asm_work_nhm_corei_sse2_2t(threaddata_t* threaddata) __attribute__((noinline));
+int asm_work_nhm_corei_sse2_2t(threaddata_t* threaddata);
-int asm_work_nhm_xeonep_sse2_1t(unsigned long long addrMem, unsigned long long addrHigh) __attribute__((noinline));
-int asm_work_nhm_xeonep_sse2_1t(unsigned long long addrMem, unsigned long long addrHigh);
+int asm_work_nhm_xeonep_sse2_1t(threaddata_t* threaddata) __attribute__((noinline));
+int asm_work_nhm_xeonep_sse2_1t(threaddata_t* threaddata);
-int asm_work_nhm_xeonep_sse2_2t(unsigned long long addrMem, unsigned long long addrHigh) __attribute__((noinline));
-int asm_work_nhm_xeonep_sse2_2t(unsigned long long addrMem, unsigned long long addrHigh);
+int asm_work_nhm_xeonep_sse2_2t(threaddata_t* threaddata) __attribute__((noinline));
+int asm_work_nhm_xeonep_sse2_2t(threaddata_t* threaddata);
-int asm_work_bld_opteron_fma4_1t(unsigned long long addrMem, unsigned long long addrHigh) __attribute__((noinline));
-int asm_work_bld_opteron_fma4_1t(unsigned long long addrMem, unsigned long long addrHigh);
+int asm_work_bld_opteron_fma4_1t(threaddata_t* threaddata) __attribute__((noinline));
+int asm_work_bld_opteron_fma4_1t(threaddata_t* threaddata);
/*
diff --git a/x86.c b/x86.c
index b231214f..fc765bcc 100644
--- a/x86.c
+++ b/x86.c
@@ -53,9 +53,6 @@ static int has_invariant_rdtsc();
#endif
#endif
-/** used to store Registers {R|E}AX, {R|E}BX, {R|E}CX and {R|E}DX */
-static unsigned long long a,b,c,d;
-
/*
* declarations of x86 specific functions, only used within this file
*/
@@ -77,11 +74,11 @@ static int has_htt();
/** 64 Bit implementations */
#if defined _64_BIT
-static unsigned long long reg_a,reg_b,reg_c,reg_d;
-
static void cpuid(unsigned long long *a, unsigned long long *b, unsigned long long *c, unsigned long long *d)
{
+ unsigned long long reg_a,reg_b,reg_c,reg_d;
+
__asm__ __volatile__(
"cpuid;"
: "=a" (reg_a), "=b" (reg_b), "=c" (reg_c), "=d" (reg_d)
@@ -101,6 +98,8 @@ static int has_cpuid()
unsigned long long timestamp()
{
+ unsigned long long reg_a,reg_d;
+
if (!has_rdtsc()) return 0;
__asm__ __volatile__("rdtsc;": "=a" (reg_a), "=d" (reg_d));
return (reg_d<<32)|(reg_a&0xffffffffULL);
@@ -110,11 +109,11 @@ unsigned long long timestamp()
/** 32 Bit implementations */
#if defined(_32_BIT)
-/* 32 Bit Registers */
-static unsigned int reg_a,reg_b,reg_c,reg_d;
static void cpuid(unsigned long long *a, unsigned long long *b, unsigned long long *c, unsigned long long *d)
{
+ unsigned int reg_a,reg_b,reg_c,reg_d;
+
__asm__ __volatile__(
"cpuid;"
: "=a" (reg_a), "=b" (reg_b), "=c" (reg_c), "=d" (reg_d)
@@ -165,6 +164,8 @@ static int has_cpuid()
unsigned long long timestamp()
{
+ unsigned int reg_a,reg_d;
+
if (!has_rdtsc()) return 0;
__asm__ __volatile__("rdtsc;": "=a" (reg_a) , "=d" (reg_d));
// upper 32 Bit in EDX, lower 32 Bit in EAX
@@ -206,6 +207,8 @@ void get_architecture(char* arch, size_t len)
int has_rdtsc()
{
+ unsigned long long a,b,c,d;
+
if (!has_cpuid()) return 0;
a=0;
@@ -223,6 +226,7 @@ int has_rdtsc()
int has_invariant_rdtsc()
{
+ unsigned long long a,b,c,d;
char tmp[_HW_DETECT_MAX_OUTPUT];
int res=0;
@@ -281,6 +285,8 @@ int has_invariant_rdtsc()
static int has_htt()
{
+ unsigned long long a,b,c,d;
+
if (!has_cpuid()) return 0;
a=0;
cpuid(&a,&b,&c,&d);
@@ -295,6 +301,7 @@ static int has_htt()
int get_cpu_vendor(char* vendor, size_t len)
{
+ unsigned long long a,b,c,d;
char tmp_vendor[13];
if (!has_cpuid()) return generic_get_cpu_vendor(vendor);
@@ -312,6 +319,7 @@ int get_cpu_vendor(char* vendor, size_t len)
int get_cpu_name(char* name, size_t len)
{
+ unsigned long long a,b,c,d;
char tmp[48];
char* start;
@@ -361,6 +369,8 @@ int get_cpu_name(char* name, size_t len)
int get_cpu_family()
{
+ unsigned long long a,b,c,d;
+
if (!has_cpuid()) return generic_get_cpu_family();
a=0;
cpuid(&a,&b,&c,&d);
@@ -375,6 +385,8 @@ int get_cpu_family()
}
int get_cpu_model()
{
+ unsigned long long a,b,c,d;
+
if (!has_cpuid()) return generic_get_cpu_model();
a=0;
cpuid(&a,&b,&c,&d);
@@ -389,6 +401,8 @@ int get_cpu_model()
}
int get_cpu_stepping()
{
+ unsigned long long a,b,c,d;
+
if (!has_cpuid()) return generic_get_cpu_stepping();
a=0;
cpuid(&a,&b,&c,&d);
@@ -404,6 +418,7 @@ int get_cpu_stepping()
int get_cpu_isa_extensions(char* features, size_t len)
{
+ unsigned long long a,b,c,d;
unsigned long long max,max_ext;
char tmp[16];
@@ -451,6 +466,14 @@ int get_cpu_isa_extensions(char* features, size_t len)
if (c&(1<<23)) strncat(features,"POPCNT ",(len-strlen(features))-1);
}
+ if (max>=7)
+ {
+ a=7;c=0;
+ cpuid(&a,&b,&c,&d);
+
+ if (b&(1<<5)) strncat(features,"AVX2 ", (len-strlen(features))-1);
+ if (b&(1<<16)) strncat(features,"AVX512 ", (len-strlen(features))-1);
+ }
if (max_ext>=0x80000001)
{
a=0x80000001;
@@ -591,6 +614,7 @@ unsigned long long get_cpu_clockrate(int check,int cpu)
*/
int num_caches(int cpu)
{
+ unsigned long long a,b,c,d;
unsigned long long max,max_ext;
char tmp[16];
int num;
@@ -660,6 +684,7 @@ int num_caches(int cpu)
//TODO use sysfs if available to determine cache sharing
int cache_info(int cpu,int id, char* output, size_t len)
{
+ unsigned long long a,b,c,d;
unsigned long long max,max_ext;
char tmp[16];
@@ -1120,6 +1145,7 @@ int num_packages()
int num_cores_per_package()
{
+ unsigned long long a,b,c,d;
char tmp[16];
int num=-1;
@@ -1167,7 +1193,7 @@ int num_cores_per_package()
/* consistency checks */
/* more cores than cpus is not possible -> some cores are deactivated */
if (num>num_cpus()) num=num_cpus();
- /* if the number of packages is known this cann be checked for multi-socket systems, too
+ /* if the number of packages is known this can be checked for multi-socket systems, too
NOTE depends on valid entries in sysfs */
if ((generic_num_packages()!=-1)&&(generic_num_packages()*num>num_cpus())) num=num_cpus()/generic_num_packages();
@@ -1185,6 +1211,7 @@ int num_threads_per_core()
int num_threads_per_package()
{
+ unsigned long long a,b,c,d;
int num=-1;
char tmp[16];