diff --git a/.gitignore b/.gitignore new file mode 100644 index 000000000..61fbc4039 --- /dev/null +++ b/.gitignore @@ -0,0 +1,46 @@ +build/* +CMakeCache.txt +CMakeFiles/ +CTestTestfile.cmake +DartConfiguration.tcl +Makefile +bin/ +cmake_install.cmake +src/CMakeFiles/ +src/CTestTestfile.cmake +src/Drivers/CMakeFiles/ +src/Drivers/CTestTestfile.cmake +src/Drivers/Makefile +src/Drivers/cmake_install.cmake +src/Drivers/tests/CMakeFiles/ +src/Drivers/tests/CTestTestfile.cmake +src/Drivers/tests/Makefile +src/Drivers/tests/cmake_install.cmake +src/Makefile +src/Particle/tests/CMakeFiles/ +src/Particle/tests/CTestTestfile.cmake +src/Particle/tests/Makefile +src/Particle/tests/cmake_install.cmake +src/QMCWaveFunctions/CMakeFiles/ +src/QMCWaveFunctions/CTestTestfile.cmake +src/QMCWaveFunctions/Makefile +src/QMCWaveFunctions/cmake_install.cmake +src/QMCWaveFunctions/tests/CMakeFiles/ +src/QMCWaveFunctions/tests/CTestTestfile.cmake +src/QMCWaveFunctions/tests/Makefile +src/QMCWaveFunctions/tests/cmake_install.cmake +src/Utilities/CMakeFiles/ +src/Utilities/CTestTestfile.cmake +src/Utilities/Makefile +src/Utilities/cmake_install.cmake +src/Utilities/tests/CMakeFiles/ +src/Utilities/tests/CTestTestfile.cmake +src/Utilities/tests/Makefile +src/Utilities/tests/cmake_install.cmake +src/cmake_install.cmake +src/config.h CMakeCache.txt +CMakeFiles/ +CTestTestfile.cmake +DartConfiguration.tcl +Makefile + diff --git a/Benchmark_base/plot.gp b/Benchmark_base/plot.gp new file mode 100644 index 000000000..dd39a0a5e --- /dev/null +++ b/Benchmark_base/plot.gp @@ -0,0 +1,25 @@ +set term png size 1700,700 enhanced font "Terminal,12" +set output "plot.png" +#set grid + +set datafile separator ";" +set auto x + +set style data histogram +set style fill solid border -1 +set boxwidth 0.9 + +set xtic rotate by -44 scale 0 +set logscale y +set multiplot layout 1, 1 rowsfirst + +set key left +set ylabel "Time (s)" +set xlabel "Running on 'Nb' mpi process and 'Nb' omp threads" +set title "Scalability analysis of the miniQMC application with differents problems size" +plot "scalabilite_bench211.dat" using 2:xticlabels(stringcolumn(1)) t "miniQMC+211",\ + "scalabilite_bench221.dat" using 2:xticlabels(stringcolumn(1)) t "miniQMC+221",\ + "scalabilite_bench222.dat" using 2:xticlabels(stringcolumn(1)) t "miniQMC+222",\ + "scalabilite_bench422.dat" using 2:xticlabels(stringcolumn(1)) t "miniQMC+422" + + diff --git a/Benchmark_base/plot.png b/Benchmark_base/plot.png new file mode 100644 index 000000000..da0d44a1d Binary files /dev/null and b/Benchmark_base/plot.png differ diff --git a/Benchmark_base/scalabilite_bench.dat b/Benchmark_base/scalabilite_bench.dat new file mode 100644 index 000000000..98a094a98 --- /dev/null +++ b/Benchmark_base/scalabilite_bench.dat @@ -0,0 +1,31 @@ +1 1; 0.00 +; +2 1; 2.16 +; +1 2; 0.00 +; +4 1; 2.16 +; +1 4; 0.00 +; +2 2; 2.15 +; +8 1; 2.17 +; +1 8; 0.00 +; +2 4; 2.15 +; +4 2; 2.16 +; +16 1; 2.23 +; +1 16; 0.00 +; +2 8; 2.15 +; +4 4; ; +8 2; ; +32 1; 0.14 +; +1 32; \ No newline at end of file diff --git a/Benchmark_base/scalabilite_bench211.dat b/Benchmark_base/scalabilite_bench211.dat new file mode 100644 index 000000000..6e8639067 --- /dev/null +++ b/Benchmark_base/scalabilite_bench211.dat @@ -0,0 +1,40 @@ +1 1; 1.61 +1 2; 1.55 +1 4; 1.79 +1 8; 2.02 +1 16; 2.15 +1 32; 2.94 +1 48; 3.35 +2 1; 1.79 +2 2; 2.29 +2 4; 4.36 +2 8; 8.23 +2 16; 16.06 +2 24; 24.11 +3 16; 4.13 +4 1; 1.67 +4 2; 1.84 +4 4; 1.93 +4 8; 2.88 +4 12; 3.02 +6 8; 2.75 +8 1; 1.75 +8 2; 1.95 +8 4; 2.55 +8 6; 2.74 +12 4; 2.77 +16 1; 1.83 +16 2; 2.43 +16 3; 2.76 +24 2; 2.80 +32 1; 0.11 +48 1; 0.12 + + + + + + + + + diff --git a/Benchmark_base/scalabilite_bench221.dat b/Benchmark_base/scalabilite_bench221.dat new file mode 100644 index 000000000..c9406194e --- /dev/null +++ b/Benchmark_base/scalabilite_bench221.dat @@ -0,0 +1,41 @@ +1 1; 5.03 +1 2; 5.28 +1 4; 6.32 +1 8; 6.00 +1 16; 6.38 +1 32; 10.34 +1 48; 12.62 +2 1; 5.22 +2 2; 7.68 +2 4; 15.03 +2 8; 29.63 +2 16; 59.78 +2 24; 89.99 +3 16; 14.64 +4 1; 5.17 +4 2; 5.74 +4 4; 6.01 +4 8; 9.05 +4 12; 10.70 +6 8; 10.65 +8 1; 5.42 +8 2; 5.71 +8 4; 8.48 +8 6; 10.62 +12 4; 10.67 +16 1; 5.61 +16 2; 8.29 +16 3; 10.66 +24 2; 10.59 +32 1; 0.10 +48 1; 0.09 + + + + + + + + + + diff --git a/Benchmark_base/scalabilite_bench222.dat b/Benchmark_base/scalabilite_bench222.dat new file mode 100644 index 000000000..6b8cd2694 --- /dev/null +++ b/Benchmark_base/scalabilite_bench222.dat @@ -0,0 +1,34 @@ +1 1; 21.50 +1 2; 22.36 +1 4; 22.43 +1 8; 23.33 +1 16; 25.20 +1 32; 41.16 +1 48; 58.76 +2 1; 87.26 +2 2; 33.48 +2 4; 65.97 +2 8; 132.41 +2 16; 269.17 +2 24; 402.38 +3 16; 70.17 +4 1; 21.49 +4 2; 22.81 +4 4; 23.95 +4 8; 38.13 +4 12; 52.21 +6 8; 51.85 +8 1; 22.58 +8 2; 23.49 +8 4; 37.01 +8 6; 51.94 +12 4; 52.10 +16 1; 23.40 +16 2; 36.36 +16 3; 52.22 +24 2; 52.61 +32 1; 00.10 +48 1; 00.11 + + + diff --git a/Benchmark_base/scalabilite_bench422.dat b/Benchmark_base/scalabilite_bench422.dat new file mode 100644 index 000000000..d6f805206 --- /dev/null +++ b/Benchmark_base/scalabilite_bench422.dat @@ -0,0 +1,40 @@ +1 1; 101.33 +1 2; 101.38 +1 4; 106.76 +1 8; 112.05 +1 16; 120.38 +1 32; 210.00 +1 48; 312.79 +2 1; 103.69 +2 2; 165.27 +2 4; 336.69 +2 8; 676.17 +2 16; 1395.95 +2 24; 2059.04 +3 16; 385.52 +4 1; 100.67 +4 2; 109.86 +4 4; 116.53 +4 8; 197.72 +4 12; 290.18 +6 8; 289.73 +8 1; 110.60 +8 2; 116.01 +8 4; 189.65 +8 6; 289.75 +12 4; 288.98 +16 1; 114.77 +16 2; 193.47 +16 3; 290.29 +24 2; 290.24 +32 1; 0.15 +48 1; 0.15 + + + + + + + + + diff --git a/Benchmarks_omp/plot.gp b/Benchmarks_omp/plot.gp new file mode 100644 index 000000000..46198bedc --- /dev/null +++ b/Benchmarks_omp/plot.gp @@ -0,0 +1,26 @@ +set term png size 1700,700 enhanced font "Terminal,12" +set output "plot.png" +#set grid + +set datafile separator ";" +set auto x + +set style data histogram +set style fill solid border -1 +set boxwidth 0.9 + +set xtic rotate by -44 scale 0 +set logscale y +set yrange [0.01:10000] +set multiplot layout 1, 1 rowsfirst + +set key left +set ylabel "Time (s)" +set xlabel "Running on 'Nb' mpi process and 'Nb' omp threads" +set title "Scalability analysis of the miniQMC application with differents problems size" +plot "scalabilite_bench211.dat" using 2:xticlabels(stringcolumn(1)) t "miniQMC+211",\ + "scalabilite_bench221.dat" using 2:xticlabels(stringcolumn(1)) t "miniQMC+221",\ + "scalabilite_bench222.dat" using 2:xticlabels(stringcolumn(1)) t "miniQMC+222",\ + "scalabilite_bench422.dat" using 2:xticlabels(stringcolumn(1)) t "miniQMC+422",\ + + diff --git a/Benchmarks_omp/plot.png b/Benchmarks_omp/plot.png new file mode 100644 index 000000000..9a05d2f70 Binary files /dev/null and b/Benchmarks_omp/plot.png differ diff --git a/Benchmarks_omp/scalabilite_bench211.dat b/Benchmarks_omp/scalabilite_bench211.dat new file mode 100644 index 000000000..02ca179fe --- /dev/null +++ b/Benchmarks_omp/scalabilite_bench211.dat @@ -0,0 +1,37 @@ +1 1; 2.30 +1 2; 2.77 +1 4; 2.69 +1 8; 2.93 +1 16; 2.88 +1 32; 4.37 +1 48; 4.61 +2 1; 2.48 +2 2; 3.20 +2 4; 6.27 +2 8; 12.05 +2 16; 23.58 +2 24; 35.24 +3 16; 5.56 +4 1; 2.54 +4 2; 2.79 +4 4; 2.79 +4 8; 4.06 +4 12; 4.33 +6 8; 3.97 +8 1; 2.46 +8 2; 2.96 +8 4; 3.73 +8 6; 4.15 +12 4; 3.99 +16 1; 2.61 +16 2; 3.43 +16 3; 3.92 +24 2; 3.92 +32 1; 0.11 +48 1; 0.10 +; + + + + + diff --git a/Benchmarks_omp/scalabilite_bench221.dat b/Benchmarks_omp/scalabilite_bench221.dat new file mode 100644 index 000000000..8140086d4 --- /dev/null +++ b/Benchmarks_omp/scalabilite_bench221.dat @@ -0,0 +1,40 @@ +1 1; 6.96 +1 2; 7.32 +1 4; 7.84 +1 8; 8.14 +1 16; 8.54 +1 32; 13.57 +1 48; 15.81 +2 1; 7.16 +2 2; 10.96 +2 4; 21.46 +2 8; 43.23 +2 16; 86.13 +2 24; 129.11 +3 16; 19.22 +4 1; 6.97 +4 2; 7.79 +4 4; 8.12 +4 8; 12.19 +4 12; 14.48 +6 8; 14.49 +8 1; 7.56 +8 2; 7.76 +8 4; 11.50 +8 6; 14.34 +12 4; 14.11 +16 1; 7.67 +16 2; 11.28 +16 3; 14.21 +24 2; 14.19 +32 1; 0.10 +48 1; 0.11 +; + + + + + + +1 + diff --git a/Benchmarks_omp/scalabilite_bench222.dat b/Benchmarks_omp/scalabilite_bench222.dat new file mode 100644 index 000000000..0ab372ed4 --- /dev/null +++ b/Benchmarks_omp/scalabilite_bench222.dat @@ -0,0 +1,39 @@ +1 1; 27.24 +1 2; 27.87 +1 4; 29.81 +1 8; 31.50 +1 16; 32.31 +1 32; 50.39 +1 48; 68.30 +2 1; 28.44 +2 2; 45.93 +2 4; 88.90 +2 8; 179.15 +2 16; 363.14 +2 24; 542.38 +3 16; 88.64 +4 1; 27.69 +4 2; 30.41 +4 4; 31.25 +4 8; 47.59 +4 12; 64.62 +6 8; 64.12 +8 1; 30.03 +8 2; 30.72 +8 4; 47.02 +8 6; 63.10 +12 4; 64.50 +16 1; 30.48 +16 2; 46.82 +16 3; 64.02 +24 2; 64.08 +32 1; 0.10 +48 1; 0.11 +; + + + + + + + diff --git a/Benchmarks_omp/scalabilite_bench422.dat b/Benchmarks_omp/scalabilite_bench422.dat new file mode 100644 index 000000000..9dc6951b6 --- /dev/null +++ b/Benchmarks_omp/scalabilite_bench422.dat @@ -0,0 +1,40 @@ +1 1; 119.23 +1 2; 120.35 +1 4; 127.97 +1 8; 132.13 +1 16; 139.62 +1 32; 233.18 +1 48; 329.06 +2 1; 121.54 +2 2; 206.11 +2 4; 405.85 +2 8; 822.06 +2 16; 1667.00 +2 24; 2504.87 +3 16; 433.31 +4 1; 118.62 +4 2; 129.26 +4 4; 137.09 +4 8; 225.07 +4 12; 315.16 +6 8; 313.06 +8 1; 130.30 +8 2; 135.34 +8 4; 214.02 +8 6; 313.55 +12 4; 312.25 +16 1; 135.76 +16 2; 212.71 +16 3; 313.45 +24 2; 313.00 +32 1; 0.15 +48 1; 0.16 +; + + + + + + + + diff --git a/bench.sh b/bench.sh new file mode 100644 index 000000000..56585ee9d --- /dev/null +++ b/bench.sh @@ -0,0 +1,4 @@ +#!/bin/bash +./exec211.sh 48 +./exec221.sh 48 +./exec422.sh 48 diff --git a/bench_comparaison/plot.gp b/bench_comparaison/plot.gp new file mode 100644 index 000000000..8cd4bd7a3 --- /dev/null +++ b/bench_comparaison/plot.gp @@ -0,0 +1,25 @@ +set term png size 1700,700 enhanced font "Terminal,12" +set output "plot.png" +#set grid + +set datafile separator ";" +set auto x + +set style data histogram +set style fill solid border -1 +set boxwidth 0.9 + +set xtic rotate by -44 scale 0 +set logscale y +set yrange [0.01:10000] +set multiplot layout 1, 1 rowsfirst + +set key left +set ylabel "Time (s)" +set xlabel "Running on 'Nb' mpi process and 'Nb' omp threads" +set title "Scalability analysis of the miniQMC application with differents problems size" +plot "scalabilite_bench422base.dat" using 2:xticlabels(stringcolumn(1)) t "miniQMC+422+base",\ + "scalabilite_bench422omp.dat" using 2:xticlabels(stringcolumn(1)) t "miniQMC+422+optim" + + + diff --git a/bench_comparaison/plot.png b/bench_comparaison/plot.png new file mode 100644 index 000000000..6dec023a3 Binary files /dev/null and b/bench_comparaison/plot.png differ diff --git a/bench_comparaison/scalabilite_bench422base.dat b/bench_comparaison/scalabilite_bench422base.dat new file mode 100644 index 000000000..d6f805206 --- /dev/null +++ b/bench_comparaison/scalabilite_bench422base.dat @@ -0,0 +1,40 @@ +1 1; 101.33 +1 2; 101.38 +1 4; 106.76 +1 8; 112.05 +1 16; 120.38 +1 32; 210.00 +1 48; 312.79 +2 1; 103.69 +2 2; 165.27 +2 4; 336.69 +2 8; 676.17 +2 16; 1395.95 +2 24; 2059.04 +3 16; 385.52 +4 1; 100.67 +4 2; 109.86 +4 4; 116.53 +4 8; 197.72 +4 12; 290.18 +6 8; 289.73 +8 1; 110.60 +8 2; 116.01 +8 4; 189.65 +8 6; 289.75 +12 4; 288.98 +16 1; 114.77 +16 2; 193.47 +16 3; 290.29 +24 2; 290.24 +32 1; 0.15 +48 1; 0.15 + + + + + + + + + diff --git a/bench_comparaison/scalabilite_bench422omp.dat b/bench_comparaison/scalabilite_bench422omp.dat new file mode 100644 index 000000000..9dc6951b6 --- /dev/null +++ b/bench_comparaison/scalabilite_bench422omp.dat @@ -0,0 +1,40 @@ +1 1; 119.23 +1 2; 120.35 +1 4; 127.97 +1 8; 132.13 +1 16; 139.62 +1 32; 233.18 +1 48; 329.06 +2 1; 121.54 +2 2; 206.11 +2 4; 405.85 +2 8; 822.06 +2 16; 1667.00 +2 24; 2504.87 +3 16; 433.31 +4 1; 118.62 +4 2; 129.26 +4 4; 137.09 +4 8; 225.07 +4 12; 315.16 +6 8; 313.06 +8 1; 130.30 +8 2; 135.34 +8 4; 214.02 +8 6; 313.55 +12 4; 312.25 +16 1; 135.76 +16 2; 212.71 +16 3; 313.45 +24 2; 313.00 +32 1; 0.15 +48 1; 0.16 +; + + + + + + + + diff --git a/benchmarks/result_base_2_1_1.txt b/benchmarks/result_base_2_1_1.txt new file mode 100644 index 000000000..4e1cbba87 --- /dev/null +++ b/benchmarks/result_base_2_1_1.txt @@ -0,0 +1,66 @@ +miniqmc git branch: develop +miniqmc git commit: 5ed650c8c390884d6a84f002be2bbfa103b7df3e + +Number of orbitals/splines = 384 +Tile size = 384 +Number of tiles = 1 +Number of electrons = 768 +Rmax = 1.7 +AcceptanceRatio = 0.5 +Iterations = 5 +OpenMP threads = 4 +Number of walkers per rank = 4 + +SPO coefficients size = 196608000 bytes (187.5 MB) +delayed update rank = 32 +Using SoA distance table, Jastrow + einspline, +and determinant update. +================================== +Stack timer profile in seconds +Timer Inclusive_time Exclusive_time Calls Time_per_call +Setup 0.0572 0.0572 1 0.057204007 +Total 2.3655 0.3644 1 2.365510626 + Diffusion 0.9438 0.0031 5 0.188756147 + Accept move 0.0015 0.0015 1913 0.000000782 + Complete Updates 0.0296 0.0000 5 0.005926068 + Determinant::update 0.0296 0.0296 10 0.002961351 + Current Gradient 0.0485 0.0014 3840 0.000012624 + Determinant::ratio 0.0464 0.0464 3840 0.000012076 + OneBodyJastrow 0.0004 0.0004 3840 0.000000102 + TwoBodyJastrow 0.0003 0.0003 3840 0.000000082 + Kinetic Energy 0.0090 0.0090 5 0.001805381 + OneBodyJastrow 0.0000 0.0000 5 0.000009225 + TwoBodyJastrow 0.0000 0.0000 5 0.000005901 + Make move 0.0826 0.0826 3840 0.000021510 + New Gradient 0.2125 0.0021 3840 0.000055339 + Determinant::ratio 0.0024 0.0024 3840 0.000000613 + Determinant::spovgl 0.1814 0.0048 3840 0.000047232 + Single-Particle Orbitals 0.1766 0.1766 3840 0.000045983 + OneBodyJastrow 0.0027 0.0027 3840 0.000000692 + TwoBodyJastrow 0.0240 0.0240 3840 0.000006247 + Set active 0.0836 0.0836 3840 0.000021775 + Update 0.4734 0.0010 1913 0.000247448 + Determinant::update 0.4559 0.4559 1913 0.000238301 + OneBodyJastrow 0.0002 0.0002 1913 0.000000106 + TwoBodyJastrow 0.0163 0.0163 1913 0.000008520 + Initialization 0.1507 0.0716 1 0.150714155 + Determinant::inverse 0.0249 0.0249 2 0.012473029 + Determinant::spovgl 0.0504 0.0019 2 0.025175876 + Single-Particle Orbitals 0.0484 0.0484 768 0.000063085 + OneBodyJastrow 0.0005 0.0005 1 0.000466003 + TwoBodyJastrow 0.0033 0.0033 1 0.003317725 + Pseudopotential 0.9067 0.0046 5 0.181331985 + Make move 0.3530 0.3530 15792 0.000022355 + Value 0.5491 0.0062 15792 0.000034768 + Determinant::ratio 0.0051 0.0051 15792 0.000000323 + Determinant::spoval 0.4879 0.0040 15792 0.000030896 + Single-Particle Orbitals 0.4839 0.4839 15792 0.000030640 + OneBodyJastrow 0.0052 0.0052 15792 0.000000331 + TwoBodyJastrow 0.0446 0.0446 15792 0.000002825 + +========== Throughput ============ + +Total throughput ( N_walkers * N_elec^3 / Total time ) = 7.65982e+08 +Diffusion throughput ( N_walkers * N_elec^3 / Diffusion time ) = 1.91987e+09 +Pseudopotential throughput ( N_walkers * N_elec^2 / Pseudopotential time ) = 2.60218e+06 + diff --git a/exec.sh b/exec.sh new file mode 100644 index 000000000..1ac020069 --- /dev/null +++ b/exec.sh @@ -0,0 +1,62 @@ +#!/bin/bash +rm -rf scalabilite_bench.dat +TIME_FORMAT=%R +max_core=$1 + +printf 'running on 1 process\n\n' +echo -n "1 1; " >> scalabilite_bench.dat +OMP_NUM_THREADS=1 ~/time -p ./build/bin/miniqmc -g "2 2 2" |& grep -E "real [0-9][0-9]*\.[0-9][0-9]*" | sed "s/real //g" >> scalabilite_bench.dat +echo ";" >> scalabilite_bench.dat +for((i=2; i<=max_core; i=i*2)) +do + printf 'running on %d mpi process and 1 omp threads\n' "$i" + echo -n "$i 1; " >> scalabilite_bench.dat + OMP_NUM_THREADS=1 ~/time -p mpirun -n $i ./build/bin/miniqmc -g "2 2 2" |& grep -E "real [0-9][0-9]*\.[0-9][0-9]*" | sed "s/real //g" >> scalabilite_bench.dat + echo ";" >> scalabilite_bench.dat + + printf 'running on 1 mpi process and %d omp threads\n' "$i" + echo -n "1 $i; " >> scalabilite_bench.dat + OMP_NUM_THREADS=$i ~/time -p ./build/bin/miniqmc -g "2 2 2" |& grep -E "real [0-9][0-9]*\.[0-9][0-9]*" | sed "s/real //g" >> scalabilite_bench.dat + echo ";" >> scalabilite_bench.dat + + for((k=2; k> scalabilite_bench.dat + OMP_NUM_THREADS=$div ~/time -p mpirun -n $k ./build/bin/miniqmc -g "2 2 2"|& grep -E "real [0-9][0-9]*\.[0-9][0-9]*" | sed "s/real //g" >> scalabilite_bench.dat + echo ";" >> scalabilite_bench.dat + + done + printf "\n" + if((i==max_core)) + then + exit 0 + fi +done + +printf 'running on %d mpi process and 1 omp threads\n' "$max_core" +echo -n "$max_core 1; " >> scalabilite_bench.dat +OMP_NUM_THREADS=1 ~/time -p mpirun -n $max_core ./build/bin/miniqmc -g "2 2 2"|& grep -E "real [0-9][0-9]*\.[0-9][0-9]*" | sed "s/real //g" >> scalabilite_bench.dat +echo ";" >> scalabilite_bench.dat + +printf 'running on 1 mpi process and %d omp threads\n' "$max_core" +echo -n "1 $max_core; " >> scalabilite_bench.dat +OMP_NUM_THREADS=$max_core ~/time -p ./build/bin/miniqmc -g "2 2 2"|& grep -E "real [0-9][0-9]*\.[0-9][0-9]*" | sed "s/real //g" >> scalabilite_bench.dat +echo ";" >> scalabilite_bench.dat + +for((k=2; k<=max_core; k=k*2)) +do + div=$(($max_core/$k)) + if((div == 1 && k!= max_core)); then break; fi + printf 'running on %d mpi process and %d omp threads\n' "$k" "$div" + echo -n "$k $div; " >> scalabilite_bench.dat + OMP_NUM_THREADS=$div ~/time -p mpirun -n $k ./build/bin/miniqmc -g "2 2 2"|& grep -E "real [0-9][0-9]*\.[0-9][0-9]*" | sed "s/real //g" >> scalabilite_bench.dat + echo ";" >> scalabilite_bench.dat + + printf 'running on %d mpi process and %d omp threads\n' "$div" "$k" + echo -n "$div $k; " >> scalabilite_bench.dat + OMP_NUM_THREADS=$k ~/time -p mpirun -n $div ./build/bin/miniqmc -g "2 2 2"|& grep -E "real [0-9][0-9]*\.[0-9][0-9]*" | sed "s/real //g" >> scalabilite_bench.dat + echo ";" >> scalabilite_bench.dat +done +printf "\n" diff --git a/exec211.sh b/exec211.sh new file mode 100644 index 000000000..ea4cb2db2 --- /dev/null +++ b/exec211.sh @@ -0,0 +1,62 @@ +#!/bin/bash +rm -rf scalabilite_bench211.dat +TIME_FORMAT=%R +max_core=$1 + +printf 'running on 1 process\n\n' +echo -n "1 1; " >> scalabilite_bench211.dat +OMP_NUM_THREADS=1 ~/time -p ./build/bin/miniqmc -g "2 1 1" |& grep -E "real [0-9][0-9]*\.[0-9][0-9]*" | sed "s/real //g" >> scalabilite_bench211.dat +echo ";" >> scalabilite_bench211.dat +for((i=2; i<=max_core; i=i*2)) +do + printf 'running on %d mpi process and 1 omp threads\n' "$i" + echo -n "$i 1; " >> scalabilite_bench211.dat + OMP_NUM_THREADS=1 ~/time -p mpirun -n $i ./build/bin/miniqmc -g "2 1 1" |& grep -E "real [0-9][0-9]*\.[0-9][0-9]*" | sed "s/real //g" >> scalabilite_bench211.dat + echo ";" >> scalabilite_bench211.dat + + printf 'running on 1 mpi process and %d omp threads\n' "$i" + echo -n "1 $i; " >> scalabilite_bench211.dat + OMP_NUM_THREADS=$i ~/time -p ./build/bin/miniqmc -g "2 1 1" |& grep -E "real [0-9][0-9]*\.[0-9][0-9]*" | sed "s/real //g" >> scalabilite_bench211.dat + echo ";" >> scalabilite_bench211.dat + + for((k=2; k> scalabilite_bench211.dat + OMP_NUM_THREADS=$div ~/time -p mpirun -n $k ./build/bin/miniqmc -g "2 1 1"|& grep -E "real [0-9][0-9]*\.[0-9][0-9]*" | sed "s/real //g" >> scalabilite_bench211.dat + echo ";" >> scalabilite_bench211.dat + + done + printf "\n" + if((i==max_core)) + then + exit 0 + fi +done + +printf 'running on %d mpi process and 1 omp threads\n' "$max_core" +echo -n "$max_core 1; " >> scalabilite_bench211.dat +OMP_NUM_THREADS=1 ~/time -p mpirun -n $max_core ./build/bin/miniqmc -g "2 1 1"|& grep -E "real [0-9][0-9]*\.[0-9][0-9]*" | sed "s/real //g" >> scalabilite_bench211.dat +echo ";" >> scalabilite_bench211.dat + +printf 'running on 1 mpi process and %d omp threads\n' "$max_core" +echo -n "1 $max_core; " >> scalabilite_bench211.dat +OMP_NUM_THREADS=$max_core ~/time -p ./build/bin/miniqmc -g "2 1 1"|& grep -E "real [0-9][0-9]*\.[0-9][0-9]*" | sed "s/real //g" >> scalabilite_bench211.dat +echo ";" >> scalabilite_bench211.dat + +for((k=2; k<=max_core; k=k*2)) +do + div=$(($max_core/$k)) + if((div == 1 && k!= max_core)); then break; fi + printf 'running on %d mpi process and %d omp threads\n' "$k" "$div" + echo -n "$k $div; " >> scalabilite_bench211.dat + OMP_NUM_THREADS=$div ~/time -p mpirun -n $k ./build/bin/miniqmc -g "2 1 1"|& grep -E "real [0-9][0-9]*\.[0-9][0-9]*" | sed "s/real //g" >> scalabilite_bench211.dat + echo ";" >> scalabilite_bench211.dat + + printf 'running on %d mpi process and %d omp threads\n' "$div" "$k" + echo -n "$div $k; " >> scalabilite_bench211.dat + OMP_NUM_THREADS=$k ~/time -p mpirun -n $div ./build/bin/miniqmc -g "2 1 1"|& grep -E "real [0-9][0-9]*\.[0-9][0-9]*" | sed "s/real //g" >> scalabilite_bench211.dat + echo ";" >> scalabilite_bench211.dat +done +printf "\n" diff --git a/exec221.sh b/exec221.sh new file mode 100644 index 000000000..2d0ddd7a1 --- /dev/null +++ b/exec221.sh @@ -0,0 +1,62 @@ +#!/bin/bash +rm -rf scalabilite_bench221.dat +TIME_FORMAT=%R +max_core=$1 + +printf 'running on 1 process\n\n' +echo -n "1 1; " >> scalabilite_bench221.dat +OMP_NUM_THREADS=1 ~/time -p ./build/bin/miniqmc -g "2 2 1" |& grep -E "real [0-9][0-9]*\.[0-9][0-9]*" | sed "s/real //g" >> scalabilite_bench221.dat +echo ";" >> scalabilite_bench221.dat +for((i=2; i<=max_core; i=i*2)) +do + printf 'running on %d mpi process and 1 omp threads\n' "$i" + echo -n "$i 1; " >> scalabilite_bench221.dat + OMP_NUM_THREADS=1 ~/time -p mpirun -n $i ./build/bin/miniqmc -g "2 2 1" |& grep -E "real [0-9][0-9]*\.[0-9][0-9]*" | sed "s/real //g" >> scalabilite_bench221.dat + echo ";" >> scalabilite_bench221.dat + + printf 'running on 1 mpi process and %d omp threads\n' "$i" + echo -n "1 $i; " >> scalabilite_bench221.dat + OMP_NUM_THREADS=$i ~/time -p ./build/bin/miniqmc -g "2 2 1" |& grep -E "real [0-9][0-9]*\.[0-9][0-9]*" | sed "s/real //g" >> scalabilite_bench221.dat + echo ";" >> scalabilite_bench221.dat + + for((k=2; k> scalabilite_bench221.dat + OMP_NUM_THREADS=$div ~/time -p mpirun -n $k ./build/bin/miniqmc -g "2 2 1"|& grep -E "real [0-9][0-9]*\.[0-9][0-9]*" | sed "s/real //g" >> scalabilite_bench221.dat + echo ";" >> scalabilite_bench221.dat + + done + printf "\n" + if((i==max_core)) + then + exit 0 + fi +done + +printf 'running on %d mpi process and 1 omp threads\n' "$max_core" +echo -n "$max_core 1; " >> scalabilite_bench221.dat +OMP_NUM_THREADS=1 ~/time -p mpirun -n $max_core ./build/bin/miniqmc -g "2 2 1"|& grep -E "real [0-9][0-9]*\.[0-9][0-9]*" | sed "s/real //g" >> scalabilite_bench221.dat +echo ";" >> scalabilite_bench221.dat + +printf 'running on 1 mpi process and %d omp threads\n' "$max_core" +echo -n "1 $max_core; " >> scalabilite_bench221.dat +OMP_NUM_THREADS=$max_core ~/time -p ./build/bin/miniqmc -g "2 2 1"|& grep -E "real [0-9][0-9]*\.[0-9][0-9]*" | sed "s/real //g" >> scalabilite_bench221.dat +echo ";" >> scalabilite_bench221.dat + +for((k=2; k<=max_core; k=k*2)) +do + div=$(($max_core/$k)) + if((div == 1 && k!= max_core)); then break; fi + printf 'running on %d mpi process and %d omp threads\n' "$k" "$div" + echo -n "$k $div; " >> scalabilite_bench221.dat + OMP_NUM_THREADS=$div ~/time -p mpirun -n $k ./build/bin/miniqmc -g "2 2 1"|& grep -E "real [0-9][0-9]*\.[0-9][0-9]*" | sed "s/real //g" >> scalabilite_bench221.dat + echo ";" >> scalabilite_bench221.dat + + printf 'running on %d mpi process and %d omp threads\n' "$div" "$k" + echo -n "$div $k; " >> scalabilite_bench221.dat + OMP_NUM_THREADS=$k ~/time -p mpirun -n $div ./build/bin/miniqmc -g "2 2 1"|& grep -E "real [0-9][0-9]*\.[0-9][0-9]*" | sed "s/real //g" >> scalabilite_bench221.dat + echo ";" >> scalabilite_bench221.dat +done +printf "\n" diff --git a/exec222.sh b/exec222.sh new file mode 100644 index 000000000..1e772f577 --- /dev/null +++ b/exec222.sh @@ -0,0 +1,62 @@ +#!/bin/bash +rm -rf scalabilite_bench222.dat +TIME_FORMAT=%R +max_core=$1 + +printf 'running on 1 process\n\n' +echo -n "1 1; " >> scalabilite_bench222.dat +OMP_NUM_THREADS=1 ~/time -p ./build/bin/miniqmc -g "2 2 2" |& grep -E "real [0-9][0-9]*\.[0-9][0-9]*" | sed "s/real //g" >> scalabilite_bench222.dat +echo ";" >> scalabilite_bench222.dat +for((i=2; i<=max_core; i=i*2)) +do + printf 'running on %d mpi process and 1 omp threads\n' "$i" + echo -n "$i 1; " >> scalabilite_bench222.dat + OMP_NUM_THREADS=1 ~/time -p mpirun -n $i ./build/bin/miniqmc -g "2 2 2" |& grep -E "real [0-9][0-9]*\.[0-9][0-9]*" | sed "s/real //g" >> scalabilite_bench222.dat + echo ";" >> scalabilite_bench222.dat + + printf 'running on 1 mpi process and %d omp threads\n' "$i" + echo -n "1 $i; " >> scalabilite_bench222.dat + OMP_NUM_THREADS=$i ~/time -p ./build/bin/miniqmc -g "2 2 2" |& grep -E "real [0-9][0-9]*\.[0-9][0-9]*" | sed "s/real //g" >> scalabilite_bench222.dat + echo ";" >> scalabilite_bench222.dat + + for((k=2; k> scalabilite_bench222.dat + OMP_NUM_THREADS=$div ~/time -p mpirun -n $k ./build/bin/miniqmc -g "2 2 2"|& grep -E "real [0-9][0-9]*\.[0-9][0-9]*" | sed "s/real //g" >> scalabilite_bench222.dat + echo ";" >> scalabilite_bench222.dat + + done + printf "\n" + if((i==max_core)) + then + exit 0 + fi +done + +printf 'running on %d mpi process and 1 omp threads\n' "$max_core" +echo -n "$max_core 1; " >> scalabilite_bench222.dat +OMP_NUM_THREADS=1 ~/time -p mpirun -n $max_core ./build/bin/miniqmc -g "2 2 2"|& grep -E "real [0-9][0-9]*\.[0-9][0-9]*" | sed "s/real //g" >> scalabilite_bench222.dat +echo ";" >> scalabilite_bench222.dat + +printf 'running on 1 mpi process and %d omp threads\n' "$max_core" +echo -n "1 $max_core; " >> scalabilite_bench222.dat +OMP_NUM_THREADS=$max_core ~/time -p ./build/bin/miniqmc -g "2 2 2"|& grep -E "real [0-9][0-9]*\.[0-9][0-9]*" | sed "s/real //g" >> scalabilite_bench222.dat +echo ";" >> scalabilite_bench222.dat + +for((k=2; k<=max_core; k=k*2)) +do + div=$(($max_core/$k)) + if((div == 1 && k!= max_core)); then break; fi + printf 'running on %d mpi process and %d omp threads\n' "$k" "$div" + echo -n "$k $div; " >> scalabilite_bench222.dat + OMP_NUM_THREADS=$div ~/time -p mpirun -n $k ./build/bin/miniqmc -g "2 2 2"|& grep -E "real [0-9][0-9]*\.[0-9][0-9]*" | sed "s/real //g" >> scalabilite_bench222.dat + echo ";" >> scalabilite_bench222.dat + + printf 'running on %d mpi process and %d omp threads\n' "$div" "$k" + echo -n "$div $k; " >> scalabilite_bench222.dat + OMP_NUM_THREADS=$k ~/time -p mpirun -n $div ./build/bin/miniqmc -g "2 2 2"|& grep -E "real [0-9][0-9]*\.[0-9][0-9]*" | sed "s/real //g" >> scalabilite_bench222.dat + echo ";" >> scalabilite_bench222.dat +done +printf "\n" diff --git a/exec422.sh b/exec422.sh new file mode 100644 index 000000000..0d77c5668 --- /dev/null +++ b/exec422.sh @@ -0,0 +1,62 @@ +#!/bin/bash +rm -rf scalabilite_bench422.dat +TIME_FORMAT=%R +max_core=$1 + +printf 'running on 1 process\n\n' +echo -n "1 1; " >> scalabilite_bench422.dat +OMP_NUM_THREADS=1 ~/time -p ./build/bin/miniqmc -g "4 2 2" |& grep -E "real [0-9][0-9]*\.[0-9][0-9]*" | sed "s/real //g" >> scalabilite_bench422.dat +echo ";" >> scalabilite_bench422.dat +for((i=2; i<=max_core; i=i*2)) +do + printf 'running on %d mpi process and 1 omp threads\n' "$i" + echo -n "$i 1; " >> scalabilite_bench422.dat + OMP_NUM_THREADS=1 ~/time -p mpirun -n $i ./build/bin/miniqmc -g "4 2 2" |& grep -E "real [0-9][0-9]*\.[0-9][0-9]*" | sed "s/real //g" >> scalabilite_bench422.dat + echo ";" >> scalabilite_bench422.dat + + printf 'running on 1 mpi process and %d omp threads\n' "$i" + echo -n "1 $i; " >> scalabilite_bench422.dat + OMP_NUM_THREADS=$i ~/time -p ./build/bin/miniqmc -g "4 2 2" |& grep -E "real [0-9][0-9]*\.[0-9][0-9]*" | sed "s/real //g" >> scalabilite_bench422.dat + echo ";" >> scalabilite_bench422.dat + + for((k=2; k> scalabilite_bench422.dat + OMP_NUM_THREADS=$div ~/time -p mpirun -n $k ./build/bin/miniqmc -g "4 2 2"|& grep -E "real [0-9][0-9]*\.[0-9][0-9]*" | sed "s/real //g" >> scalabilite_bench422.dat + echo ";" >> scalabilite_bench422.dat + + done + printf "\n" + if((i==max_core)) + then + exit 0 + fi +done + +printf 'running on %d mpi process and 1 omp threads\n' "$max_core" +echo -n "$max_core 1; " >> scalabilite_bench422.dat +OMP_NUM_THREADS=1 ~/time -p mpirun -n $max_core ./build/bin/miniqmc -g "4 2 2"|& grep -E "real [0-9][0-9]*\.[0-9][0-9]*" | sed "s/real //g" >> scalabilite_bench422.dat +echo ";" >> scalabilite_bench422.dat + +printf 'running on 1 mpi process and %d omp threads\n' "$max_core" +echo -n "1 $max_core; " >> scalabilite_bench422.dat +OMP_NUM_THREADS=$max_core ~/time -p ./build/bin/miniqmc -g "4 2 2"|& grep -E "real [0-9][0-9]*\.[0-9][0-9]*" | sed "s/real //g" >> scalabilite_bench422.dat +echo ";" >> scalabilite_bench422.dat + +for((k=2; k<=max_core; k=k*2)) +do + div=$(($max_core/$k)) + if((div == 1 && k!= max_core)); then break; fi + printf 'running on %d mpi process and %d omp threads\n' "$k" "$div" + echo -n "$k $div; " >> scalabilite_bench422.dat + OMP_NUM_THREADS=$div ~/time -p mpirun -n $k ./build/bin/miniqmc -g "4 2 2"|& grep -E "real [0-9][0-9]*\.[0-9][0-9]*" | sed "s/real //g" >> scalabilite_bench422.dat + echo ";" >> scalabilite_bench422.dat + + printf 'running on %d mpi process and %d omp threads\n' "$div" "$k" + echo -n "$div $k; " >> scalabilite_bench422.dat + OMP_NUM_THREADS=$k ~/time -p mpirun -n $div ./build/bin/miniqmc -g "4 2 2"|& grep -E "real [0-9][0-9]*\.[0-9][0-9]*" | sed "s/real //g" >> scalabilite_bench422.dat + echo ";" >> scalabilite_bench422.dat +done +printf "\n" diff --git a/notes_perf.md b/notes_perf.md new file mode 100644 index 000000000..201496b5b --- /dev/null +++ b/notes_perf.md @@ -0,0 +1,7 @@ +# Optimisation +## ajout de flags de compilation +```bash + cmake -DCMAKE_BUILD_TYPE=None -DCMAKE_CXX_COMPILER=mpicxx CMAKE_CXX_FLAGS="-Ofast -march=native -finline-functions -funroll-loops -ftree-loop-vectorize -ftree-vectorize" .. + +``` +Expliquer les flags \ No newline at end of file diff --git a/plot.png b/plot.png new file mode 100644 index 000000000..da0d44a1d Binary files /dev/null and b/plot.png differ diff --git a/src/Numerics/Spline2/MultiBspline.hpp b/src/Numerics/Spline2/MultiBspline.hpp index d687d2fad..25cc5f5cf 100644 --- a/src/Numerics/Spline2/MultiBspline.hpp +++ b/src/Numerics/Spline2/MultiBspline.hpp @@ -48,6 +48,7 @@ inline void evaluate_v(const typename bspline_traits::SplineType* restrict ASSUME_ALIGNED(vals); std::fill(vals, vals + num_splines, zero); + for (size_t i = 0; i < 4; i++) for (size_t j = 0; j < 4; j++) { @@ -55,10 +56,25 @@ inline void evaluate_v(const typename bspline_traits::SplineType* restrict const T* restrict coefs = spline_m->coefs + ((ix + i) * xs + (iy + j) * ys + iz * zs); ASSUME_ALIGNED(coefs); //#pragma omp simd - for (size_t n = 0; n < num_splines; n++) + /*for (size_t n = 0; n < num_splines; n++) vals[n] += pre00 * (c[0] * coefs[n] + c[1] * coefs[n + zs] + c[2] * coefs[n + 2 * zs] + - c[3] * coefs[n + 3 * zs]); + c[3] * coefs[n + 3 * zs]);*/ + // déroulage de boucle par 4 element + + #pragma omp parallel for simd aligned(coefs:64) + for (size_t n = 0; n < num_splines-(num_splines%4); n+=4) + { + vals[n+0] += pre00 * (c[0] * coefs[n] + c[1] * coefs[n + zs] + c[2] * coefs[n + 2 * zs] + c[3] * coefs[n + 3 * zs]); + vals[n+1] += pre00 * (c[0] * coefs[n+1] + c[1] * coefs[(n+1) + zs] + c[2] * coefs[(n+1) + 2 * zs] + c[3] * coefs[(n+1) + 3 * zs]); + vals[n+2] += pre00 * (c[0] * coefs[n+2] + c[1] * coefs[(n+2) + zs] + c[2] * coefs[(n+2) + 2 * zs] + c[3] * coefs[(n+2) + 3 * zs]); + vals[n+3] += pre00 * (c[0] * coefs[n+3] + c[1] * coefs[(n+3) + zs] + c[2] * coefs[(n+3) + 2 * zs] + c[3] * coefs[(n+3) + 3 * zs]); + } + for (size_t n = num_splines-(num_splines%4); n< num_splines; n++) + { + vals[n] += pre00 * (c[0] * coefs[n] + c[1] * coefs[n + zs] + c[2] * coefs[n + 2 * zs] + c[3] * coefs[n + 3 * zs]); + } + } } @@ -121,7 +137,10 @@ evaluate_vgl(const typename bspline_traits::SplineType* restrict spline_m, ASSUME_ALIGNED(coefs3zs); #pragma noprefetch -#pragma omp simd +#pragma omp parallel + +{ + #pragma omp for simd for (int n = 0; n < num_splines; n++) { const T coefsv = coefs[n]; @@ -150,7 +169,7 @@ evaluate_vgl(const typename bspline_traits::SplineType* restrict spline_m, const T dyInv2 = dyInv * dyInv; const T dzInv2 = dzInv * dzInv; -#pragma omp simd +#pragma omp for simd for (int n = 0; n < num_splines; n++) { gx[n] *= dxInv; @@ -160,6 +179,7 @@ evaluate_vgl(const typename bspline_traits::SplineType* restrict spline_m, } } +} template inline void evaluate_vgh(const typename bspline_traits::SplineType* restrict spline_m, T x, T y, T z, @@ -229,7 +249,8 @@ evaluate_vgh(const typename bspline_traits::SplineType* restrict spline_m, const T pre02 = a[i] * d2b[j]; const int iSplitPoint = num_splines; -#pragma omp simd + +#pragma omp parallel for simd for (int n = 0; n < iSplitPoint; n++) { T coefsv = coefs[n]; @@ -264,7 +285,7 @@ evaluate_vgh(const typename bspline_traits::SplineType* restrict spline_m, const T dxz = dxInv * dzInv; const T dyz = dyInv * dzInv; -#pragma omp simd +#pragma omp parallel for simd for (int n = 0; n < num_splines; n++) { gx[n] *= dxInv; diff --git a/src/Particle/Lattice/ParticleBConds.h b/src/Particle/Lattice/ParticleBConds.h index 36961880e..57a21c22d 100644 --- a/src/Particle/Lattice/ParticleBConds.h +++ b/src/Particle/Lattice/ParticleBConds.h @@ -82,6 +82,8 @@ struct DTD_BConds { const int n = dr.size(); const T cone(1); + + #pragma omp simd for (int i = 0; i < n; ++i) { r[i] = std::sqrt(apply_bc(dr[i])); diff --git a/src/QMCWaveFunctions/Jastrow/TwoBodyJastrow.h b/src/QMCWaveFunctions/Jastrow/TwoBodyJastrow.h index 17bd48e9e..03b426238 100644 --- a/src/QMCWaveFunctions/Jastrow/TwoBodyJastrow.h +++ b/src/QMCWaveFunctions/Jastrow/TwoBodyJastrow.h @@ -122,6 +122,7 @@ struct TwoBodyJastrow : public WaveFunctionComponent { valT curUat(0); const int igt = P.GroupID[iat] * NumGroups; + #pragma simd for (int jg = 0; jg < NumGroups; ++jg) { const FuncType& f2(*F[igt + jg]); @@ -148,8 +149,9 @@ struct TwoBodyJastrow : public WaveFunctionComponent for (int idim = 0; idim < OHMMS_DIM; ++idim) { const valT* restrict dX = displ.data(idim); - valT s = valT(); + valT s = valT(); + #pragma omp simd reduction(+ : s) for (int jat = 0; jat < N; ++jat) s += du[jat] * dX[jat]; grad[idim] = s; @@ -263,6 +265,9 @@ inline void TwoBodyJastrow::computeU3(const ParticleSet& P, std::fill_n(d2u, jelmax, czero); const int igt = P.GroupID[iat] * NumGroups; + + #pragma omp simd + for (int jg = 0; jg < NumGroups; ++jg) { const FuncType& f2(*F[igt + jg]); @@ -319,6 +324,8 @@ void TwoBodyJastrow::acceptMove(ParticleSet& P, int iat) const auto& new_dr = d_table->Temp_dr; const auto& old_dr = d_table->Displacements[iat]; constexpr valT lapfac = OHMMS_DIM - RealType(1); + + #pragma omp simd for (int jat = 0; jat < N; jat++) { const valT du = cur_u[jat] - old_u[jat]; @@ -337,6 +344,8 @@ void TwoBodyJastrow::acceptMove(ParticleSet& P, int iat) const valT* restrict old_du_pt = old_du.data(); valT* restrict save_g = dUat.data(idim); valT cur_g = cur_dUat[idim]; + + #pragma omp simd for (int jat = 0; jat < N; jat++) { const valT newg = cur_du_pt[jat] * new_dX[jat]; @@ -376,6 +385,8 @@ void TwoBodyJastrow::recompute(ParticleSet& P) { const valT* restrict dX = displ.data(idim); valT s = valT(); + + #pragma omp simd reduction(+ : s) for (int jat = 0; jat < iat; ++jat) s += du[jat] * dX[jat]; grad[idim] = s; @@ -383,15 +394,20 @@ void TwoBodyJastrow::recompute(ParticleSet& P) dUat(iat) = grad; d2Uat[iat] = -lap; // add the contribution from the upper triangle + + #pragma omp simd for (int jat = 0; jat < iat; jat++) { Uat[jat] += u[jat]; d2Uat[jat] -= d2u[jat] + lapfac * du[jat]; } + for (int idim = 0; idim < OHMMS_DIM; ++idim) { valT* restrict save_g = dUat.data(idim); const valT* restrict dX = displ.data(idim); + + #pragma omp simd for (int jat = 0; jat < iat; jat++) save_g[jat] -= du[jat] * dX[jat]; } @@ -418,6 +434,8 @@ void TwoBodyJastrow::evaluateGL(ParticleSet& P, if (fromscratch) recompute(P); LogValue = valT(0); + + #pragma omp simd reduction(+ : LogValue) for (int iat = 0; iat < N; ++iat) { LogValue += Uat[iat];