From 77f632b42042346b6d972e3d6d4745e9d29ff3f0 Mon Sep 17 00:00:00 2001 From: Daniel J Magee <43071310+dmageeLANL@users.noreply.github.com> Date: Fri, 15 Sep 2023 15:53:46 -0600 Subject: [PATCH 01/12] DGEMM doc complete without results. (#55) --- .../10_Microbenchmarks/M1_STREAM/STREAM.rst | 14 ++--- .../10_Microbenchmarks/M3_DGEMM/DGEMM.rst | 51 +++++++++++++++---- doc/sphinx/3_vibe/vibe.rst | 6 +-- utils/pav_config/tests/parthenon.yaml | 2 +- 4 files changed, 49 insertions(+), 24 deletions(-) diff --git a/doc/sphinx/10_Microbenchmarks/M1_STREAM/STREAM.rst b/doc/sphinx/10_Microbenchmarks/M1_STREAM/STREAM.rst index eb0315df..bccba2e7 100644 --- a/doc/sphinx/10_Microbenchmarks/M1_STREAM/STREAM.rst +++ b/doc/sphinx/10_Microbenchmarks/M1_STREAM/STREAM.rst @@ -37,6 +37,8 @@ These operations stress memory and floating point pipelines.They test memory tra Figure of Merit --------------- +The primary FOM is the Triad rate (MB/s). + Building ======== @@ -44,12 +46,6 @@ Adjustments to GOMP_CPU_AFFINITY may also be necessary. You can modify the STREAM_ARRAY_SIZE value in the compilation step to change the array size used by the benchmark. Adjusting the array size can help accommodate the available memory on your system. -RHEL Systems ------------- - -CrayOS Systems --------------- - Running ======= @@ -66,9 +62,6 @@ Replace `` with the number of MPI processes you want to use. For Input ----- -Independent Variables ---------------------- - Dependent Variable(s) --------------------- @@ -93,4 +86,5 @@ CTS-1 Snow :alt: STREAM microbenchmark bandwidth measurement ATS-3 Rocinante HBM -------------------- \ No newline at end of file +------------------- + diff --git a/doc/sphinx/10_Microbenchmarks/M3_DGEMM/DGEMM.rst b/doc/sphinx/10_Microbenchmarks/M3_DGEMM/DGEMM.rst index e14e613d..850b6e73 100644 --- a/doc/sphinx/10_Microbenchmarks/M3_DGEMM/DGEMM.rst +++ b/doc/sphinx/10_Microbenchmarks/M3_DGEMM/DGEMM.rst @@ -15,29 +15,60 @@ Characteristics Problem ------- +.. math:: + \mathbf{C} = \alpha*\mathbf{A}*\mathbf{B} + \beta*\mathbf{C} + +Where :math:`A B C` are square :math:`NxN` vectors and :math:`\alpha` and :math:`\beta` are scalars. This operation is repeated :math:`R` times. + Figure of Merit --------------- +The Gigaflops per second rate reported at the end of the run + +GFLOP/s rate: GF/s + +Run Rules +--------- + + +* Vendors are permitted to change the source code in the region marked in the source. +* Optimized BLAS/DGEMM routines are permitted (and encouraged) to demonstrate the highest performance. +* Vendors may modify the Makefile(s) as required + Building ======== -RHEL Systems ------------- +Makefiles are provided for the intel and gcc compilers. Before building, load the compiler and blas libraries into the PATH and LD_LIBRARY_PATH. + +.. code-block:: -CrayOS Systems --------------- + cd src + patch -p1 < ../dgemm_omp_fixes.patch + make + +If using a different compiler, copy and modify the simple makefiles to apply the appropriate flags. + +If using a different blas library than mkl or openblas, modify the C source file to use the correct header and dgemm command. Running ======= -Input ------ +DGEMM uses OpenMP but does not use MPI. + +Set the number of OpenMP threads before running. + +.. code-block:: bash + export OPENBLAS_NUM_THREADS = + export OMP_NUM_THREADS = + +.. code-block:: bash + ./mt-dgemm + +These values default to: :math:`N=256, R=8, \alpha=1.0, \beta=1.0` -Independent Variables ---------------------- +These inputs are subject to the conditions :math:`N>128, R>4`. -Dependent Variable(s) ---------------------- +These are positional arguments, so, for instance, R cannot be set without setting N. Example Results =============== diff --git a/doc/sphinx/3_vibe/vibe.rst b/doc/sphinx/3_vibe/vibe.rst index 25306743..5fbfb6d0 100644 --- a/doc/sphinx/3_vibe/vibe.rst +++ b/doc/sphinx/3_vibe/vibe.rst @@ -1,6 +1,6 @@ -****** +****************** Parthenon-VIBE -****** +****************** This is the documentation for the ATS-5 Benchmark, Parthenon-VIBE. @@ -25,7 +25,7 @@ and evolves one or more passive scalar quantities :math:`q^i` according to \partial_t q^i + \nabla \cdot \left( q^i \mathbf{u} \right) = 0 -as well as computing an auxiliary quantity :math:`d`` that resemebles a kinetic energy +as well as computing an auxiliary quantity :math:`d` that resemebles a kinetic energy .. math:: d = \frac{1}{2} q^0 \mathbf{u}\cdot\mathbf{u}. diff --git a/utils/pav_config/tests/parthenon.yaml b/utils/pav_config/tests/parthenon.yaml index 03ad190a..a6de79f6 100644 --- a/utils/pav_config/tests/parthenon.yaml +++ b/utils/pav_config/tests/parthenon.yaml @@ -99,7 +99,7 @@ ats5_spr: - partn variables: - nx: [128, 160] + nx: [64, 128, 160] tpm: [8, 32, 56, 88, 112] #[4, 8, 18, 26, 36, 50, 74, 90, 110] # intelversion: "2023.1.0" # crayversion: "15.0.1" From 3230b257bfd43ec88baec1329616dd18c4e50159 Mon Sep 17 00:00:00 2001 From: Anthony Agelastos Date: Mon, 18 Sep 2023 20:06:45 -0600 Subject: [PATCH 02/12] renamed directories so numbering puts things in appropriate order with listing --- .../{0_intro => 00_intro}/introduction.rst | 0 .../{1_branson => 01_branson}/branson.rst | 0 doc/sphinx/{1_branson => 01_branson}/cpu.gp | 0 .../{1_branson => 01_branson}/cpu_133M.csv | 0 .../{1_branson => 01_branson}/cpu_200M.csv | 0 .../{1_branson => 01_branson}/cpu_66M.csv | 0 doc/sphinx/{1_branson => 01_branson}/gpu.csv | 0 doc/sphinx/{1_branson => 01_branson}/gpu.gp | 0 doc/sphinx/{2_amg => 02_amg}/amg.rst | 0 doc/sphinx/{2_amg => 02_amg}/cpu.gp | 0 doc/sphinx/{2_amg => 02_amg}/cpu1_120.csv | 0 doc/sphinx/{2_amg => 02_amg}/cpu1_160.csv | 0 doc/sphinx/{2_amg => 02_amg}/cpu1_200.csv | 0 doc/sphinx/{2_amg => 02_amg}/cpu2_200.csv | 0 doc/sphinx/{2_amg => 02_amg}/cpu2_256.csv | 0 doc/sphinx/{2_amg => 02_amg}/cpu2_320.csv | 0 doc/sphinx/{2_amg => 02_amg}/gpu.gp | 0 doc/sphinx/{2_amg => 02_amg}/gpu1.csv | 0 doc/sphinx/{2_amg => 02_amg}/gpu2.csv | 0 .../{2_amg => 02_amg}/plots/CPU-FOM-1.png | Bin .../{2_amg => 02_amg}/plots/CPU-FOM-2.png | Bin doc/sphinx/{2_amg => 02_amg}/plots/amg-J1.png | Bin doc/sphinx/{2_amg => 02_amg}/plots/amg-J2.png | Bin doc/sphinx/{2_amg => 02_amg}/plots/mem-J1.png | Bin doc/sphinx/{2_amg => 02_amg}/plots/mem-J2.png | Bin doc/sphinx/{3_vibe => 03_vibe}/cpu.gp | 0 doc/sphinx/{3_vibe => 03_vibe}/cpu_20.csv | 0 doc/sphinx/{3_vibe => 03_vibe}/cpu_40.csv | 0 doc/sphinx/{3_vibe => 03_vibe}/cpu_60.csv | 0 .../{3_vibe => 03_vibe}/do_gpu_throughput.sh | 0 .../do_strong_scaling_cpu.sh | 0 doc/sphinx/{3_vibe => 03_vibe}/gpu.csv | 0 doc/sphinx/{3_vibe => 03_vibe}/gpu.gp | 0 ...arthenon-ats5_spr-hbm128-intel-classic.csv | 0 ...arthenon-ats5_spr-hbm160-intel-classic.csv | 0 .../{3_vibe => 03_vibe}/plots/cpu-strong.png | Bin .../plots/gpu-throughput.png | Bin doc/sphinx/{3_vibe => 03_vibe}/vibe.rst | 0 doc/sphinx/{4_spatter => 04_spatter}/a100.gp | 0 .../a100_throughput_001.csv | 0 .../a100_throughput_001fp.csv | 0 .../a100_throughput_001nonfp.csv | 0 .../a100_throughput_asteroid.csv | 0 doc/sphinx/{4_spatter => 04_spatter}/cts1.gp | 0 .../cts1_weak_average_001.csv | 0 .../cts1_weak_average_001fp.csv | 0 .../cts1_weak_average_001nonfp.csv | 0 .../cts1_weak_average_asteroid.csv | 0 .../cts1_weak_total_001.csv | 0 .../cts1_weak_total_001fp.csv | 0 .../cts1_weak_total_001nonfp.csv | 0 .../cts1_weak_total_asteroid.csv | 0 .../{4_spatter => 04_spatter}/skylake.gp | 0 .../skylake_weak_average_001.csv | 0 .../skylake_weak_average_001fp.csv | 0 .../skylake_weak_average_001nonfp.csv | 0 .../skylake_weak_average_asteroid.csv | 0 .../skylake_weak_total_001.csv | 0 .../skylake_weak_total_001fp.csv | 0 .../skylake_weak_total_001nonfp.csv | 0 .../skylake_weak_total_asteroid.csv | 0 .../{4_spatter => 04_spatter}/spatter.rst | 0 doc/sphinx/{4_spatter => 04_spatter}/v100.gp | 0 .../v100_throughput_001.csv | 0 .../v100_throughput_001fp.csv | 0 .../v100_throughput_001nonfp.csv | 0 .../v100_throughput_asteroid.csv | 0 doc/sphinx/{5_mlmd => 05_mlmd}/gpu.csv | 0 doc/sphinx/{5_mlmd => 05_mlmd}/gpu.gp | 0 doc/sphinx/{5_mlmd => 05_mlmd}/mlmd.rst | 0 .../plots/StrongSingle-s.png | Bin .../plots/StrongSingle-t.png | Bin .../plots/WeakParallel-s.png | Bin .../plots/WeakParallel-t.png | Bin .../spp1_strong_scaling_cts2.csv | 0 .../spp1_strong_scaling_cts2.gp | 0 .../spp1_strong_scaling_cts2_abridged.csv | 0 .../spp1_throughput_V100.csv | 0 .../{6_umt => 06_umt}/spp1_throughput_V100.gp | 0 doc/sphinx/{6_umt => 06_umt}/umt.rst | 0 .../umtsp2_strong_scaling_cpu.csv | 0 .../umtsp2_throughput_gpu.csv | 0 doc/sphinx/{7_miniem => 07_miniem}/ats2.csv | 0 doc/sphinx/{7_miniem => 07_miniem}/ats2.gp | 0 doc/sphinx/{7_miniem => 07_miniem}/ats2mem.gp | 0 .../{7_miniem => 07_miniem}/cts1-0.25.csv | 0 .../{7_miniem => 07_miniem}/cts1-0.25.gp | 0 .../{7_miniem => 07_miniem}/cts1-0.50.csv | 0 .../{7_miniem => 07_miniem}/cts1-0.50.gp | 0 .../{7_miniem => 07_miniem}/cts1-1.00.csv | 0 .../{7_miniem => 07_miniem}/cts1-1.00.gp | 0 .../{7_miniem => 07_miniem}/cts1-2.00.csv | 0 .../{7_miniem => 07_miniem}/cts1-2.00.gp | 0 doc/sphinx/{7_miniem => 07_miniem}/cts1.csv | 0 doc/sphinx/{7_miniem => 07_miniem}/cts1.gp | 0 .../{7_miniem => 07_miniem}/cts1mem-0.25.gp | 0 .../{7_miniem => 07_miniem}/cts1mem-0.50.gp | 0 .../{7_miniem => 07_miniem}/cts1mem-1.00.gp | 0 .../{7_miniem => 07_miniem}/cts1mem-2.00.gp | 0 doc/sphinx/{7_miniem => 07_miniem}/cts1mem.gp | 0 doc/sphinx/{7_miniem => 07_miniem}/miniem.rst | 0 doc/sphinx/{7_miniem => 07_miniem}/output.log | 0 doc/sphinx/{7_miniem => 07_miniem}/recipe.sh | 0 .../Makefile.manzano_kokkos | 0 doc/sphinx/{8_sparta => 08_sparta}/ats2.csv | 0 doc/sphinx/{8_sparta => 08_sparta}/ats2.gp | 0 .../{8_sparta => 08_sparta}/build-manzano.sh | 0 .../{8_sparta => 08_sparta}/cts1-0.25.csv | 0 .../{8_sparta => 08_sparta}/cts1-0.25.gp | 0 .../{8_sparta => 08_sparta}/cts1-0.50.csv | 0 .../{8_sparta => 08_sparta}/cts1-0.50.gp | 0 .../{8_sparta => 08_sparta}/cts1-1.00.csv | 0 .../{8_sparta => 08_sparta}/cts1-1.00.gp | 0 .../{8_sparta => 08_sparta}/cts1-2.00.csv | 0 .../{8_sparta => 08_sparta}/cts1-2.00.gp | 0 doc/sphinx/{8_sparta => 08_sparta}/cts1.csv | 0 doc/sphinx/{8_sparta => 08_sparta}/cts1.gp | 0 .../{8_sparta => 08_sparta}/cts1mem-0.25.gp | 0 .../{8_sparta => 08_sparta}/cts1mem-0.50.gp | 0 .../{8_sparta => 08_sparta}/cts1mem-1.00.gp | 0 .../{8_sparta => 08_sparta}/cts1mem-2.00.gp | 0 doc/sphinx/{8_sparta => 08_sparta}/log.sparta | 0 doc/sphinx/{8_sparta => 08_sparta}/sparta.rst | 0 .../{8_sparta => 08_sparta}/sparta_fom.py | 0 .../build_docs.rst | 0 doc/sphinx/index.rst | 27 +++++++++++------- 126 files changed, 17 insertions(+), 10 deletions(-) rename doc/sphinx/{0_intro => 00_intro}/introduction.rst (100%) rename doc/sphinx/{1_branson => 01_branson}/branson.rst (100%) rename doc/sphinx/{1_branson => 01_branson}/cpu.gp (100%) rename doc/sphinx/{1_branson => 01_branson}/cpu_133M.csv (100%) rename doc/sphinx/{1_branson => 01_branson}/cpu_200M.csv (100%) rename doc/sphinx/{1_branson => 01_branson}/cpu_66M.csv (100%) rename doc/sphinx/{1_branson => 01_branson}/gpu.csv (100%) rename doc/sphinx/{1_branson => 01_branson}/gpu.gp (100%) rename doc/sphinx/{2_amg => 02_amg}/amg.rst (100%) rename doc/sphinx/{2_amg => 02_amg}/cpu.gp (100%) rename doc/sphinx/{2_amg => 02_amg}/cpu1_120.csv (100%) rename doc/sphinx/{2_amg => 02_amg}/cpu1_160.csv (100%) rename doc/sphinx/{2_amg => 02_amg}/cpu1_200.csv (100%) rename doc/sphinx/{2_amg => 02_amg}/cpu2_200.csv (100%) rename doc/sphinx/{2_amg => 02_amg}/cpu2_256.csv (100%) rename doc/sphinx/{2_amg => 02_amg}/cpu2_320.csv (100%) rename doc/sphinx/{2_amg => 02_amg}/gpu.gp (100%) rename doc/sphinx/{2_amg => 02_amg}/gpu1.csv (100%) rename doc/sphinx/{2_amg => 02_amg}/gpu2.csv (100%) rename doc/sphinx/{2_amg => 02_amg}/plots/CPU-FOM-1.png (100%) rename doc/sphinx/{2_amg => 02_amg}/plots/CPU-FOM-2.png (100%) rename doc/sphinx/{2_amg => 02_amg}/plots/amg-J1.png (100%) rename doc/sphinx/{2_amg => 02_amg}/plots/amg-J2.png (100%) rename doc/sphinx/{2_amg => 02_amg}/plots/mem-J1.png (100%) rename doc/sphinx/{2_amg => 02_amg}/plots/mem-J2.png (100%) rename doc/sphinx/{3_vibe => 03_vibe}/cpu.gp (100%) rename doc/sphinx/{3_vibe => 03_vibe}/cpu_20.csv (100%) rename doc/sphinx/{3_vibe => 03_vibe}/cpu_40.csv (100%) rename doc/sphinx/{3_vibe => 03_vibe}/cpu_60.csv (100%) rename doc/sphinx/{3_vibe => 03_vibe}/do_gpu_throughput.sh (100%) rename doc/sphinx/{3_vibe => 03_vibe}/do_strong_scaling_cpu.sh (100%) rename doc/sphinx/{3_vibe => 03_vibe}/gpu.csv (100%) rename doc/sphinx/{3_vibe => 03_vibe}/gpu.gp (100%) rename doc/sphinx/{3_vibe => 03_vibe}/parthenon-ats5_spr-hbm128-intel-classic.csv (100%) rename doc/sphinx/{3_vibe => 03_vibe}/parthenon-ats5_spr-hbm160-intel-classic.csv (100%) rename doc/sphinx/{3_vibe => 03_vibe}/plots/cpu-strong.png (100%) rename doc/sphinx/{3_vibe => 03_vibe}/plots/gpu-throughput.png (100%) rename doc/sphinx/{3_vibe => 03_vibe}/vibe.rst (100%) rename doc/sphinx/{4_spatter => 04_spatter}/a100.gp (100%) rename doc/sphinx/{4_spatter => 04_spatter}/a100_throughput_001.csv (100%) rename doc/sphinx/{4_spatter => 04_spatter}/a100_throughput_001fp.csv (100%) rename doc/sphinx/{4_spatter => 04_spatter}/a100_throughput_001nonfp.csv (100%) rename doc/sphinx/{4_spatter => 04_spatter}/a100_throughput_asteroid.csv (100%) rename doc/sphinx/{4_spatter => 04_spatter}/cts1.gp (100%) rename doc/sphinx/{4_spatter => 04_spatter}/cts1_weak_average_001.csv (100%) rename doc/sphinx/{4_spatter => 04_spatter}/cts1_weak_average_001fp.csv (100%) rename doc/sphinx/{4_spatter => 04_spatter}/cts1_weak_average_001nonfp.csv (100%) rename doc/sphinx/{4_spatter => 04_spatter}/cts1_weak_average_asteroid.csv (100%) rename doc/sphinx/{4_spatter => 04_spatter}/cts1_weak_total_001.csv (100%) rename doc/sphinx/{4_spatter => 04_spatter}/cts1_weak_total_001fp.csv (100%) rename doc/sphinx/{4_spatter => 04_spatter}/cts1_weak_total_001nonfp.csv (100%) rename doc/sphinx/{4_spatter => 04_spatter}/cts1_weak_total_asteroid.csv (100%) rename doc/sphinx/{4_spatter => 04_spatter}/skylake.gp (100%) rename doc/sphinx/{4_spatter => 04_spatter}/skylake_weak_average_001.csv (100%) rename doc/sphinx/{4_spatter => 04_spatter}/skylake_weak_average_001fp.csv (100%) rename doc/sphinx/{4_spatter => 04_spatter}/skylake_weak_average_001nonfp.csv (100%) rename doc/sphinx/{4_spatter => 04_spatter}/skylake_weak_average_asteroid.csv (100%) rename doc/sphinx/{4_spatter => 04_spatter}/skylake_weak_total_001.csv (100%) rename doc/sphinx/{4_spatter => 04_spatter}/skylake_weak_total_001fp.csv (100%) rename doc/sphinx/{4_spatter => 04_spatter}/skylake_weak_total_001nonfp.csv (100%) rename doc/sphinx/{4_spatter => 04_spatter}/skylake_weak_total_asteroid.csv (100%) rename doc/sphinx/{4_spatter => 04_spatter}/spatter.rst (100%) rename doc/sphinx/{4_spatter => 04_spatter}/v100.gp (100%) rename doc/sphinx/{4_spatter => 04_spatter}/v100_throughput_001.csv (100%) rename doc/sphinx/{4_spatter => 04_spatter}/v100_throughput_001fp.csv (100%) rename doc/sphinx/{4_spatter => 04_spatter}/v100_throughput_001nonfp.csv (100%) rename doc/sphinx/{4_spatter => 04_spatter}/v100_throughput_asteroid.csv (100%) rename doc/sphinx/{5_mlmd => 05_mlmd}/gpu.csv (100%) rename doc/sphinx/{5_mlmd => 05_mlmd}/gpu.gp (100%) rename doc/sphinx/{5_mlmd => 05_mlmd}/mlmd.rst (100%) rename doc/sphinx/{5_mlmd => 05_mlmd}/plots/StrongSingle-s.png (100%) rename doc/sphinx/{5_mlmd => 05_mlmd}/plots/StrongSingle-t.png (100%) rename doc/sphinx/{5_mlmd => 05_mlmd}/plots/WeakParallel-s.png (100%) rename doc/sphinx/{5_mlmd => 05_mlmd}/plots/WeakParallel-t.png (100%) rename doc/sphinx/{6_umt => 06_umt}/spp1_strong_scaling_cts2.csv (100%) rename doc/sphinx/{6_umt => 06_umt}/spp1_strong_scaling_cts2.gp (100%) rename doc/sphinx/{6_umt => 06_umt}/spp1_strong_scaling_cts2_abridged.csv (100%) rename doc/sphinx/{6_umt => 06_umt}/spp1_throughput_V100.csv (100%) rename doc/sphinx/{6_umt => 06_umt}/spp1_throughput_V100.gp (100%) rename doc/sphinx/{6_umt => 06_umt}/umt.rst (100%) rename doc/sphinx/{6_umt => 06_umt}/umtsp2_strong_scaling_cpu.csv (100%) rename doc/sphinx/{6_umt => 06_umt}/umtsp2_throughput_gpu.csv (100%) rename doc/sphinx/{7_miniem => 07_miniem}/ats2.csv (100%) rename doc/sphinx/{7_miniem => 07_miniem}/ats2.gp (100%) rename doc/sphinx/{7_miniem => 07_miniem}/ats2mem.gp (100%) rename doc/sphinx/{7_miniem => 07_miniem}/cts1-0.25.csv (100%) rename doc/sphinx/{7_miniem => 07_miniem}/cts1-0.25.gp (100%) rename doc/sphinx/{7_miniem => 07_miniem}/cts1-0.50.csv (100%) rename doc/sphinx/{7_miniem => 07_miniem}/cts1-0.50.gp (100%) rename doc/sphinx/{7_miniem => 07_miniem}/cts1-1.00.csv (100%) rename doc/sphinx/{7_miniem => 07_miniem}/cts1-1.00.gp (100%) rename doc/sphinx/{7_miniem => 07_miniem}/cts1-2.00.csv (100%) rename doc/sphinx/{7_miniem => 07_miniem}/cts1-2.00.gp (100%) rename doc/sphinx/{7_miniem => 07_miniem}/cts1.csv (100%) rename doc/sphinx/{7_miniem => 07_miniem}/cts1.gp (100%) rename doc/sphinx/{7_miniem => 07_miniem}/cts1mem-0.25.gp (100%) rename doc/sphinx/{7_miniem => 07_miniem}/cts1mem-0.50.gp (100%) rename doc/sphinx/{7_miniem => 07_miniem}/cts1mem-1.00.gp (100%) rename doc/sphinx/{7_miniem => 07_miniem}/cts1mem-2.00.gp (100%) rename doc/sphinx/{7_miniem => 07_miniem}/cts1mem.gp (100%) rename doc/sphinx/{7_miniem => 07_miniem}/miniem.rst (100%) rename doc/sphinx/{7_miniem => 07_miniem}/output.log (100%) rename doc/sphinx/{7_miniem => 07_miniem}/recipe.sh (100%) rename doc/sphinx/{8_sparta => 08_sparta}/Makefile.manzano_kokkos (100%) rename doc/sphinx/{8_sparta => 08_sparta}/ats2.csv (100%) rename doc/sphinx/{8_sparta => 08_sparta}/ats2.gp (100%) rename doc/sphinx/{8_sparta => 08_sparta}/build-manzano.sh (100%) rename doc/sphinx/{8_sparta => 08_sparta}/cts1-0.25.csv (100%) rename doc/sphinx/{8_sparta => 08_sparta}/cts1-0.25.gp (100%) rename doc/sphinx/{8_sparta => 08_sparta}/cts1-0.50.csv (100%) rename doc/sphinx/{8_sparta => 08_sparta}/cts1-0.50.gp (100%) rename doc/sphinx/{8_sparta => 08_sparta}/cts1-1.00.csv (100%) rename doc/sphinx/{8_sparta => 08_sparta}/cts1-1.00.gp (100%) rename doc/sphinx/{8_sparta => 08_sparta}/cts1-2.00.csv (100%) rename doc/sphinx/{8_sparta => 08_sparta}/cts1-2.00.gp (100%) rename doc/sphinx/{8_sparta => 08_sparta}/cts1.csv (100%) rename doc/sphinx/{8_sparta => 08_sparta}/cts1.gp (100%) rename doc/sphinx/{8_sparta => 08_sparta}/cts1mem-0.25.gp (100%) rename doc/sphinx/{8_sparta => 08_sparta}/cts1mem-0.50.gp (100%) rename doc/sphinx/{8_sparta => 08_sparta}/cts1mem-1.00.gp (100%) rename doc/sphinx/{8_sparta => 08_sparta}/cts1mem-2.00.gp (100%) rename doc/sphinx/{8_sparta => 08_sparta}/log.sparta (100%) rename doc/sphinx/{8_sparta => 08_sparta}/sparta.rst (100%) rename doc/sphinx/{8_sparta => 08_sparta}/sparta_fom.py (100%) rename doc/sphinx/{9_appendix => 09_appendix}/build_docs.rst (100%) diff --git a/doc/sphinx/0_intro/introduction.rst b/doc/sphinx/00_intro/introduction.rst similarity index 100% rename from doc/sphinx/0_intro/introduction.rst rename to doc/sphinx/00_intro/introduction.rst diff --git a/doc/sphinx/1_branson/branson.rst b/doc/sphinx/01_branson/branson.rst similarity index 100% rename from doc/sphinx/1_branson/branson.rst rename to doc/sphinx/01_branson/branson.rst diff --git a/doc/sphinx/1_branson/cpu.gp b/doc/sphinx/01_branson/cpu.gp similarity index 100% rename from doc/sphinx/1_branson/cpu.gp rename to doc/sphinx/01_branson/cpu.gp diff --git a/doc/sphinx/1_branson/cpu_133M.csv b/doc/sphinx/01_branson/cpu_133M.csv similarity index 100% rename from doc/sphinx/1_branson/cpu_133M.csv rename to doc/sphinx/01_branson/cpu_133M.csv diff --git a/doc/sphinx/1_branson/cpu_200M.csv b/doc/sphinx/01_branson/cpu_200M.csv similarity index 100% rename from doc/sphinx/1_branson/cpu_200M.csv rename to doc/sphinx/01_branson/cpu_200M.csv diff --git a/doc/sphinx/1_branson/cpu_66M.csv b/doc/sphinx/01_branson/cpu_66M.csv similarity index 100% rename from doc/sphinx/1_branson/cpu_66M.csv rename to doc/sphinx/01_branson/cpu_66M.csv diff --git a/doc/sphinx/1_branson/gpu.csv b/doc/sphinx/01_branson/gpu.csv similarity index 100% rename from doc/sphinx/1_branson/gpu.csv rename to doc/sphinx/01_branson/gpu.csv diff --git a/doc/sphinx/1_branson/gpu.gp b/doc/sphinx/01_branson/gpu.gp similarity index 100% rename from doc/sphinx/1_branson/gpu.gp rename to doc/sphinx/01_branson/gpu.gp diff --git a/doc/sphinx/2_amg/amg.rst b/doc/sphinx/02_amg/amg.rst similarity index 100% rename from doc/sphinx/2_amg/amg.rst rename to doc/sphinx/02_amg/amg.rst diff --git a/doc/sphinx/2_amg/cpu.gp b/doc/sphinx/02_amg/cpu.gp similarity index 100% rename from doc/sphinx/2_amg/cpu.gp rename to doc/sphinx/02_amg/cpu.gp diff --git a/doc/sphinx/2_amg/cpu1_120.csv b/doc/sphinx/02_amg/cpu1_120.csv similarity index 100% rename from doc/sphinx/2_amg/cpu1_120.csv rename to doc/sphinx/02_amg/cpu1_120.csv diff --git a/doc/sphinx/2_amg/cpu1_160.csv b/doc/sphinx/02_amg/cpu1_160.csv similarity index 100% rename from doc/sphinx/2_amg/cpu1_160.csv rename to doc/sphinx/02_amg/cpu1_160.csv diff --git a/doc/sphinx/2_amg/cpu1_200.csv b/doc/sphinx/02_amg/cpu1_200.csv similarity index 100% rename from doc/sphinx/2_amg/cpu1_200.csv rename to doc/sphinx/02_amg/cpu1_200.csv diff --git a/doc/sphinx/2_amg/cpu2_200.csv b/doc/sphinx/02_amg/cpu2_200.csv similarity index 100% rename from doc/sphinx/2_amg/cpu2_200.csv rename to doc/sphinx/02_amg/cpu2_200.csv diff --git a/doc/sphinx/2_amg/cpu2_256.csv b/doc/sphinx/02_amg/cpu2_256.csv similarity index 100% rename from doc/sphinx/2_amg/cpu2_256.csv rename to doc/sphinx/02_amg/cpu2_256.csv diff --git a/doc/sphinx/2_amg/cpu2_320.csv b/doc/sphinx/02_amg/cpu2_320.csv similarity index 100% rename from doc/sphinx/2_amg/cpu2_320.csv rename to doc/sphinx/02_amg/cpu2_320.csv diff --git a/doc/sphinx/2_amg/gpu.gp b/doc/sphinx/02_amg/gpu.gp similarity index 100% rename from doc/sphinx/2_amg/gpu.gp rename to doc/sphinx/02_amg/gpu.gp diff --git a/doc/sphinx/2_amg/gpu1.csv b/doc/sphinx/02_amg/gpu1.csv similarity index 100% rename from doc/sphinx/2_amg/gpu1.csv rename to doc/sphinx/02_amg/gpu1.csv diff --git a/doc/sphinx/2_amg/gpu2.csv b/doc/sphinx/02_amg/gpu2.csv similarity index 100% rename from doc/sphinx/2_amg/gpu2.csv rename to doc/sphinx/02_amg/gpu2.csv diff --git a/doc/sphinx/2_amg/plots/CPU-FOM-1.png b/doc/sphinx/02_amg/plots/CPU-FOM-1.png similarity index 100% rename from doc/sphinx/2_amg/plots/CPU-FOM-1.png rename to doc/sphinx/02_amg/plots/CPU-FOM-1.png diff --git a/doc/sphinx/2_amg/plots/CPU-FOM-2.png b/doc/sphinx/02_amg/plots/CPU-FOM-2.png similarity index 100% rename from doc/sphinx/2_amg/plots/CPU-FOM-2.png rename to doc/sphinx/02_amg/plots/CPU-FOM-2.png diff --git a/doc/sphinx/2_amg/plots/amg-J1.png b/doc/sphinx/02_amg/plots/amg-J1.png similarity index 100% rename from doc/sphinx/2_amg/plots/amg-J1.png rename to doc/sphinx/02_amg/plots/amg-J1.png diff --git a/doc/sphinx/2_amg/plots/amg-J2.png b/doc/sphinx/02_amg/plots/amg-J2.png similarity index 100% rename from doc/sphinx/2_amg/plots/amg-J2.png rename to doc/sphinx/02_amg/plots/amg-J2.png diff --git a/doc/sphinx/2_amg/plots/mem-J1.png b/doc/sphinx/02_amg/plots/mem-J1.png similarity index 100% rename from doc/sphinx/2_amg/plots/mem-J1.png rename to doc/sphinx/02_amg/plots/mem-J1.png diff --git a/doc/sphinx/2_amg/plots/mem-J2.png b/doc/sphinx/02_amg/plots/mem-J2.png similarity index 100% rename from doc/sphinx/2_amg/plots/mem-J2.png rename to doc/sphinx/02_amg/plots/mem-J2.png diff --git a/doc/sphinx/3_vibe/cpu.gp b/doc/sphinx/03_vibe/cpu.gp similarity index 100% rename from doc/sphinx/3_vibe/cpu.gp rename to doc/sphinx/03_vibe/cpu.gp diff --git a/doc/sphinx/3_vibe/cpu_20.csv b/doc/sphinx/03_vibe/cpu_20.csv similarity index 100% rename from doc/sphinx/3_vibe/cpu_20.csv rename to doc/sphinx/03_vibe/cpu_20.csv diff --git a/doc/sphinx/3_vibe/cpu_40.csv b/doc/sphinx/03_vibe/cpu_40.csv similarity index 100% rename from doc/sphinx/3_vibe/cpu_40.csv rename to doc/sphinx/03_vibe/cpu_40.csv diff --git a/doc/sphinx/3_vibe/cpu_60.csv b/doc/sphinx/03_vibe/cpu_60.csv similarity index 100% rename from doc/sphinx/3_vibe/cpu_60.csv rename to doc/sphinx/03_vibe/cpu_60.csv diff --git a/doc/sphinx/3_vibe/do_gpu_throughput.sh b/doc/sphinx/03_vibe/do_gpu_throughput.sh similarity index 100% rename from doc/sphinx/3_vibe/do_gpu_throughput.sh rename to doc/sphinx/03_vibe/do_gpu_throughput.sh diff --git a/doc/sphinx/3_vibe/do_strong_scaling_cpu.sh b/doc/sphinx/03_vibe/do_strong_scaling_cpu.sh similarity index 100% rename from doc/sphinx/3_vibe/do_strong_scaling_cpu.sh rename to doc/sphinx/03_vibe/do_strong_scaling_cpu.sh diff --git a/doc/sphinx/3_vibe/gpu.csv b/doc/sphinx/03_vibe/gpu.csv similarity index 100% rename from doc/sphinx/3_vibe/gpu.csv rename to doc/sphinx/03_vibe/gpu.csv diff --git a/doc/sphinx/3_vibe/gpu.gp b/doc/sphinx/03_vibe/gpu.gp similarity index 100% rename from doc/sphinx/3_vibe/gpu.gp rename to doc/sphinx/03_vibe/gpu.gp diff --git a/doc/sphinx/3_vibe/parthenon-ats5_spr-hbm128-intel-classic.csv b/doc/sphinx/03_vibe/parthenon-ats5_spr-hbm128-intel-classic.csv similarity index 100% rename from doc/sphinx/3_vibe/parthenon-ats5_spr-hbm128-intel-classic.csv rename to doc/sphinx/03_vibe/parthenon-ats5_spr-hbm128-intel-classic.csv diff --git a/doc/sphinx/3_vibe/parthenon-ats5_spr-hbm160-intel-classic.csv b/doc/sphinx/03_vibe/parthenon-ats5_spr-hbm160-intel-classic.csv similarity index 100% rename from doc/sphinx/3_vibe/parthenon-ats5_spr-hbm160-intel-classic.csv rename to doc/sphinx/03_vibe/parthenon-ats5_spr-hbm160-intel-classic.csv diff --git a/doc/sphinx/3_vibe/plots/cpu-strong.png b/doc/sphinx/03_vibe/plots/cpu-strong.png similarity index 100% rename from doc/sphinx/3_vibe/plots/cpu-strong.png rename to doc/sphinx/03_vibe/plots/cpu-strong.png diff --git a/doc/sphinx/3_vibe/plots/gpu-throughput.png b/doc/sphinx/03_vibe/plots/gpu-throughput.png similarity index 100% rename from doc/sphinx/3_vibe/plots/gpu-throughput.png rename to doc/sphinx/03_vibe/plots/gpu-throughput.png diff --git a/doc/sphinx/3_vibe/vibe.rst b/doc/sphinx/03_vibe/vibe.rst similarity index 100% rename from doc/sphinx/3_vibe/vibe.rst rename to doc/sphinx/03_vibe/vibe.rst diff --git a/doc/sphinx/4_spatter/a100.gp b/doc/sphinx/04_spatter/a100.gp similarity index 100% rename from doc/sphinx/4_spatter/a100.gp rename to doc/sphinx/04_spatter/a100.gp diff --git a/doc/sphinx/4_spatter/a100_throughput_001.csv b/doc/sphinx/04_spatter/a100_throughput_001.csv similarity index 100% rename from doc/sphinx/4_spatter/a100_throughput_001.csv rename to doc/sphinx/04_spatter/a100_throughput_001.csv diff --git a/doc/sphinx/4_spatter/a100_throughput_001fp.csv b/doc/sphinx/04_spatter/a100_throughput_001fp.csv similarity index 100% rename from doc/sphinx/4_spatter/a100_throughput_001fp.csv rename to doc/sphinx/04_spatter/a100_throughput_001fp.csv diff --git a/doc/sphinx/4_spatter/a100_throughput_001nonfp.csv b/doc/sphinx/04_spatter/a100_throughput_001nonfp.csv similarity index 100% rename from doc/sphinx/4_spatter/a100_throughput_001nonfp.csv rename to doc/sphinx/04_spatter/a100_throughput_001nonfp.csv diff --git a/doc/sphinx/4_spatter/a100_throughput_asteroid.csv b/doc/sphinx/04_spatter/a100_throughput_asteroid.csv similarity index 100% rename from doc/sphinx/4_spatter/a100_throughput_asteroid.csv rename to doc/sphinx/04_spatter/a100_throughput_asteroid.csv diff --git a/doc/sphinx/4_spatter/cts1.gp b/doc/sphinx/04_spatter/cts1.gp similarity index 100% rename from doc/sphinx/4_spatter/cts1.gp rename to doc/sphinx/04_spatter/cts1.gp diff --git a/doc/sphinx/4_spatter/cts1_weak_average_001.csv b/doc/sphinx/04_spatter/cts1_weak_average_001.csv similarity index 100% rename from doc/sphinx/4_spatter/cts1_weak_average_001.csv rename to doc/sphinx/04_spatter/cts1_weak_average_001.csv diff --git a/doc/sphinx/4_spatter/cts1_weak_average_001fp.csv b/doc/sphinx/04_spatter/cts1_weak_average_001fp.csv similarity index 100% rename from doc/sphinx/4_spatter/cts1_weak_average_001fp.csv rename to doc/sphinx/04_spatter/cts1_weak_average_001fp.csv diff --git a/doc/sphinx/4_spatter/cts1_weak_average_001nonfp.csv b/doc/sphinx/04_spatter/cts1_weak_average_001nonfp.csv similarity index 100% rename from doc/sphinx/4_spatter/cts1_weak_average_001nonfp.csv rename to doc/sphinx/04_spatter/cts1_weak_average_001nonfp.csv diff --git a/doc/sphinx/4_spatter/cts1_weak_average_asteroid.csv b/doc/sphinx/04_spatter/cts1_weak_average_asteroid.csv similarity index 100% rename from doc/sphinx/4_spatter/cts1_weak_average_asteroid.csv rename to doc/sphinx/04_spatter/cts1_weak_average_asteroid.csv diff --git a/doc/sphinx/4_spatter/cts1_weak_total_001.csv b/doc/sphinx/04_spatter/cts1_weak_total_001.csv similarity index 100% rename from doc/sphinx/4_spatter/cts1_weak_total_001.csv rename to doc/sphinx/04_spatter/cts1_weak_total_001.csv diff --git a/doc/sphinx/4_spatter/cts1_weak_total_001fp.csv b/doc/sphinx/04_spatter/cts1_weak_total_001fp.csv similarity index 100% rename from doc/sphinx/4_spatter/cts1_weak_total_001fp.csv rename to doc/sphinx/04_spatter/cts1_weak_total_001fp.csv diff --git a/doc/sphinx/4_spatter/cts1_weak_total_001nonfp.csv b/doc/sphinx/04_spatter/cts1_weak_total_001nonfp.csv similarity index 100% rename from doc/sphinx/4_spatter/cts1_weak_total_001nonfp.csv rename to doc/sphinx/04_spatter/cts1_weak_total_001nonfp.csv diff --git a/doc/sphinx/4_spatter/cts1_weak_total_asteroid.csv b/doc/sphinx/04_spatter/cts1_weak_total_asteroid.csv similarity index 100% rename from doc/sphinx/4_spatter/cts1_weak_total_asteroid.csv rename to doc/sphinx/04_spatter/cts1_weak_total_asteroid.csv diff --git a/doc/sphinx/4_spatter/skylake.gp b/doc/sphinx/04_spatter/skylake.gp similarity index 100% rename from doc/sphinx/4_spatter/skylake.gp rename to doc/sphinx/04_spatter/skylake.gp diff --git a/doc/sphinx/4_spatter/skylake_weak_average_001.csv b/doc/sphinx/04_spatter/skylake_weak_average_001.csv similarity index 100% rename from doc/sphinx/4_spatter/skylake_weak_average_001.csv rename to doc/sphinx/04_spatter/skylake_weak_average_001.csv diff --git a/doc/sphinx/4_spatter/skylake_weak_average_001fp.csv b/doc/sphinx/04_spatter/skylake_weak_average_001fp.csv similarity index 100% rename from doc/sphinx/4_spatter/skylake_weak_average_001fp.csv rename to doc/sphinx/04_spatter/skylake_weak_average_001fp.csv diff --git a/doc/sphinx/4_spatter/skylake_weak_average_001nonfp.csv b/doc/sphinx/04_spatter/skylake_weak_average_001nonfp.csv similarity index 100% rename from doc/sphinx/4_spatter/skylake_weak_average_001nonfp.csv rename to doc/sphinx/04_spatter/skylake_weak_average_001nonfp.csv diff --git a/doc/sphinx/4_spatter/skylake_weak_average_asteroid.csv b/doc/sphinx/04_spatter/skylake_weak_average_asteroid.csv similarity index 100% rename from doc/sphinx/4_spatter/skylake_weak_average_asteroid.csv rename to doc/sphinx/04_spatter/skylake_weak_average_asteroid.csv diff --git a/doc/sphinx/4_spatter/skylake_weak_total_001.csv b/doc/sphinx/04_spatter/skylake_weak_total_001.csv similarity index 100% rename from doc/sphinx/4_spatter/skylake_weak_total_001.csv rename to doc/sphinx/04_spatter/skylake_weak_total_001.csv diff --git a/doc/sphinx/4_spatter/skylake_weak_total_001fp.csv b/doc/sphinx/04_spatter/skylake_weak_total_001fp.csv similarity index 100% rename from doc/sphinx/4_spatter/skylake_weak_total_001fp.csv rename to doc/sphinx/04_spatter/skylake_weak_total_001fp.csv diff --git a/doc/sphinx/4_spatter/skylake_weak_total_001nonfp.csv b/doc/sphinx/04_spatter/skylake_weak_total_001nonfp.csv similarity index 100% rename from doc/sphinx/4_spatter/skylake_weak_total_001nonfp.csv rename to doc/sphinx/04_spatter/skylake_weak_total_001nonfp.csv diff --git a/doc/sphinx/4_spatter/skylake_weak_total_asteroid.csv b/doc/sphinx/04_spatter/skylake_weak_total_asteroid.csv similarity index 100% rename from doc/sphinx/4_spatter/skylake_weak_total_asteroid.csv rename to doc/sphinx/04_spatter/skylake_weak_total_asteroid.csv diff --git a/doc/sphinx/4_spatter/spatter.rst b/doc/sphinx/04_spatter/spatter.rst similarity index 100% rename from doc/sphinx/4_spatter/spatter.rst rename to doc/sphinx/04_spatter/spatter.rst diff --git a/doc/sphinx/4_spatter/v100.gp b/doc/sphinx/04_spatter/v100.gp similarity index 100% rename from doc/sphinx/4_spatter/v100.gp rename to doc/sphinx/04_spatter/v100.gp diff --git a/doc/sphinx/4_spatter/v100_throughput_001.csv b/doc/sphinx/04_spatter/v100_throughput_001.csv similarity index 100% rename from doc/sphinx/4_spatter/v100_throughput_001.csv rename to doc/sphinx/04_spatter/v100_throughput_001.csv diff --git a/doc/sphinx/4_spatter/v100_throughput_001fp.csv b/doc/sphinx/04_spatter/v100_throughput_001fp.csv similarity index 100% rename from doc/sphinx/4_spatter/v100_throughput_001fp.csv rename to doc/sphinx/04_spatter/v100_throughput_001fp.csv diff --git a/doc/sphinx/4_spatter/v100_throughput_001nonfp.csv b/doc/sphinx/04_spatter/v100_throughput_001nonfp.csv similarity index 100% rename from doc/sphinx/4_spatter/v100_throughput_001nonfp.csv rename to doc/sphinx/04_spatter/v100_throughput_001nonfp.csv diff --git a/doc/sphinx/4_spatter/v100_throughput_asteroid.csv b/doc/sphinx/04_spatter/v100_throughput_asteroid.csv similarity index 100% rename from doc/sphinx/4_spatter/v100_throughput_asteroid.csv rename to doc/sphinx/04_spatter/v100_throughput_asteroid.csv diff --git a/doc/sphinx/5_mlmd/gpu.csv b/doc/sphinx/05_mlmd/gpu.csv similarity index 100% rename from doc/sphinx/5_mlmd/gpu.csv rename to doc/sphinx/05_mlmd/gpu.csv diff --git a/doc/sphinx/5_mlmd/gpu.gp b/doc/sphinx/05_mlmd/gpu.gp similarity index 100% rename from doc/sphinx/5_mlmd/gpu.gp rename to doc/sphinx/05_mlmd/gpu.gp diff --git a/doc/sphinx/5_mlmd/mlmd.rst b/doc/sphinx/05_mlmd/mlmd.rst similarity index 100% rename from doc/sphinx/5_mlmd/mlmd.rst rename to doc/sphinx/05_mlmd/mlmd.rst diff --git a/doc/sphinx/5_mlmd/plots/StrongSingle-s.png b/doc/sphinx/05_mlmd/plots/StrongSingle-s.png similarity index 100% rename from doc/sphinx/5_mlmd/plots/StrongSingle-s.png rename to doc/sphinx/05_mlmd/plots/StrongSingle-s.png diff --git a/doc/sphinx/5_mlmd/plots/StrongSingle-t.png b/doc/sphinx/05_mlmd/plots/StrongSingle-t.png similarity index 100% rename from doc/sphinx/5_mlmd/plots/StrongSingle-t.png rename to doc/sphinx/05_mlmd/plots/StrongSingle-t.png diff --git a/doc/sphinx/5_mlmd/plots/WeakParallel-s.png b/doc/sphinx/05_mlmd/plots/WeakParallel-s.png similarity index 100% rename from doc/sphinx/5_mlmd/plots/WeakParallel-s.png rename to doc/sphinx/05_mlmd/plots/WeakParallel-s.png diff --git a/doc/sphinx/5_mlmd/plots/WeakParallel-t.png b/doc/sphinx/05_mlmd/plots/WeakParallel-t.png similarity index 100% rename from doc/sphinx/5_mlmd/plots/WeakParallel-t.png rename to doc/sphinx/05_mlmd/plots/WeakParallel-t.png diff --git a/doc/sphinx/6_umt/spp1_strong_scaling_cts2.csv b/doc/sphinx/06_umt/spp1_strong_scaling_cts2.csv similarity index 100% rename from doc/sphinx/6_umt/spp1_strong_scaling_cts2.csv rename to doc/sphinx/06_umt/spp1_strong_scaling_cts2.csv diff --git a/doc/sphinx/6_umt/spp1_strong_scaling_cts2.gp b/doc/sphinx/06_umt/spp1_strong_scaling_cts2.gp similarity index 100% rename from doc/sphinx/6_umt/spp1_strong_scaling_cts2.gp rename to doc/sphinx/06_umt/spp1_strong_scaling_cts2.gp diff --git a/doc/sphinx/6_umt/spp1_strong_scaling_cts2_abridged.csv b/doc/sphinx/06_umt/spp1_strong_scaling_cts2_abridged.csv similarity index 100% rename from doc/sphinx/6_umt/spp1_strong_scaling_cts2_abridged.csv rename to doc/sphinx/06_umt/spp1_strong_scaling_cts2_abridged.csv diff --git a/doc/sphinx/6_umt/spp1_throughput_V100.csv b/doc/sphinx/06_umt/spp1_throughput_V100.csv similarity index 100% rename from doc/sphinx/6_umt/spp1_throughput_V100.csv rename to doc/sphinx/06_umt/spp1_throughput_V100.csv diff --git a/doc/sphinx/6_umt/spp1_throughput_V100.gp b/doc/sphinx/06_umt/spp1_throughput_V100.gp similarity index 100% rename from doc/sphinx/6_umt/spp1_throughput_V100.gp rename to doc/sphinx/06_umt/spp1_throughput_V100.gp diff --git a/doc/sphinx/6_umt/umt.rst b/doc/sphinx/06_umt/umt.rst similarity index 100% rename from doc/sphinx/6_umt/umt.rst rename to doc/sphinx/06_umt/umt.rst diff --git a/doc/sphinx/6_umt/umtsp2_strong_scaling_cpu.csv b/doc/sphinx/06_umt/umtsp2_strong_scaling_cpu.csv similarity index 100% rename from doc/sphinx/6_umt/umtsp2_strong_scaling_cpu.csv rename to doc/sphinx/06_umt/umtsp2_strong_scaling_cpu.csv diff --git a/doc/sphinx/6_umt/umtsp2_throughput_gpu.csv b/doc/sphinx/06_umt/umtsp2_throughput_gpu.csv similarity index 100% rename from doc/sphinx/6_umt/umtsp2_throughput_gpu.csv rename to doc/sphinx/06_umt/umtsp2_throughput_gpu.csv diff --git a/doc/sphinx/7_miniem/ats2.csv b/doc/sphinx/07_miniem/ats2.csv similarity index 100% rename from doc/sphinx/7_miniem/ats2.csv rename to doc/sphinx/07_miniem/ats2.csv diff --git a/doc/sphinx/7_miniem/ats2.gp b/doc/sphinx/07_miniem/ats2.gp similarity index 100% rename from doc/sphinx/7_miniem/ats2.gp rename to doc/sphinx/07_miniem/ats2.gp diff --git a/doc/sphinx/7_miniem/ats2mem.gp b/doc/sphinx/07_miniem/ats2mem.gp similarity index 100% rename from doc/sphinx/7_miniem/ats2mem.gp rename to doc/sphinx/07_miniem/ats2mem.gp diff --git a/doc/sphinx/7_miniem/cts1-0.25.csv b/doc/sphinx/07_miniem/cts1-0.25.csv similarity index 100% rename from doc/sphinx/7_miniem/cts1-0.25.csv rename to doc/sphinx/07_miniem/cts1-0.25.csv diff --git a/doc/sphinx/7_miniem/cts1-0.25.gp b/doc/sphinx/07_miniem/cts1-0.25.gp similarity index 100% rename from doc/sphinx/7_miniem/cts1-0.25.gp rename to doc/sphinx/07_miniem/cts1-0.25.gp diff --git a/doc/sphinx/7_miniem/cts1-0.50.csv b/doc/sphinx/07_miniem/cts1-0.50.csv similarity index 100% rename from doc/sphinx/7_miniem/cts1-0.50.csv rename to doc/sphinx/07_miniem/cts1-0.50.csv diff --git a/doc/sphinx/7_miniem/cts1-0.50.gp b/doc/sphinx/07_miniem/cts1-0.50.gp similarity index 100% rename from doc/sphinx/7_miniem/cts1-0.50.gp rename to doc/sphinx/07_miniem/cts1-0.50.gp diff --git a/doc/sphinx/7_miniem/cts1-1.00.csv b/doc/sphinx/07_miniem/cts1-1.00.csv similarity index 100% rename from doc/sphinx/7_miniem/cts1-1.00.csv rename to doc/sphinx/07_miniem/cts1-1.00.csv diff --git a/doc/sphinx/7_miniem/cts1-1.00.gp b/doc/sphinx/07_miniem/cts1-1.00.gp similarity index 100% rename from doc/sphinx/7_miniem/cts1-1.00.gp rename to doc/sphinx/07_miniem/cts1-1.00.gp diff --git a/doc/sphinx/7_miniem/cts1-2.00.csv b/doc/sphinx/07_miniem/cts1-2.00.csv similarity index 100% rename from doc/sphinx/7_miniem/cts1-2.00.csv rename to doc/sphinx/07_miniem/cts1-2.00.csv diff --git a/doc/sphinx/7_miniem/cts1-2.00.gp b/doc/sphinx/07_miniem/cts1-2.00.gp similarity index 100% rename from doc/sphinx/7_miniem/cts1-2.00.gp rename to doc/sphinx/07_miniem/cts1-2.00.gp diff --git a/doc/sphinx/7_miniem/cts1.csv b/doc/sphinx/07_miniem/cts1.csv similarity index 100% rename from doc/sphinx/7_miniem/cts1.csv rename to doc/sphinx/07_miniem/cts1.csv diff --git a/doc/sphinx/7_miniem/cts1.gp b/doc/sphinx/07_miniem/cts1.gp similarity index 100% rename from doc/sphinx/7_miniem/cts1.gp rename to doc/sphinx/07_miniem/cts1.gp diff --git a/doc/sphinx/7_miniem/cts1mem-0.25.gp b/doc/sphinx/07_miniem/cts1mem-0.25.gp similarity index 100% rename from doc/sphinx/7_miniem/cts1mem-0.25.gp rename to doc/sphinx/07_miniem/cts1mem-0.25.gp diff --git a/doc/sphinx/7_miniem/cts1mem-0.50.gp b/doc/sphinx/07_miniem/cts1mem-0.50.gp similarity index 100% rename from doc/sphinx/7_miniem/cts1mem-0.50.gp rename to doc/sphinx/07_miniem/cts1mem-0.50.gp diff --git a/doc/sphinx/7_miniem/cts1mem-1.00.gp b/doc/sphinx/07_miniem/cts1mem-1.00.gp similarity index 100% rename from doc/sphinx/7_miniem/cts1mem-1.00.gp rename to doc/sphinx/07_miniem/cts1mem-1.00.gp diff --git a/doc/sphinx/7_miniem/cts1mem-2.00.gp b/doc/sphinx/07_miniem/cts1mem-2.00.gp similarity index 100% rename from doc/sphinx/7_miniem/cts1mem-2.00.gp rename to doc/sphinx/07_miniem/cts1mem-2.00.gp diff --git a/doc/sphinx/7_miniem/cts1mem.gp b/doc/sphinx/07_miniem/cts1mem.gp similarity index 100% rename from doc/sphinx/7_miniem/cts1mem.gp rename to doc/sphinx/07_miniem/cts1mem.gp diff --git a/doc/sphinx/7_miniem/miniem.rst b/doc/sphinx/07_miniem/miniem.rst similarity index 100% rename from doc/sphinx/7_miniem/miniem.rst rename to doc/sphinx/07_miniem/miniem.rst diff --git a/doc/sphinx/7_miniem/output.log b/doc/sphinx/07_miniem/output.log similarity index 100% rename from doc/sphinx/7_miniem/output.log rename to doc/sphinx/07_miniem/output.log diff --git a/doc/sphinx/7_miniem/recipe.sh b/doc/sphinx/07_miniem/recipe.sh similarity index 100% rename from doc/sphinx/7_miniem/recipe.sh rename to doc/sphinx/07_miniem/recipe.sh diff --git a/doc/sphinx/8_sparta/Makefile.manzano_kokkos b/doc/sphinx/08_sparta/Makefile.manzano_kokkos similarity index 100% rename from doc/sphinx/8_sparta/Makefile.manzano_kokkos rename to doc/sphinx/08_sparta/Makefile.manzano_kokkos diff --git a/doc/sphinx/8_sparta/ats2.csv b/doc/sphinx/08_sparta/ats2.csv similarity index 100% rename from doc/sphinx/8_sparta/ats2.csv rename to doc/sphinx/08_sparta/ats2.csv diff --git a/doc/sphinx/8_sparta/ats2.gp b/doc/sphinx/08_sparta/ats2.gp similarity index 100% rename from doc/sphinx/8_sparta/ats2.gp rename to doc/sphinx/08_sparta/ats2.gp diff --git a/doc/sphinx/8_sparta/build-manzano.sh b/doc/sphinx/08_sparta/build-manzano.sh similarity index 100% rename from doc/sphinx/8_sparta/build-manzano.sh rename to doc/sphinx/08_sparta/build-manzano.sh diff --git a/doc/sphinx/8_sparta/cts1-0.25.csv b/doc/sphinx/08_sparta/cts1-0.25.csv similarity index 100% rename from doc/sphinx/8_sparta/cts1-0.25.csv rename to doc/sphinx/08_sparta/cts1-0.25.csv diff --git a/doc/sphinx/8_sparta/cts1-0.25.gp b/doc/sphinx/08_sparta/cts1-0.25.gp similarity index 100% rename from doc/sphinx/8_sparta/cts1-0.25.gp rename to doc/sphinx/08_sparta/cts1-0.25.gp diff --git a/doc/sphinx/8_sparta/cts1-0.50.csv b/doc/sphinx/08_sparta/cts1-0.50.csv similarity index 100% rename from doc/sphinx/8_sparta/cts1-0.50.csv rename to doc/sphinx/08_sparta/cts1-0.50.csv diff --git a/doc/sphinx/8_sparta/cts1-0.50.gp b/doc/sphinx/08_sparta/cts1-0.50.gp similarity index 100% rename from doc/sphinx/8_sparta/cts1-0.50.gp rename to doc/sphinx/08_sparta/cts1-0.50.gp diff --git a/doc/sphinx/8_sparta/cts1-1.00.csv b/doc/sphinx/08_sparta/cts1-1.00.csv similarity index 100% rename from doc/sphinx/8_sparta/cts1-1.00.csv rename to doc/sphinx/08_sparta/cts1-1.00.csv diff --git a/doc/sphinx/8_sparta/cts1-1.00.gp b/doc/sphinx/08_sparta/cts1-1.00.gp similarity index 100% rename from doc/sphinx/8_sparta/cts1-1.00.gp rename to doc/sphinx/08_sparta/cts1-1.00.gp diff --git a/doc/sphinx/8_sparta/cts1-2.00.csv b/doc/sphinx/08_sparta/cts1-2.00.csv similarity index 100% rename from doc/sphinx/8_sparta/cts1-2.00.csv rename to doc/sphinx/08_sparta/cts1-2.00.csv diff --git a/doc/sphinx/8_sparta/cts1-2.00.gp b/doc/sphinx/08_sparta/cts1-2.00.gp similarity index 100% rename from doc/sphinx/8_sparta/cts1-2.00.gp rename to doc/sphinx/08_sparta/cts1-2.00.gp diff --git a/doc/sphinx/8_sparta/cts1.csv b/doc/sphinx/08_sparta/cts1.csv similarity index 100% rename from doc/sphinx/8_sparta/cts1.csv rename to doc/sphinx/08_sparta/cts1.csv diff --git a/doc/sphinx/8_sparta/cts1.gp b/doc/sphinx/08_sparta/cts1.gp similarity index 100% rename from doc/sphinx/8_sparta/cts1.gp rename to doc/sphinx/08_sparta/cts1.gp diff --git a/doc/sphinx/8_sparta/cts1mem-0.25.gp b/doc/sphinx/08_sparta/cts1mem-0.25.gp similarity index 100% rename from doc/sphinx/8_sparta/cts1mem-0.25.gp rename to doc/sphinx/08_sparta/cts1mem-0.25.gp diff --git a/doc/sphinx/8_sparta/cts1mem-0.50.gp b/doc/sphinx/08_sparta/cts1mem-0.50.gp similarity index 100% rename from doc/sphinx/8_sparta/cts1mem-0.50.gp rename to doc/sphinx/08_sparta/cts1mem-0.50.gp diff --git a/doc/sphinx/8_sparta/cts1mem-1.00.gp b/doc/sphinx/08_sparta/cts1mem-1.00.gp similarity index 100% rename from doc/sphinx/8_sparta/cts1mem-1.00.gp rename to doc/sphinx/08_sparta/cts1mem-1.00.gp diff --git a/doc/sphinx/8_sparta/cts1mem-2.00.gp b/doc/sphinx/08_sparta/cts1mem-2.00.gp similarity index 100% rename from doc/sphinx/8_sparta/cts1mem-2.00.gp rename to doc/sphinx/08_sparta/cts1mem-2.00.gp diff --git a/doc/sphinx/8_sparta/log.sparta b/doc/sphinx/08_sparta/log.sparta similarity index 100% rename from doc/sphinx/8_sparta/log.sparta rename to doc/sphinx/08_sparta/log.sparta diff --git a/doc/sphinx/8_sparta/sparta.rst b/doc/sphinx/08_sparta/sparta.rst similarity index 100% rename from doc/sphinx/8_sparta/sparta.rst rename to doc/sphinx/08_sparta/sparta.rst diff --git a/doc/sphinx/8_sparta/sparta_fom.py b/doc/sphinx/08_sparta/sparta_fom.py similarity index 100% rename from doc/sphinx/8_sparta/sparta_fom.py rename to doc/sphinx/08_sparta/sparta_fom.py diff --git a/doc/sphinx/9_appendix/build_docs.rst b/doc/sphinx/09_appendix/build_docs.rst similarity index 100% rename from doc/sphinx/9_appendix/build_docs.rst rename to doc/sphinx/09_appendix/build_docs.rst diff --git a/doc/sphinx/index.rst b/doc/sphinx/index.rst index ba86c085..6f9224d1 100644 --- a/doc/sphinx/index.rst +++ b/doc/sphinx/index.rst @@ -11,22 +11,22 @@ ATS Benchmarks Project :maxdepth: 3 :caption: Main: - 0_intro/introduction - 1_branson/branson - 2_amg/amg - 3_vibe/vibe - 4_spatter/spatter - 5_mlmd/mlmd - 6_umt/umt - 7_miniem/miniem - 8_sparta/sparta + 00_intro/introduction + 01_branson/branson + 02_amg/amg + 03_vibe/vibe + 04_spatter/spatter + 05_mlmd/mlmd + 06_umt/umt + 07_miniem/miniem + 08_sparta/sparta .. toctree:: :numbered: :maxdepth: 3 :caption: Appendices: - 9_appendix/build_docs + 09_appendix/build_docs .. toctree:: :numbered: @@ -41,6 +41,13 @@ ATS Benchmarks Project 10_Microbenchmarks/M5_OSUMB/OSUMB 10_Microbenchmarks/M6_MDTEST/MDTEST +.. toctree:: + :numbered: + :maxdepth: 3 + :caption: Reference Systems: + + 11_ReferenceSystems/crossroads + .. Indices and tables ================== .. * :ref:`genindex` From 6184f4706c8a6c9af21d5c7282b09efa9d14ac14 Mon Sep 17 00:00:00 2001 From: Anthony Agelastos Date: Mon, 18 Sep 2023 20:08:52 -0600 Subject: [PATCH 03/12] changed microbenchmark folder to be named more appropriately --- .../M0_Intro/introduction.rst | 0 .../M1_STREAM/STREAM.rst | 0 .../M1_STREAM/cpu.gp | 0 .../stream-cts1_ats5intel-oneapi-openmpi.csv | 0 .../M2_DAXPY/DAXPY.rst | 0 .../M2_DAXPY/cpu.gp | 0 .../M3_DGEMM/DGEMM.rst | 0 .../M3_DGEMM/cpu.gp | 0 .../M4_IOR/IOR.rst | 0 .../M4_IOR/cpu.gp | 0 .../M5_OSUMB/OSUMB.rst | 0 .../M5_OSUMB/cpu.gp | 0 .../M6_MDTEST/MDTEST.rst | 0 .../M6_MDTEST/cpu.gp | 0 doc/sphinx/index.rst | 16 ++++++++-------- 15 files changed, 8 insertions(+), 8 deletions(-) rename doc/sphinx/{10_Microbenchmarks => 10_microbenchmarks}/M0_Intro/introduction.rst (100%) rename doc/sphinx/{10_Microbenchmarks => 10_microbenchmarks}/M1_STREAM/STREAM.rst (100%) rename doc/sphinx/{10_Microbenchmarks => 10_microbenchmarks}/M1_STREAM/cpu.gp (100%) rename doc/sphinx/{10_Microbenchmarks => 10_microbenchmarks}/M1_STREAM/stream-cts1_ats5intel-oneapi-openmpi.csv (100%) rename doc/sphinx/{10_Microbenchmarks => 10_microbenchmarks}/M2_DAXPY/DAXPY.rst (100%) rename doc/sphinx/{10_Microbenchmarks => 10_microbenchmarks}/M2_DAXPY/cpu.gp (100%) rename doc/sphinx/{10_Microbenchmarks => 10_microbenchmarks}/M3_DGEMM/DGEMM.rst (100%) rename doc/sphinx/{10_Microbenchmarks => 10_microbenchmarks}/M3_DGEMM/cpu.gp (100%) rename doc/sphinx/{10_Microbenchmarks => 10_microbenchmarks}/M4_IOR/IOR.rst (100%) rename doc/sphinx/{10_Microbenchmarks => 10_microbenchmarks}/M4_IOR/cpu.gp (100%) rename doc/sphinx/{10_Microbenchmarks => 10_microbenchmarks}/M5_OSUMB/OSUMB.rst (100%) rename doc/sphinx/{10_Microbenchmarks => 10_microbenchmarks}/M5_OSUMB/cpu.gp (100%) rename doc/sphinx/{10_Microbenchmarks => 10_microbenchmarks}/M6_MDTEST/MDTEST.rst (100%) rename doc/sphinx/{10_Microbenchmarks => 10_microbenchmarks}/M6_MDTEST/cpu.gp (100%) diff --git a/doc/sphinx/10_Microbenchmarks/M0_Intro/introduction.rst b/doc/sphinx/10_microbenchmarks/M0_Intro/introduction.rst similarity index 100% rename from doc/sphinx/10_Microbenchmarks/M0_Intro/introduction.rst rename to doc/sphinx/10_microbenchmarks/M0_Intro/introduction.rst diff --git a/doc/sphinx/10_Microbenchmarks/M1_STREAM/STREAM.rst b/doc/sphinx/10_microbenchmarks/M1_STREAM/STREAM.rst similarity index 100% rename from doc/sphinx/10_Microbenchmarks/M1_STREAM/STREAM.rst rename to doc/sphinx/10_microbenchmarks/M1_STREAM/STREAM.rst diff --git a/doc/sphinx/10_Microbenchmarks/M1_STREAM/cpu.gp b/doc/sphinx/10_microbenchmarks/M1_STREAM/cpu.gp similarity index 100% rename from doc/sphinx/10_Microbenchmarks/M1_STREAM/cpu.gp rename to doc/sphinx/10_microbenchmarks/M1_STREAM/cpu.gp diff --git a/doc/sphinx/10_Microbenchmarks/M1_STREAM/stream-cts1_ats5intel-oneapi-openmpi.csv b/doc/sphinx/10_microbenchmarks/M1_STREAM/stream-cts1_ats5intel-oneapi-openmpi.csv similarity index 100% rename from doc/sphinx/10_Microbenchmarks/M1_STREAM/stream-cts1_ats5intel-oneapi-openmpi.csv rename to doc/sphinx/10_microbenchmarks/M1_STREAM/stream-cts1_ats5intel-oneapi-openmpi.csv diff --git a/doc/sphinx/10_Microbenchmarks/M2_DAXPY/DAXPY.rst b/doc/sphinx/10_microbenchmarks/M2_DAXPY/DAXPY.rst similarity index 100% rename from doc/sphinx/10_Microbenchmarks/M2_DAXPY/DAXPY.rst rename to doc/sphinx/10_microbenchmarks/M2_DAXPY/DAXPY.rst diff --git a/doc/sphinx/10_Microbenchmarks/M2_DAXPY/cpu.gp b/doc/sphinx/10_microbenchmarks/M2_DAXPY/cpu.gp similarity index 100% rename from doc/sphinx/10_Microbenchmarks/M2_DAXPY/cpu.gp rename to doc/sphinx/10_microbenchmarks/M2_DAXPY/cpu.gp diff --git a/doc/sphinx/10_Microbenchmarks/M3_DGEMM/DGEMM.rst b/doc/sphinx/10_microbenchmarks/M3_DGEMM/DGEMM.rst similarity index 100% rename from doc/sphinx/10_Microbenchmarks/M3_DGEMM/DGEMM.rst rename to doc/sphinx/10_microbenchmarks/M3_DGEMM/DGEMM.rst diff --git a/doc/sphinx/10_Microbenchmarks/M3_DGEMM/cpu.gp b/doc/sphinx/10_microbenchmarks/M3_DGEMM/cpu.gp similarity index 100% rename from doc/sphinx/10_Microbenchmarks/M3_DGEMM/cpu.gp rename to doc/sphinx/10_microbenchmarks/M3_DGEMM/cpu.gp diff --git a/doc/sphinx/10_Microbenchmarks/M4_IOR/IOR.rst b/doc/sphinx/10_microbenchmarks/M4_IOR/IOR.rst similarity index 100% rename from doc/sphinx/10_Microbenchmarks/M4_IOR/IOR.rst rename to doc/sphinx/10_microbenchmarks/M4_IOR/IOR.rst diff --git a/doc/sphinx/10_Microbenchmarks/M4_IOR/cpu.gp b/doc/sphinx/10_microbenchmarks/M4_IOR/cpu.gp similarity index 100% rename from doc/sphinx/10_Microbenchmarks/M4_IOR/cpu.gp rename to doc/sphinx/10_microbenchmarks/M4_IOR/cpu.gp diff --git a/doc/sphinx/10_Microbenchmarks/M5_OSUMB/OSUMB.rst b/doc/sphinx/10_microbenchmarks/M5_OSUMB/OSUMB.rst similarity index 100% rename from doc/sphinx/10_Microbenchmarks/M5_OSUMB/OSUMB.rst rename to doc/sphinx/10_microbenchmarks/M5_OSUMB/OSUMB.rst diff --git a/doc/sphinx/10_Microbenchmarks/M5_OSUMB/cpu.gp b/doc/sphinx/10_microbenchmarks/M5_OSUMB/cpu.gp similarity index 100% rename from doc/sphinx/10_Microbenchmarks/M5_OSUMB/cpu.gp rename to doc/sphinx/10_microbenchmarks/M5_OSUMB/cpu.gp diff --git a/doc/sphinx/10_Microbenchmarks/M6_MDTEST/MDTEST.rst b/doc/sphinx/10_microbenchmarks/M6_MDTEST/MDTEST.rst similarity index 100% rename from doc/sphinx/10_Microbenchmarks/M6_MDTEST/MDTEST.rst rename to doc/sphinx/10_microbenchmarks/M6_MDTEST/MDTEST.rst diff --git a/doc/sphinx/10_Microbenchmarks/M6_MDTEST/cpu.gp b/doc/sphinx/10_microbenchmarks/M6_MDTEST/cpu.gp similarity index 100% rename from doc/sphinx/10_Microbenchmarks/M6_MDTEST/cpu.gp rename to doc/sphinx/10_microbenchmarks/M6_MDTEST/cpu.gp diff --git a/doc/sphinx/index.rst b/doc/sphinx/index.rst index 6f9224d1..7ba44618 100644 --- a/doc/sphinx/index.rst +++ b/doc/sphinx/index.rst @@ -33,20 +33,20 @@ ATS Benchmarks Project :maxdepth: 3 :caption: Microbenchmarks: - 10_Microbenchmarks/M0_INTRO/introduction - 10_Microbenchmarks/M1_STREAM/STREAM - 10_Microbenchmarks/M2_DAXPY/DAXPY - 10_Microbenchmarks/M3_DGEMM/DGEMM - 10_Microbenchmarks/M4_IOR/IOR - 10_Microbenchmarks/M5_OSUMB/OSUMB - 10_Microbenchmarks/M6_MDTEST/MDTEST + 10_microbenchmarks/M0_INTRO/introduction + 10_microbenchmarks/M1_STREAM/STREAM + 10_microbenchmarks/M2_DAXPY/DAXPY + 10_microbenchmarks/M3_DGEMM/DGEMM + 10_microbenchmarks/M4_IOR/IOR + 10_microbenchmarks/M5_OSUMB/OSUMB + 10_microbenchmarks/M6_MDTEST/MDTEST .. toctree:: :numbered: :maxdepth: 3 :caption: Reference Systems: - 11_ReferenceSystems/crossroads + 11_reference_systems/crossroads .. Indices and tables ================== From 497303359ae29ba0edfcae768eccbfb8c036292d Mon Sep 17 00:00:00 2001 From: Anthony Agelastos Date: Mon, 18 Sep 2023 20:16:07 -0600 Subject: [PATCH 04/12] added stub for crossroads ref --- .../11_reference_systems/.#crossroads.rst | 1 + .../11_reference_systems/crossroads.rst | 248 ++++++++++++++++++ 2 files changed, 249 insertions(+) create mode 120000 doc/sphinx/11_reference_systems/.#crossroads.rst create mode 100644 doc/sphinx/11_reference_systems/crossroads.rst diff --git a/doc/sphinx/11_reference_systems/.#crossroads.rst b/doc/sphinx/11_reference_systems/.#crossroads.rst new file mode 120000 index 00000000..4a13ab3d --- /dev/null +++ b/doc/sphinx/11_reference_systems/.#crossroads.rst @@ -0,0 +1 @@ +amagela@s1091389.7166 \ No newline at end of file diff --git a/doc/sphinx/11_reference_systems/crossroads.rst b/doc/sphinx/11_reference_systems/crossroads.rst new file mode 100644 index 00000000..4840094c --- /dev/null +++ b/doc/sphinx/11_reference_systems/crossroads.rst @@ -0,0 +1,248 @@ +************ +Introduction +************ + +This is the documentation for the **ATS-5 Benchmarks**. + +Assuring that real applications perform efficiently on ATS-5 is key to their success. +A suite of benchmarks have been developed for Request For Proposal (RFP) response evaluation and system acceptance. +These codes are representative of the workloads of the NNSA laboratories. + +The benchmarks contained within this site represent a pre-RFP draft state. Over the next few months the +benchmarks will change somewhat. While we expect most of the changes will be additions and modifications it is possible that we will remove +benchmarks prior to RFP. + +To use these benchmarks please refer to the ATS-5 benchmarks repository `ATS-5 repo `_ + +Benchmark changes from Crossroads +================================= + +The key differences from Crossroads benchmarks and ATS-5 benchmarks are as summarized below: + +.. list-table:: + + * - **Crossroads** + - **ATS-5** + - **Notes** + * - Few GPU-ready benchmarks + - | All proxy benchmarks have + | GPU implementations. + - + * - | System level performance metric: + | Scalable System Improvement + | geometric mean of app FOMs. + | Use of single node benchmarks + | for RFP. + - | Multi-node benchmarking for + | system acceptance based on + | RFP benchmarks, negotiated + | with vendor as part of SOW. + - | Attempting to limit multi-node + | benchmarking for RFP + | to communication (MPI), and + | IO (IOR). Expect responses to + | include multiple node + | configurations and ability to + | compose them to meet our needs + | in a codesign partnership. + | Will use scaled single node + | improvement to assess proposals + | (along with other factors) and + | SSI for acceptance. + * - | Mini-Apps + full scale apps + | some of which were export + | controlled. + - | Mini-apps only - all open + | source. + - + * - No Machine Learning. + - | ML training and inference + | included. + - | Focuses on material science + | workloads of relevance. + + + +Benchmark Overview +================== + +.. list-table:: + + * - **Benchmark** + - **Description** + - **Language** + - **Parallelism** + * - Branson + - Implicit Monte Carlo transport + - C++ + - MPI + Cuda/HIP + * - AMG2023 + - | AMG solver of sparse matrices + | using Hypre + - C + - | MPI+CUDA/HIP/SYCL + | OpenMP on CPU + * - MiniEM + - Electro-Magnetics solver + - C++ + - MPI+Kokkos + * - MLMD + - | ML Training of interatomic + | potential model using HIPYNN + | on VASP Simulation data. + | ML inference using LAMMPS, + | Kokkos, and HIPYNN trained + | interatomic potential model. + - Python, C++, C + - MPI+Cuda/HIP + * - Parthenon-VIBE + - | Block structured AMR proxy using + | the Parthenon framework. + - C++ + - MPI+Kokkos + * - Sparta + - Direct Simulation Monte Carlo + - C++ + - MPI+Kokkos + * - UMT + - Deterministic (Sn) transport + - Fortran + - | MPI+OpenMP and + | OpenMP Offload + + + +Microbenchmark Overview +======================= + +.. list-table:: + + * - **Benchmark** + - **Description** + - **Language** + - **Parallelism** + - **Multi-node** + * - Stream + - Streaming memory bandwidth test + - C/Fortran + - OpenMP + - No + * - Spatter + - | Sparse memory bandwidth test + | driven by application memory + | access patterns. + - C++ + - | MPI+OpenMP/ + | CUDA/OpenCL + - No + * - | OSU MPI + + | Sandia SMB + | message rate + - MPI Performance Benchmarks + - C++ + - MPI + - Yes + * - DGEMM + - | Single node floating-point + | performance on matrix multiply. + - C/Fortran + - Various + - No + * - DAXPY + - | Single node floating-point + | performance of a scaled vector + | plus a vector. + - C/Fortran + - Various + - No + * - IOR + - | Performance testing of parallel + | file system using various + | interfaces and access patterns. + - C + - MPI + - No + * - mdtest + - | Metadata benchmark that performs + | open/stat/close operations on + | files and directories. + - C + - MPI + - Yes + + +Run Rules Synopsis +================== + +Single node benchmarks will require respondent to provide estimates on + +* strong scaling for CPU architectures. + +* throughput curves for GPU architectures. + +* estimates must be provided for each compute node type (including options). + +* Problem size must be changed to meet % of memory requirements. + +Source code modification categories: + +* Baseline: “out-of-the-box” performance + + * Code modifications not permitted + + * Compiler options can be modified, library substitutions permitted, problem decomposition may be changed + +* Ported: “alternative baseline for new architectures” + + * Limited source-code modifications are permitted to port and tune for the target architecture using directives or commonly used interfaces. + +* Optimized: "speed of light" + + * Aggressive code changes that enhance performance are permitted. + + * Algorithms fundamental to the program may not be replaced. + + * The modified code must still pass validation tests. + +Required results: + + * A **baseline** or **ported** result is required for each benchmark. If baseline cannot be obtained, ported results may be provided. + +Optional results: + + * **Ported** results may be provided in addition to the baseline if minor code changes enable substantial performance gain. + + * **Optimized** results to showcase system capabilities. + +Scaled Single Node Improvement +============================== +One element of evaluation will focus on scaled single node improvement (SSNI). SSNI is defined as follows: + +Given two platforms using one as a reference, SSNI is defined as a weighted geometric mean using the following equation. + +.. math:: + + SSNI = N(\prod_{i=1}^{M}(S_i)^{w_i})^\frac{1}{\sum_{i=1}^{M}{W_i}} + + +Where: + +* N = Number of nodes on ATS-5 system / Number of nodes on reference system, + +* M = total number of Benchmarks, + +* S = application speedup; Figure of Merit on ATS-5 system / Figure of Merit on reference system (Crossroads); S must be greater than 1, + +* w = weighting factor. + + + +Approvals +========= + +- LA-UR-23-22084 Approved for public release; distribution is unlimited. +- Content from Sandia National Laboratories considered unclassified with + unlimited distribution under SAND2023-12176O, SAND2023-01069O, and + SAND2023-01070O. + + From b86f4ab3c482b5bcf2b39aec240fa4fc3ef71d51 Mon Sep 17 00:00:00 2001 From: Anthony Agelastos Date: Mon, 18 Sep 2023 20:21:38 -0600 Subject: [PATCH 05/12] removed temporary file --- doc/sphinx/11_reference_systems/.#crossroads.rst | 1 - 1 file changed, 1 deletion(-) delete mode 120000 doc/sphinx/11_reference_systems/.#crossroads.rst diff --git a/doc/sphinx/11_reference_systems/.#crossroads.rst b/doc/sphinx/11_reference_systems/.#crossroads.rst deleted file mode 120000 index 4a13ab3d..00000000 --- a/doc/sphinx/11_reference_systems/.#crossroads.rst +++ /dev/null @@ -1 +0,0 @@ -amagela@s1091389.7166 \ No newline at end of file From bf76f4477db46c164602d7ea43cf51ff8c6c79b1 Mon Sep 17 00:00:00 2001 From: Anthony Agelastos Date: Mon, 18 Sep 2023 22:18:22 -0600 Subject: [PATCH 06/12] added crossroads reference system to documentation --- doc/sphinx/00_intro/introduction.rst | 5 +- .../11_reference_systems/crossroads.rst | 286 +++--------------- 2 files changed, 42 insertions(+), 249 deletions(-) diff --git a/doc/sphinx/00_intro/introduction.rst b/doc/sphinx/00_intro/introduction.rst index 4840094c..f878ee65 100644 --- a/doc/sphinx/00_intro/introduction.rst +++ b/doc/sphinx/00_intro/introduction.rst @@ -14,7 +14,10 @@ benchmarks prior to RFP. To use these benchmarks please refer to the ATS-5 benchmarks repository `ATS-5 repo `_ -Benchmark changes from Crossroads +The benchmarks will, eventually, be generated atop Crossroads as the reference +system (see :ref:`ReferenceCrossroads` for more information). + +Benchmark Changes from Crossroads ================================= The key differences from Crossroads benchmarks and ATS-5 benchmarks are as summarized below: diff --git a/doc/sphinx/11_reference_systems/crossroads.rst b/doc/sphinx/11_reference_systems/crossroads.rst index 4840094c..a3756fba 100644 --- a/doc/sphinx/11_reference_systems/crossroads.rst +++ b/doc/sphinx/11_reference_systems/crossroads.rst @@ -1,248 +1,38 @@ -************ -Introduction -************ - -This is the documentation for the **ATS-5 Benchmarks**. - -Assuring that real applications perform efficiently on ATS-5 is key to their success. -A suite of benchmarks have been developed for Request For Proposal (RFP) response evaluation and system acceptance. -These codes are representative of the workloads of the NNSA laboratories. - -The benchmarks contained within this site represent a pre-RFP draft state. Over the next few months the -benchmarks will change somewhat. While we expect most of the changes will be additions and modifications it is possible that we will remove -benchmarks prior to RFP. - -To use these benchmarks please refer to the ATS-5 benchmarks repository `ATS-5 repo `_ - -Benchmark changes from Crossroads -================================= - -The key differences from Crossroads benchmarks and ATS-5 benchmarks are as summarized below: - -.. list-table:: - - * - **Crossroads** - - **ATS-5** - - **Notes** - * - Few GPU-ready benchmarks - - | All proxy benchmarks have - | GPU implementations. - - - * - | System level performance metric: - | Scalable System Improvement - | geometric mean of app FOMs. - | Use of single node benchmarks - | for RFP. - - | Multi-node benchmarking for - | system acceptance based on - | RFP benchmarks, negotiated - | with vendor as part of SOW. - - | Attempting to limit multi-node - | benchmarking for RFP - | to communication (MPI), and - | IO (IOR). Expect responses to - | include multiple node - | configurations and ability to - | compose them to meet our needs - | in a codesign partnership. - | Will use scaled single node - | improvement to assess proposals - | (along with other factors) and - | SSI for acceptance. - * - | Mini-Apps + full scale apps - | some of which were export - | controlled. - - | Mini-apps only - all open - | source. - - - * - No Machine Learning. - - | ML training and inference - | included. - - | Focuses on material science - | workloads of relevance. - - - -Benchmark Overview -================== - -.. list-table:: - - * - **Benchmark** - - **Description** - - **Language** - - **Parallelism** - * - Branson - - Implicit Monte Carlo transport - - C++ - - MPI + Cuda/HIP - * - AMG2023 - - | AMG solver of sparse matrices - | using Hypre - - C - - | MPI+CUDA/HIP/SYCL - | OpenMP on CPU - * - MiniEM - - Electro-Magnetics solver - - C++ - - MPI+Kokkos - * - MLMD - - | ML Training of interatomic - | potential model using HIPYNN - | on VASP Simulation data. - | ML inference using LAMMPS, - | Kokkos, and HIPYNN trained - | interatomic potential model. - - Python, C++, C - - MPI+Cuda/HIP - * - Parthenon-VIBE - - | Block structured AMR proxy using - | the Parthenon framework. - - C++ - - MPI+Kokkos - * - Sparta - - Direct Simulation Monte Carlo - - C++ - - MPI+Kokkos - * - UMT - - Deterministic (Sn) transport - - Fortran - - | MPI+OpenMP and - | OpenMP Offload - - - -Microbenchmark Overview -======================= - -.. list-table:: - - * - **Benchmark** - - **Description** - - **Language** - - **Parallelism** - - **Multi-node** - * - Stream - - Streaming memory bandwidth test - - C/Fortran - - OpenMP - - No - * - Spatter - - | Sparse memory bandwidth test - | driven by application memory - | access patterns. - - C++ - - | MPI+OpenMP/ - | CUDA/OpenCL - - No - * - | OSU MPI + - | Sandia SMB - | message rate - - MPI Performance Benchmarks - - C++ - - MPI - - Yes - * - DGEMM - - | Single node floating-point - | performance on matrix multiply. - - C/Fortran - - Various - - No - * - DAXPY - - | Single node floating-point - | performance of a scaled vector - | plus a vector. - - C/Fortran - - Various - - No - * - IOR - - | Performance testing of parallel - | file system using various - | interfaces and access patterns. - - C - - MPI - - No - * - mdtest - - | Metadata benchmark that performs - | open/stat/close operations on - | files and directories. - - C - - MPI - - Yes - - -Run Rules Synopsis -================== - -Single node benchmarks will require respondent to provide estimates on - -* strong scaling for CPU architectures. - -* throughput curves for GPU architectures. - -* estimates must be provided for each compute node type (including options). - -* Problem size must be changed to meet % of memory requirements. - -Source code modification categories: - -* Baseline: “out-of-the-box” performance - - * Code modifications not permitted - - * Compiler options can be modified, library substitutions permitted, problem decomposition may be changed - -* Ported: “alternative baseline for new architectures” - - * Limited source-code modifications are permitted to port and tune for the target architecture using directives or commonly used interfaces. - -* Optimized: "speed of light" - - * Aggressive code changes that enhance performance are permitted. - - * Algorithms fundamental to the program may not be replaced. - - * The modified code must still pass validation tests. - -Required results: - - * A **baseline** or **ported** result is required for each benchmark. If baseline cannot be obtained, ported results may be provided. - -Optional results: - - * **Ported** results may be provided in addition to the baseline if minor code changes enable substantial performance gain. - - * **Optimized** results to showcase system capabilities. - -Scaled Single Node Improvement -============================== -One element of evaluation will focus on scaled single node improvement (SSNI). SSNI is defined as follows: - -Given two platforms using one as a reference, SSNI is defined as a weighted geometric mean using the following equation. - -.. math:: - - SSNI = N(\prod_{i=1}^{M}(S_i)^{w_i})^\frac{1}{\sum_{i=1}^{M}{W_i}} - - -Where: - -* N = Number of nodes on ATS-5 system / Number of nodes on reference system, - -* M = total number of Benchmarks, - -* S = application speedup; Figure of Merit on ATS-5 system / Figure of Merit on reference system (Crossroads); S must be greater than 1, - -* w = weighting factor. - - - -Approvals -========= - -- LA-UR-23-22084 Approved for public release; distribution is unlimited. -- Content from Sandia National Laboratories considered unclassified with - unlimited distribution under SAND2023-12176O, SAND2023-01069O, and - SAND2023-01070O. - - +.. _ReferenceCrossroads: + +********** +Crossroads +********** + +The Crossroads (see [ACESCrossroads]_) reference system is the third Advanced +Technology System (ATS-3) in the Advanced Simulation and Computing (ASC) +Program. Each compute node has dual sockets with each sporting an Intel Xeon +Sapphire Rapids (SPR) CPU Max 9480 processor configured with Sub-NUMA Clustering +4 (SNC-4) affinity. This provides 8 NUMA Domains across the node (4 per socket). +Each NUMA Domain has 14 physical cores and 28 virtual cores, which totals 112 +physical and 224 virtual cores across the compute node. Each processor has a +base clock frequency of 1.9 GHz with a Max Turbo Frequency of 3.50 GHz. SPR +delivers ultra-wide (512-bit) vector operations capabilities with up to 2 Fused +Multiply Add (FMA) instructions with Intel Advanced Vector Extensions 512 +(AVX-512). The total node-level memory, including cache, quantities are listed +below. + +- **High-Bandwidth Memory**: 128 GiB +- **L1d cache**: 5.3 MiB (112 instances) +- **L1i cache**: 3.5 MiB (112 instances) +- **L2 cache**: 224 MiB (112 instances) +- **L3 cache**: 225 MiB (2 instances) + +Refer to Intel's Ark page (see [IntelArk]_) for more information. + + +References +========== + +.. [ACESCrossroads] ACES, 'Crossroads', 2023. [Online]. Available: + https://www.lanl.gov/projects/crossroads/. [Accessed: 18- + Sep- 2023] +.. [IntelArk] Intel, 'Intel Xeon CPU Max 9480 Processor', 2023. [Online]. + Available: + https://www.intel.com/content/www/us/en/products/sku/232592/intel-xeon-cpu-max-9480-processor-112-5m-cache-1-90-ghz/specifications.html. + [Accessed: 18- Sep- 2023] From 04b8b3bbf8248b9230f39b1ac31f9377048dd070 Mon Sep 17 00:00:00 2001 From: Anthony Agelastos Date: Tue, 19 Sep 2023 08:41:08 -0600 Subject: [PATCH 07/12] added NUMA configuration information to Crossroads section --- .../11_reference_systems/crossroads.rst | 32 +++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/doc/sphinx/11_reference_systems/crossroads.rst b/doc/sphinx/11_reference_systems/crossroads.rst index a3756fba..943afc0a 100644 --- a/doc/sphinx/11_reference_systems/crossroads.rst +++ b/doc/sphinx/11_reference_systems/crossroads.rst @@ -8,8 +8,8 @@ The Crossroads (see [ACESCrossroads]_) reference system is the third Advanced Technology System (ATS-3) in the Advanced Simulation and Computing (ASC) Program. Each compute node has dual sockets with each sporting an Intel Xeon Sapphire Rapids (SPR) CPU Max 9480 processor configured with Sub-NUMA Clustering -4 (SNC-4) affinity. This provides 8 NUMA Domains across the node (4 per socket). -Each NUMA Domain has 14 physical cores and 28 virtual cores, which totals 112 +4 (SNC-4) affinity. This provides 8 NUMA domains across the node (4 per socket). +Each NUMA domain has 14 physical cores and 28 virtual cores, which totals 112 physical and 224 virtual cores across the compute node. Each processor has a base clock frequency of 1.9 GHz with a Max Turbo Frequency of 3.50 GHz. SPR delivers ultra-wide (512-bit) vector operations capabilities with up to 2 Fused @@ -26,6 +26,34 @@ below. Refer to Intel's Ark page (see [IntelArk]_) for more information. +Single-Node Strong Scaling +========================== + +Single-node hardware configurations are becoming increasingly complex. As an +example, the Crossroads compute node has some resources shared at the socket and +some at the NUMA domain levels. Typically, NUMA domains capture these at the +smallest level (not going as far as individual cores, though) which is why it is +desired to leverage them for generating and comparing strong scaling results +across hardware configurations. This procurement would like to generate and +compare single-node strong scaling data of its benchmarks as they strong scale +on ~1%, ~25%, ~50%, ~75%, and 100% of NUMA domain utilization across all +domains. Crossroads has 8 NUMA domains each with 14 physical cores. The targets +above would map to the following configurations on Crossroads. + + +.. table:: Crossroads Single-Node Strong Scaling Configurations + + ===== ====================== =========================== + % # Cores on NUMA Domain # Cores Across Compute Node + ===== ====================== =========================== + ~1% 1 8 + ~25% 4 32 + ~50% 7 56 + ~75% 11 88 + 100% 14 112 + ===== ====================== =========================== + + References ========== From cef86c7a3db7b7ff90b5f443002b187b519bbd47 Mon Sep 17 00:00:00 2001 From: Anthony Agelastos Date: Tue, 19 Sep 2023 08:46:09 -0600 Subject: [PATCH 08/12] clarification around the crossroads numa domain utilization --- doc/sphinx/11_reference_systems/crossroads.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/sphinx/11_reference_systems/crossroads.rst b/doc/sphinx/11_reference_systems/crossroads.rst index 943afc0a..c7de9f8c 100644 --- a/doc/sphinx/11_reference_systems/crossroads.rst +++ b/doc/sphinx/11_reference_systems/crossroads.rst @@ -41,7 +41,7 @@ domains. Crossroads has 8 NUMA domains each with 14 physical cores. The targets above would map to the following configurations on Crossroads. -.. table:: Crossroads Single-Node Strong Scaling Configurations +.. table:: Crossroads Single-Node Strong Scaling Configurations Regarding NUMA Domain Utilization ===== ====================== =========================== % # Cores on NUMA Domain # Cores Across Compute Node From 5bfa8ff48cdfd710be4f15759b7cee3da71eec44 Mon Sep 17 00:00:00 2001 From: Anthony Agelastos Date: Tue, 19 Sep 2023 08:49:22 -0600 Subject: [PATCH 09/12] preserving SPARTA documentation in the Manzano era --- .../manzano_era/Makefile.manzano_kokkos | 104 ++++ doc/sphinx/08_sparta/manzano_era/ats2.csv | 8 + .../08_sparta/manzano_era/ats2.gp-noauto | 25 + .../08_sparta/manzano_era/build-manzano.sh | 30 ++ .../08_sparta/manzano_era/cts1-0.25.csv | 8 + .../08_sparta/manzano_era/cts1-0.25.gp-noauto | 24 + .../08_sparta/manzano_era/cts1-0.50.csv | 8 + .../08_sparta/manzano_era/cts1-0.50.gp-noauto | 24 + .../08_sparta/manzano_era/cts1-1.00.csv | 8 + .../08_sparta/manzano_era/cts1-1.00.gp-noauto | 24 + .../08_sparta/manzano_era/cts1-2.00.csv | 8 + .../08_sparta/manzano_era/cts1-2.00.gp-noauto | 24 + doc/sphinx/08_sparta/manzano_era/cts1.csv | 8 + .../08_sparta/manzano_era/cts1.gp-noauto | 24 + .../manzano_era/cts1mem-0.25.gp-noauto | 24 + .../manzano_era/cts1mem-0.50.gp-noauto | 24 + .../manzano_era/cts1mem-1.00.gp-noauto | 24 + .../manzano_era/cts1mem-2.00.gp-noauto | 24 + doc/sphinx/08_sparta/manzano_era/log.sparta | 465 ++++++++++++++++ doc/sphinx/08_sparta/manzano_era/sparta.rst | 499 ++++++++++++++++++ .../08_sparta/manzano_era/sparta_fom.py | 236 +++++++++ 21 files changed, 1623 insertions(+) create mode 100644 doc/sphinx/08_sparta/manzano_era/Makefile.manzano_kokkos create mode 100644 doc/sphinx/08_sparta/manzano_era/ats2.csv create mode 100644 doc/sphinx/08_sparta/manzano_era/ats2.gp-noauto create mode 100755 doc/sphinx/08_sparta/manzano_era/build-manzano.sh create mode 100644 doc/sphinx/08_sparta/manzano_era/cts1-0.25.csv create mode 100644 doc/sphinx/08_sparta/manzano_era/cts1-0.25.gp-noauto create mode 100644 doc/sphinx/08_sparta/manzano_era/cts1-0.50.csv create mode 100644 doc/sphinx/08_sparta/manzano_era/cts1-0.50.gp-noauto create mode 100644 doc/sphinx/08_sparta/manzano_era/cts1-1.00.csv create mode 100644 doc/sphinx/08_sparta/manzano_era/cts1-1.00.gp-noauto create mode 100644 doc/sphinx/08_sparta/manzano_era/cts1-2.00.csv create mode 100644 doc/sphinx/08_sparta/manzano_era/cts1-2.00.gp-noauto create mode 100644 doc/sphinx/08_sparta/manzano_era/cts1.csv create mode 100644 doc/sphinx/08_sparta/manzano_era/cts1.gp-noauto create mode 100644 doc/sphinx/08_sparta/manzano_era/cts1mem-0.25.gp-noauto create mode 100644 doc/sphinx/08_sparta/manzano_era/cts1mem-0.50.gp-noauto create mode 100644 doc/sphinx/08_sparta/manzano_era/cts1mem-1.00.gp-noauto create mode 100644 doc/sphinx/08_sparta/manzano_era/cts1mem-2.00.gp-noauto create mode 100644 doc/sphinx/08_sparta/manzano_era/log.sparta create mode 100644 doc/sphinx/08_sparta/manzano_era/sparta.rst create mode 100755 doc/sphinx/08_sparta/manzano_era/sparta_fom.py diff --git a/doc/sphinx/08_sparta/manzano_era/Makefile.manzano_kokkos b/doc/sphinx/08_sparta/manzano_era/Makefile.manzano_kokkos new file mode 100644 index 00000000..a857e028 --- /dev/null +++ b/doc/sphinx/08_sparta/manzano_era/Makefile.manzano_kokkos @@ -0,0 +1,104 @@ +# kokkos_mpi_only = KOKKOS package with Serial backend (no OpenMP support), MPI compiler, default MPI + +SHELL = /bin/sh + +# --------------------------------------------------------------------- +# compiler/linker settings +# specify flags and libraries needed for your compiler + +CC = mpicxx +CCFLAGS = -g -O3 -DSPARTA_BIGBIG +SHFLAGS = -fPIC +DEPFLAGS = -M + +LINK = mpicxx +LINKFLAGS = -g -O3 +LIB = +SIZE = size + +ARCHIVE = ar +ARFLAGS = -rc +SHLIBFLAGS = -shared +KOKKOS_DEVICES = Serial +KOKKOS_ARCH = SKX + +# --------------------------------------------------------------------- +# SPARTA-specific settings +# specify settings for SPARTA features you will use +# if you change any -D setting, do full re-compile after "make clean" + +# SPARTA ifdef settings, OPTIONAL +# see possible settings in doc/Section_start.html#2_2 (step 4) + +SPARTA_INC = -DSPARTA_GZIP + +# MPI library, REQUIRED +# see discussion in doc/Section_start.html#2_2 (step 5) +# can point to dummy MPI library in src/STUBS as in Makefile.serial +# INC = path for mpi.h, MPI compiler settings +# PATH = path for MPI library +# LIB = name of MPI library + +MPI_INC = -DMPICH_SKIP_MPICXX -DOMPI_SKIP_MPICXX=1 +MPI_PATH = +MPI_LIB = + +# JPEG library, OPTIONAL +# see discussion in doc/Section_start.html#2_2 (step 7) +# only needed if -DSPARTA_JPEG listed with SPARTA_INC +# INC = path for jpeglib.h +# PATH = path for JPEG library +# LIB = name of JPEG library + +JPG_INC = +JPG_PATH = +JPG_LIB = + +# --------------------------------------------------------------------- +# build rules and dependencies +# no need to edit this section + +include Makefile.package.settings +include Makefile.package + +EXTRA_INC = $(SPARTA_INC) $(PKG_INC) $(MPI_INC) $(JPG_INC) $(PKG_SYSINC) +EXTRA_PATH = $(PKG_PATH) $(MPI_PATH) $(JPG_PATH) $(PKG_SYSPATH) +EXTRA_LIB = $(PKG_LIB) $(MPI_LIB) $(JPG_LIB) $(PKG_SYSLIB) +EXTRA_CPP_DEPENDS = $(PKG_CPP_DEPENDS) +EXTRA_LINK_DEPENDS = $(PKG_LINK_DEPENDS) + +# Path to src files + +vpath %.cpp .. +vpath %.h .. + +# Link target + +$(EXE): $(OBJ) $(EXTRA_LINK_DEPENDS) + $(LINK) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(EXTRA_LIB) $(LIB) -o $(EXE) + $(SIZE) $(EXE) + +# Library targets + +lib: $(OBJ) $(EXTRA_LINK_DEPENDS) + $(ARCHIVE) $(ARFLAGS) $(EXE) $(OBJ) + +shlib: $(OBJ) $(EXTRA_LINK_DEPENDS) + $(CC) $(CCFLAGS) $(SHFLAGS) $(SHLIBFLAGS) $(EXTRA_PATH) -o $(EXE) \ + $(OBJ) $(EXTRA_LIB) $(LIB) + +# Compilation rules + +%.o:%.cpp $(EXTRA_CPP_DEPENDS) + $(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $< + +%.d:%.cpp $(EXTRA_CPP_DEPENDS) + $(CC) $(CCFLAGS) $(EXTRA_INC) $(DEPFLAGS) $< > $@ + +%.o:%.cu $(EXTRA_CPP_DEPENDS) + $(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $< + +# Individual dependencies + +DEPENDS = $(OBJ:.o=.d) +sinclude $(DEPENDS) diff --git a/doc/sphinx/08_sparta/manzano_era/ats2.csv b/doc/sphinx/08_sparta/manzano_era/ats2.csv new file mode 100644 index 00000000..1a3d1503 --- /dev/null +++ b/doc/sphinx/08_sparta/manzano_era/ats2.csv @@ -0,0 +1,8 @@ +No. Particles,Actual +7142385,436.3179 +14287466,575.0493 +28571551,751.5044 +35712209,840.5981 +42855549,873.2238 +49999941,894.6309 +57143091,921.2964 diff --git a/doc/sphinx/08_sparta/manzano_era/ats2.gp-noauto b/doc/sphinx/08_sparta/manzano_era/ats2.gp-noauto new file mode 100644 index 00000000..c537de6a --- /dev/null +++ b/doc/sphinx/08_sparta/manzano_era/ats2.gp-noauto @@ -0,0 +1,25 @@ +#!/usr/bin/gnuplot +set terminal pngcairo enhanced size 1024, 768 dashed font 'Helvetica,18' +set output "ats2.png" + +set title "SPARTA Throughput Performance on ATS-2/Vortex" font "serif,22" +set xlabel "No. Particles" +set ylabel "Figure of Merit (M-particle-steps/sec)" + +# set xrange [1:64] +set key left top + +# set logscale x 2 +# set logscale y 2 + +set grid +show grid + +set datafile separator comma +set key autotitle columnheader + +set style line 1 linetype 6 dashtype 1 linecolor rgb "#FF0000" linewidth 2 pointtype 6 pointsize 3 +set style line 2 linetype 1 dashtype 2 linecolor rgb "#FF0000" linewidth 2 +set style line 3 linetype 6 dashtype 1 linecolor rgb "#0000FF" linewidth 2 pointtype 6 pointsize 3 + +plot "ats2.csv" using 1:2 with linespoints linestyle 3 diff --git a/doc/sphinx/08_sparta/manzano_era/build-manzano.sh b/doc/sphinx/08_sparta/manzano_era/build-manzano.sh new file mode 100755 index 00000000..a3193378 --- /dev/null +++ b/doc/sphinx/08_sparta/manzano_era/build-manzano.sh @@ -0,0 +1,30 @@ +#!/bin/sh + +umask 022 +set -e +set -x + +dir_root=`git rev-parse --show-toplevel` +dir_src="${dir_root}/sparta" + +module unload intel +module unload openmpi-intel +module use /apps/modules/modulefiles-apps/cde/v3/ +module load cde/v3/devpack/intel-ompi +module list + +pushd "${dir_src}" +git clean -fdx +git reset --hard +popd +cp -a Makefile.manzano_kokkos "${dir_src}/src/MAKE" + +pushd "${dir_src}/src" +make yes-kokkos +make -j 16 manzano_kokkos +echo "Resultant build info:" +ls -lh `pwd -P`/spa_manzano_kokkos +popd + + +exit 0 diff --git a/doc/sphinx/08_sparta/manzano_era/cts1-0.25.csv b/doc/sphinx/08_sparta/manzano_era/cts1-0.25.csv new file mode 100644 index 00000000..655dfdd6 --- /dev/null +++ b/doc/sphinx/08_sparta/manzano_era/cts1-0.25.csv @@ -0,0 +1,8 @@ +No. Cores,Actual,Ideal,Memory (GiB) +1,27.083,27.083,11.23 +2,48.984,54.166,11.23 +4,86.519,108.332,11.37 +8,147.291,216.664,11.47 +16,245.349,433.328,11.71 +32,347.010,866.656,12.34 +48,406.431,1299.984,12.76 diff --git a/doc/sphinx/08_sparta/manzano_era/cts1-0.25.gp-noauto b/doc/sphinx/08_sparta/manzano_era/cts1-0.25.gp-noauto new file mode 100644 index 00000000..7f4f17ff --- /dev/null +++ b/doc/sphinx/08_sparta/manzano_era/cts1-0.25.gp-noauto @@ -0,0 +1,24 @@ +#!/usr/bin/gnuplot +set terminal pngcairo enhanced size 1024, 768 dashed font 'Helvetica,18' +set output "cts1-0.25.png" + +set title "SPARTA Strong Scaling Performance on CTS-1/Manzano (0.25 GiB/PE)" font "serif,22" +set xlabel "No. Processing Elements" +set ylabel "Figure of Merit (time steps/sec)" + +set xrange [1:64] +set key left top + +set logscale x 2 +set logscale y 2 + +set grid +show grid + +set datafile separator comma +set key autotitle columnheader + +set style line 1 linetype 6 dashtype 1 linecolor rgb "#FF0000" linewidth 2 pointtype 6 pointsize 3 +set style line 2 linetype 1 dashtype 2 linecolor rgb "#FF0000" linewidth 2 + +plot "cts1-0.25.csv" using 1:2 with linespoints linestyle 1, "" using 1:3 with line linestyle 2 diff --git a/doc/sphinx/08_sparta/manzano_era/cts1-0.50.csv b/doc/sphinx/08_sparta/manzano_era/cts1-0.50.csv new file mode 100644 index 00000000..db5c3a9d --- /dev/null +++ b/doc/sphinx/08_sparta/manzano_era/cts1-0.50.csv @@ -0,0 +1,8 @@ +No. Cores,Actual,Ideal,Memory (GiB) +1,29.078,29.078,18.42 +2,55.678,58.156,18.41 +4,100.193,116.312,18.55 +8,173.462,232.624,18.65 +16,287.659,465.248,18.89 +32,395.545,930.496,19.52 +48,454.229,1395.744,20.01 diff --git a/doc/sphinx/08_sparta/manzano_era/cts1-0.50.gp-noauto b/doc/sphinx/08_sparta/manzano_era/cts1-0.50.gp-noauto new file mode 100644 index 00000000..3303df1f --- /dev/null +++ b/doc/sphinx/08_sparta/manzano_era/cts1-0.50.gp-noauto @@ -0,0 +1,24 @@ +#!/usr/bin/gnuplot +set terminal pngcairo enhanced size 1024, 768 dashed font 'Helvetica,18' +set output "cts1-0.50.png" + +set title "SPARTA Strong Scaling Performance on CTS-1/Manzano (0.50 GiB/PE)" font "serif,22" +set xlabel "No. Processing Elements" +set ylabel "Figure of Merit (time steps/sec)" + +set xrange [1:64] +set key left top + +set logscale x 2 +set logscale y 2 + +set grid +show grid + +set datafile separator comma +set key autotitle columnheader + +set style line 1 linetype 6 dashtype 1 linecolor rgb "#FF0000" linewidth 2 pointtype 6 pointsize 3 +set style line 2 linetype 1 dashtype 2 linecolor rgb "#FF0000" linewidth 2 + +plot "cts1-0.50.csv" using 1:2 with linespoints linestyle 1, "" using 1:3 with line linestyle 2 diff --git a/doc/sphinx/08_sparta/manzano_era/cts1-1.00.csv b/doc/sphinx/08_sparta/manzano_era/cts1-1.00.csv new file mode 100644 index 00000000..7c64095f --- /dev/null +++ b/doc/sphinx/08_sparta/manzano_era/cts1-1.00.csv @@ -0,0 +1,8 @@ +No. Cores,Actual,Ideal,Memory (GiB) +1,32.306,32.306,33.51 +2,60.208,64.612,33.50 +4,111.753,129.224,33.63 +8,200.736,258.448,33.73 +16,328.496,516.896,33.98 +32,440.688,1033.792,34.55 +48,504.080,1550.688,35.03 diff --git a/doc/sphinx/08_sparta/manzano_era/cts1-1.00.gp-noauto b/doc/sphinx/08_sparta/manzano_era/cts1-1.00.gp-noauto new file mode 100644 index 00000000..8661226a --- /dev/null +++ b/doc/sphinx/08_sparta/manzano_era/cts1-1.00.gp-noauto @@ -0,0 +1,24 @@ +#!/usr/bin/gnuplot +set terminal pngcairo enhanced size 1024, 768 dashed font 'Helvetica,18' +set output "cts1-1.00.png" + +set title "SPARTA Strong Scaling Performance on CTS-1/Manzano (1.00 GiB/PE)" font "serif,22" +set xlabel "No. Processing Elements" +set ylabel "Figure of Merit (time steps/sec)" + +set xrange [1:64] +set key left top + +set logscale x 2 +set logscale y 2 + +set grid +show grid + +set datafile separator comma +set key autotitle columnheader + +set style line 1 linetype 6 dashtype 1 linecolor rgb "#FF0000" linewidth 2 pointtype 6 pointsize 3 +set style line 2 linetype 1 dashtype 2 linecolor rgb "#FF0000" linewidth 2 + +plot "cts1-1.00.csv" using 1:2 with linespoints linestyle 1, "" using 1:3 with line linestyle 2 diff --git a/doc/sphinx/08_sparta/manzano_era/cts1-2.00.csv b/doc/sphinx/08_sparta/manzano_era/cts1-2.00.csv new file mode 100644 index 00000000..cdac037e --- /dev/null +++ b/doc/sphinx/08_sparta/manzano_era/cts1-2.00.csv @@ -0,0 +1,8 @@ +No. Cores,Actual,Ideal,Memory (GiB) +1,33.259,33.259,93.86 +2,65.506,66.519,93.85 +4,119.683,133.038,93.98 +8,230.232,266.075,94.08 +16,363.815,532.151,94.29 +32,477.281,1064.302,94.81 +48,535.983,1596.453,95.23 diff --git a/doc/sphinx/08_sparta/manzano_era/cts1-2.00.gp-noauto b/doc/sphinx/08_sparta/manzano_era/cts1-2.00.gp-noauto new file mode 100644 index 00000000..a52fe09d --- /dev/null +++ b/doc/sphinx/08_sparta/manzano_era/cts1-2.00.gp-noauto @@ -0,0 +1,24 @@ +#!/usr/bin/gnuplot +set terminal pngcairo enhanced size 1024, 768 dashed font 'Helvetica,18' +set output "cts1-2.00.png" + +set title "SPARTA Strong Scaling Performance on CTS-1/Manzano (2.00 GiB/PE)" font "serif,22" +set xlabel "No. Processing Elements" +set ylabel "Figure of Merit (time steps/sec)" + +set xrange [1:64] +set key left top + +set logscale x 2 +set logscale y 2 + +set grid +show grid + +set datafile separator comma +set key autotitle columnheader + +set style line 1 linetype 6 dashtype 1 linecolor rgb "#FF0000" linewidth 2 pointtype 6 pointsize 3 +set style line 2 linetype 1 dashtype 2 linecolor rgb "#FF0000" linewidth 2 + +plot "cts1-2.00.csv" using 1:2 with linespoints linestyle 1, "" using 1:3 with line linestyle 2 diff --git a/doc/sphinx/08_sparta/manzano_era/cts1.csv b/doc/sphinx/08_sparta/manzano_era/cts1.csv new file mode 100644 index 00000000..4975392f --- /dev/null +++ b/doc/sphinx/08_sparta/manzano_era/cts1.csv @@ -0,0 +1,8 @@ +No. Cores,Actual,Ideal +1,33.56245,33.56245 +2,61.10145,67.1249 +4,121.2445833,134.2498 +8,224.82665,268.4996 +16,362.8721333,536.9992 +32,501.6367667,1073.9984 +48,551.21695,1610.9976 diff --git a/doc/sphinx/08_sparta/manzano_era/cts1.gp-noauto b/doc/sphinx/08_sparta/manzano_era/cts1.gp-noauto new file mode 100644 index 00000000..72c9df27 --- /dev/null +++ b/doc/sphinx/08_sparta/manzano_era/cts1.gp-noauto @@ -0,0 +1,24 @@ +#!/usr/bin/gnuplot +set terminal pngcairo enhanced size 1024, 768 dashed font 'Helvetica,18' +set output "cts1.png" + +set title "SPARTA Strong Scaling Performance on CTS-1/Manzano" font "serif,22" +set xlabel "No. Processing Elements" +set ylabel "Figure of Merit (M-particle-steps/sec)" + +set xrange [1:64] +set key left top + +set logscale x 2 +set logscale y 2 + +set grid +show grid + +set datafile separator comma +set key autotitle columnheader + +set style line 1 linetype 6 dashtype 1 linecolor rgb "#FF0000" linewidth 2 pointtype 6 pointsize 3 +set style line 2 linetype 1 dashtype 2 linecolor rgb "#FF0000" linewidth 2 + +plot "cts1.csv" using 1:2 with linespoints linestyle 1, "" using 1:3 with line linestyle 2 diff --git a/doc/sphinx/08_sparta/manzano_era/cts1mem-0.25.gp-noauto b/doc/sphinx/08_sparta/manzano_era/cts1mem-0.25.gp-noauto new file mode 100644 index 00000000..fcd5743d --- /dev/null +++ b/doc/sphinx/08_sparta/manzano_era/cts1mem-0.25.gp-noauto @@ -0,0 +1,24 @@ +#!/usr/bin/gnuplot +set terminal pngcairo enhanced size 1024, 768 dashed font 'Helvetica,18' +set output "cts1mem-0.25.png" + +set title "MiniEM Strong Scaling High-water Memory on CTS-1/Manzano (0.25 GiB/PE)" font "serif,22" +set xlabel "No. Processing Elements" +set ylabel "Maximum Resident Set Size (GiB)" + +set xrange [1:64] +set key left top + +set logscale x 2 +# set logscale y 2 + +set grid +show grid + +set datafile separator comma +set key autotitle columnheader + +set style line 1 linetype 6 dashtype 1 linecolor rgb "#FF0000" linewidth 2 pointtype 6 pointsize 3 +set style line 2 linetype 1 dashtype 2 linecolor rgb "#FF0000" linewidth 2 + +plot "cts1-0.25.csv" using 1:4 with linespoints linestyle 1 diff --git a/doc/sphinx/08_sparta/manzano_era/cts1mem-0.50.gp-noauto b/doc/sphinx/08_sparta/manzano_era/cts1mem-0.50.gp-noauto new file mode 100644 index 00000000..be19f28a --- /dev/null +++ b/doc/sphinx/08_sparta/manzano_era/cts1mem-0.50.gp-noauto @@ -0,0 +1,24 @@ +#!/usr/bin/gnuplot +set terminal pngcairo enhanced size 1024, 768 dashed font 'Helvetica,18' +set output "cts1mem-0.50.png" + +set title "MiniEM Strong Scaling High-water Memory on CTS-1/Manzano (0.50 GiB/PE)" font "serif,22" +set xlabel "No. Processing Elements" +set ylabel "Maximum Resident Set Size (GiB)" + +set xrange [1:64] +set key left top + +set logscale x 2 +# set logscale y 2 + +set grid +show grid + +set datafile separator comma +set key autotitle columnheader + +set style line 1 linetype 6 dashtype 1 linecolor rgb "#FF0000" linewidth 2 pointtype 6 pointsize 3 +set style line 2 linetype 1 dashtype 2 linecolor rgb "#FF0000" linewidth 2 + +plot "cts1-0.50.csv" using 1:4 with linespoints linestyle 1 diff --git a/doc/sphinx/08_sparta/manzano_era/cts1mem-1.00.gp-noauto b/doc/sphinx/08_sparta/manzano_era/cts1mem-1.00.gp-noauto new file mode 100644 index 00000000..979a8104 --- /dev/null +++ b/doc/sphinx/08_sparta/manzano_era/cts1mem-1.00.gp-noauto @@ -0,0 +1,24 @@ +#!/usr/bin/gnuplot +set terminal pngcairo enhanced size 1024, 768 dashed font 'Helvetica,18' +set output "cts1mem-1.00.png" + +set title "MiniEM Strong Scaling High-water Memory on CTS-1/Manzano (1.00 GiB/PE)" font "serif,22" +set xlabel "No. Processing Elements" +set ylabel "Maximum Resident Set Size (GiB)" + +set xrange [1:64] +set key left top + +set logscale x 2 +# set logscale y 2 + +set grid +show grid + +set datafile separator comma +set key autotitle columnheader + +set style line 1 linetype 6 dashtype 1 linecolor rgb "#FF0000" linewidth 2 pointtype 6 pointsize 3 +set style line 2 linetype 1 dashtype 2 linecolor rgb "#FF0000" linewidth 2 + +plot "cts1-1.00.csv" using 1:4 with linespoints linestyle 1 diff --git a/doc/sphinx/08_sparta/manzano_era/cts1mem-2.00.gp-noauto b/doc/sphinx/08_sparta/manzano_era/cts1mem-2.00.gp-noauto new file mode 100644 index 00000000..72b2d910 --- /dev/null +++ b/doc/sphinx/08_sparta/manzano_era/cts1mem-2.00.gp-noauto @@ -0,0 +1,24 @@ +#!/usr/bin/gnuplot +set terminal pngcairo enhanced size 1024, 768 dashed font 'Helvetica,18' +set output "cts1mem-2.00.png" + +set title "MiniEM Strong Scaling High-water Memory on CTS-1/Manzano (2.00 GiB/PE)" font "serif,22" +set xlabel "No. Processing Elements" +set ylabel "Maximum Resident Set Size (GiB)" + +set xrange [1:64] +set key left top + +set logscale x 2 +# set logscale y 2 + +set grid +show grid + +set datafile separator comma +set key autotitle columnheader + +set style line 1 linetype 6 dashtype 1 linecolor rgb "#FF0000" linewidth 2 pointtype 6 pointsize 3 +set style line 2 linetype 1 dashtype 2 linecolor rgb "#FF0000" linewidth 2 + +plot "cts1-2.00.csv" using 1:4 with linespoints linestyle 1 diff --git a/doc/sphinx/08_sparta/manzano_era/log.sparta b/doc/sphinx/08_sparta/manzano_era/log.sparta new file mode 100644 index 00000000..a470927f --- /dev/null +++ b/doc/sphinx/08_sparta/manzano_era/log.sparta @@ -0,0 +1,465 @@ +SPARTA +KOKKOS mode is enabled (../kokkos.cpp:40) + requested 1 GPU(s) per node + requested 1 thread(s) per MPI task +Running on 1 MPI task(s) +package kokkos +package kokkos reduction atomic +# advect particles on uniform Cartesian grid +# single-step moves that cross grid cell boundaries are detected +# particle effectively moves from cell to cell +# particles reflect off global box boundaries +# +################################### +# Constants +################################### +variable boltz equal 1.380658E-23 + +################################### +# Gas parameters (Ar) +################################### +variable mue equal 1.656E-5 +variable mass equal 46.5E-27 +variable visc equal 1.656E-5 +variable gamma equal 1.400 +variable To equal 293. +variable pi equal 3.14159 + +variable cbar equal sqrt(8.*${boltz}*${To}/${mass}/${pi}) +variable cbar equal sqrt(8.*1.380658e-23*${To}/${mass}/${pi}) +variable cbar equal sqrt(8.*1.380658e-23*293/${mass}/${pi}) +variable cbar equal sqrt(8.*1.380658e-23*293/4.65e-26/${pi}) +variable cbar equal sqrt(8.*1.380658e-23*293/4.65e-26/3.14159) +variable uspeed equal sqrt(${gamma}*${boltz}*${To}/${mass}) +variable uspeed equal sqrt(1.4*${boltz}*${To}/${mass}) +variable uspeed equal sqrt(1.4*1.380658e-23*${To}/${mass}) +variable uspeed equal sqrt(1.4*1.380658e-23*293/${mass}) +variable uspeed equal sqrt(1.4*1.380658e-23*293/4.65e-26) + +################################### +# Trajectory inputs +################################### +variable mach equal 1.71 +variable L equal 1. +variable Vo equal ${mach}*${uspeed} +variable Vo equal 1.71*${uspeed} +variable Vo equal 1.71*348.991145588143 +variable nden equal 1.E20 +variable Vo equal ${mach}*${uspeed} +variable Vo equal 1.71*${uspeed} +variable Vo equal 1.71*348.991145588143 + +variable surftemp equal 293. +variable temp equal 293.00 +variable beta equal 0.000 + +################################### +# Simulation initialization standards +################################### +variable ppc equal 8 +#variable nmfp equal 200 +variable cpmfp equal 4 + +################################### +# Parameter calculations +################################### +variable Vx equal ${Vo}*cos(${beta}*2*PI/360) +variable Vx equal 596.774858955725*cos(${beta}*2*PI/360) +variable Vx equal 596.774858955725*cos(0*2*PI/360) +variable Vy equal ${Vo}*sin(${beta}*2*PI/360) +variable Vy equal 596.774858955725*sin(${beta}*2*PI/360) +variable Vy equal 596.774858955725*sin(0*2*PI/360) + + +variable mfp equal 2*${mue}/(${nden}*${mass}*${cbar}) +variable mfp equal 2*1.656e-05/(${nden}*${mass}*${cbar}) +variable mfp equal 2*1.656e-05/(1e+20*${mass}*${cbar}) +variable mfp equal 2*1.656e-05/(1e+20*4.65e-26*${cbar}) +variable mfp equal 2*1.656e-05/(1e+20*4.65e-26*470.674457970473) + +variable xmin equal -5.0*${L} +variable xmin equal -5.0*1 +variable xmax equal 5.1*${L} +variable xmax equal 5.1*1 +variable ymin equal -5.1*${L} +variable ymin equal -5.1*1 +variable ymax equal 5.1*${L} +variable ymax equal 5.1*1 + +variable xncells equal (${xmax}-${xmin})/${mfp}*${cpmfp} +variable xncells equal (5.1-${xmin})/${mfp}*${cpmfp} +variable xncells equal (5.1--5)/${mfp}*${cpmfp} +variable xncells equal (5.1--5)/0.0151327112073885*${cpmfp} +variable xncells equal (5.1--5)/0.0151327112073885*4 +variable yncells equal (${ymax}-${ymin})/${mfp}*${cpmfp} +variable yncells equal (5.1-${ymin})/${mfp}*${cpmfp} +variable yncells equal (5.1--5.1)/${mfp}*${cpmfp} +variable yncells equal (5.1--5.1)/0.0151327112073885*${cpmfp} +variable yncells equal (5.1--5.1)/0.0151327112073885*4 + +variable Fnum equal ${nden}*(${xmax}-${xmin})*(${ymax}-${ymin})/${ppc}/${xncells}/${yncells} +variable Fnum equal 1e+20*(${xmax}-${xmin})*(${ymax}-${ymin})/${ppc}/${xncells}/${yncells} +variable Fnum equal 1e+20*(5.1-${xmin})*(${ymax}-${ymin})/${ppc}/${xncells}/${yncells} +variable Fnum equal 1e+20*(5.1--5)*(${ymax}-${ymin})/${ppc}/${xncells}/${yncells} +variable Fnum equal 1e+20*(5.1--5)*(5.1-${ymin})/${ppc}/${xncells}/${yncells} +variable Fnum equal 1e+20*(5.1--5)*(5.1--5.1)/${ppc}/${xncells}/${yncells} +variable Fnum equal 1e+20*(5.1--5)*(5.1--5.1)/8/${xncells}/${yncells} +variable Fnum equal 1e+20*(5.1--5)*(5.1--5.1)/8/2669.71327519122/${yncells} +variable Fnum equal 1e+20*(5.1--5)*(5.1--5.1)/8/2669.71327519122/2696.14607989608 + +variable tstep equal (-${xmin}+${xmax})/${Vx}/${xncells}/10/4 +variable tstep equal (--5+${xmax})/${Vx}/${xncells}/10/4 +variable tstep equal (--5+5.1)/${Vx}/${xncells}/10/4 +variable tstep equal (--5+5.1)/596.774858955725/${xncells}/10/4 +variable tstep equal (--5+5.1)/596.774858955725/2669.71327519122/10/4 + +################################### +# Print variable values to log file +################################### +print " Velocity = ${Vo}" + Velocity = 596.774858955725 +print " Density = ${nden}" + Density = 1e+20 +print " X-Velocity = ${Vx}" + X-Velocity = 596.774858955725 +print " Y-Velocity = ${Vy}" + Y-Velocity = 0 +print " Temp = ${temp}" + Temp = 293 +print " cbar = ${cbar}" + cbar = 470.674457970473 +print " mean free path = ${mfp}" + mean free path = 0.0151327112073885 +print " cells per free stream mean free path = ${cpmfp}" + cells per free stream mean free path = 4 +print " sound speed = ${uspeed}" + sound speed = 348.991145588143 +#print " number of mean free paths = ${nmfp}" +print " x-min = ${xmin}" + x-min = -5 +print " x-max = ${xmax}" + x-max = 5.1 +print " y-min = ${ymin}" + y-min = -5.1 +print " y-max = ${ymax}" + y-max = 5.1 +print " x-cells = ${xncells}" + x-cells = 2669.71327519122 +print " y-cells = ${yncells}" + y-cells = 2696.14607989608 +print " Simulation Ratio = ${Fnum}" + Simulation Ratio = 178905428504860 +print " Timestep = ${tstep}" + Timestep = 1.584842987717e-07 + +################################### +# Simulation parameters +################################### +seed 847384 +dimension 2 +global nrho ${nden} +global nrho 1e+20 +global fnum ${Fnum} +global fnum 178905428504860 + +timestep ${tstep} +timestep 1.584842987717e-07 +global gridcut 1.E-1 +#global surfmax 10000 +#global surfpush yes +#global comm/sort yes +#global particle/reorder 10 + +################################### +# Grid generation +################################### +boundary o o p +create_box ${xmin} ${xmax} ${ymin} ${ymax} -0.5 0.5 +create_box -5 ${xmax} ${ymin} ${ymax} -0.5 0.5 +create_box -5 5.1 ${ymin} ${ymax} -0.5 0.5 +create_box -5 5.1 -5.1 ${ymax} -0.5 0.5 +create_box -5 5.1 -5.1 5.1 -0.5 0.5 +Created orthogonal box = (-5 -5.1 -0.5) to (5.1 5.1 0.5) +create_grid ${xncells} ${yncells} 1 block * * * +create_grid 2669.71327519122 ${yncells} 1 block * * * +create_grid 2669.71327519122 2696.14607989608 1 block * * * +Created 7195624 child grid cells + CPU time = 4.52432 secs + create/ghost percent = 66.8847 33.1153 +#read_restart restart.%.100000 + +#balance_grid rcb cell +#write_grid parent grid.out + +##################################### +# Gas/Collision Model Specification # +##################################### +species air.species N2 +mixture air vstream ${Vx} ${Vy} 0.0 temp ${temp} +mixture air vstream 596.774858955725 ${Vy} 0.0 temp ${temp} +mixture air vstream 596.774858955725 0 0.0 temp ${temp} +mixture air vstream 596.774858955725 0 0.0 temp 293 +mixture air N2 frac 1.0 + +mixture air vstream ${Vx} ${Vy} 0.0 temp ${temp} +mixture air vstream 596.774858955725 ${Vy} 0.0 temp ${temp} +mixture air vstream 596.774858955725 0 0.0 temp ${temp} +mixture air vstream 596.774858955725 0 0.0 temp 293 + +collide vss all air.vss relax variable +collide_modify vremax 10000 yes vibrate no rotate smooth nearcp yes 10 + +##################################################### +# Surface generation and collision specification +##################################################### +read_surf circle_R0.5_P10000.surf group 1 invert + 10000 points + 10000 lines + -0.5 0.5 xlo xhi + -0.5 0.5 ylo yhi + 0 0 zlo zhi + 0.000314159 min line length + 1058 0 = cells overlapping surfs, overlap cells with unmarked corner pts + 7140234 54332 1058 = cells outside/inside/overlapping surfs + 1058 = surf cells with 1,2,etc splits + 102.235 102.235 = cell-wise and global flow volume + CPU time = 4.70811 secs + read/check/sort/surf2grid/ghost/inout/particle percent = 0.319644 0.0735068 3.47566 65.2149 30.9163 25.9965 6.2233e-06 + surf2grid time = 3.07039 secs + map/comm1/comm2/comm3/comm4/split percent = 30.2458 0.0792146 44.2833 1.61381 5.0372 3.48686 +#surf_collide 1 specular noslip +surf_collide 1 diffuse ${surftemp} 1.0 +surf_collide 1 diffuse 293 1.0 +surf_modify 1 collide 1 +#surf_react 1 prob air.surf +#surf_modify 1 collide 1 +#surf_modify 1 collide 1 react 1 + +################################### +# Boundary conditions +################################### +fix in emit/face air xlo xhi ylo yhi + +# adapt the grid around the surface before running the simulation +adapt_grid all refine surf all 0.00001 iterate 5 +Adapting grid ... +WARNING: One or more fix inflow faces oppose streaming velocity (../fix_emit_face.cpp:195) + 7147718 61648 8458 = cells outside/inside/overlapping surfs + 8458 = surf cells with 1,2,etc splits + 102.235 102.235 = cell-wise and global flow volume + 7400 cells refined, 0 cells coarsened + adapted to 7217824 grid cells + CPU time = 11.9306 secs + adapt/redo percent = 81.9966 18.0034 + +################################### +# Initialize simulation +################################### +create_particles air n 0 +Created 57144494 particles + CPU time = 24.6205 secs +#fix check grid/check 1 error + +################################### +# Unsteady Output +################################### +stats_style step cpu np nattempt ncoll + +compute 2 grid all all nrho +compute 5 thermal/grid all all temp +compute 3 grid all all trot + +fix 5 ave/grid all 1 1000 10000 c_5[*] ave one +fix 2 ave/grid all 1 1000 10000 c_2[*] ave one +fix 3 ave/grid all 1 1000 10000 c_3[*] ave one + +dump dgrid1 grid all 10000 tmp_grid.* id f_2[*] f_5[*] f_3[*] + +compute 1b lambda/grid c_2[1] NULL N2 kall + +#fix 10 adapt 1000 all refine coarsen value c_1b[2] 0.5 2.0 # combine min thresh less more maxlevel 10 cells 2 2 1 file grid.* + +fix load balance 1000 1.1 rcb part + +stats_style step cpu np nattempt ncoll maxlevel + +stats 50 +run 5800 +WARNING: One or more fix inflow faces oppose streaming velocity (../fix_emit_face.cpp:195) +Memory usage per proc in Mbytes: + particles (ave,min,max) = 7224.3 7224.3 7224.3 + grid (ave,min,max) = 1323.11 1323.11 1323.11 + surf (ave,min,max) = 1.02997 1.02997 1.02997 + total (ave,min,max) = 9870.06 9870.06 9870.06 +Step CPU Np Natt Ncoll Maxlevel + 0 0 57144494 0 0 4 + 50 2.058492 57144353 202798 161581 4 + 100 3.8934437 57144165 194559 151949 4 + 150 5.9264821 57144277 198187 152510 4 + 200 7.8741561 57144501 201549 153420 4 + 250 10.032195 57144624 203458 152778 4 + 300 12.061168 57144456 205469 153049 4 + 350 14.190343 57144900 207345 153059 4 + 400 16.439252 57144623 209558 153299 4 + 450 18.708537 57144477 211065 153490 4 + 500 21.039468 57144509 212701 153993 4 + 550 23.384597 57144361 214613 154199 4 + 600 25.728705 57143966 215891 154226 4 + 650 28.143147 57143817 216934 154032 4 + 700 30.525966 57143733 218282 154220 4 + 750 32.863796 57143665 218738 153527 4 + 800 35.31154 57143764 220506 154561 4 + 850 37.780522 57143900 220210 153766 4 + 900 40.252289 57143662 222260 154931 4 + 950 42.799034 57143331 222427 154383 4 + 1000 46.784784 57143434 222924 153828 4 + 1050 49.320878 57143942 224776 154388 4 + 1100 51.880107 57143933 225323 154800 4 + 1150 54.461474 57143730 225924 154748 4 + 1200 57.054725 57143876 226694 154798 4 + 1250 59.660279 57143976 226824 154500 4 + 1300 62.393098 57143087 227364 154503 4 + 1350 64.924131 57143314 227876 154537 4 + 1400 67.429157 57143344 229780 155655 4 + 1450 70.104509 57142918 228488 154507 4 + 1500 72.740532 57142183 229053 154551 4 + 1550 75.442812 57142280 230351 155093 4 + 1600 78.197287 57142138 231021 155301 4 + 1650 80.994013 57142211 230483 154476 4 + 1700 83.761101 57142448 231495 155165 4 + 1750 86.65143 57142188 231914 155327 4 + 1800 89.493893 57142476 232105 155152 4 + 1850 92.307896 57142532 232156 154803 4 + 1900 95.235481 57142581 232132 154246 4 + 1950 98.112455 57142456 233300 154753 4 + 2000 102.4587 57142235 234294 155191 4 + 2050 105.32818 57142290 234420 155515 4 + 2100 108.16064 57142479 234743 155216 4 + 2150 110.98096 57142326 234892 155137 4 + 2200 113.9036 57142344 234756 155028 4 + 2250 116.90427 57142152 234546 154570 4 + 2300 119.89731 57142410 235102 155120 4 + 2350 122.82959 57143159 235124 154849 4 + 2400 125.91148 57143258 235922 155123 4 + 2450 128.99203 57143532 236249 155125 4 + 2500 131.94321 57143525 236689 155499 4 + 2550 135.06612 57143769 236501 154799 4 + 2600 138.148 57143406 237184 155281 4 + 2650 141.24436 57143281 237065 155046 4 + 2700 144.33909 57143402 238181 155275 4 + 2750 147.43203 57143217 237535 155210 4 + 2800 150.57207 57143066 238571 155327 4 + 2850 153.71315 57142836 239021 155527 4 + 2900 156.84212 57143081 238178 155200 4 + 2950 159.93988 57142986 239445 155779 4 + 3000 164.58071 57143191 238934 155384 4 + 3050 167.73452 57143336 239788 155558 4 + 3100 170.90471 57143217 238930 154740 4 + 3150 174.09899 57143569 239882 155315 4 + 3200 177.29528 57143549 239742 154925 4 + 3250 180.50476 57143787 239765 154987 4 + 3300 183.70915 57143638 240448 155209 4 + 3350 187.00156 57143808 240348 155198 4 + 3400 190.2722 57143992 241083 155702 4 + 3450 193.49754 57143869 242247 156229 4 + 3500 196.74938 57143981 241246 155561 4 + 3550 199.99076 57144250 241679 155689 4 + 3600 203.29537 57144267 241782 155402 4 + 3650 206.64881 57143820 241640 155925 4 + 3700 209.97137 57143663 242206 155207 4 + 3750 213.2846 57143992 240968 154943 4 + 3800 216.62289 57143834 242340 155880 4 + 3850 219.91016 57143645 242296 155488 4 + 3900 223.25863 57143393 242695 155694 4 + 3950 226.58709 57143785 243128 155667 4 + 4000 231.38424 57143428 242862 155683 4 + 4050 234.78426 57143347 244074 156069 4 + 4100 238.16172 57143567 242428 155394 4 + 4150 241.58793 57143465 243571 155595 4 + 4200 244.97722 57143641 244152 156100 4 + 4250 248.39276 57143404 243866 155456 4 + 4300 251.86078 57143622 244075 155953 4 + 4350 255.29979 57143672 244470 156122 4 + 4400 258.68111 57143584 244237 155649 4 + 4450 262.11313 57143677 243505 155114 4 + 4500 265.5987 57143619 245017 156070 4 + 4550 269.11518 57143354 244043 155607 4 + 4600 272.58861 57143266 244995 155585 4 + 4650 276.09737 57143487 244923 155679 4 + 4700 279.59686 57143374 245508 156481 4 + 4750 283.11732 57143220 245794 156174 4 + 4800 286.62826 57143543 245858 155985 4 + 4850 290.16258 57143537 246163 156154 4 + 4900 293.70587 57143158 246396 155852 4 + 4950 297.24365 57143156 245881 156296 4 + 5000 302.22578 57143134 246183 156006 4 + 5050 305.82038 57143224 245437 155265 4 + 5100 309.37403 57143263 246728 156438 4 + 5150 312.91463 57143336 246483 156143 4 + 5200 316.49343 57143277 246507 155968 4 + 5250 320.04961 57143424 247266 156740 4 + 5300 323.6269 57143536 247152 156027 4 + 5350 327.22681 57143678 246797 156411 4 + 5400 330.8471 57143565 246420 155714 4 + 5450 334.43881 57143589 248494 156634 4 + 5500 338.05504 57143522 247892 156006 4 + 5550 341.61585 57143299 247864 156202 4 + 5600 345.22658 57143326 247396 155602 4 + 5650 348.86079 57143101 247539 156261 4 + 5700 352.46277 57143321 248061 156337 4 + 5750 356.1278 57143326 247768 156090 4 + 5800 359.74302 57143091 248584 156399 4 +Loop time of 359.743 on 1 procs for 5800 steps with 57143091 particles + +MPI task timing breakdown: +Section | min time | avg time | max time |%varavg| %total +--------------------------------------------------------------- +Move | 203.67 | 203.67 | 203.67 | 0.0 | 56.62 +Coll | 34.597 | 34.597 | 34.597 | 0.0 | 9.62 +Sort | 85.919 | 85.919 | 85.919 | 0.0 | 23.88 +Comm | 5.8387 | 5.8387 | 5.8387 | 0.0 | 1.62 +Modify | 29.706 | 29.706 | 29.706 | 0.0 | 8.26 +Output | 0.0028535 | 0.0028535 | 0.0028535 | 0.0 | 0.00 +Other | | 0.01026 | | | 0.00 + +Particle moves = 331436303918 (331B) +Cells touched = 343041342060 (343B) +Particle comms = 0 (0K) +Boundary collides = 0 (0K) +Boundary exits = 4373746 (4.37M) +SurfColl checks = 14635081 (14.6M) +SurfColl occurs = 351623 (0.352M) +Surf reactions = 0 (0K) +Collide attempts = 1355727883 (1.36B) +Collide occurs = 896228959 (896M) +Reactions = 0 (0K) +Particles stuck = 0 +Axisymm bad moves = 0 + +Particle-moves/CPUsec/proc: 9.21314e+08 +Particle-moves/step: 5.71442e+07 +Cell-touches/particle/step: 1.03501 +Particle comm iterations/step: 1 +Particle fraction communicated: 0 +Particle fraction colliding with boundary: 0 +Particle fraction exiting boundary: 1.31963e-05 +Surface-checks/particle/step: 4.41565e-05 +Surface-collisions/particle/step: 1.06091e-06 +Surf-reactions/particle/step: 0 +Collision-attempts/particle/step: 0.00409046 +Collisions/particle/step: 0.00270408 +Reactions/particle/step: 0 + +Particles: 5.71431e+07 ave 5.71431e+07 max 5.71431e+07 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +Cells: 7.21782e+06 ave 7.21782e+06 max 7.21782e+06 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +GhostCell: 0 ave 0 max 0 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +EmptyCell: 0 ave 0 max 0 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +Surfs: 10000 ave 10000 max 10000 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +GhostSurf: 0 ave 0 max 0 min +Histogram: 1 0 0 0 0 0 0 0 0 0 + diff --git a/doc/sphinx/08_sparta/manzano_era/sparta.rst b/doc/sphinx/08_sparta/manzano_era/sparta.rst new file mode 100644 index 00000000..3639ec4a --- /dev/null +++ b/doc/sphinx/08_sparta/manzano_era/sparta.rst @@ -0,0 +1,499 @@ +****** +SPARTA +****** + +This is the documentation for the ATS-5 Benchmark [SPARTA]_. The content herein +was created by the following authors (in alphabetical order). + +- `Anthony M. Agelastos `_ +- `Michael A. Gallis `_ +- `Stan Moore `_ +- `Douglas M. Pase `_ +- `Joel O. Stevenson `_ + +This material is based upon work supported by the Sandia National Laboratories +(SNL), a multimission laboratory managed and operated by National Technology and +Engineering Solutions of Sandia under the U.S. Department of Energy's National +Nuclear Security Administration under contract DE-NA0003525. Content herein +considered unclassified with unlimited distribution under SAND2023-01070O. + + +Purpose +======= + +Heavily pulled from their [site]_: + + SPARTA is an acronym for **S**\ tochastic **PA**\ rallel **R**\ arefied-gas + **T**\ ime-accurate **A**\ nalyzer. SPARTA is a parallel Direct Simulation + Monte Carlo (DSMC) code for performing simulations of low-density gases in + 2d or 3d. Particles advect through a hierarchical Cartesian grid that + overlays the simulation box. The grid is used to group particles by grid + cell for purposes of performing collisions and chemistry. Physical objects + with triangulated surfaces can be embedded in the grid, creating cut and + split grid cells. The grid is also used to efficiently find particle/surface + collisions. SPARTA runs on single processors or in parallel using + message-passing techniques and a spatial-decomposition of the simulation + domain. The code is designed to be easy to modify or extend with new + functionality. Running SPARTA and the input command syntax is very similar + to the LAMMPS molecular dynamics code (but SPARTA and LAMMPS use different + underlying algorithms). + + +Characteristics +=============== + + +Application Version +------------------- + +The target application version corresponds to the Git SHA that the SPARTA git +submodule at the root of this repository is set to, i.e., within ``sparta``. + + +Problem +------- + +This problem models 2D hypersonic flow of nitrogen over a circle with periodic +boundary conditions in the z dimension, which physically translates to 3D flow +over a cylinder of infinite length. Particles are continuously emitted from the +4 faces of the simulation box during the simulation, bounce off the circle, and +then exit. The hierarchical cartesian grid is statically adapted to 6 levels +around the circle. The memory array used to hold particles is reordered by grid +cell every 100 timesteps to improve data locality and cache access patterns. + +This problem is present within the upstream SPARTA repository. The components of +this problem are listed below (paths given are within SPARTA repository). Each +of these files will need to be copied into a run directory for the simulation. + +``examples/cylinder/in.cylinder`` + This is the primary input file that controls the simulation. Some parameters + within this file may need to be changed depending upon what is being run + (i.e., these parameters control how long this simulation runs for and how + much memory it uses). + +``examples/cylinder/circle_R0.5_P10000.surf`` + This is the mesh file and will remain unchanged. + +``examples/cylinder/air.*`` + These three files (i.e., ``air.species``, ``air.tce``, and ``air.vss``) + contain the composition and reactions inherent with the air. These files, + like the mesh file, are not to be edited. + +An excerpt from this input file that has its key parameters is +provided below. + +.. code-block:: + :emphasize-lines: 5,11 + + + 37 ################################### + 38 # Simulation initialization standards + 39 ################################### + 40 variable ppc equal 34 + + 149 ################################### + 150 # Unsteady Output + 151 ################################### + + 174 run 1000 + +These parameters are described below. + +``ppc`` + This sets the **p**\ articles **p**\ er **c**\ ell variable. This variable + controls the size of the problem and, accordingly, the amount of memory it + uses. + +``run`` + This sets how many iterations it will run for, which also controls the wall + time required for termination. + + +Figure of Merit +--------------- + +Each SPARTA simulation writes out a file named "log.sparta". At the end of this +simulation is a block that resembles the following example (this is from the +ATS-2/Sierra case discussed below with 57,143,091 particles whose full output is +within :download:`log.sparta `). + +.. code-block:: + :emphasize-lines: 8-14 + + Step CPU Np Natt Ncoll Maxlevel + 0 0 57144494 0 0 4 + 50 2.058492 57144353 202798 161581 4 + 100 3.8934437 57144165 194559 151949 4 + 150 5.9264821 57144277 198187 152510 4 + 200 7.8741561 57144501 201549 153420 4 + 250 10.032195 57144624 203458 152778 4 + 300 12.061168 57144456 205469 153049 4 + 350 14.190343 57144900 207345 153059 4 + 400 16.439252 57144623 209558 153299 4 + 450 18.708537 57144477 211065 153490 4 + 500 21.039468 57144509 212701 153993 4 + 550 23.384597 57144361 214613 154199 4 + 600 25.728705 57143966 215891 154226 4 + 650 28.143147 57143817 216934 154032 4 + 700 30.525966 57143733 218282 154220 4 + 750 32.863796 57143665 218738 153527 4 + 800 35.31154 57143764 220506 154561 4 + 850 37.780522 57143900 220210 153766 4 + 900 40.252289 57143662 222260 154931 4 + 950 42.799034 57143331 222427 154383 4 + 1000 46.784784 57143434 222924 153828 4 + ... + 5800 359.74302 57143091 248584 156399 4 + Loop time of 359.743 on 1 procs for 5800 steps with 57143091 particles + +The quantity of interest (QOI) is "mega particle steps per second," which can be +computed from the above table by multiplying the third column (no. of particles) by +the first (no. of steps), dividing the result by the second column (elapsed time +in seconds), and finally dividing by 1,000,000 (normalize). + +The number of steps must be large enough so the times mentioned in the second +column exceed 600 (i.e., so it runs for at least 10 minutes). The figure of +merit (FOM) is the harmonic mean of the QOI computed from the times between 300 +and 600 seconds. + +A Python script (:download:`sparta_fom.py `) is included within +the repository to aid in computing this quantity. Pass it the ``-h`` command +line argument to view its help page for additional information. + + +System Information +================== + +The platforms utilized for benchmarking activities are listed and described below. + +* Commodity Technology System 1 (CTS-1) with Intel Cascade Lake processors, + known as Manzano at SNL (see :ref:`SystemCTS1`) +* Advanced Technology System 3 (ATS-3), also known as Crossroads (see + :ref:`SystemATS3`) +* Advanced Technology System 2 (ATS-2), also known as Sierra (see + :ref:`SystemATS2`) + + +.. _SystemCTS3: + +CTS-1/Manzano +------------- + +.. note:: + The CTS-1/Manzano system is used as a placeholder for when ATS-3/Crossroads + is available. + +The Manzano HPC cluster has 1,488 compute nodes connected together by a +high-bandwidth, low-latency Intel OmniPath network where each compute node uses +two Intel Xeon Platinum 8268 (Cascade Lake) processors. Each processor has 24 +cores, and each node has 48 physical cores and 96 virtual cores. Each core has a +base frequency of 2.9 GHz and a max frequency of 3.9 GHz. Cores support two +AVX512 SIMD units each, with peak floating-point performance (RPEAK) of 2.9 GHz +x 32 FLOP/clock x 48 cores = 4.45 TF/s. Measured DGEMM performance is just under +3.5 TF/s per node (78.5% efficiency). + +Compute nodes are a Non-Uniform Memory Access (NUMA) design, with each processor +representing a separate NUMA domain. Each processor (domain) supports six +channels of 2,933 MT/s DDR4 memory. Total memory capacity is 4 GB/core, or 192 +GB/node. Memory bandwidth for the node is 12 channels x 8 bytes / channel x +2.933 GT/s = 281.568 GB/s, and measured STREAM TRIAD throughput for local memory +access is approximately 215 GB/s (76% efficiency). Cache design uses three +levels of cache, with L1 using separate instruction and data caches, L2 unifying +instruction and data, and L3 being shared across all cores in the processor. The +cache size is 1.5 MB/core, 35.75 MB/processor, or 71.5 MB/node. + + +.. _SystemATS3: + +ATS-3/Crossroads +---------------- + +This system is not available yet but is slated to be the reference platform. + + +.. _SystemATS2: + +ATS-2/Sierra +------------ + +This system has a plethora of compute nodes that are made up of Power9 +processors with four NVIDIA V100 GPUs. Please refer to [Sierra-LLNL]_ for more +detailed information. + +A Sierra application and regression testbed system named Vortex, housed at SNL, +was used for benchmarking for convenience. Vortex has the same compute node +hardware as Sierra. + + +Building +======== + +Instructions are provided on how to build SPARTA for the following systems: + +* Generic (see :ref:`BuildGeneric`) +* Commodity Technology System 1 (CTS-1) with Intel Cascade Lake processors, + known as Manzano at SNL (see :ref:`BuildCTS1`) +* Advanced Technology System 2 (ATS-2), also known as Sierra (see + :ref:`BuildATS2`) + +If submodules were cloned within this repository, then the source code to build +SPARTA is already present at the top level within the "sparta" folder. + + +.. _BuildGeneric: + +Generic +------- + +Refer to SPARTA's [build]_ documentation for generic instructions. + + +.. _BuildCTS1: + +CTS-1/Manzano +------------- + +.. note:: + The CTS-1/Manzano system is used as a placeholder for when ATS-3/Crossroads + is available. + +Instructions for building on Manzano are provided below. These instructions +assume this repository has been cloned and that the current working directory is +at the top level of this repository. + +.. code-block:: bash + + cd doc/sphinx/8_sparta + ./build-manzano.sh + + +.. _BuildATS2: + +ATS-2/Vortex +------------ + +Instructions for building on Sierra are provided below. + +.. code-block:: bash + + module load cuda/11.2.0 + module load gcc/8.3.1 + git clone https://github.com/sparta/sparta.git sparta + pushd "sparta/src" + make yes-kokkos + make -j 64 vortex_kokkos + ls -lh `pwd -P`/spa_vortex_kokkos + popd + + +Running +======= + +Instructions are provided on how to run SPARTA for the following systems: + +* Commodity Technology System 1 (CTS-1) with Intel Cascade Lake processors, + known as Manzano at SNL (see :ref:`RunCTS1`) +* Advanced Technology System 2 (ATS-2), also known as Sierra (see + :ref:`RunATS2`) + + +.. _RunCTS1: + +CTS-1/Manzano +------------- + +.. note:: + The CTS-1/Manzano system is used as a placeholder for when ATS-3/Crossroads + is available. + +An example of how to run the test case on Manzano is provided below. + +.. code-block:: bash + + module unload intel + module unload openmpi-intel + module use /apps/modules/modulefiles-apps/cde/v3/ + module load cde/v3/devpack/intel-ompi + mpiexec \ + --np ${num_procs} \ + --bind-to socket \ + --map-by socket:span \ + "sparta/src/spa_manzano_kokkos" -in "in.cylinder" \ + >"sparta.out" 2>&1 + + +.. _RunATS2: + +ATS-2/Vortex +------------ + +An example of how to run the test case with a single GPU on Sierra is provided +below. + +.. code-block:: bash + + module load gcc/8.3.1 + module load cuda/11.2.0 + jsrun \ + -M "-gpu -disable_gdr" \ + -n 1 -a 1 -c 1 -g 1 -d packed \ + "sparta/src/spa_vortex_kokkos" -in "in.cylinder" \ + -k on g 1 -sf kk -pk kokkos reduction atomic \ + >"sparta.out" 2>&1 + + + +Verification of Results +======================= + +Results from SPARTA are provided on the following systems: + +* Commodity Technology System 1 (CTS-1) with Intel Cascade Lake processors, + known as Manzano at SNL (see :ref:`ResultsCTS1`) +* Advanced Technology System 2 (ATS-2), also known as Sierra (see + :ref:`ResultsATS2`) + + +.. _ResultsCTS1: + +CTS-1/Manzano +------------- + +.. note:: + The CTS-1/Manzano system is used as a placeholder for when ATS-3/Crossroads + is available. + +Strong scaling performance (i.e., fixed problem size being run on different MPI +rank counts) plots of SPARTA on CTS-1/Manzano are provided within the following +subsections. + +``ppc`` 11 (0.25 GiB/PE) +^^^^^^^^^^^^^^^^^^^^^^^^ + +.. csv-table:: SPARTA Strong Scaling Performance and Memory on Manzano with ppc=11 (0.25 GiB/PE) + :file: cts1-0.25.csv + :align: center + :widths: 10, 10, 10, 10 + :header-rows: 1 + +.. figure:: cts1-0.25.png + :align: center + :scale: 50% + :alt: SPARTA Strong Scaling Performance on Manzano with ppc=11 (0.25 GiB/PE) + + SPARTA Strong Scaling Performance on Manzano with ppc=11 (0.25 GiB/PE) + +.. figure:: cts1mem-0.25.png + :align: center + :scale: 50% + :alt: SPARTA Strong Scaling Memory on Manzano with ppc=11 (0.25 GiB/PE) + + SPARTA Strong Scaling Memory on Manzano with ppc=11 elements (0.25 GiB/PE) + +``ppc`` 21 (0.50 GiB/PE) +^^^^^^^^^^^^^^^^^^^^^^^^ + +.. csv-table:: SPARTA Strong Scaling Performance and Memory on Manzano with ppc=21 (0.50 GiB/PE) + :file: cts1-0.50.csv + :align: center + :widths: 10, 10, 10, 10 + :header-rows: 1 + +.. figure:: cts1-0.50.png + :align: center + :scale: 50% + :alt: SPARTA Strong Scaling Performance on Manzano with ppc=21 (0.50 GiB/PE) + + SPARTA Strong Scaling Performance on Manzano with ppc=21 (0.50 GiB/PE) + +.. figure:: cts1mem-0.50.png + :align: center + :scale: 50% + :alt: SPARTA Strong Scaling Memory on Manzano with ppc=21 (0.50 GiB/PE) + + SPARTA Strong Scaling Memory on Manzano with ppc=21 elements (0.50 GiB/PE) + +``ppc`` 42 (1.00 GiB/PE) +^^^^^^^^^^^^^^^^^^^^^^^^ + +.. csv-table:: SPARTA Strong Scaling Performance and Memory on Manzano with ppc=42 (1.00 GiB/PE) + :file: cts1-1.00.csv + :align: center + :widths: 10, 10, 10, 10 + :header-rows: 1 + +.. figure:: cts1-1.00.png + :align: center + :scale: 50% + :alt: SPARTA Strong Scaling Performance on Manzano with ppc=42 (1.00 GiB/PE) + + SPARTA Strong Scaling Performance on Manzano with ppc=42 (1.00 GiB/PE) + +.. figure:: cts1mem-1.00.png + :align: center + :scale: 50% + :alt: SPARTA Strong Scaling Memory on Manzano with ppc=42 (1.00 GiB/PE) + + SPARTA Strong Scaling Memory on Manzano with ppc=42 elements (1.00 GiB/PE) + +``ppc`` 126 (2.00 GiB/PE) +^^^^^^^^^^^^^^^^^^^^^^^^ + +.. csv-table:: SPARTA Strong Scaling Performance and Memory on Manzano with ppc=126 (2.00 GiB/PE) + :file: cts1-2.00.csv + :align: center + :widths: 10, 10, 10, 10 + :header-rows: 1 + +.. figure:: cts1-2.00.png + :align: center + :scale: 50% + :alt: SPARTA Strong Scaling Performance on Manzano with ppc=126 (2.00 GiB/PE) + + SPARTA Strong Scaling Performance on Manzano with ppc=126 (2.00 GiB/PE) + +.. figure:: cts1mem-2.00.png + :align: center + :scale: 50% + :alt: SPARTA Strong Scaling Memory on Manzano with ppc=126 (2.00 GiB/PE) + + SPARTA Strong Scaling Memory on Manzano with ppc=126 elements (2.00 GiB/PE) + + +.. _ResultsATS2: + +ATS-2/Vortex +------------ + +Throughput performance of SPARTA on ATS-2/Vortex is provided within the +following table and figure. + +.. csv-table:: SPARTA Throughput Performance on ATS-2/Vortex + :file: ats2.csv + :align: center + :widths: 10, 10 + :header-rows: 1 + +.. figure:: ats2.png + :align: center + :scale: 50% + :alt: SPARTA Throughput Performance on ATS-2/Vortex + + SPARTA Throughput Performance on ATS-2/Vortex + +Output from the largest case is within :download:`log.sparta `. + +References +========== + +.. [SPARTA] S. J. Plimpton and S. G. Moore and A. Borner and A. K. Stagg + and T. P. Koehler and J. R. Torczynski and M. A. Gallis, 'Direct + Simulation Monte Carlo on petaflop supercomputers and beyond', + 2019, Physics of Fluids, 31, 086101. +.. [site] M. Gallis and S. Plimpton and S. Moore, 'SPARTA Direct Simulation + Monte Carlo Simulator', 2023. [Online]. Available: + https://sparta.github.io. [Accessed: 22- Feb- 2023] +.. [build] M. Gallis and S. Plimpton and S. Moore, 'SPARTA Documentation Getting + Started', 2023. [Online]. Available: + https://sparta.github.io/doc/Section_start.html#start_2. [Accessed: + 26- Mar- 2023] diff --git a/doc/sphinx/08_sparta/manzano_era/sparta_fom.py b/doc/sphinx/08_sparta/manzano_era/sparta_fom.py new file mode 100755 index 00000000..af1561af --- /dev/null +++ b/doc/sphinx/08_sparta/manzano_era/sparta_fom.py @@ -0,0 +1,236 @@ +#!/usr/bin/env python3 + +""" +This is a self-contained script that extracts the SPARTA FOM for ATS-5. + +This self-contained script extracts the figure of merit (FOM) from SPARTA (ca. +early 2023) log.sparta output files. The FOM is the harmonic mean of the +computed Mega-cell-steps-per-second from the Loop timer block between 5 and 10 +minutes of wall time. +Author: Anthony M. Agelastos +""" + + +# import Python functions +import sys +import argparse +import os +import logging + +assert sys.version_info >= (3, 5), "Please use Python version 3.5 or later." + + +# define GLOBAL vars +VERSION = "2.71" +TIMEOUT = 30 +IS_ALL = True +EXIT_CODES = {"success": 0, "no file": 1, "bad loop time block": 2} + + +# define global functions +def print_exit_codes(): + """Print out exit codes.""" + super_str = "exit codes = {" + for key, value in EXIT_CODES.items(): + super_str += '"{}": {}, '.format(key, value) + super_str = super_str[:-2] + super_str += "}" + return super_str + + +def is_file(file_name): + """Check if the file exists and can be read.""" + return os.access(file_name, os.R_OK) + + +# define classes +class BuildDocHelp(object): + """Display help.""" + + def __init__(self): + """Initialize object and create argparse entities.""" + my_epilog = print_exit_codes() + self.parser = argparse.ArgumentParser( + description="This Python program will extract the figure of merit (FOM) for SPARTA.", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + epilog=my_epilog, + ) + + self.parser.add_argument( + "-a", + "--all", + action="store_true", + default=IS_ALL, + help="Generate ALL FOM information", + ) + + self.parser.add_argument( + "-f", + "--file", + type=str, + default="log.sparta", + help="file name to read", + ) + + self.parser.add_argument( + "-l", + "--logLevel", + type=str, + default="info", + choices=("info", "debug", "warning"), + help="logging level", + ) + + self.parser.add_argument( + "-v", "--version", action="version", version="%(prog)s {}".format(VERSION) + ) + + self.args = self.parser.parse_args() + + def get_args(self): + """Return argparse-parsed arguments for checking workflow state.""" + return self.args + + +class SpartaFom(object): + """This class encapsulates the build of ADPS documentation.""" + + def __init__(self, **kwargs): + """Initialize object and define initial desired build state.""" + # set parameters from object instantiation + for key, value in kwargs.items(): + setattr(self, key, value) + + # check for required attributes + required_attr = [ + "logger", + "file_name", + "is_all", + ] + needed_attr = [item for item in required_attr if not hasattr(self, item)] + assert len(needed_attr) == 0, ( + "Please ensure object {} has the following required " + "attributes: {}!".format(self.__class____name__, required_attr) + ) + + # check attributes + self._check_attr() + + def _check_attr(self): + """Check object attributes.""" + # check inputs + assert isinstance( + self.logger, logging.RootLogger + ), "Pass appropriate logging object to {}!".format(self.__class__.__name__) + if not isinstance(self.is_all, bool): + tmp = bool(self.is_all) + self.logger.critical( + "Type issue with is_all within {} (should be bool, is {}); converted to bool and is now {}.".format( + self.__class__.__name__, type(self.is_all), tmp + ) + ) + self.is_all = tmp + + if not is_file(self.file_name): + self.logger.critical('Cannot read "{}"'.format(self.file_name)) + sys.exit(EXIT_CODES["no file"]) + + def _check_start(self, line): + """Check if this is the start of the Loop time block.""" + return "Step CPU Np Natt Ncoll Maxlevel" in line + + def _check_end(self, line): + """Check if this is the end of the Loop time block.""" + return "Loop time of " in line and "steps with" in line + + def _extract_line(self, line): + """Extract and parse the line.""" + l_line = line.split() + if len(l_line) != 6: + self.logger.critical("Loop time block not sized appropriately!") + sys.exit(EXIT_CODES["bad loop time block"]) + n_line = [] + n_line.append(int(l_line[0])) + n_line.append(float(l_line[1])) + n_line.append(int(l_line[2])) + n_line.append(int(l_line[3])) + n_line.append(int(l_line[4])) + n_line.append(int(l_line[5])) + return n_line + + def _compute_fom(self, block): + """Compute the FOM.""" + vals = [] + start = 300.0 + finish = 600.0 + for line in block: + if line[1] >= finish: + break + if line[1] > start: + fom = line[2] * line[0] / line[1] / 1000000 + fom = 1 / fom + vals.append(fom) + num_vals = len(vals) + + hmean_fom = 0 + hmean_denom = 0 + if num_vals != 0: + for item in vals: + hmean_denom = hmean_denom + item + hmean_fom = num_vals / hmean_denom + + return hmean_fom + + def run(self): + """Extract the FOM.""" + self.logger.debug("Extracting the FOM...") + + loop_info = [] + is_extract = False + with open(self.file_name) as fp: + cnt = 1 + line = fp.readline() + while line: + cnt += 1 + line = fp.readline() + if self._check_end(line): + self.logger.debug("Found end at line {}.".format(cnt)) + break + if is_extract: + loop_info.append(self._extract_line(line)) + if self._check_start(line): + self.logger.debug("Found start at line {}.".format(cnt)) + is_extract = True + continue + fom = self._compute_fom(loop_info) + self.logger.info("FOM = {}".format(fom)) + + +# do work +if __name__ == "__main__": + # manage command line arguments + build_doc_help = BuildDocHelp() + cl_args = build_doc_help.get_args() + + # manage logging + int_logging_level = getattr(logging, cl_args.logLevel.upper(), None) + if not isinstance(int_logging_level, int): + raise ValueError("Invalid log level: {}!".format(cl_args.logLevel)) + logging.basicConfig( + format="%(levelname)s - %(asctime)s - %(message)s", level=int_logging_level + ) + logging.debug("Set logging level to {}.".format(cl_args.logLevel)) + logger = logging.getLogger() + + # manage worker object + sparta_fom = SpartaFom( + logger=logger, + file_name=cl_args.file, + is_all=cl_args.all, + ) + + # do work + sparta_fom.run() + + # exit gracefully + sys.exit(EXIT_CODES["success"]) From 6b066ddb1e46fb33fe958d9b236bba09c8e04818 Mon Sep 17 00:00:00 2001 From: "Daniel J. Magee" Date: Fri, 15 Sep 2023 16:28:32 -0600 Subject: [PATCH 10/12] Update parser and include dgemm source in repo. --- dgemm/README.ACES | 60 ++++++++++++ dgemm/scripts/loop_dgemm | 32 +++++++ dgemm/src/Makefile | 11 +++ dgemm/src/Makefile.intel | 11 +++ dgemm/src/mt-dgemm.c | 200 +++++++++++++++++++++++++++++++++++++++ utils/pavparse | 13 +-- 6 files changed, 321 insertions(+), 6 deletions(-) create mode 100644 dgemm/README.ACES create mode 100755 dgemm/scripts/loop_dgemm create mode 100644 dgemm/src/Makefile create mode 100644 dgemm/src/Makefile.intel create mode 100644 dgemm/src/mt-dgemm.c diff --git a/dgemm/README.ACES b/dgemm/README.ACES new file mode 100644 index 00000000..03f14e3f --- /dev/null +++ b/dgemm/README.ACES @@ -0,0 +1,60 @@ +=================================================================== +ACES DGEMM Benchmark +=================================================================== + +The purpose of the DGEMM benchmark is to provide an evaluation +mechanism for running numerically intensive appplications on +hardware systems with active thermal throttling and dynamic clock +frequencies. + +The benchmark runs on a single node (i.e. there is no distributed +MPI) but is threaded. + +Source code is contained in the "src" directory. + +=================================================================== + +Modifications Permitted: + +- Vendors are permitted to change the source code in the region +marked in the source. +- Optimized BLAS/DGEMM routines are permitted (and encouraged) to +demonstrated the highest performance. +- Vendors may modify the Makefile(s) as required + +=================================================================== + +Running the Benchmark: + +Example: + +export OMP_NUM_THREADS=32 +export OMP_PLACES=cores +export OMP_PROC_BIND=close + +./mt-dgemm 5004 100 + + +- This runs the benchmark with a matrix input size of 5004 and +100 repetitions. +- The vendor is free to select the matrix size but smaller matrices +will often produce lower performance results +- ACES will perform run the benchmark with a minimum repetition +of 500 to ensure consistent processor performance that is unaffected +by thermal throttling. + +=================================================================== + +Example Output of Interest: + +Final Sum is: 5004.010000 +Memory for Matrices: 573.120483 MB +Multiply time: 26.925897 seconds +FLOPs computed: 25065056016000.000000 +GFLOP/s rate: 930.890292 GF/s + +- The GFLOP/s rate is the FOM of interest for this benchmark +- The *entire* output should be provided by the Offeror + +=================================================================== + diff --git a/dgemm/scripts/loop_dgemm b/dgemm/scripts/loop_dgemm new file mode 100755 index 00000000..8169430d --- /dev/null +++ b/dgemm/scripts/loop_dgemm @@ -0,0 +1,32 @@ +#!/bin/sh +CNT=${1:-10} #Number of iterations at each Affinity; default is 10 +EXE_PATH="." #Directory path to dgemm executable +KEY=GFLOP #Search key +CSV=dgemm-summary.csv #Name of file that can be imported into Excel +LOG=dgemm.log #Full output; TODO: search for errors or reasonableness +rm -f $LOG +VENDOR=intel +# Search loaded modules for active compiler version +COMP=`module --terse list 2>&1 | grep $VENDOR | awk -F/ '{print $NF}'` +# Header in csv file +# Comment out next line if you want to append to an existing file +echo "Date, Command, compiler, node, $KEY" > $CSV +for AFF in "0-15" "16-31" +do +# Would love to use this in command line. Tried alias and back ticks. No luck. +CMD="OMP_NUM_THREADS=16 GOMP_CPU_AFFINITY=$AFF $EXE_PATH/mt-dgemm-icc 2048" +i=1 +while [ "$i" -le $CNT ] +do +# Put static stuff in cvs file with no new line +echo -n `date` ", " $CMD ", " $VENDOR $COMP ", " `hostname` ", " >> $CSV +# Do it +OMP_NUM_THREADS=16 GOMP_CPU_AFFINITY=$AFF $EXE_PATH/mt-dgemm-icc 2048 | tee -a $LOG | grep $KEY | awk '{print $3}' >> $CSV +# Still alive feedback +echo -n AFF=$AFF " " +grep $KEY $LOG | tail -1 +i=`expr $i + 1` +done #while +done #for +#cat $CSV +echo Comma delimited file $CSV is ready diff --git a/dgemm/src/Makefile b/dgemm/src/Makefile new file mode 100644 index 00000000..a448b919 --- /dev/null +++ b/dgemm/src/Makefile @@ -0,0 +1,11 @@ + +CC=gcc +CFLAGS=-ffast-math -mavx2 -ftree-vectorizer-verbose=3 -O3 -fopenmp -DUSE_CBLAS +LDFLAGS=-L${OPENBLAS_ROOT}/lib -lopenblas + +mt-dgemm: mt-dgemm.c + $(CC) $(CFLAGS) -o mt-dgemm mt-dgemm.c + +clean: + rm mt-dgemm *.o + diff --git a/dgemm/src/Makefile.intel b/dgemm/src/Makefile.intel new file mode 100644 index 00000000..e9765c58 --- /dev/null +++ b/dgemm/src/Makefile.intel @@ -0,0 +1,11 @@ + +CC=icc +CFLAGS=-O3 -fopenmp -mkl -DUSE_MKL +LDFLAGS= + +mt-dgemm-icc: mt-dgemm.c + $(CC) $(CFLAGS) -o mt-dgemm-icc mt-dgemm.c + +clean: + rm mt-dgemm *.o + diff --git a/dgemm/src/mt-dgemm.c b/dgemm/src/mt-dgemm.c new file mode 100644 index 00000000..046a8086 --- /dev/null +++ b/dgemm/src/mt-dgemm.c @@ -0,0 +1,200 @@ + +#include +#include +#include + +#ifdef USE_MKL +#include "mkl.h" +#endif + +#ifdef USE_CBLAS +#include "cblas.h" +#endif + +#ifdef USE_ESSL +#include "essl.h" +#endif + +#define DGEMM_RESTRICT __restrict__ + +// ------------------------------------------------------- // +// Function: get_seconds +// +// Vendor may modify this call to provide higher resolution +// timing if required +// ------------------------------------------------------- // +double get_seconds() { + struct timeval now; + gettimeofday(&now, NULL); + + const double seconds = (double) now.tv_sec; + const double usec = (double) now.tv_usec; + + return seconds + (usec * 1.0e-6); +} + +// ------------------------------------------------------- // +// Function: main +// +// Modify only in permitted regions (see comments in the +// function) +// ------------------------------------------------------- // +int main(int argc, char* argv[]) { + + // ------------------------------------------------------- // + // DO NOT CHANGE CODE BELOW + // ------------------------------------------------------- // + + int N = 256; + int repeats = 8; + + double alpha = 1.0; + double beta = 1.0; + + if(argc > 1) { + N = atoi(argv[1]); + printf("Matrix size input by command line: %d\n", N); + + if(argc > 2) { + repeats = atoi(argv[2]); + + if(repeats < 4) { + fprintf(stderr, "Error: repeats must be at least 4, setting is: %d\n", repeats); + exit(-1); + } + + printf("Repeat multiply %d times.\n", repeats); + + if(argc > 3) { + alpha = (double) atof(argv[3]); + + if(argc > 4) { + beta = (double) atof(argv[4]); + } + } + } else { + printf("Repeat multiply defaulted to %d\n", repeats); + } + } else { + printf("Matrix size defaulted to %d\n", N); + } + + printf("Alpha = %f\n", alpha); + printf("Beta = %f\n", beta); + + if(N < 128) { + printf("Error: N (%d) is less than 128, the matrix is too small.\n", N); + exit(-1); + } + + printf("Allocating Matrices...\n"); + + double* DGEMM_RESTRICT matrixA = (double*) malloc(sizeof(double) * N * N); + double* DGEMM_RESTRICT matrixB = (double*) malloc(sizeof(double) * N * N); + double* DGEMM_RESTRICT matrixC = (double*) malloc(sizeof(double) * N * N); + + printf("Allocation complete, populating with values...\n"); + + int i, j, k, r; + + #pragma omp parallel for + for(i = 0; i < N; i++) { + for(j = 0; j < N; j++) { + matrixA[i*N + j] = 2.0; + matrixB[i*N + j] = 0.5; + matrixC[i*N + j] = 1.0; + } + } + + printf("Performing multiplication...\n"); + + const double start = get_seconds(); + + // ------------------------------------------------------- // + // VENDOR NOTIFICATION: START MODIFIABLE REGION + // + // Vendor is able to change the lines below to call optimized + // DGEMM or other matrix multiplication routines. Do *NOT* + // change any lines above this statement. + // ------------------------------------------------------- // + + double sum = 0; + + // Repeat multiple times + for(r = 0; r < repeats; r++) { +#if defined(USE_MKL) || defined(USE_CBLAS) + cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, + N, N, N, alpha, matrixA, N, matrixB, N, beta, matrixC, N); +#elif USE_ESSL + dgemm("N", "N", + N, N, N, alpha, matrixA, N, matrixB, N, beta, matrixC, N); +#else + #pragma omp parallel for private(sum) + for(i = 0; i < N; i++) { + for(j = 0; j < N; j++) { + sum = 0; + + for(k = 0; k < N; k++) { + sum += matrixA[i*N + k] * matrixB[k*N + j]; + } + + matrixC[i*N + j] = (alpha * sum) + (beta * matrixC[i*N + j]); + } + } +#endif + } + + // ------------------------------------------------------- // + // VENDOR NOTIFICATION: END MODIFIABLE REGION + // ------------------------------------------------------- // + + // ------------------------------------------------------- // + // DO NOT CHANGE CODE BELOW + // ------------------------------------------------------- // + + const double end = get_seconds(); + + printf("Calculating matrix check...\n"); + + double final_sum = 0; + double count = 0; + + #pragma omp parallel for reduction(+:final_sum, count) + for(i = 0; i < N; i++) { + for(j = 0; j < N; j++) { + final_sum += matrixC[i*N + j]; + count += 1.0; + } + } + + double N_dbl = (double) N; + double matrix_memory = (3 * N_dbl * N_dbl) * ((double) sizeof(double)); + + printf("\n"); + printf("===============================================================\n"); + + printf("Final Sum is: %f\n", (final_sum / (count * repeats))); + printf("Memory for Matrices: %f MB\n", + (matrix_memory / (1024 * 1024))); + + const double time_taken = (end - start); + + printf("Multiply time: %f seconds\n", time_taken); + + // O(N**3) elements each with one add and three multiplies + // (alpha, beta and A_i*B_i). + const double flops_computed = (N_dbl * N_dbl * N_dbl * 2.0 * (double)(repeats)) + + (N_dbl * N_dbl * 2 * (double)(repeats)); + + printf("FLOPs computed: %f\n", flops_computed); + printf("GFLOP/s rate: %f GF/s\n", (flops_computed / time_taken) / 1000000000.0); + + printf("===============================================================\n"); + printf("\n"); + + free(matrixA); + free(matrixB); + free(matrixC); + + return 0; +} diff --git a/utils/pavparse b/utils/pavparse index 859fbe36..58f82a53 100755 --- a/utils/pavparse +++ b/utils/pavparse @@ -177,15 +177,16 @@ if __name__ == '__main__': gtitle = k if isinstance(k, tuple): gtitle = "-".join(k) - grouped_df = convert_df_numeric(v).drop(groupers,axis=1).set_index(index).sort_index() - grouped_df.plot(ax=ax, grid=True) + grouped_df = convert_df_numeric(v).drop(groupers,axis=1) + gdf = grouped_df.set_index(index).sort_index() + gdf.plot(ax=ax, grid=True) plt_legend.append(gtitle) if index_title: - grouped_df.index.name = index_title + gdf.index.name = index_title if value_title: - grouped_df.columns = value_title - grouped_dfs[gtitle] = grouped_df.squeeze().map(fmt.format) - df_mean[gtitle] = grouped_df.mean().values[0] + gdf.columns = value_title + grouped_dfs[gtitle] = gdf.squeeze().map(fmt.format) + df_mean[gtitle] = gdf.mean().values[0] plt.legend(plt_legend) mean_series = pd.Series(df_mean) From b38b6b5b3d8266d92b4280e642c5ad3356da76b3 Mon Sep 17 00:00:00 2001 From: "Daniel J. Magee" Date: Tue, 19 Sep 2023 12:30:07 -0600 Subject: [PATCH 11/12] The commit for STREAM and DGEMM docs was lost. Recovered it and recommitting. --- doc/sphinx/03_vibe/vibe.rst | 54 ++++++++--------- .../10_microbenchmarks/M1_STREAM/STREAM.rst | 58 +++++++++++++------ .../10_microbenchmarks/M1_STREAM/cpu.gp | 17 +++--- .../M1_STREAM/stream_cts1.csv | 8 +++ .../10_microbenchmarks/M3_DGEMM/DGEMM.rst | 12 +++- doc/sphinx/10_microbenchmarks/M3_DGEMM/cpu.gp | 16 ++--- 6 files changed, 97 insertions(+), 68 deletions(-) create mode 100644 doc/sphinx/10_microbenchmarks/M1_STREAM/stream_cts1.csv diff --git a/doc/sphinx/03_vibe/vibe.rst b/doc/sphinx/03_vibe/vibe.rst index 5fbfb6d0..314df660 100644 --- a/doc/sphinx/03_vibe/vibe.rst +++ b/doc/sphinx/03_vibe/vibe.rst @@ -124,8 +124,33 @@ Results from Parthenon are provided on the following systems: * Commodity Technology System 1 (CTS-1) (Snow) with Intel Broadwell processors, * An Nvidia A100 GPU hosted on an [Nvidia Arm HPC Developer Kit](https://developer.nvidia.com/arm-hpc-devkit) -CTS-1 --------- +ATS-3 Rocinante HBM +------------------- + +.. csv-table:: VIBE Throughput Performance on ATS-3 Rocinante HBM nodes 40% Memory + :file: parthenon-ats5_spr-hbm128-intel-classic.csv + :align: center + :widths: 10, 10 + :header-rows: 1 + +.. figure:: ats3_40.png + :align: center + :scale: 50% + :alt: VIBE Throughput Performance on ATS-3 Rocinante HBM nodes + +.. csv-table:: VIBE Throughput Performance on ATS-3 Rocinante HBM nodes 60% Memory + :file: parthenon-ats5_spr-hbm160-intel-classic.csv + :align: center + :widths: 10, 10 + :header-rows: 1 + +.. figure:: ats3_60.png + :align: center + :scale: 50% + :alt: VIBE Throughput Performance on ATS-3 Rocinante HBM nodes + +CTS-1 Snow +----------- The mesh and meshblock size parameters are chosen to balance realism/performance with memory footprint. For the following tests we @@ -196,31 +221,6 @@ Throughput performance of Parthenon-VIBE on a 40GB A100 is provided within the f :scale: 50% :alt: VIBE Throughput Performance on A100 -ATS-3 ------- - -.. csv-table:: VIBE Throughput Performance on ATS-3 Rocinante HBM nodes 40% Memory - :file: parthenon-ats5_spr-hbm128-intel-classic.csv - :align: center - :widths: 10, 10 - :header-rows: 1 - -.. figure:: ats3_40.png - :align: center - :scale: 50% - :alt: VIBE Throughput Performance on ATS-3 Rocinante HBM nodes - -.. csv-table:: VIBE Throughput Performance on ATS-3 Rocinante HBM nodes 60% Memory - :file: parthenon-ats5_spr-hbm160-intel-classic.csv - :align: center - :widths: 10, 10 - :header-rows: 1 - -.. figure:: ats3_60.png - :align: center - :scale: 50% - :alt: VIBE Throughput Performance on ATS-3 Rocinante HBM nodes - Verification of Results ======================= diff --git a/doc/sphinx/10_microbenchmarks/M1_STREAM/STREAM.rst b/doc/sphinx/10_microbenchmarks/M1_STREAM/STREAM.rst index bccba2e7..d3dab238 100644 --- a/doc/sphinx/10_microbenchmarks/M1_STREAM/STREAM.rst +++ b/doc/sphinx/10_microbenchmarks/M1_STREAM/STREAM.rst @@ -42,42 +42,67 @@ The primary FOM is the Triad rate (MB/s). Building ======== -Adjustments to GOMP_CPU_AFFINITY may also be necessary. +Adjustments to GOMP_CPU_AFFINITY may be necessary. -You can modify the STREAM_ARRAY_SIZE value in the compilation step to change the array size used by the benchmark. Adjusting the array size can help accommodate the available memory on your system. +The STREAM_ARRAY_SIZE value is a critical parameter set at compile time and controls the size of the array used to measure bandwidth. STREAM requires different amounts of memory to run on different systems, depending on both the system cache size(s) and the granularity of the system timer. + +You should adjust the value of 'STREAM_ARRAY_SIZE' (below) to meet BOTH of the following criteria: + +1) Each array must be at least 4 times the size of the available cache memory. I don't worry about the difference between 10^6 and 2^20, so in practice the minimum array size is about 3.8 times the cache size. + (a) Example 1: One Xeon E3 with 8 MB L3 cache STREAM_ARRAY_SIZE should be >= 4 million, giving an array size of 30.5 MB and a total memory requirement of 91.5 MB. + (b) Example 2: Two Xeon E5's with 20 MB L3 cache each (using OpenMP) STREAM_ARRAY_SIZE should be >= 20 million, giving an array size of 153 MB and a total memory requirement of 458 MB. +2) The size should be large enough so that the 'timing calibration' output by the program is at least 20 clock-ticks. +For example, most versions of Windows have a 10 millisecond timer granularity. 20 "ticks" at 10 ms/tic is 200 milliseconds. If the chip is capable of 10 GB/s, it moves 2 GB in 200 msec. This means the each array must be at least 1 GB, or 128M elements. + +Set STREAM_ARRAY_SIZE using the -D flag on your compile line. + +Example calculations for results presented here: + +STREAM ARRAY SIZE CALCULATIONS: + +ARRAY_SIZE ~= 4 x (45 MiB cache / processor) x (2 processors) / (3 arrays) / (8 bytes / element) = 15 Mi elements = 15000000 + +HASWELL: Intel(R) Xeon(R) CPU E5-2698 v3 @ 2.30GHz +CACHE: 40M +SOCKETS: 2 +4 * ( 40M * 2 ) / 3 ARRAYS / 8 Bytes/element = 13.4 Mi elements = 13400000 + +BROADWELL: Intel(R) Xeon(R) CPU E5-2695 v4 @ 2.10GHz +CACHE: 45M +SOCKETS: 2 +4 * ( 45M * 2 ) / 3 ARRAYS / 8 BYTES/ELEMENT = 15.0 Mi elements = 15000000 + +SAPPHIRE RAPIDS: Intel(R) Xeon(R) Platinum 8480+ +CACHE: 105 +SOCKETS: 2 +4 x (105M * 2 ) / 3 ARRAYS / 8 BYTES/ELEMENT = 35 Mi elements = 35000000 Running ======= .. code-block:: bash - mpirun -np ./stream + srun -n ./stream Replace `` with the number of MPI processes you want to use. For example, if you want to use 4 MPI processes, the command will be: .. code-block:: bash - mpirun -np 4 ./stream - -Input ------ - -Dependent Variable(s) ---------------------- - -1. Maximum bandwidth while utilizing all hardware cores and threads. MAX_BW -2. A minimum number of cores and threads that achieves MAX_BW. MIN_CT + srun -n 4 ./stream Example Results =============== +ATS-3 Rocinante HBM +------------------- + CTS-1 Snow ----------- .. csv-table:: STREAM microbenchmark bandwidth measurement - :file: stream-cts1_ats5intel-oneapi-openmpi.csv + :file: stream_cts1.csv :align: center - :widths: 10, 10 + :widths: 10, 10, 10 :header-rows: 1 .. figure:: cpu_cts1.png @@ -85,6 +110,3 @@ CTS-1 Snow :scale: 50% :alt: STREAM microbenchmark bandwidth measurement -ATS-3 Rocinante HBM -------------------- - diff --git a/doc/sphinx/10_microbenchmarks/M1_STREAM/cpu.gp b/doc/sphinx/10_microbenchmarks/M1_STREAM/cpu.gp index 36f35a67..6af002ff 100644 --- a/doc/sphinx/10_microbenchmarks/M1_STREAM/cpu.gp +++ b/doc/sphinx/10_microbenchmarks/M1_STREAM/cpu.gp @@ -4,13 +4,14 @@ set terminal pngcairo enhanced size 1024, 768 dashed font 'Helvetica,18' set output "cpu_cts1.png" set title "STREAM Single node bandwidth" font "serif,22" -set xlabel "No. Processing Elements" -set ylabel "Figure of Merit Triad (MB/s)" +set ylabel "Per core triad (MB/s)" +set y2label "FOM: Total Triad (MB/s)" -set xrange [1:64] +set xrange [1:40] +set yrange [3000:15000] -set logscale x 2 -set logscale y 2 +# set logscale x 2 +# set logscale y 2 set grid show grid @@ -21,9 +22,7 @@ set key autotitle columnheader set style line 1 linetype 6 dashtype 1 linecolor rgb "#FF0000" linewidth 2 pointtype 6 pointsize 3 set style line 2 linetype 1 dashtype 2 linecolor rgb "#FF0000" linewidth 2 -plot "stream-cts1_ats5intel-oneapi-openmpi.csv" using 1:2 with linespoints linestyle 1 +plot "stream_cts1.csv" using 1:2 with linespoints linestyle 1 axis x1y1, "" using 1:3 with line linestyle 2 axis x1y2 + -# set output "cpu_133M.png" -# set title "Branson Strong Scaling Performance on CTS-1, 133M particles" font "serif,22" -# plot "cpu_133M.csv" using 1:2 with linespoints linestyle 1, "" using 1:3 with line linestyle 2 diff --git a/doc/sphinx/10_microbenchmarks/M1_STREAM/stream_cts1.csv b/doc/sphinx/10_microbenchmarks/M1_STREAM/stream_cts1.csv new file mode 100644 index 00000000..dd40b0ba --- /dev/null +++ b/doc/sphinx/10_microbenchmarks/M1_STREAM/stream_cts1.csv @@ -0,0 +1,8 @@ +No. Cores,Bandwidth (MB/s),Total Bandwidth (MB/s) +1,10690.1,10690.1 +2,10701.3,21402.6 +4,9316.5,37266.0 +8,7884.5,63076.0 +16,7747.5,123960.0 +32,5510.3,176329.6 +36,3189.2,114811.2 \ No newline at end of file diff --git a/doc/sphinx/10_microbenchmarks/M3_DGEMM/DGEMM.rst b/doc/sphinx/10_microbenchmarks/M3_DGEMM/DGEMM.rst index 850b6e73..6acfecab 100644 --- a/doc/sphinx/10_microbenchmarks/M3_DGEMM/DGEMM.rst +++ b/doc/sphinx/10_microbenchmarks/M3_DGEMM/DGEMM.rst @@ -16,6 +16,7 @@ Problem ------- .. math:: + \mathbf{C} = \alpha*\mathbf{A}*\mathbf{B} + \beta*\mathbf{C} Where :math:`A B C` are square :math:`NxN` vectors and :math:`\alpha` and :math:`\beta` are scalars. This operation is repeated :math:`R` times. @@ -30,7 +31,6 @@ GFLOP/s rate: GF/s Run Rules --------- - * Vendors are permitted to change the source code in the region marked in the source. * Optimized BLAS/DGEMM routines are permitted (and encouraged) to demonstrate the highest performance. * Vendors may modify the Makefile(s) as required @@ -40,12 +40,14 @@ Building Makefiles are provided for the intel and gcc compilers. Before building, load the compiler and blas libraries into the PATH and LD_LIBRARY_PATH. -.. code-block:: +.. code-block:: bash cd src patch -p1 < ../dgemm_omp_fixes.patch make +.. + If using a different compiler, copy and modify the simple makefiles to apply the appropriate flags. If using a different blas library than mkl or openblas, modify the C source file to use the correct header and dgemm command. @@ -58,12 +60,18 @@ DGEMM uses OpenMP but does not use MPI. Set the number of OpenMP threads before running. .. code-block:: bash + export OPENBLAS_NUM_THREADS = export OMP_NUM_THREADS = +.. + .. code-block:: bash + ./mt-dgemm +.. + These values default to: :math:`N=256, R=8, \alpha=1.0, \beta=1.0` These inputs are subject to the conditions :math:`N>128, R>4`. diff --git a/doc/sphinx/10_microbenchmarks/M3_DGEMM/cpu.gp b/doc/sphinx/10_microbenchmarks/M3_DGEMM/cpu.gp index e4bb7155..14c4ac35 100644 --- a/doc/sphinx/10_microbenchmarks/M3_DGEMM/cpu.gp +++ b/doc/sphinx/10_microbenchmarks/M3_DGEMM/cpu.gp @@ -1,10 +1,10 @@ #!/usr/bin/gnuplot set terminal pngcairo enhanced size 1024, 768 dashed font 'Helvetica,18' -set output "cpu_66M.png" +set output "dgemm_cts1.png" -set title "Branson Strong Scaling Performance on CTS-1, 66M particles" font "serif,22" +set title " Single node Dgemm" font "serif,22" set xlabel "No. Processing Elements" -set ylabel "Figure of Merit (particles/sec)" +set ylabel "Figure of Merit (GFlops)" set xrange [1:64] set key left top @@ -21,15 +21,7 @@ set key autotitle columnheader set style line 1 linetype 6 dashtype 1 linecolor rgb "#FF0000" linewidth 2 pointtype 6 pointsize 3 set style line 2 linetype 1 dashtype 2 linecolor rgb "#FF0000" linewidth 2 -plot "cpu_66M.csv" using 1:2 with linespoints linestyle 1, "" using 1:3 with line linestyle 2 +#plot "cpu_66M.csv" using 1:2 with linespoints linestyle 1, "" using 1:3 with line linestyle 2 -set output "cpu_133M.png" -set title "Branson Strong Scaling Performance on CTS-1, 133M particles" font "serif,22" -plot "cpu_133M.csv" using 1:2 with linespoints linestyle 1, "" using 1:3 with line linestyle 2 - - -set output "cpu_200M.png" -set title "Branson Strong Scaling Performance on CTS-1, 200M particles" font "serif,22" -plot "cpu_200M.csv" using 1:2 with linespoints linestyle 1, "" using 1:3 with line linestyle 2 From de0a026693e8befba9babe3b3101092bb3a1c85e Mon Sep 17 00:00:00 2001 From: "Daniel J. Magee" Date: Wed, 20 Sep 2023 11:07:32 -0600 Subject: [PATCH 12/12] Fixed stream doc format. MD Test explanations. Added dgemm, mdtest to microbench dir. --- .../10_microbenchmarks/M1_STREAM/STREAM.rst | 50 +- .../10_microbenchmarks/M6_MDTEST/MDTEST.rst | 12 - {dgemm => microbenchmarks/dgemm}/README.ACES | 0 .../dgemm}/scripts/loop_dgemm | 0 {dgemm => microbenchmarks/dgemm}/src/Makefile | 0 .../dgemm}/src/Makefile.intel | 0 .../dgemm}/src/mt-dgemm.c | 0 microbenchmarks/mdtest/COPYRIGHT | 256 +++ microbenchmarks/mdtest/Makefile | 40 + microbenchmarks/mdtest/Makefile.XROADS | 12 + microbenchmarks/mdtest/README | 136 ++ microbenchmarks/mdtest/README.XROADS | 171 ++ microbenchmarks/mdtest/RELEASE_LOG | 103 + microbenchmarks/mdtest/mdtest.1 | 188 ++ microbenchmarks/mdtest/mdtest.c | 1942 +++++++++++++++++ microbenchmarks/mdtest/scripts/WRAPPER_README | 49 + microbenchmarks/mdtest/scripts/env_to_db.tcsh | 127 ++ .../mdtest/scripts/mdtest_wrapper.py | 533 +++++ microbenchmarks/mdtest/scripts/paramCatch.py | 46 + microbenchmarks/mdtest/scripts/tester.py | 223 ++ utils/pav_config/tests/stream.yaml | 67 +- 21 files changed, 3919 insertions(+), 36 deletions(-) rename {dgemm => microbenchmarks/dgemm}/README.ACES (100%) rename {dgemm => microbenchmarks/dgemm}/scripts/loop_dgemm (100%) rename {dgemm => microbenchmarks/dgemm}/src/Makefile (100%) rename {dgemm => microbenchmarks/dgemm}/src/Makefile.intel (100%) rename {dgemm => microbenchmarks/dgemm}/src/mt-dgemm.c (100%) create mode 100644 microbenchmarks/mdtest/COPYRIGHT create mode 100644 microbenchmarks/mdtest/Makefile create mode 100644 microbenchmarks/mdtest/Makefile.XROADS create mode 100644 microbenchmarks/mdtest/README create mode 100644 microbenchmarks/mdtest/README.XROADS create mode 100644 microbenchmarks/mdtest/RELEASE_LOG create mode 100644 microbenchmarks/mdtest/mdtest.1 create mode 100644 microbenchmarks/mdtest/mdtest.c create mode 100644 microbenchmarks/mdtest/scripts/WRAPPER_README create mode 100755 microbenchmarks/mdtest/scripts/env_to_db.tcsh create mode 100755 microbenchmarks/mdtest/scripts/mdtest_wrapper.py create mode 100644 microbenchmarks/mdtest/scripts/paramCatch.py create mode 100755 microbenchmarks/mdtest/scripts/tester.py diff --git a/doc/sphinx/10_microbenchmarks/M1_STREAM/STREAM.rst b/doc/sphinx/10_microbenchmarks/M1_STREAM/STREAM.rst index d3dab238..df20b3c6 100644 --- a/doc/sphinx/10_microbenchmarks/M1_STREAM/STREAM.rst +++ b/doc/sphinx/10_microbenchmarks/M1_STREAM/STREAM.rst @@ -37,7 +37,21 @@ These operations stress memory and floating point pipelines.They test memory tra Figure of Merit --------------- -The primary FOM is the Triad rate (MB/s). +The primary FOM is the max Triad rate (MB/s). + +Run Rules +--------- + +The program must synchronize between each operation. For instance: + +On a heterogeneous system, run stream for all computational devices. Where there is unified or heterogeneously addressable memory, also provide performance numbers for each device's access to available memory types. + + +For instance: +On a heterogenous node architecture with multi-core CPU with HBM2 memory and a GPU with HBM3 memory Stream performance should be reported for: CPU <-> HBM2, GPU <-> HBM3, CPU <-> HBM3, GPU <-> HBM2 + +Present n CPU we want to see the scale as function of cores. On GPU maximum bandwidth. + Building ======== @@ -60,22 +74,30 @@ Example calculations for results presented here: STREAM ARRAY SIZE CALCULATIONS: -ARRAY_SIZE ~= 4 x (45 MiB cache / processor) x (2 processors) / (3 arrays) / (8 bytes / element) = 15 Mi elements = 15000000 +:: + + ARRAY_SIZE ~= 4 x (45 MiB cache / processor) x (2 processors) / (3 arrays) / (8 bytes / element) = 15 Mi elements = 15000000 + +:: + + HASWELL: Intel(R) Xeon(R) CPU E5-2698 v3 @ 2.30GHz + CACHE: 40M + SOCKETS: 2 + 4 * ( 40M * 2 ) / 3 ARRAYS / 8 Bytes/element = 13.4 Mi elements = 13400000 + +:: -HASWELL: Intel(R) Xeon(R) CPU E5-2698 v3 @ 2.30GHz -CACHE: 40M -SOCKETS: 2 -4 * ( 40M * 2 ) / 3 ARRAYS / 8 Bytes/element = 13.4 Mi elements = 13400000 + BROADWELL: Intel(R) Xeon(R) CPU E5-2695 v4 @ 2.10GHz + CACHE: 45M + SOCKETS: 2 + 4 * ( 45M * 2 ) / 3 ARRAYS / 8 BYTES/ELEMENT = 15.0 Mi elements = 15000000 -BROADWELL: Intel(R) Xeon(R) CPU E5-2695 v4 @ 2.10GHz -CACHE: 45M -SOCKETS: 2 -4 * ( 45M * 2 ) / 3 ARRAYS / 8 BYTES/ELEMENT = 15.0 Mi elements = 15000000 +:: -SAPPHIRE RAPIDS: Intel(R) Xeon(R) Platinum 8480+ -CACHE: 105 -SOCKETS: 2 -4 x (105M * 2 ) / 3 ARRAYS / 8 BYTES/ELEMENT = 35 Mi elements = 35000000 + SAPPHIRE RAPIDS: Intel(R) Xeon(R) Platinum 8480+ + CACHE: 105 + SOCKETS: 2 + 4 x (105M * 2 ) / 3 ARRAYS / 8 BYTES/ELEMENT = 35 Mi elements = 35000000 Running ======= diff --git a/doc/sphinx/10_microbenchmarks/M6_MDTEST/MDTEST.rst b/doc/sphinx/10_microbenchmarks/M6_MDTEST/MDTEST.rst index cf97adb4..ffc1477a 100644 --- a/doc/sphinx/10_microbenchmarks/M6_MDTEST/MDTEST.rst +++ b/doc/sphinx/10_microbenchmarks/M6_MDTEST/MDTEST.rst @@ -21,24 +21,12 @@ Figure of Merit Building ======== -RHEL Systems ------------- - -CrayOS Systems --------------- - Running ======= Input ----- -Independent Variables ---------------------- - -Dependent Variable(s) ---------------------- - Example Results =============== diff --git a/dgemm/README.ACES b/microbenchmarks/dgemm/README.ACES similarity index 100% rename from dgemm/README.ACES rename to microbenchmarks/dgemm/README.ACES diff --git a/dgemm/scripts/loop_dgemm b/microbenchmarks/dgemm/scripts/loop_dgemm similarity index 100% rename from dgemm/scripts/loop_dgemm rename to microbenchmarks/dgemm/scripts/loop_dgemm diff --git a/dgemm/src/Makefile b/microbenchmarks/dgemm/src/Makefile similarity index 100% rename from dgemm/src/Makefile rename to microbenchmarks/dgemm/src/Makefile diff --git a/dgemm/src/Makefile.intel b/microbenchmarks/dgemm/src/Makefile.intel similarity index 100% rename from dgemm/src/Makefile.intel rename to microbenchmarks/dgemm/src/Makefile.intel diff --git a/dgemm/src/mt-dgemm.c b/microbenchmarks/dgemm/src/mt-dgemm.c similarity index 100% rename from dgemm/src/mt-dgemm.c rename to microbenchmarks/dgemm/src/mt-dgemm.c diff --git a/microbenchmarks/mdtest/COPYRIGHT b/microbenchmarks/mdtest/COPYRIGHT new file mode 100644 index 00000000..ef8fc360 --- /dev/null +++ b/microbenchmarks/mdtest/COPYRIGHT @@ -0,0 +1,256 @@ +Copyright (c) 2003, The Regents of the University of California. +Produced at the Lawrence Livermore National Laboratory. +Written by Christopher Morrone , Bill Loewe , +and Tyce McLarty . +UCRL-CODE-155800 +All rights reserved. + +This file is part of mdtest. + +Please also read Our Notice and GNU General Public License. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License (as published by the Free Software +Foundation) version 2, dated June 1991. + +This program is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the IMPLIED WARRANTY OF MERCHANTABILITY or FITNESS FOR A +PARTICULAR PURPOSE. See the terms and conditions of the GNU General Public +License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + + +OUR NOTICE AND TERMS AND CONDITIONS OF THE GNU GENERAL PUBLIC LICENSE + +Our Preamble Notice + +A. This notice is required to be provided under our contract with the U.S. +Department of Energy (DOE). This work was produced at the University of +California, Lawrence Livermore National Laboratory under Contract No. +W-7405-ENG-48 with the DOE. + +B. Neither the United States Government nor the University of California nor +any of their employees, makes any warranty, express or implied, or assumes any +liability or responsibility for the accuracy, completeness, or usefulness of +any information, apparatus, product, or process disclosed, or represents that +its use would not infringe privately-owned rights. + +C. Also, reference herein to any specific commercial products, process, or +services by trade name, trademark, manufacturer or otherwise does not +necessarily constitute or imply its endorsement, recommendation, or favoring +by the United States Government or the University of California. The views and +opinions of authors expressed herein do not necessarily state or reflect those +of the United States Government or the University of California, and shall not +be used for advertising or product endorsement purposes. + +The precise terms and conditions for copying, distribution and modification +follows. + +GNU Terms and Conditions for Copying, Distribution, and Modification + +0. This License applies to any program or other work which contains a notice +placed by the copyright holder saying it may be distributed under the terms of +this General Public License. The "Program," below, refers to any such program +or work, and a "work based on the Program" means either the Program or any +derivative work under copyright law: that is to say, a work containing the +Program or a portion of it, either verbatim or with modifications and/or +translated into another language. (Hereinafter, translation is included +without limitation in the term "modification".) Each licensee is addressed as +"you." + +Activities other than copying, distribution and modification are not covered by +this License; they are outside its scope. The act of running the Program is +not restricted, and the output from the Program is covered only if its contents +constitute a work based on the Program (independent of having been made by +running the Program). Whether that is true depends on what the Program does. + +1. You may copy and distribute verbatim copies of the Program's source code as +you receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice and +disclaimer of warranty; keep intact all the notices that refer to this License +and to the absence of any warranty; and give any other recipients of the +Program a copy of this License along with the Program. + +You may charge a fee for the physical act of transferring a copy, and you may +at your option offer warranty protection in exchange for a fee. + +2. You may modify your copy or copies of the Program or any portion of it, +thus forming a work based on the Program, and copy and distribute such +modifications or work under the terms of Section 1 above, provided that you +also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices stating + that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in whole + or in part contains or is derived from the Program or any part thereof, + to be licensed as a whole at no charge to all third parties under the terms + of this License. + + c) If the modified program normally reads commands interactively when run, + you must cause it, when started running for such interactive use in the + most ordinary way, to print or display an announcement including an + appropriate copyright notice and a notice that there is no warranty (or + else, saying that you provide a warranty) and that users may redistribute + the program under these conditions, and telling the user how to view a copy + of this License. (Exception: if the Program itself is interactive but does + not normally print such an announcement, your work based on the Program is + not required to print an announcement.) + +These requirements apply to the modified work as a whole. If identifiable +sections of that work are not derived from the Program, and can be reasonably +considered independent and separate works in themselves, then this License, and +its terms, do not apply to those sections when you distribute them as separate +work. But when you distribute the same section as part of a whole which is a +work based on the Program, the distribution of the whole must be on the terms +of this License, whose permissions for other licensees extend to the entire +whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest your +rights to work written entirely by you; rather, the intent is to exercise the +right to control the distribution of derivative or collective works based on +the Program. + +In addition, mere aggregation of another work not based on the Program with the +Program (or with a work based on the Program) on a volume of a storage or +distribution medium does not bring the other work under the scope of this +License. + +3. You may copy and distribute the Program (or a work based on it, under +Section 2) in object code or executable form under the terms of Sections 1 and +2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable source + code, which must be distributed under the terms of Sections 1 and 2 above + on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three years, to + give any third party, for a charge no more than your cost of physically + performing source distribution, a complete machine-readable copy of the + corresponding source code, to be distributed under the terms of Sections 1 + and 2 above on a medium customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer to + distribute corresponding source code. (This alternative is allowed only + for noncommercial distribution and only if you received the program in + object code or executable form with such an offer, in accord with + Subsection b above.) + +The source code for a work means the preferred form the work for making +modifications to it. For an executable work, complete source code means all +the source code for all modules it contains, plus any associated interface +definition files, plus the scripts used to control compilation and installation +of the executable. However, as a special exception, the source code +distributed need not include anything that is normally distributed (in either +source or binary form) with the major components (compiler, kernel, and so on) +of the operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering access to copy +from a designated place, then offering equivalent access to copy the source +code from the same place counts as distribution of the source code, even though +third parties are not compelled to copy the source along with the object code. + +4. You may not copy, modify, sublicense, or distribute the Program except as +expressly provided under this License. Any attempt otherwise to copy, modify, +sublicense or distribute the Program is void, and will automatically terminate +your rights under this License. However, parties who have received copies, or +rights, from you under this License will not have their licenses terminated so +long as such parties remain in full compliance. + +5. You are not required to accept this License, since you have not signed it. +However, nothing else grants you permission to modify or distribute the Program +or its derivative works. These actions are prohibited by law if you do not +accept this License. Therefore, by modifying or distributing the Program (or +any work based on the Program), you indicate your acceptance of this License to +do so, and all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + +6. Each time you redistribute the Program (or any work based on the Program), +the recipient automatically receives a license from the original licensor to +copy, distribute or modify the Program subject to these terms and conditions. +You may not impose any further restrictions on the recipients' exercise of the +rights granted herein. You are not responsible for enforcing compliance by +third parties to this License. + +7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), conditions +are imposed on you (whether by court order, agreement or otherwise) that +contradict the conditions of this License, they do not excuse you from the +conditions of this License. If you cannot distribute so as to satisfy +simultaneously your obligations under this License and any other pertinent +obligations, then as a consequence you may not distribute the Program at all. +For example, if a patent license would not permit royalty-free redistribution +of the Program by all those who receive copies directly or indirectly through +you, then the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under any +particular circumstance, the balance of the section is intended to apply and +the section as a whole is intended to apply in other circumstances. + +It is not the purpose to this section to induce you to infringe any patents or +other property right claims or to contest validity of any such claims; this +section has the sole purpose of protecting the integrity of the free software +distribution system, which is implemented by public license practices. Many +people have made generous contributions to the wide range of software +distributed through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing to +distribute software through any other system and a licensee cannot impose that +choice. + +This section is intended to make thoroughly clear what is believed to be a +consequence of the rest of this License. + +8. If the distribution and/or use of the Program is restricted in certain +countries either by patents or by copyrighted interfaces, the original +copyright holder who places the Program under this License may add an explicit +geographical distribution limitation excluding those countries, so that +distribution is permitted only in or among countries not thus excluded. In +such case, this License incorporates the limitation as if written in the body +of this License. + +9. The Free Software Foundation may publish revised and/or new versions of the +General Public License from time to time. Such new versions will be similar in +spirit to the present version, but may differ in detail to address new problems +or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any later +version," you have the option of following the terms and conditions either of +that version of any later version published by the Free Software Foundation. +If the Program does not specify a version number of this License, you may +choose any version ever published by the Free Software Foundation. + +10. If you wish to incorporate parts of the Program into other free programs +whose distribution conditions are different, write to the author to ask for +permission. For software which is copyrighted by the Free Software Foundation, +write to the Free Software Foundation; we sometimes make exceptions for this. +Our decision to grant permission will be guided by the two goals of preserving +the free status of all derivatives of our free software and or promoting the +sharing and reuse of software generally. + +NO WARRANTY + +11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY FOR +THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE +STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE +PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, +INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND +FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND +PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, +YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + +12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL +ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR REDISTRIBUTE THE +PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR +INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA +BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A +FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER +OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. + +END OF TERMS AND CONDITIONS diff --git a/microbenchmarks/mdtest/Makefile b/microbenchmarks/mdtest/Makefile new file mode 100644 index 00000000..31514cf1 --- /dev/null +++ b/microbenchmarks/mdtest/Makefile @@ -0,0 +1,40 @@ +#/*****************************************************************************\ +#* * +#* Copyright (c) 2003, The Regents of the University of California * +#* See the file COPYRIGHT for a complete copyright notice and license. * +#* * +#******************************************************************************* +#* +#* CVS info: +#* $RCSfile: Makefile,v $ +#* $Revision: 1.1.1.1.2.1 $ +#* $Date: 2010/05/11 21:25:16 $ +#* $Author: loewe6 $ +#* +#* Purpose: +#* Make mdtest executable. +#* +#* make [mdtest] -- mdtest +#* make clean -- remove executable +#* +#\*****************************************************************************/ + +CC.AIX = mpcc_r -bmaxdata:0x80000000 +CC.Linux = cc -Wall +CC.Darwin = mpicc -Wall + +# Requires GNU Make +OS=$(shell uname) + +# Flags for compiling on 64-bit machines +LARGE_FILE = -D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE=1 -D__USE_LARGEFILE64=1 + +CC = $(CC.$(OS)) + +all: mdtest + +mdtest: mdtest.c + $(CC) -D$(OS) $(LARGE_FILE) -g -o mdtest mdtest.c -lm + +clean: + rm -f mdtest mdtest.o diff --git a/microbenchmarks/mdtest/Makefile.XROADS b/microbenchmarks/mdtest/Makefile.XROADS new file mode 100644 index 00000000..0a94dcbe --- /dev/null +++ b/microbenchmarks/mdtest/Makefile.XROADS @@ -0,0 +1,12 @@ +# +# You may wish to specify CC, CFLAGS, and/or LDFLAGS when compiling, e.g., +# +# make CC=mpicc CFLAGS=-g +# + +.PHONY: clean + +mdtest: mdtest.o + +clean: + @rm mdtest *.o diff --git a/microbenchmarks/mdtest/README b/microbenchmarks/mdtest/README new file mode 100644 index 00000000..c5d1299d --- /dev/null +++ b/microbenchmarks/mdtest/README @@ -0,0 +1,136 @@ +/******************************************************************************\ +* * +* Copyright (c) 2003, The Regents of the University of California * +* See the file COPYRIGHT for a complete copyright notice and license. * +* * +\******************************************************************************/ + +Usage: mdtest [-b #] [-B] [-c] [-C] [-d testdir] [-D] [-e] [-E] [-f first] [-F] + [-h] [-i iterations] [-I #] [-l last] [-L] [-n #] [-N #] [-p seconds] + [-r] [-R[#]] [-s #] [-S] [-t] [-T] [-u] [-v] [-V #] [-w #] [-y] + [-z #] + + -b: branching factor of hierarchical directory structure + -B: no barriers between phases (create/stat/remove) + -c: collective creates: task 0 does all creates and deletes + -C: only create files/dirs + -d: the directory in which the tests will run + -D: perform test on directories only (no files) + -e: number of bytes to read from each file + -E: only read files + -f: first number of tasks on which the test will run + -F: perform test on files only (no directories) + -h: prints help message + -i: number of iterations the test will run + -I: number of items per tree node + -l: last number of tasks on which the test will run + -L: files/dirs created only at leaf level + -n: every task will create/stat/remove # files/dirs per tree + -N: stride # between neighbor tasks for file/dir stat (local=0) + -p: pre-iteration delay (in seconds) + -r: only remove files/dirs + -R: randomly stat files/dirs (optional seed can be provided) + -s: stride between the number of tasks for each test + -S: shared file access (file only, no directories) + -t: time unique working directory overhead + -T: only stat files/dirs + -u: unique working directory for each task + -v: verbosity (each instance of option increments by one) + -V: verbosity value + -w: number of bytes to write to each file + -y: sync file after write completion + -z: depth of hierarchical directory structure + +NOTES: + * -N allows a "read-your-neighbor" approach by setting stride to + tasks-per-node + * -d allows multiple paths for the form '-d fullpath1@fullpath2@fullpath3' + * -B allows each task to time itself. The aggregate results reflect this + change. + * -n and -I cannot be used together. -I specifies the number of files/dirs + created per tree node, whereas the -n specifies the total number of + files/dirs created over an entire tree. When using -n, integer division is + used to determine the number of files/dirs per tree node. (E.g. if -n is + 10 and there are 4 tree nodes (z=1 and b=3), there will be 2 files/dirs per + tree node.) + * -R and -T can be used separately. -R merely indicates that if files/dirs + are going to be stat'ed, then they will be stat'ed randomly. + + +Illustration of terminology: + + Hierarchical directory structure (tree) + + ======= + | | (tree node) + ======= + / | \ + ------ | ------ + / | \ + ======= ======= ======= + | | | | | | (leaf level) + ======= ======= ======= + + In this example, the tree has a depth of one (z=1) and branching factor of + three (b=3). The node at the top of the tree is the root node. The level + of nodes furthest from the root is the leaf level. All trees created by + mdtest are balanced. + + To see how mdtest operates, do a simple run like the following: + + mdtest -z 1 -b 3 -I 10 -C -i 3 + + This command will create a tree like the one above, then each task will + create 10 files/dirs per tree node. Three of these trees will be created + (one for each iteration). + + +Example usages: + +mdtest -I 10 -z 5 -b 2 + + A directory tree is created in the current working directory that has a + depth of 5 and a branching factor of 2. Each task operates on 10 + files/dirs in each tree node. + +mdtest -I 10 -z 5 -b 2 -R + + This example is the same as the previous one except that the files/dirs are + stat'ed randomly. + +mdtest -I 10 -z 5 -b 2 -R4 + + Again, this example is the same as the previous except a seed of 4 is + passed to the random number generator. + +mdtest -I 10 -z 5 -b 2 -L + + A directory tree is created as described above, but in this example + files/dirs exist only at the leaf level of the tree. + +mdtest -n 100 -i 3 -d /users/me/testing + + Each task creates 100 files/dirs in a root node (there are no branches + out of the root node) within the path /users/me/testing. This is done + three times. Aggregate values are calculated over the iterations. + +mdtest -n 100 -F -C + + Each task only creates 100 files in the current directory. + Directories are not created. The files are neither stat'ed nor + removed. + +mdtest -I 5 -z 3 -b 5 -u -d /users/me/testing + + Each task creates a directory tree in the /users/me/testing + directory. Each tree has a depth of 3 and a branching factor of + 5. Five files/dirs are operated upon in each node of each tree. + +mdtest -I 5 -z 3 -b 5 -u -d /users/me/testing@/some/other/location + + This run is the same as the previous except that each task creates + its tree in a different directory. Task 0 will create a tree in + /users/me/testing. Task 1 will create a tree in /some/other/location. + After all of the directories are used, the remaining tasks round- + robin over the directories supplied. (I.e. Task 2 will create a + tree in /users/me/testing, etc.) diff --git a/microbenchmarks/mdtest/README.XROADS b/microbenchmarks/mdtest/README.XROADS new file mode 100644 index 00000000..2783b547 --- /dev/null +++ b/microbenchmarks/mdtest/README.XROADS @@ -0,0 +1,171 @@ +Crossroads/NERSC-9 mdtest Benchmark +================================================================================ + +I. Benchmark Description +-------------------------------------------------------------------------------- +mdtest is designed to measure the performance of various metadata operations and +uses MPI to coordinate the operations and to collect the results. All of the +general run rules for XRoads benchmarking apply. + + +II. Build Instructions +-------------------------------------------------------------------------------- +MPI is required in order to build and run the code. The source code used for +this benchmark is derived from mdtest 1.8.4 and it is included with this +benchmark specification. More information about mdtest is available on +http://mdtest.sourceforge.net. + +After extracting the tar file, ensure that the MPI compiler wrappers (e.g., +`mpicc`) are in `$PATH` and then + + cd mdtest-1.8.4-xroads + make + +This will build the mdtest executable, called `mdtest`. It may be necessary to +specify the `CC`, `CFLAGS`, and `LDFLAGS` variables to ensure correct +compilation of `mdtest`. A simplified Makefile, `Makefile.XROADS`, is also +provided to this end, e.g., + + make -f Makefile.XROADS CC=mpicc CFLAGS=-g + +Either `make` or `make -f Makefile.XROADS` can be used to build the binary used +for this benchmark, but any additional `CFLAGS` or `LDFLAGS` required for +compilation must be reported with the benchmark results. + + +III. Run Rules +-------------------------------------------------------------------------------- +The intent of this benchmark is to measure the performance of file metadata +operations on the platform storage. + +Observed benchmark performance shall be obtained from a storage system +configured as closely as possible to the proposed platform storage. If the +proposed solution includes multiple file access protocols (e.g., pNFS and NFS) +or multiple tiers accessible by applications, benchmark results for mdtest +shall be provided for each protocol and/or tier. + +Performance projections are permissible if they are derived from a similar +system that is considered an earlier generation of the proposed system. + +### Required Runs + +This benchmark is intended to measure the capability of the storage subsystem +to create and delete files, and it contains features that minimize +caching/buffering effects. As such, the Offerer should not utilize +optimizations that cache/buffer file metadata or metadata operations in compute +node memory. + +The Offeror shall run the following tests: + +* creating, statting, and removing at least 1,048,576 files in a single + directory +* creating, statting, and removing at least 1,048,576 files in separate + directories (one directory per MPI process) +* creating, statting, and removing one file by multiple MPI processes + +Each of these tests must be run at the following levels of concurrency: + +1. a single MPI process +2. the optimal number of MPI processes on a single compute node +3. the minimal number of MPI processes on multiple compute nodes that achieves + the peak results for the proposed system +4. the maximum possible MPI-level concurrency on the proposed system. This + could mean + * using one MPI process per CPU core across the entire system + * using the maximum number of MPI processes possible if one MPI process per + core will not be possible on the proposed architecture + * using more than 1,048,576 files if the system is capable of launching + more than 1,048,576 MPI processes + +These tests are configured via command-line arguments, and the following +section provides guidance on passing the correct options to `mdtest` for each +test. + +### Running mdtest + +mdtest is executed as any other standard MPI application would be on the +proposed system (e.g., with `mpirun` or `srun`). For the sake of the +following examples, `mpirun` is used. + +**To run create, stat, and delete tests on files in a shared directory**, an +appropriate `mdtest` command-line invocation may look like + + mpirun -np 64 ./mdtest -F -C -T -r -n 16384 -d /scratch -N 16 + +The following command-line flags MUST be changed: + +* `-n` - the number of files **each MPI process** should manipulate. For a + test run with 64 MPI processes, specifying `-n 16384` will produce the + required 1048576 files (64 MPI processes x 16384). This parameter must + be changed for each level of concurrency. +* `-d /scratch` - the directory in which this test should be run. **This + must be an absolute path.** +* `-N` - MPI rank offset for each separate phase of the test. This parameter + must be equal to the number of MPI processes per node in use (e.g., `-N 16` + for a test with 16 processes per node) to ensure that each test phase (read, + stat, and delete) is performed on a different node. + +The following command-line flags MUST NOT be changed or omitted: + +* `-F` - only operate on files, not directories +* `-C` - perform file creation test +* `-T` - perform file stat test +* `-r` - perform file remove test + +**To have each MPI process write files into a unique directory,** add the `-u` +option: + + mpirun -np 64 ./mdtest -F -C -T -r -n 16384 -d /scratch -N 16 -u + +**To create, stat, and remove one file by multiple MPI processes,** add the `-S` +option: + + mpirun -np 64 ./mdtest -F -C -T -r -n 16384 -d /scratch -N 16 -S + + +IV. Permitted Modifications +-------------------------------------------------------------------------------- + +Modifications to the benchmark application code are only permissible to enable +correct compilation and execution on the target platform. Any modifications +must be fully documented (e.g., as a diff or patch file) and reported with the +benchmark results. + + +V. Reporting Results +-------------------------------------------------------------------------------- + +mdtest will execute file creation, file statting, and file deletion tests for +each run. The rate of file creating/statting/deleting are reported to stdout +at the conclusion of each test, and the following rates should be reported: + +* `File creation` +* `File stat` +* `File removal` + +The maximum values for these rates must be reported for all tests. Reporting +the maximum creation rates from one run and the maximum deletion rates from a +different run is NOT valid. File creation rate has slightly higher importance +for this test, so if the highest observed file creation rate came from a +different run than the highest observed deletion rate, report the results from +the run with the highest file creation rate. + +### Benchmark Platform Description + +The Offeror must provide a comprehensive description of the environment in which +each benchmark was run. This must include: + +* Client and server system configurations, including node and processor counts, + processor models, memory size and speed, and OS (names and versions) +* Storage media and their configurations used for each tier of the storage + subsystem +* Network fabric used to connect servers, clients, and storage, including + network configuration settings and topology +* Client and server configuration settings including + * Client and server sysctl settings + * Driver options + * Network interface options + * File system configuration and mount options +* Compiler name and version, compiler options, and libraries used to build + benchmarks + diff --git a/microbenchmarks/mdtest/RELEASE_LOG b/microbenchmarks/mdtest/RELEASE_LOG new file mode 100644 index 00000000..d2e9abbf --- /dev/null +++ b/microbenchmarks/mdtest/RELEASE_LOG @@ -0,0 +1,103 @@ +Changes in mdtest-1.8.4 + * Added read option to extend create (write) capability. New feature will: + -E: Only perform the read phase of the tests. + -e #: Set the number of Bytes to read from each file. + +Fixes in mdtest-1.8.3 + * Prepared for release on sourceforge.net + +Fixes in mdtest-1.8.2 + * With the new changes issued in mdtest-1.8.0, all files and directories + were operated upon by using the full path to each file/dir. Full paths + are no longer used. Now a relative path is used from the root dir of + each directory tree. + * fixed bug in collective creates and unique directory per task mode + +Fixes in mdtest-1.8.1 + * A new test directory is created for each iteration. Then for each + iteration the directory structure is created/removed. This allowed + multiple iterations of the create-only mode. The name of the test + directories has changed as a result of this fix. Also, aggregate + creation/removal times are computed now over the number of iterations. + +Changes in mdtest-1.8.0 + * added option to create files/dirs in tree-like directory structure: + Previously, all files/dirs were created in one test directory. Now the + root directories of the tree(s) are created in that test directory. + Files/dirs are then created within those root directories or their children. + If the -u flag is specified, then unique trees are created per proc. + Otherwise, one tree is created. This coincides with the previous + functionality. The following flags were added/changed to incorporate this + new feature: + -z #: Indicates the depth of the leaves of the tree. If this flag is not + specified, the depth defaults to 0 (i.e. files/dirs are created in + the top-level directories). + -b #: Indicates the branching factor of the tree. If this flag is not + specified, the branching factor defaults to 1. Branching factor + indicates the number of children that each non-leaf node has. + -L: Indicates that files/dirs should only be created at the leaf level + of the tree. + -I #: Indicates the number of files/dirs that should be created within + each directory of the tree. + -n #: This flag still indicates the total number of files/dirs that should + be created. However, with the new tree structure some calculations + are done to determine the number of files that should be created per + directory in the tree. Due to rounding the actual total number of + files may differ slightly from what is specified. + + * added option to choose which phases to run: + The create, stat, and remove phases of mdtest have been separated. There + are flags now that allow the user to choose which phases they want to + perform. If none of these flags is specified, then the default usage is + to do all of the phases. The user is trusted to be intelligent about their + choice of phases. As a result of the separation of the phases, the naming + convention of the files/dirs had to be altered slightly. + + * added option to not barrier between each phase (create/stat/remove): + A major change in mdtest is the ability to time each proc that is running + the different phases of mdtest. The default functionality is the same as + the previous version - barriers are taken between phases (create/stat/ + remove). Also, in the default case, the resultant times reflect the + slowest rates for each phase. If the -B flag is specified, then no barriers + are taken between the phases. There is a race condition when specifying + this flag, but it is rarely met. The race condition is that one proc might + be trying to remove a file in the shared file case before someone else has + a chance to stat the file. Also, when the -B flag is specified, the + resultant rates are aggregates over the number of iterations and the number + of procs used. The default case, as mentioned above, calculates aggregates + only over the number of iterations where the time for each phase of an + iteration is the time of the slowest proc for that particular phase. + + * added option to stat files/dirs in a random order: + The default usage of mdtest will stat files in sequential order. Now, + however, items can be stat'ed in a random order by specifying the -R flag. + Even though the stat order is random with this usage, items are still only + stat'ed once each. This is achieved by randomly sorting a list of unique + item IDs before running the different tests. A seed for the random number + generator can optionally be provided with the following syntax: -R#. + +Fixes in mdtest-1.7.5 + * changed bug in how test directory was created (race condition) + * added multipath option for test directories ('-d path1@path2@path3') + * added man page and correct malloc error-checking (patches from Jim Garlick) + +Fixes in mdtest-1.7.4: + * folded b_remove_0 branch into main HEAD branch + +Fixes in mdtest-b_remove_0: + * added remove option to only remove files from previous run + +Fixes in mdtest-pre_b_remove_0: + * simple clean up for preparing for branch + +Fixes in mdtest-1.7.3: + * added statfs() to get file system data block and inode usage, replacing + system() call + +Fixes in mdtest-1.7.2: + * initialized declared variables + * modified df disk usage call + * added error-checking for chdir() + +Fixes in mdtest-1.7.1: + * added '-y' option to sync file after write diff --git a/microbenchmarks/mdtest/mdtest.1 b/microbenchmarks/mdtest/mdtest.1 new file mode 100644 index 00000000..ba82d88a --- /dev/null +++ b/microbenchmarks/mdtest/mdtest.1 @@ -0,0 +1,188 @@ +.TH mdtest 1 "2010-05-05" "mdtest-1.8.3" "mdtest" +.SH NAME +mdtest \- test file system metadata performance +.SH SYNOPSIS +.B mdtest +.I "[-options]" +.SH DESCRIPTION +.B mdtest +is a file system metadata performance test designed to run +in a cluster MPI environment against parallel file systems. +.PP +In each iteration of the test, each MPI task creates, stats, and removes +the specified number of directories and/or files and measures the performance +in ops/second. After all the iterations complete, the maximum, minimum, +mean ops/sec and the std. deviation are reported for each operation. +.SH OPTIONS +.TP +.I "-b" branching_factor +The branching factor of the hierarchical directory structure [default: 1]. +.TP +.I "-B" +No barriers will be taken between the phases (create/stat/remove) of the tests. +.TP +.I "-c" +Use ``collective creates'', meaning task 0 does all the creates. +.TP +.I "-C" +Only perform the create phase of the tests. +.TP +.I "-d" testdir[@testdir2] +The directory in which the tests will run. For multiple pathes, must use fully-qualified pathnames. +[default: working directory of mdtest]. +.TP +.I "-D" +Perform test on directories only (no files). +.TP +.I "-e" bytes +Set the number of Bytes to read from each file [default: 0]. +.TP +.I "-E" +Only perform the read phase of the tests. +.TP +.I "-f" first +The first number of tasks on which the test will run +[default: 0]. +.TP +.I "-F" +Perform test on files only (no directories). +.TP +.I "-h" +Display help message. +.TP +.I "-i" iterations +The number of iterations the test will run +[default: 1]. +.TP +.I "-I" items_per_directory +The number of items per directory in the tree [default: 0]. +.TP +.I "-l" last +The last number of tasks on which the test will run +[default: 0]. +.TP +.I "-L" +Files/directories only created at the leaf level of the tree. +.TP +.I "-n" number_of_items +Every process will creat/stat/remove # directories and files +[default: 0]. +.TP +.I "-N" stride +Stride # between neighbor tasks for file/dir stat, 0 = local +[default: 0]. +.TP +.I "-p" seconds +Pre-iteration delay (in seconds). +.TP +.I "-r" +Only perform the remove phase of the tests. +.TP +.I "-R[seed]" +Randomly stat files. There is an optional argument that provides a seed +to the random number generator. (Note: There is no space between the +.I "-R" + and +the seed if one is provided.) +.TP +.I "-s" stride +Stride between the number of tasks for each test +[default: 1]. +.TP +.I "-S" +Shared file access (file only, no directories). +.TP +.I "-t" +Include unique working directory management overhead in the results +(presumes +.I "-u" +option). +.TP +.I "-T" +Only perform the stat phase of the tests. +.TP +.I "-u" +Create a unique working directory for each task +(presumes +.I "-d" +option). +.TP +.I "-v" +Increase verbosity (each instance of option increments by one). +.TP +.I "-V" value +Set verbosity value +[default: 0]. +.TP +.I "-w" bytes +Set the number of Bytes to write to each file after it is created +[default: 0]. +.TP +.I "-z" tree_depth +The depth of the hierarchical directory tree [default: 0]. +.SH EXAMPLES +.SS "Example 1" +.nf +$ mpirun -n 2 ./mdtest -d /tmp/z -n 100 -i 2 + +-- started at 11/23/2009 09:05:29 -- + +mdtest-1.8.1 was launched with 2 total task(s) on 1 nodes +Command line used: ./mdtest -d /tmp/z -n 100 -i 2 +Path: /tmp +FS: 28.8 GiB Used FS: 8.6% 8.6%Inodes: 1.8 Mi Used Inodes: 5.1% + +time to create tree: 0.000078 sec +tree creation rate: 12826.617737 ops/sec + +2 tasks, 200 files/directories + +SUMMARY: (of 2 iterations) + Operation Max Min Mean Std Dev + --------- --- --- ---- ------- + Directory creation: 21489.415 17447.551 19468.483 2020.932 + Directory stat : 154657.227 28731.061 91694.144 62963.083 + Directory removal : 146756.613 21489.415 84123.014 62633.599 + File creation : 42024.989 28731.061 35378.025 6646.964 + File stat : 146756.613 17447.551 82102.082 64654.531 + File removal : 156884.384 42024.989 99454.686 57429.698 + +time to remove tree: 0.001031 sec +tree removal rate: 970.005550 ops/sec + +-- finished at 11/23/2009 09:05:29 -- +.fi +.SS "Example 2" +.nf +$ mpirun -np 2 -H pc6 ./mdtest -d /tmp/z -b 2 -z 3 -I 10 + +-- started at 11/23/2009 09:09:23 -- + +mdtest-1.8.1 was launched with 2 total task(s) on 1 nodes +Command line used: ./mdtest -d /tmp/z -b 2 -z 3 -I 10 +Path: /tmp +FS: 28.8 GiB Used FS: 8.6% 8.6%Inodes: 1.8 Mi Used Inodes: 5.1% + +time to create tree: 0.000765 sec +tree creation rate: 19605.659084 ops/sec + +2 tasks, 300 files/directories + +SUMMARY: (of 1 iterations) + Operation Max Min Mean Std Dev + --------- --- --- ---- ------- + Directory creation: 29365.707 29365.707 29365.707 0.000 + Directory stat : 123701.455 123701.455 123701.455 0.000 + Directory removal : 25623.459 25623.459 25623.459 0.000 + File creation : 38704.743 38704.743 38704.743 0.000 + File stat : 125477.782 125477.782 125477.782 0.000 + File removal : 51911.845 51911.845 51911.845 0.000 + +time to remove tree: 0.000940 sec +tree removal rate: 15960.060883 ops/sec + +-- finished at 11/23/2009 09:09:23 -- +.fi + +.SH "SEE ALSO" +\fBhttp://sourceforge.net/projects/mdtest\fR diff --git a/microbenchmarks/mdtest/mdtest.c b/microbenchmarks/mdtest/mdtest.c new file mode 100644 index 00000000..da2d2605 --- /dev/null +++ b/microbenchmarks/mdtest/mdtest.c @@ -0,0 +1,1942 @@ +/* + * Copyright (C) 2003, The Regents of the University of California. + * Produced at the Lawrence Livermore National Laboratory. + * Written by Christopher J. Morrone , + * Bill Loewe , Tyce McLarty , + * and Ryan Kroiss . + * All rights reserved. + * UCRL-CODE-155800 + * + * Please read the COPYRIGHT file. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License (as published by + * the Free Software Foundation) version 2, dated June 1991. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the IMPLIED WARRANTY OF + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * terms and conditions of the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * CVS info: + * $RCSfile: mdtest.c,v $ + * $Revision: 1.1.1.1.2.1 $ + * $Date: 2010/05/11 21:25:16 $ + * $Author: loewe6 $ + */ + +#include "mpi.h" +#include +#include +#include +#include +#include +#ifdef __APPLE__ +#include +#include +#else +#include +#endif +#include +#include +#include +#include +#include +#include +#include + +#define FILEMODE S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH +#define DIRMODE S_IRUSR|S_IWUSR|S_IXUSR|S_IRGRP|S_IWGRP|S_IXGRP|S_IROTH|S_IXOTH +#define MAX_LEN 1024 +#define RELEASE_VERS "1.8.3" +#define TEST_DIR "#test-dir" +#define ITEM_COUNT 25000 + +typedef struct +{ + double entry[10]; +} table_t; + +int rank; +int size; +int* rand_array; +char testdir[MAX_LEN]; +char testdirpath[MAX_LEN]; +char top_dir[MAX_LEN]; +char base_tree_name[MAX_LEN]; +char ** filenames = NULL; +char hostname[MAX_LEN]; +char unique_dir[MAX_LEN]; +char mk_name[MAX_LEN]; +char stat_name[MAX_LEN]; +char read_name[MAX_LEN]; +char rm_name[MAX_LEN]; +char unique_mk_dir[MAX_LEN]; +char unique_chdir_dir[MAX_LEN]; +char unique_stat_dir[MAX_LEN]; +char unique_read_dir[MAX_LEN]; +char unique_rm_dir[MAX_LEN]; +char unique_rm_uni_dir[MAX_LEN]; +char * write_buffer = NULL; +char * read_buffer = NULL; +int barriers = 1; +int create_only = 0; +int stat_only = 0; +int read_only = 0; +int remove_only = 0; +int leaf_only = 0; +int branch_factor = 1; +int depth = 0; +int num_dirs_in_tree = 0; +int items_per_dir = 0; +int random_seed = 0; +int shared_file = 0; +int files_only = 0; +int dirs_only = 0; +int pre_delay = 0; +int unique_dir_per_task = 0; +int time_unique_dir_overhead = 0; +int verbose = 0; +int throttle = 1; +int items = 0; +int collective_creates = 0; +int write_bytes = 0; +int read_bytes = 0; +int sync_file = 0; +int path_count = 0; +int nstride = 0; /* neighbor stride */ +MPI_Comm testcomm; +table_t * summary_table; + +/* for making/removing unique directory && stating/deleting subdirectory */ +enum {MK_UNI_DIR, STAT_SUB_DIR, READ_SUB_DIR, RM_SUB_DIR, RM_UNI_DIR}; + +#ifdef __linux__ +#define FAIL(msg) do { \ + fprintf(stdout, "%s: Process %d(%s): FAILED in %s, %s: %s\n",\ + timestamp(), rank, hostname, __func__, \ + msg, strerror(errno)); \ + fflush(stdout);\ + MPI_Abort(MPI_COMM_WORLD, 1); \ +} while(0) +#else +#define FAIL(msg) do { \ + fprintf(stdout, "%s: Process %d(%s): FAILED at %d, %s: %s\n",\ + timestamp(), rank, hostname, __LINE__, \ + msg, strerror(errno)); \ + fflush(stdout);\ + MPI_Abort(MPI_COMM_WORLD, 1); \ +} while(0) +#endif + +char *timestamp() { + static char datestring[80]; + time_t timestamp; + + fflush(stdout); + timestamp = time(NULL); + strftime(datestring, 80, "%m/%d/%Y %T", localtime(×tamp)); + + return datestring; +} + +int count_tasks_per_node(void) { + char localhost[MAX_LEN], + hostname[MAX_LEN]; + int count = 1, + i; + MPI_Status status; + + if (gethostname(localhost, MAX_LEN) != 0) { + FAIL("gethostname()"); + } + if (rank == 0) { + /* MPI_receive all hostnames, and compare to local hostname */ + for (i = 0; i < size-1; i++) { + MPI_Recv(hostname, MAX_LEN, MPI_CHAR, MPI_ANY_SOURCE, + MPI_ANY_TAG, MPI_COMM_WORLD, &status); + if (strcmp(hostname, localhost) == 0) { + count++; + } + } + } else { + /* MPI_send hostname to root node */ + MPI_Send(localhost, MAX_LEN, MPI_CHAR, 0, 0, MPI_COMM_WORLD); + } + MPI_Bcast(&count, 1, MPI_INT, 0, MPI_COMM_WORLD); + + return(count); +} + +void delay_secs(int delay) { + if (rank == 0 && delay > 0) { + if (verbose >= 1) { + fprintf(stdout, "delaying %d seconds . . .\n", delay); + fflush(stdout); + } + sleep(delay); + } + MPI_Barrier(testcomm); +} + +void offset_timers(double * t, int tcount) { + double toffset; + int i; + + toffset = MPI_Wtime() - t[tcount]; + for (i = 0; i < tcount+1; i++) { + t[i] += toffset; + } +} + +void parse_dirpath(char *dirpath_arg) { + char * tmp, * token; + char delimiter_string[3] = { '@', '\n', '\0' }; + int i = 0; + + tmp = dirpath_arg; + + if (* tmp != '\0') path_count++; + while (* tmp != '\0') { + if (* tmp == '@') { + path_count++; + } + tmp++; + } + filenames = (char **)malloc(path_count * sizeof(char **)); + if (filenames == NULL) { + FAIL("out of memory"); + } + + token = strtok(dirpath_arg, delimiter_string); + while (token != NULL) { + filenames[i] = token; + token = strtok(NULL, delimiter_string); + i++; + } +} + +void unique_dir_access(int opt) { + if (opt == MK_UNI_DIR) { + MPI_Barrier(testcomm); + if (chdir(unique_chdir_dir) == -1) { + FAIL("Unable to chdir to unique test directory"); + } + } else if (opt == STAT_SUB_DIR) { + if (chdir(unique_stat_dir) == -1) { + FAIL("Unable to chdir to test directory"); + } + } else if (opt == READ_SUB_DIR) { + if (chdir(unique_read_dir) == -1) { + FAIL("Unable to chdir to test directory"); + } + } else if (opt == RM_SUB_DIR) { + if (chdir(unique_rm_dir) == -1) { + FAIL("Unable to chdir to test directory"); + } + } else if (opt == RM_UNI_DIR) { + if (chdir(unique_rm_uni_dir) == -1) { + FAIL("Unable to chdir to test directory"); + } + } +} + +/* helper for creating/removing items */ +void create_remove_items_helper(int dirs, + int create, char* path, int itemNum) { + + int i; + char curr_item[MAX_LEN]; + + for (i=0; i= 3 + && (itemNum+i) % ITEM_COUNT==0 && (itemNum+i != 0)) { + printf("create dir: %d\n", itemNum+i); + fflush(stdout); + } + + //create dirs + sprintf(curr_item, "%sdir.%s%d", path, mk_name, itemNum+i); + if (rank == 0 && verbose >= 2) { + printf("create dir : %s\n", curr_item); + fflush(stdout); + } + if (mkdir(curr_item, DIRMODE) == -1) { + FAIL("unable to create directory"); + } + + } else { + + if (rank == 0 && verbose >= 3 + && (itemNum+i) % ITEM_COUNT==0 && (itemNum+i != 0)) { + printf("remove dir: %d\n", itemNum+i); + fflush(stdout); + } + + //remove dirs + sprintf(curr_item, "%sdir.%s%d", path, rm_name, itemNum+i); + if (rank == 0 && verbose >= 2) { + printf("remove dir : %s\n", curr_item); + fflush(stdout); + } + if (rmdir(curr_item) == -1) { + FAIL("unable to remove directory"); + } + } + + } else { + + int fd; + if (create) { + + if (rank == 0 && verbose >= 3 + && (itemNum+i) % ITEM_COUNT==0 && (itemNum+i != 0)) { + printf("create file: %d\n", itemNum+i); + fflush(stdout); + } + + //create files + sprintf(curr_item, "%sfile.%s%d", path, mk_name, itemNum+i); + if (rank == 0 && verbose >= 2) { + printf("create file: %s\n", curr_item); + fflush(stdout); + } + if (collective_creates) { + if ((fd = open(curr_item, O_RDWR)) == -1) { + FAIL("unable to open file"); + } + } else { + if (shared_file) { + if ((fd = open(curr_item, + O_CREAT|O_RDWR, FILEMODE)) == -1) { + FAIL("unable to create file"); + } + } else { + if ((fd = creat(curr_item, FILEMODE)) == -1) { + FAIL("unable to create file"); + } + } + } + + if (write_bytes > 0) { + if (write(fd, write_buffer, write_bytes) != write_bytes) + FAIL("unable to write file"); + } + + if (sync_file && fsync(fd) == -1) { + FAIL("unable to sync file"); + } + + if (close(fd) == -1) { + FAIL("unable to close file"); + } + + } else { + + if (rank == 0 && verbose >= 3 + && (itemNum+i) % ITEM_COUNT==0 && (itemNum+i != 0)) { + printf("remove file: %d\n", itemNum+i); + fflush(stdout); + } + + //remove files + sprintf(curr_item, "%sfile.%s%d", path, rm_name, itemNum+i); + if (rank == 0 && verbose >= 2) { + printf("remove file: %s\n", curr_item); + fflush(stdout); + } + if (!(shared_file && rank != 0)) { + if (unlink(curr_item) == -1) { + FAIL("unable to unlink file"); + } + } + } + } + } +} + +/* helper function to do collective operations */ +void collective_helper(int dirs, int create, char* path, int itemNum) { + + int i; + char curr_item[MAX_LEN]; + for (i=0; i= 2) { + printf("create dir : %s\n", curr_item); + fflush(stdout); + } + if (mkdir(curr_item, DIRMODE) == -1) { + FAIL("unable to create directory"); + } + + } else { + + //remove dirs + sprintf(curr_item, "%sdir.%s%d", path, rm_name, itemNum+i); + if (rank == 0 && verbose >= 2) { + printf("remove dir : %s\n", curr_item); + fflush(stdout); + } + if (rmdir(curr_item) == -1) { + FAIL("unable to remove directory"); + } + } + + } else { + + int fd; + if (create) { + + //create files + sprintf(curr_item, "%sfile.%s%d", path, mk_name, itemNum+i); + if (rank == 0 && verbose >= 2) { + printf("create file: %s\n", curr_item); + fflush(stdout); + } + if ((fd = creat(curr_item, FILEMODE)) == -1) { + FAIL("unable to create file"); + } + if (close(fd) == -1) { + FAIL("unable to close file"); + } + + } else { + + //remove files + sprintf(curr_item, "%sfile.%s%d", path, rm_name, itemNum+i); + if (rank == 0 && verbose >= 2) { + printf("remove file: %s\n", curr_item); + fflush(stdout); + } + if (!(shared_file && rank != 0)) { + if (unlink(curr_item) == -1) { + FAIL("unable to unlink file"); + } + } + } + } + } +} + +/* recusive function to create and remove files/directories from the + directory tree */ +void create_remove_items(int currDepth, int dirs, int create, int collective, + char *path, int dirNum) { + + int i; + char dir[MAX_LEN]; + memset(dir, 0, MAX_LEN); + + if (currDepth == 0) { + + /* create items at this depth */ + if (!leaf_only || (depth == 0 && leaf_only)) { + if (collective) { + collective_helper(dirs, create, dir, 0); + } else { + create_remove_items_helper(dirs, create, dir, 0); + } + } + + if (depth > 0) { + create_remove_items(++currDepth, dirs, create, + collective, dir, ++dirNum); + } + + } else if (currDepth <= depth) { + + char temp_path[MAX_LEN]; + strcpy(temp_path, path); + int currDir = dirNum; + + /* iterate through the branches */ + for (i=0; i= 3 && (i%ITEM_COUNT == 0) && (i != 0)) { + printf("stat dir: %d\n", i); + fflush(stdout); + } + sprintf(item, "dir.%s%d", stat_name, item_num); + } else { + if (rank == 0 && verbose >= 3 && (i%ITEM_COUNT == 0) && (i != 0)) { + printf("stat file: %d\n", i); + fflush(stdout); + } + sprintf(item, "file.%s%d", stat_name, item_num); + } + + /* determine the path to the file/dir to be stat'ed */ + parent_dir = item_num / items_per_dir; + + if (parent_dir > 0) { //item is not in tree's root directory + + /* prepend parent directory to item's path */ + sprintf(temp, "%s.%d/%s", base_tree_name, parent_dir, item); + strcpy(item, temp); + + //still not at the tree's root dir + while (parent_dir > branch_factor) { + parent_dir = (int) ((parent_dir-1) / branch_factor); + sprintf(temp, "%s.%d/%s", base_tree_name, parent_dir, item); + strcpy(item, temp); + } + } + + /* below temp used to be hiername */ + if (rank == 0 && verbose >= 2) { + if (dirs) { + printf("stat dir : %s\n", item); + } else { + printf("stat file: %s\n", item); + } + fflush(stdout); + } + if (stat(item, &buf) == -1) { + if (dirs) { + FAIL("unable to stat directory"); + } else { + FAIL("unable to stat file"); + } + } + } +} + + +/* reads all of the items created as specified by the input parameters */ +void mdtest_read(int random, int dirs) { + + int i, parent_dir, item_num = 0; + int fd; + char item[MAX_LEN], temp[MAX_LEN]; + + /* allocate read buffer */ + if (read_bytes > 0) { + read_buffer = (char *)malloc(read_bytes); + if (read_buffer == NULL) { + FAIL("out of memory"); + } + } + + /* determine the number of items to read */ + int stop = 0; + if (leaf_only) { + stop = items_per_dir * pow(branch_factor, depth); + } else { + stop = items; + } + + /* iterate over all of the item IDs */ + for (i = 0; i < stop; i++) { + + memset(&item, 0, MAX_LEN); + memset(temp, 0, MAX_LEN); + + /* determine the item number to read */ + if (random) { + item_num = rand_array[i]; + } else { + item_num = i; + } + + /* make adjustments if in leaf only mode*/ + if (leaf_only) { + item_num += items_per_dir * + (num_dirs_in_tree - pow(branch_factor,depth)); + } + + /* create name of file to read */ + if (dirs) { + ; /* N/A */ + } else { + if (rank == 0 && verbose >= 3 && (i%ITEM_COUNT == 0) && (i != 0)) { + printf("read file: %d\n", i); + fflush(stdout); + } + sprintf(item, "file.%s%d", read_name, item_num); + } + + /* determine the path to the file/dir to be read'ed */ + parent_dir = item_num / items_per_dir; + + if (parent_dir > 0) { //item is not in tree's root directory + + /* prepend parent directory to item's path */ + sprintf(temp, "%s.%d/%s", base_tree_name, parent_dir, item); + strcpy(item, temp); + + //still not at the tree's root dir + while (parent_dir > branch_factor) { + parent_dir = (int) ((parent_dir-1) / branch_factor); + sprintf(temp, "%s.%d/%s", base_tree_name, parent_dir, item); + strcpy(item, temp); + } + } + + /* below temp used to be hiername */ + if (rank == 0 && verbose >= 2) { + if (dirs) { + ; + } else { + printf("read file: %s\n", item); + } + fflush(stdout); + } + + /* open file for reading */ + if ((fd = open(item, O_RDWR, FILEMODE)) == -1) { + FAIL("unable to open file"); + } + + /* read file */ + if (read_bytes > 0) { + if (read(fd, read_buffer, read_bytes) != read_bytes) + FAIL("unable to read file"); + } + + /* close file */ + if (close(fd) == -1) { + FAIL("unable to close file"); + } + } +} + +/* This method should be called by rank 0. It subsequently does all of + the creates and removes for the other ranks */ +void collective_create_remove(int create, int dirs, int ntasks) { + + int i; + char temp[MAX_LEN]; + + /* rank 0 does all of the creates and removes for all of the ranks */ + for (i=0; i 0) { + mdtest_stat(1, 1); + } else { + mdtest_stat(0, 1); + } + + } + + if (barriers) { + MPI_Barrier(testcomm); + } + t[2] = MPI_Wtime(); + + /* read phase */ + if (read_only) { + + if (unique_dir_per_task) { + unique_dir_access(READ_SUB_DIR); + if (!time_unique_dir_overhead) { + offset_timers(t, 2); + } + } + + /* read directories */ + if (random_seed > 0) { + ; /* N/A */ + } else { + ; /* N/A */ + } + + } + + if (barriers) { + MPI_Barrier(testcomm); + } + t[3] = MPI_Wtime(); + + if (remove_only) { + if (unique_dir_per_task) { + unique_dir_access(RM_SUB_DIR); + if (!time_unique_dir_overhead) { + offset_timers(t, 3); + } + } + } + + /* remove phase */ + if (remove_only) { + + /* remove directories */ + if (collective_creates) { + if (rank == 0) { + collective_create_remove(0, 1, ntasks); + } + } else { + create_remove_items(0, 1, 0, 0, NULL, 0); + } + } + + if (barriers) { + MPI_Barrier(testcomm); + } + t[4] = MPI_Wtime(); + + if (remove_only) { + if (unique_dir_per_task) { + unique_dir_access(RM_UNI_DIR); + } + } + if (unique_dir_per_task && !time_unique_dir_overhead) { + offset_timers(t, 4); + } + + MPI_Comm_size(testcomm, &size); + + /* calculate times */ + if (create_only) { + summary_table[iteration].entry[0] = items*size/(t[1] - t[0]); + } else { + summary_table[iteration].entry[0] = 0; + } + if (stat_only) { + summary_table[iteration].entry[1] = items*size/(t[2] - t[1]); + } else { + summary_table[iteration].entry[1] = 0; + } + if (read_only) { + summary_table[iteration].entry[2] = items*size/(t[3] - t[2]); + } else { + summary_table[iteration].entry[2] = 0; + } + if (remove_only) { + summary_table[iteration].entry[3] = items*size/(t[4] - t[3]); + } else { + summary_table[iteration].entry[3] = 0; + } + + if (verbose >= 1 && rank == 0) { + printf(" Directory creation: %10.3f sec, %10.3f ops/sec\n", + t[1] - t[0], summary_table[iteration].entry[0]); + printf(" Directory stat : %10.3f sec, %10.3f ops/sec\n", + t[2] - t[1], summary_table[iteration].entry[1]); +/* N/A + printf(" Directory read : %10.3f sec, %10.3f ops/sec\n", + t[3] - t[2], summary_table[iteration].entry[2]); +*/ + printf(" Directory removal : %10.3f sec, %10.3f ops/sec\n", + t[4] - t[3], summary_table[iteration].entry[3]); + fflush(stdout); + } +} + +void file_test(int iteration, int ntasks) { + int size; + double t[5] = {0}; + + MPI_Barrier(testcomm); + t[0] = MPI_Wtime(); + + /* create phase */ + if (create_only) { + if (unique_dir_per_task) { + unique_dir_access(MK_UNI_DIR); + if (!time_unique_dir_overhead) { + offset_timers(t, 0); + } + } + + /* "touch" the files */ + if (collective_creates) { + if (rank == 0) { + collective_create_remove(1, 0, ntasks); + } + MPI_Barrier(testcomm); + } + + /* create files */ + create_remove_items(0, 0, 1, 0, NULL, 0); + + } + + if (barriers) { + MPI_Barrier(testcomm); + } + t[1] = MPI_Wtime(); + + /* stat phase */ + if (stat_only) { + + if (unique_dir_per_task) { + unique_dir_access(STAT_SUB_DIR); + if (!time_unique_dir_overhead) { + offset_timers(t, 1); + } + } + + /* stat files */ + if (random_seed > 0) { + mdtest_stat(1,0); + } else { + mdtest_stat(0,0); + } + } + + if (barriers) { + MPI_Barrier(testcomm); + } + t[2] = MPI_Wtime(); + + /* read phase */ + if (read_only) { + + if (unique_dir_per_task) { + unique_dir_access(READ_SUB_DIR); + if (!time_unique_dir_overhead) { + offset_timers(t, 2); + } + } + + /* read files */ + if (random_seed > 0) { + mdtest_read(1,0); + } else { + mdtest_read(0,0); + } + } + + if (barriers) { + MPI_Barrier(testcomm); + } + t[3] = MPI_Wtime(); + + if (remove_only) { + if (unique_dir_per_task) { + unique_dir_access(RM_SUB_DIR); + if (!time_unique_dir_overhead) { + offset_timers(t, 3); + } + } + } + + /* remove phase */ + if (remove_only) { + if (collective_creates) { + if (rank == 0) { + collective_create_remove(0, 0, ntasks); + } + } else { + create_remove_items(0, 0, 0, 0, NULL, 0); + } + } + + if (barriers) { + MPI_Barrier(testcomm); + } + t[4] = MPI_Wtime(); + + if (remove_only) { + if (unique_dir_per_task) { + unique_dir_access(RM_UNI_DIR); + } + } + if (unique_dir_per_task && !time_unique_dir_overhead) { + offset_timers(t, 4); + } + + MPI_Comm_size(testcomm, &size); + + /* calculate times */ + if (create_only) { + summary_table[iteration].entry[4] = items*size/(t[1] - t[0]); + } else { + summary_table[iteration].entry[4] = 0; + } + if (stat_only) { + summary_table[iteration].entry[5] = items*size/(t[2] - t[1]); + } else { + summary_table[iteration].entry[5] = 0; + } + if (read_only) { + summary_table[iteration].entry[6] = items*size/(t[3] - t[2]); + } else { + summary_table[iteration].entry[6] = 0; + } + if (remove_only) { + summary_table[iteration].entry[7] = items*size/(t[4] - t[3]); + } else { + summary_table[iteration].entry[7] = 0; + } + + if (verbose >= 1 && rank == 0) { + printf(" File creation : %10.3f sec, %10.3f ops/sec\n", + t[1] - t[0], summary_table[iteration].entry[4]); + printf(" File stat : %10.3f sec, %10.3f ops/sec\n", + t[2] - t[1], summary_table[iteration].entry[5]); + printf(" File read : %10.3f sec, %10.3f ops/sec\n", + t[3] - t[2], summary_table[iteration].entry[6]); + printf(" File removal : %10.3f sec, %10.3f ops/sec\n", + t[4] - t[3], summary_table[iteration].entry[7]); + fflush(stdout); + } +} + +void print_help() { + char * opts[] = { +"Usage: mdtest [-b branching_factor] [-B] [-c] [-C] [-d testdir] [-D] [-e number_of_bytes_to_read]", +" [-E] [-f first] [-F] [-h] [-i iterations] [-I items_per_dir] [-l last] [-L]", +" [-n number_of_items] [-N stride_length] [-p seconds] [-r]", +" [-R[seed]] [-s stride] [-S] [-t] [-T] [-u] [-v]", +" [-V verbosity_value] [-w number_of_bytes_to_write] [-y] [-z depth]", +"\t-b: branching factor of hierarchical directory structure", +"\t-B: no barriers between phases", +"\t-c: collective creates: task 0 does all creates", +"\t-C: only create files/dirs", +"\t-d: the directory in which the tests will run", +"\t-D: perform test on directories only (no files)", +"\t-e: bytes to read from each file", +"\t-E: only read files/dir", +"\t-f: first number of tasks on which the test will run", +"\t-F: perform test on files only (no directories)", +"\t-h: prints this help message", +"\t-i: number of iterations the test will run", +"\t-I: number of items per directory in tree", +"\t-l: last number of tasks on which the test will run", +"\t-L: files only at leaf level of tree", +"\t-n: every process will creat/stat/read/remove # directories and files", +"\t-N: stride # between neighbor tasks for file/dir operation (local=0)", +"\t-p: pre-iteration delay (in seconds)", +"\t-r: only remove files or directories left behind by previous runs", +"\t-R: randomly stat files (optional argument for random seed)", +"\t-s: stride between the number of tasks for each test", +"\t-S: shared file access (file only, no directories)", +"\t-t: time unique working directory overhead", +"\t-T: only stat files/dirs", +"\t-u: unique working directory for each task", +"\t-v: verbosity (each instance of option increments by one)", +"\t-V: verbosity value", +"\t-w: bytes to write to each file after it is created", +"\t-y: sync file after writing", +"\t-z: depth of hierarchical directory structure", +"" +}; + int i, j; + + for (i = 0; strlen(opts[i]) > 0; i++) + printf("%s\n", opts[i]); + fflush(stdout); + + MPI_Initialized(&j); + if (j) { + MPI_Finalize(); + } + exit(0); +} + +void summarize_results(int iterations) { + char access[MAX_LEN]; + int i, j, k; + int start, stop, tableSize = 10; + double min, max, mean, sd, sum = 0, var = 0, curr = 0; + + double all[iterations * size * tableSize]; + MPI_Barrier(MPI_COMM_WORLD); + MPI_Gather(&summary_table->entry[0], tableSize*iterations, + MPI_DOUBLE, all, tableSize*iterations, MPI_DOUBLE, + 0, MPI_COMM_WORLD); + + if (rank == 0) { + + printf("\nSUMMARY: (of %d iterations)\n", iterations); + printf( + " Operation Max Min Mean Std Dev\n"); + printf( + " --------- --- --- ---- -------\n"); + fflush(stdout); + + /* if files only access, skip entries 0-3 (the dir tests) */ + if (files_only && !dirs_only) { + start = 4; + } else { + start = 0; + } + + /* if directories only access, skip entries 4-7 (the file tests) */ + if (dirs_only && !files_only) { + stop = 4; + } else { + stop = 8; + } + + /* special case: if no directory or file tests, skip all */ + if (!dirs_only && !files_only) { + start = stop = 0; + } + + /* calculate aggregates */ + if (barriers) { + double maxes[iterations]; + + + /* Because each proc times itself, in the case of barriers we + * have to backwards calculate the time to simulate the use + * of barriers. + */ + for (i = start; i < stop; i++) { + for (j=0; j maxes[j]) { + min = maxes[j]; + } + if (max < maxes[j]) { + max = maxes[j]; + } + sum += maxes[j]; + } + mean = sum / iterations; + for (j=0; j curr) { + min = curr; + } + if (max < curr) { + max = curr; + } + sum += curr; + } + } + mean = sum / (iterations * size); + for (k=0; k curr) { + min = curr; + } + if (max < curr) { + max = curr; + } + sum += curr; + } + mean = sum / (iterations); + for (j = 0; j < iterations; j++) { + var += pow((mean - summary_table[j].entry[i]), 2); + } + var = var / (iterations); + sd = sqrt(var); + switch (i) { + case 8: strcpy(access, "Tree creation :"); break; + case 9: strcpy(access, "Tree removal :"); break; + default: strcpy(access, "ERR"); break; + } + printf(" %s ", access); + printf("%10.3f ", max); + printf("%10.3f ", min); + printf("%10.3f ", mean); + printf("%10.3f\n", sd); + fflush(stdout); + sum = var = 0; + } + } +} + +/* Checks to see if the test setup is valid. If it isn't, fail. */ +void valid_tests() { + + /* if dirs_only and files_only were both left unset, set both now */ + if (!dirs_only && !files_only) { + dirs_only = files_only = 1; + } + + /* if shared file 'S' access, no directory tests */ + if (shared_file) { + dirs_only = 0; + } + + /* check for collective_creates incompatibilities */ + if (shared_file && collective_creates && rank == 0) { + FAIL("-c not compatible with -S"); + } + if (path_count > 1 && collective_creates && rank == 0) { + FAIL("-c not compatible with multiple test directories"); + } + if (collective_creates && !barriers) { + FAIL("-c not compatible with -B"); + } + + /* check for shared file incompatibilities */ + if (unique_dir_per_task && shared_file && rank == 0) { + FAIL("-u not compatible with -S"); + } + + /* check multiple directory paths and strided option */ + if (path_count > 1 && nstride > 0) { + FAIL("cannot have multiple directory paths with -N strides between neighbor tasks"); + } + + /* check for shared directory and multiple directories incompatibility */ + if (path_count > 1 && unique_dir_per_task != 1) { + FAIL("shared directory mode is not compatible with multiple directory paths"); + } + + /* check if more directory paths than ranks */ + if (path_count > size) { + FAIL("cannot have more directory paths than MPI tasks"); + } + + /* check depth */ + if (depth < 0) { + FAIL("depth must be greater than or equal to zero"); + } + /* check branch_factor */ + if (branch_factor < 1 && depth > 0) { + FAIL("branch factor must be greater than or equal to zero"); + } + /* check for valid number of items */ + if ((items > 0) && (items_per_dir > 0)) { + FAIL("only specify the number of items or the number of items per directory"); + } + +} + +void show_file_system_size(char *file_system) { + char real_path[MAX_LEN]; + char file_system_unit_str[MAX_LEN] = "GiB"; + char inode_unit_str[MAX_LEN] = "Mi"; + long long int file_system_unit_val = 1024 * 1024 * 1024; + long long int inode_unit_val = 1024 * 1024; + long long int total_file_system_size, + free_file_system_size, + total_inodes, + free_inodes; + double total_file_system_size_hr, + used_file_system_percentage, + used_inode_percentage; + struct statfs status_buffer; + + if (statfs(file_system, &status_buffer) != 0) { + FAIL("unable to statfs() file system"); + } + + /* data blocks */ + total_file_system_size = status_buffer.f_blocks * status_buffer.f_bsize; + free_file_system_size = status_buffer.f_bfree * status_buffer.f_bsize; + used_file_system_percentage = (1 - ((double)free_file_system_size + / (double)total_file_system_size)) * 100; + total_file_system_size_hr = (double)total_file_system_size + / (double)file_system_unit_val; + if (total_file_system_size_hr > 1024) { + total_file_system_size_hr = total_file_system_size_hr / 1024; + strcpy(file_system_unit_str, "TiB"); + } + + /* inodes */ + total_inodes = status_buffer.f_files; + free_inodes = status_buffer.f_ffree; + used_inode_percentage = (1 - ((double)free_inodes/(double)total_inodes)) + * 100; + + /* show results */ + if (realpath(file_system, real_path) == NULL) { + FAIL("unable to use realpath()"); + } + fprintf(stdout, "Path: %s\n", real_path); + fprintf(stdout, "FS: %.1f %s Used FS: %2.1f%% ", + total_file_system_size_hr, file_system_unit_str, + used_file_system_percentage); + fprintf(stdout, "Inodes: %.1f %s Used Inodes: %2.1f%%\n", + (double)total_inodes / (double)inode_unit_val, + inode_unit_str, used_inode_percentage); + fflush(stdout); + + return; +} + +void display_freespace(char *testdirpath) +{ + char dirpath[MAX_LEN] = {0}; + int i; + int directoryFound = 0; + + strcpy(dirpath, testdirpath); + + /* get directory for outfile */ + i = strlen(dirpath); + while (i-- > 0) { + if (dirpath[i] == '/') { + dirpath[i] = '\0'; + directoryFound = 1; + break; + } + } + + /* if no directory/, use '.' */ + if (directoryFound == 0) { + strcpy(dirpath, "."); + } + + show_file_system_size(dirpath); + + return; +} + +void create_remove_directory_tree(int create, + int currDepth, char* path, int dirNum) { + + int i; + char dir[MAX_LEN]; + + if (currDepth == 0) { + + sprintf(dir, "%s.%d/", base_tree_name, dirNum); + + if (create) { + if (rank == 0 && verbose >= 2) { + printf("making: %s\n", dir); + fflush(stdout); + } + if (mkdir(dir, DIRMODE) == -1) { + FAIL("Unable to create directory"); + } + } + + create_remove_directory_tree(create, ++currDepth, dir, ++dirNum); + + if (!create) { + if (rank == 0 && verbose >= 2) { + printf("remove: %s\n", dir); + fflush(stdout); + } + if (rmdir(dir) == -1) { + FAIL("Unable to remove directory"); + } + } + + } else if (currDepth <= depth) { + + char temp_path[MAX_LEN]; + strcpy(temp_path, path); + int currDir = dirNum; + + for (i=0; i= 2) { + printf("making: %s\n", temp_path); + fflush(stdout); + } + if (mkdir(temp_path, DIRMODE) == -1) { + FAIL("Unable to create directory"); + } + } + + create_remove_directory_tree(create, ++currDepth, + temp_path, (branch_factor*currDir)+1); + currDepth--; + + if (!create) { + if (rank == 0 && verbose >= 2) { + printf("remove: %s\n", temp_path); + fflush(stdout); + } + if (rmdir(temp_path) == -1) { + FAIL("Unable to remove directory"); + } + } + + strcpy(temp_path, path); + currDir++; + } + } +} + +int main(int argc, char **argv) { + int i, j, c; + int nodeCount; + MPI_Group worldgroup, testgroup; + struct { + int first; + int last; + int stride; + } range = {0, 0, 1}; + int first = 1; + int last = 0; + int stride = 1; + int iterations = 1; + + /* Check for -h parameter before MPI_Init so the mdtest binary can be + called directly, without, for instance, mpirun. */ + for (i = 1; i < argc; i++) { + if (!strcmp(argv[i], "-h") || !strcmp(argv[i], "--help")) { + print_help(); + } + } + + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &size); + + nodeCount = size / count_tasks_per_node(); + + if (rank == 0) { + printf("-- started at %s --\n\n", timestamp()); + printf("mdtest-%s was launched with %d total task(s) on %d nodes\n", + RELEASE_VERS, size, nodeCount); + fflush(stdout); + } + + if (rank == 0) { + fprintf(stdout, "Command line used:"); + for (i = 0; i < argc; i++) { + fprintf(stdout, " %s", argv[i]); + } + fprintf(stdout, "\n"); + fflush(stdout); + } + + /* Parse command line options */ + while (1) { + c = getopt(argc, argv, "b:BcCd:De:Ef:Fhi:I:l:Ln:N:p:rR::s:StTuvV:w:yz:"); + if (c == -1) { + break; + } + + switch (c) { + case 'b': + branch_factor = atoi(optarg); break; + case 'B': + barriers = 0; break; + case 'c': + collective_creates = 1; break; + case 'C': + create_only = 1; break; + case 'd': + parse_dirpath(optarg); break; + case 'D': + dirs_only = 1; break; + case 'e': + read_bytes = atoi(optarg); break; + case 'E': + read_only = 1; break; + case 'f': + first = atoi(optarg); break; + case 'F': + files_only = 1; break; + case 'h': + print_help(); break; + case 'i': + iterations = atoi(optarg); break; + case 'I': + items_per_dir = atoi(optarg); break; + case 'l': + last = atoi(optarg); break; + case 'L': + leaf_only = 1; break; + case 'n': + items = atoi(optarg); break; + case 'N': + nstride = atoi(optarg); break; + case 'p': + pre_delay = atoi(optarg); break; + case 'r': + remove_only = 1; break; + case 'R': + if (optarg == NULL) { + random_seed = time(NULL); + MPI_Barrier(MPI_COMM_WORLD); + MPI_Bcast(&random_seed, 1, MPI_INT, 0, MPI_COMM_WORLD); + random_seed += rank; + } else { + random_seed = atoi(optarg)+rank; + } + break; + case 's': + stride = atoi(optarg); break; + case 'S': + shared_file = 1; break; + case 't': + time_unique_dir_overhead = 1; break; + case 'T': + stat_only = 1; break; + case 'u': + unique_dir_per_task = 1; break; + case 'v': + verbose += 1; break; + case 'V': + verbose = atoi(optarg); break; + case 'w': + write_bytes = atoi(optarg); break; + case 'y': + sync_file = 1; break; + case 'z': + depth = atoi(optarg); break; + } + } + + if (!create_only && !stat_only && !read_only && !remove_only) { + create_only = stat_only = read_only = remove_only = 1; + } + + valid_tests(); + + /* setup total number of items and number of items per dir */ + if (depth <= 0) { + num_dirs_in_tree = 1; + } else { + if (branch_factor < 1) { + num_dirs_in_tree = 1; + } else if (branch_factor == 1) { + num_dirs_in_tree = depth + 1; + } else { + num_dirs_in_tree = + (1 - pow(branch_factor, depth+1)) / (1 - branch_factor); + } + } + if (items_per_dir > 0) { + items = items_per_dir * num_dirs_in_tree; + } else { + if (leaf_only) { + if (branch_factor <= 1) { + items_per_dir = items; + } else { + items_per_dir = items / pow(branch_factor, depth); + items = items_per_dir * pow(branch_factor, depth); + } + } else { + items_per_dir = items / num_dirs_in_tree; + items = items_per_dir * num_dirs_in_tree; + } + } + + /* initialize rand_array */ + if (random_seed > 0) { + srand(random_seed); + + int stop = 0; + if (leaf_only) { + stop = items_per_dir * pow(branch_factor, depth); + } else { + stop = items; + } + rand_array = (int*) malloc(stop * sizeof(int)); + + for (i=0; i1) { + n--; + int k = rand() % (n+1); + int tmp = rand_array[k]; + rand_array[k] = rand_array[n]; + rand_array[n] = tmp; + } + } + + /* allocate and initialize write buffer with # */ + if (write_bytes > 0) { + write_buffer = (char *)malloc(write_bytes); + if (write_buffer == NULL) { + FAIL("out of memory"); + } + memset(write_buffer, 0x23, write_bytes); + } + + /* setup directory path to work in */ + if (path_count == 0) { /* special case where no directory path provided with '-d' option */ + getcwd(testdirpath, MAX_LEN); + path_count = 1; + } else { + strcpy(testdirpath, filenames[rank%path_count]); + } + + /* display disk usage */ + if (rank == 0) display_freespace(testdirpath); + + if (rank == 0) { + if (random_seed > 0) { + printf("random seed: %d\n", random_seed); + } + } + + /* if directory does not exist, create it */ + if ((rank < path_count) && chdir(testdirpath) == -1) { + if (mkdir(testdirpath, DIRMODE) == - 1) { + FAIL("Unable to create test directory path"); + } + } + + if (gethostname(hostname, MAX_LEN) == -1) { + perror("gethostname"); + MPI_Abort(MPI_COMM_WORLD, 2); + } + if (last == 0) { + first = size; + last = size; + } + + /* setup summary table for recording results */ + summary_table = (table_t *)malloc(iterations * sizeof(table_t)); + if (summary_table == NULL) { + FAIL("out of memory"); + } + + if (unique_dir_per_task) { + sprintf(base_tree_name, "mdtest_tree.%d", rank); + } else { + sprintf(base_tree_name, "mdtest_tree"); + } + + /* start and end times of directory tree create/remove */ + double startCreate, endCreate; + + /* default use shared directory */ + strcpy(mk_name, "mdtest.shared."); + strcpy(stat_name, "mdtest.shared."); + strcpy(read_name, "mdtest.shared."); + strcpy(rm_name, "mdtest.shared."); + + MPI_Comm_group(MPI_COMM_WORLD, &worldgroup); + /* Run the tests */ + for (i = first; i <= last && i <= size; i += stride) { + range.last = i - 1; + MPI_Group_range_incl(worldgroup, 1, (void *)&range, &testgroup); + MPI_Comm_create(MPI_COMM_WORLD, testgroup, &testcomm); + if (rank == 0) { + if (files_only && dirs_only) { + printf("\n%d tasks, %d files/directories\n", i, i * items); + } else if (files_only) { + printf("\n%d tasks, %d files\n", i, i * items); + } else if (dirs_only) { + printf("\n%d tasks, %d directories\n", i, i * items); + } + } + if (rank == 0 && verbose >= 1) { + printf("\n"); + printf(" Operation Duration Rate\n"); + printf(" --------- -------- ----\n"); + } + for (j = 0; j < iterations; j++) { + if (rank == 0 && verbose >= 1) { + printf(" * iteration %d *\n", j+1); + fflush(stdout); + } + + strcpy(testdir, testdirpath); + strcat(testdir, "/"); + strcat(testdir, TEST_DIR); + sprintf(testdir, "%s.%d", testdir, j); + if ((rank < path_count) && chdir(testdir) == -1) { + if (mkdir(testdir, DIRMODE) == - 1) { + FAIL("Unable to create test directory"); + } + } + MPI_Barrier(MPI_COMM_WORLD); + if (chdir(testdir) == -1) { + FAIL("Unable to change to test directory"); + } + /* create hierarchical directory structure */ + MPI_Barrier(MPI_COMM_WORLD); + if (create_only) { + startCreate = MPI_Wtime(); + if (unique_dir_per_task) { + if (collective_creates && (rank == 0)) { + for (i=0; i= 1 && rank == 0) { + printf(" Tree creation : %10.3f sec, %10.3f ops/sec\n", + (endCreate - startCreate), summary_table[j].entry[8]); + fflush(stdout); + } + } else { + summary_table[j].entry[8] = 0; + } + sprintf(unique_mk_dir, "%s/%s.0", testdir, base_tree_name); + sprintf(unique_chdir_dir, "%s/%s.0", testdir, base_tree_name); + sprintf(unique_stat_dir, "%s/%s.0", testdir, base_tree_name); + sprintf(unique_read_dir, "%s/%s.0", testdir, base_tree_name); + sprintf(unique_rm_dir, "%s/%s.0", testdir, base_tree_name); + sprintf(unique_rm_uni_dir, "%s", testdir); + + if (!unique_dir_per_task) { + if (chdir(unique_mk_dir) == -1) { + FAIL("unable to change to shared tree directory"); + } + } + + if (rank < i) { + if (!shared_file) { + sprintf(mk_name, "mdtest.%d.", (rank+(0*nstride))%i); + sprintf(stat_name, "mdtest.%d.", (rank+(1*nstride))%i); + sprintf(read_name, "mdtest.%d.", (rank+(2*nstride))%i); + sprintf(rm_name, "mdtest.%d.", (rank+(3*nstride))%i); + } + if (unique_dir_per_task) { + sprintf(unique_mk_dir, "%s/mdtest_tree.%d.0", testdir, + (rank+(0*nstride))%i); + sprintf(unique_chdir_dir, "%s/mdtest_tree.%d.0", testdir, + (rank+(1*nstride))%i); + sprintf(unique_stat_dir, "%s/mdtest_tree.%d.0", testdir, + (rank+(2*nstride))%i); + sprintf(unique_read_dir, "%s/mdtest_tree.%d.0", testdir, + (rank+(3*nstride))%i); + sprintf(unique_rm_dir, "%s/mdtest_tree.%d.0", testdir, + (rank+(4*nstride))%i); + sprintf(unique_rm_uni_dir, "%s", testdir); + } + strcpy(top_dir, unique_mk_dir); + if (dirs_only && !shared_file) { + if (pre_delay) { + delay_secs(pre_delay); + } + directory_test(j, i); + } + if (files_only) { + if (pre_delay) { + delay_secs(pre_delay); + } + file_test(j, i); + } + } + + /* remove directory structure */ + if (!unique_dir_per_task) { + if (chdir(testdir) == -1) { + FAIL("unable to change to tree directory"); + } + } + MPI_Barrier(MPI_COMM_WORLD); + if (remove_only) { + startCreate = MPI_Wtime(); + if (unique_dir_per_task) { + if (collective_creates && (rank == 0)) { + for (i=0; i= 1 && rank == 0) { + printf(" Tree removal : %10.3f sec, %10.3f ops/sec\n", + (endCreate - startCreate), summary_table[j].entry[9]); + fflush(stdout); + } + } else { + summary_table[j].entry[9] = 0; + } + } + summarize_results(iterations); + if (i == 1 && stride > 1) { + i = 0; + } + } + + if (rank == 0) { + printf("\n-- finished at %s --\n", timestamp()); + fflush(stdout); + } + + if (random_seed > 0) { + free(rand_array); + } + + MPI_Finalize(); + exit(0); +} diff --git a/microbenchmarks/mdtest/scripts/WRAPPER_README b/microbenchmarks/mdtest/scripts/WRAPPER_README new file mode 100644 index 00000000..ec7e72fa --- /dev/null +++ b/microbenchmarks/mdtest/scripts/WRAPPER_README @@ -0,0 +1,49 @@ +======================== +mdtest_wrapper.py README +======================== + +mdtest_wrapper.py is a wrapper for mdtest that inserts the results into +a database. + +-------------- +Prerequisites: +-------------- + +Python 2.3.4 or higher +setuptools (Python module installer) +MySQLdb (a Python module) + + +------------- +Installation: +------------- + +---setuptools--- + +This is available http://pypi.python.org/pypi/setuptools. Installing +from source is probably the least hassle. + +Unzip and untar the package +Change directories to the top level directory for setuptools +Run: python setup.py build +Run: python setup.py install --prefix=/some/install/directory + NOTE: --prefix arg is only necessary if you do not have + root permissions or you want to install the module into + some non-default directory +If you designated some non-default install location, then add that + directory to PYTHONPATH + + +---MySQLdb--- + +This is available at http://sourceforge.net/projects/mysql-python/. +Installing this module proceeds in the same manner as setuptools. + + +------ +Usage: +------ + +python mdtest_wrapper.py mpirun [mpirun args] ./mdtest [mdtest args] [--desc descriptionOfTest] + +NOTE: mdtest needs to be compiled before running mdtest_wrapper. diff --git a/microbenchmarks/mdtest/scripts/env_to_db.tcsh b/microbenchmarks/mdtest/scripts/env_to_db.tcsh new file mode 100755 index 00000000..eccafebb --- /dev/null +++ b/microbenchmarks/mdtest/scripts/env_to_db.tcsh @@ -0,0 +1,127 @@ +#! /bin/tcsh +# whatever this spits out in the form of # key val +# can be parsed by the fs_test and will be +# injected into the DB (so long as the key exists in the schema) +# the format is key [space] val +# currently the val can't have spaces in it... +# to pull this into the DB through the fs_test, set the +# FS_TEST_EXTRA environment variable to point to this file + + +# set up +set target = $1 + +# if the user specified an fs:/ notation for MPI-IO, then strip it +set target = `echo $target | sed 's/.*://g'` +set target_dir = $target:h +set tpf = $HOME/Testing/tpf/src/tpf_panfs.x + +# mpihome +echo "mpihome $MPIHOME" + +# segment +echo "segment $HOSTNAME" + +# user +echo "user $USER" + +# system +echo "system $HOSTNAME" + +# date_ts +set date_ts = `date +%s` +echo "date_ts $date_ts" + +# mpihost +if ( $?MY_MPI_HOST ) then + echo "mpihost $MY_MPI_HOST" +endif + +# os_version +set os_version = `uname -r` +echo "os_version $os_version"s + +# yyyymmdd +set yyyymmdd = `date +%F` +echo "yyyymmdd $yyyymmdd" + +# jobid +if ( $?PBS_JOBID ) then + echo "jobid $PBS_JOBID" +else if ( $?LFS_JOBID ) then + echo "jobid $LFS_JOB" +endif + +# mpi_version +echo "mpi_version $MPI_VERSION" + +# host list +#env | grep -i node +if ( $?PBS_NODEFILE ) then + set host_list = `cat $PBS_NODEFILE | tr '\n' ','` + echo "host_list $host_list" +endif + +# procs_per_node +if ( $?PBS_NODEFILE ) then + set shortname = `hostname -s` + set procs_per_node = `cat $PBS_NODEFILE | grep $shortname | wc -l` + echo "procs_per_node $procs_per_node" +endif + +# grab the ionode list +set ionodes = `/sbin/ip route | awk '/nexthop/ {print $3}' | sort | uniq` +set num_ionodes = `echo $ionodes | wc -w` +set ionodes = `echo "$ionodes" | tr ' ' ','` +echo "ionodes $ionodes" +echo "num_ionodes $num_ionodes" + +# grab the panfs mount options +# if panfs has multiple mounts, this might get the wrong one... +set panfs_mnt = `mount -t panfs | tr '\n' '|' | tr ' ' '_'` +echo "panfs_mnt $panfs_mnt" + +# get panfs client version +set panfs_trace1 = /usr/sbin/panfs_trace +set panfs_trace2 = /usr/local/sbin/panfs_trace +if ( -x $panfs_trace1 ) then + set client_version = `$panfs_trace1 --version $target_dir | awk '{print $4$5}' | head -1` + echo "panfs $client_version" +else if ( -x $panfs_trace2 ) then + set client_version = `$panfs_trace2 --version $target_dir | awk '{print $4$5}' | head -1` + echo "panfs $client_version" +else + echo "error couldnt_discover_panfs_version" +endif + +# get thread count +set thread_count = `ps auxw | grep kpanfs_thpool | grep -v grep | wc -l` +echo "panfs_threads $thread_count" + +# get df numbers +set df_perc = `df $target_dir -t panfs -P | tail -1 | awk '{print $5}' | sed s/%//` +set df_tot = `df $target_dir -t panfs -P | tail -1 | awk '{print $2}'` +echo "df_perc_before $df_perc" +echo "df_tot_before $df_tot" + +# grab tpf info +if ( "X$target" != "X" ) then + if ( -d $target_dir ) then + if ( -x $tpf ) then + $tpf default $target_dir |& awk \ + '/Components/ {print "panfs_comps "$5} \ + /RAID width/ {print "panfs_width "$3} \ + /Depth/ {print "panfs_depth "$2} \ + /Stride/ {print "panfs_stripe "$3} \ + /Layout Policy/ {print "panfs_visit "$3} \ + /Layout Type/ {print "panfs_type "$3} \ + ' + else + echo "error no_valid_tpf_executable" + endif + else + echo "error no_valid_target_dir_$target_dir" + endif +else + echo "error no_valid_target" +endif diff --git a/microbenchmarks/mdtest/scripts/mdtest_wrapper.py b/microbenchmarks/mdtest/scripts/mdtest_wrapper.py new file mode 100755 index 00000000..457da3df --- /dev/null +++ b/microbenchmarks/mdtest/scripts/mdtest_wrapper.py @@ -0,0 +1,533 @@ +#! /usr/bin/env python + +########################### mdtest_wrapper.py ############################## +# +#This program is a wrapper for mdtest. It will execute mdtest and parse the +#output. The result will then be inserted into a database. If the database +#doesn't exist, then the query is written to a file. +# +#To run this program, run the following command: +#python mdtest_wrapper.py mpirun [mpirun args] /path/to/mdtest [mdtest args] +# +#Written by: Ryan Kroiss +#Last modified: 07/24/2009 +# +############################################################################ + +import getopt,sys,os,array,string,time,user +import MySQLdb as db + +import sys + + + +def fail(message): + print message + sys.exit() + +### customized parsing method for mdtest ### +def parseArgs(args, db_dict): + + for i in range(0, len(args)): + if (args[i].startswith('-')): + set = False + o = args[i] + if (i+1 <= (len(args)-1)): + if (not args[i+1].startswith('-')): + a = args[i+1] + set = True + + if o == "-b": + if (not set): + fail("Improperly formatted arguments") + db_dict['branch_factor'] = a + elif o == "-B": + db_dict['no_barriers'] = 1 + elif o == "-c": + db_dict['collective_creates'] = 1 + elif o == "-C": + db_dict['create_only'] = 1 + elif o == "-d": + if (not set): + fail("Improperly formatted arguments") + db_dict['working_directory'] = a + elif o == '--desc': + continue + elif o == "-D": + db_dict['directories_only'] = 1 + elif o == "-f": + if (not set): + fail("Improperly formatted arguments") + db_dict['first_task'] = a + elif o == "-F": + db_dict['files_only'] = 1 + elif o == "-h": + continue + elif o == "-i": + if (not set): + fail("Improperly formatted arguments") + db_dict['iterations'] = a + elif o == "-I": + if (not set): + fail("Improperly formatted arguments") + db_dict['items_per_dir'] = a + elif o == "-l": + if (not set): + fail("Improperly formatted arguments") + db_dict['last_task'] = a + elif o == "-L": + db_dict['leaf_only'] = 1 + elif o == "-n": + if (not set): + fail("Improperly formatted arguments") + db_dict['items'] = a + elif o == "-N": + if (not set): + fail("Improperly formatted arguments") + db_dict['nstride'] = a + elif o == "-p": + if (not set): + fail("Improperly formatted arguments") + db_dict['pre_delay'] = a + elif o == "-r": + db_dict['remove_only'] = 1 + #elif o.startswith('-R'): + #don't do anything here because the random seed is caught in the output of the test + elif o == "-s": + if (not set): + fail("Improperly formatted arguments") + db_dict['stride'] = a + elif o == "-S": + db_dict['shared_file'] = 1 + elif o == "-t": + db_dict['time_unique_dir_overhead'] = 1 + elif o == "-T": + db_dict['stat_only'] = 1 + elif o == "-u": + db_dict['unique_dir_per_task'] = 1 + elif o == "-v": + continue + elif o == "-V": + continue + elif o == "-w": + if (not set): + fail("Improperly formatted arguments") + db_dict['write_bytes'] = a + elif o == "-y": + db_dict['sync_file'] = 1 + elif o == "-z": + if (not set): + fail("Improperly formatted arguments") + db_dict['depth'] = a + else: + if (not o.startswith('-R')): + print o + fail("Incorrect flag - check mdtest usage") + + return db_dict + + +###### creates db insert query from db_data dictionary +###### then executes query +def db_insert(dbconn, db_data): + + ###### create insert query ###### + query = "INSERT INTO mdtest (" + + count = 0 + + ### append column names to query ### + for key in db_data.keys(): + if (db_data.get(key) != None): + if (count == 1): + query += ',' + count = 1 + query += key + + query += ") VALUES ('" + count = 0 + + ### append values to query ### + for value in db_data.values(): + if (value != None): + if (count == 1): + query += "','" + count = 1 + query += str(value) + + query += "')" + + db_success=False + try: + ### connect to the database ### + raise SystemError # don't even bother, just dump to file + conn = db.connect(host="phpmyadmin",db="mpi_io_test_pro",user="cron", + passwd="hpciopwd") + cursor = conn.cursor() + + ### execute the query ### + cursor.execute(query) + + ### close connection ### + cursor.close() + conn.close() + + print "Query inserted into database" + db_success=True + + except: + + sql_file = os.getenv('HOME') + '/mdtest.sql_query' + + ### if unable to connect to db, print query to file sql_query ### + try: + f = open(sql_file,'a') + except: + f = open(sql_file,'w') + try: + f.write(query + ';\n') + f.close() + print "Appended query to file: %s" % sql_file + db_success=True + except: + print "Unable to append query to file: %s" % sql_file + + #finally: + + ### when all else fails print query to standard out ### + if db_success is False: print query + + + +def main(): + + ### check for minimum number of arguments ### + if (len(sys.argv) < 3): + print "Your command needs to have more that three arguments." + print "It should look something like this:" + print "python mdtest_wrapper.py mpirun..." + sys.exit() + + command_line =" ".join(sys.argv) + + ### find index of first arg of mdtest command ### + last = len(sys.argv) + description = None + env_to_db = None + last = len(sys.argv) + for a in sys.argv: + if (a == '--desc'): + index = sys.argv.index(a) + 1 + if (index < len(sys.argv)): + description = sys.argv[index] + last = last - 2 + if (a == '--env_to_db'): + index = sys.argv.index(a) + 1 + if (index < len(sys.argv)): + env_to_db = sys.argv[index] + last = last - 2 + + ### get command to execute ### + command = sys.argv[1] + for s in sys.argv[2:last]: + command += " " + s + + + ### run command and print db_data to standard out ### + walltime = int(time.time()) + p = os.popen(command) + mdtest_output = p.read() + walltime = int(time.time()) - walltime + print mdtest_output + + ###### set up dictionary of values ###### + db_data = dict() + + ###### keys for output ####### + db_data['user'] = None + db_data['system'] = None + db_data['date_ts'] = None + db_data['description'] = description + + ####### initialize mdtest parameters output ######## + db_data['collective_creates'] = None + db_data['working_directory'] = None + db_data['directories_only'] = None + db_data['files_only'] = None + db_data['first_task'] = None + db_data['last_task'] = None + db_data['iterations'] = None + db_data['items'] = None + db_data['items_per_dir'] = None + db_data['nstride'] = None + db_data['stride'] = None + db_data['pre_delay'] = None + db_data['remove_only'] = None + db_data['shared_file'] = None + db_data['time_unique_dir_overhead'] = None + db_data['unique_dir_per_task'] = None + db_data['write_bytes'] = None + db_data['sync_file'] = None + db_data['branch_factor'] = None + db_data['depth'] = None + db_data['random_stat'] = None + db_data['no_barriers'] = None + db_data['create_only'] = None + db_data['leaf_level'] = None + db_data['stat_only'] = None + + + ####### initialize mdtest environment output ####### + db_data['mdtest_version'] = None + db_data['num_tasks'] = None + db_data['num_nodes'] = None + db_data['command_line'] = command_line + db_data['path'] = None + db_data['fs_size'] = None + db_data['fs_used_pct'] = None + db_data['inodes_size'] = None + db_data['inodes_used_pct'] = None + db_data['walltime'] = str(walltime) + + ####### initialize mdtest operations output ######## + db_data['dir_create_max'] = None + db_data['dir_create_min'] = None + db_data['dir_create_mean'] = None + db_data['dir_create_stddev'] = None + db_data['dir_stat_max'] = None + db_data['dir_stat_min'] = None + db_data['dir_stat_mean'] = None + db_data['dir_stat_stddev'] = None + db_data['dir_remove_max'] = None + db_data['dir_remove_min'] = None + db_data['dir_remove_mean'] = None + db_data['dir_remove_stddev'] = None + db_data['file_create_max'] = None + db_data['file_create_min'] = None + db_data['file_create_mean'] = None + db_data['file_create_stddev'] = None + db_data['file_stat_max'] = None + db_data['file_stat_min'] = None + db_data['file_stat_mean'] = None + db_data['file_stat_stddev'] = None + db_data['file_remove_max'] = None + db_data['file_remove_min'] = None + db_data['file_remove_mean'] = None + db_data['file_remove_stddev'] = None + db_data['tree_create'] = None + db_data['tree_remove'] = None + + ######## initialize system output ######### + db_data['mpihome'] = None + db_data['mpihost'] = None + db_data['mpi_version'] = None + db_data['segment'] = None + db_data['os_version'] = None + db_data['yyyymmdd'] = None + db_data['jobid'] = None + db_data['host_list'] = None + db_data['panfs'] = None + db_data['panfs_srv'] = None + db_data['panfs_type'] = None + db_data['panfs_stripe'] = None + db_data['panfs_width'] = None + db_data['panfs_depth'] = None + db_data['panfs_comps'] = None + db_data['panfs_visit'] = None + db_data['panfs_mnt'] = None + db_data['panfs_threads'] = None + db_data['ionodes'] = None + db_data['num_ionodes'] = None + db_data['procs_per_node'] = None + + + ### set working_directory to cwd if user didn't specify one + if (db_data['working_directory'] == None): + db_data['working_directory'] = os.getcwd() + + ####### run env_to_db and parse output ###### + if (env_to_db is not None and os.path.exists(env_to_db)): + command = "%s %s" % (env_to_db, db_data['working_directory']) + p = os.popen(command) + env_result = p.read() + lines = env_result.splitlines() + for line in lines: + tokens = line.split() + if (len(tokens) >= 2): + if (tokens[0] == 'ionodes'): + db_data['ionodes'] = tokens[1] + elif (tokens[0] == 'num_ionodes'): + db_data['num_ionodes'] = tokens[1] + elif (tokens[0] == 'panfs_mnt'): + db_data['panfs_mnt'] = tokens[1] + elif (tokens[0] == 'panfs_type'): + db_data['panfs_type'] = tokens[1] + elif (tokens[0] == 'panfs_comps'): + db_data['panfs_comps'] = tokens[1] + elif (tokens[0] == 'panfs_stripe'): + db_data['panfs_stripe'] = tokens[1] + elif (tokens[0] == 'panfs_width'): + db_data['panfs_width'] = tokens[1] + elif (tokens[0] == 'panfs_depth'): + db_data['panfs_depth'] = tokens[1] + elif (tokens[0] == 'panfs_visit'): + db_data['panfs_visit'] = tokens[1] + elif (tokens[0] == 'mpihome'): + db_data['mpihome'] = tokens[1] + elif (tokens[0] == 'segment'): + db_data['segment'] = tokens[1] + elif (tokens[0] == 'user'): + db_data['user'] = tokens[1] + elif (tokens[0] == 'system'): + db_data['system'] = tokens[1] + elif (tokens[0] == 'date_ts'): + db_data['date_ts'] = tokens[1] + elif (tokens[0] == 'mpihost'): + db_data['mpihost'] = tokens[1] + elif (tokens[0] == 'os_version'): + db_data['os_version'] = tokens[1] + elif (tokens[0] == 'yyyymmdd'): + db_data['yyyymmdd'] = tokens[1] + elif (tokens[0] == 'jobid'): + db_data['jobid'] = tokens[1] + elif (tokens[0] == 'mpi_version'): + db_data['mpi_version'] = tokens[1] + elif (tokens[0] == 'host_list'): + db_data['host_list'] = tokens[1] + elif (tokens[0] == 'procs_per_node'): + db_data['procs_per_node'] = tokens[1] + elif (tokens[0] == 'panfs_threads'): + db_data['panfs_threads'] = tokens[1] + elif (tokens[0] == 'panfs'): + db_data['panfs'] = tokens[1] + for i in range(len(tokens)-2): + db_data['panfs'] += " " + tokens[i+2] + + ###### get fs stats ###### + ### NOTE: this info could obtained by parsing output from mdtest + ### but it's both easier and more accurate to do it here + stats = os.statvfs(db_data['working_directory']) + + ### data blocks + total_fs_size = stats.f_blocks * stats.f_bsize + free_fs_size = stats.f_bfree * stats.f_bsize + used_fs_pct = (1 - (float(free_fs_size)/float(total_fs_size))) * 100 + db_data['fs_size'] = total_fs_size + db_data['fs_used_pct'] = used_fs_pct + + ### inodes + total_inodes = stats.f_files + free_inodes = stats.f_ffree + used_inodes_pct = (1 - (float(free_inodes)/float(total_inodes))) * 100 + db_data['inodes_size'] = total_inodes + db_data['inodes_used_pct'] = used_inodes_pct + + ###### parse output from mdtest and put in db_data dictionary ###### + lines = mdtest_output.splitlines() + for line in lines: + if (line.startswith('mdtest')): + line_toks = line.split(' ') + db_data['mdtest_version'] = line_toks[0] + first = True + for l in line_toks: + if (l.isdigit() and first): + db_data['num_tasks'] = l + first = False + elif (l.isdigit()): + db_data['num_nodes'] = l + elif (line.startswith('Path:')): + line_toks = line.split(':') + db_data['path'] = line_toks[1].strip() + elif (line.startswith('random')): + line_toks = line.split(':') + db_data['random_stat'] = line_toks[1].strip() + elif (line.startswith('tree creation rate')): + line_toks = line.split(':') + db_data['tree_create'] = line_toks[1].strip() + elif (line.startswith(" Directory creation:")): + line_toks = line.split() + length = len(line_toks) + for i in range(length): + if (i==(length-4)): + db_data['dir_create_max'] = line_toks[i] + elif (i==(length-3)): + db_data['dir_create_min'] = line_toks[i] + elif (i==(length-2)): + db_data['dir_create_mean'] = line_toks[i] + elif (i==(length-1)): + db_data['dir_create_stddev'] = line_toks[i] + elif (line.startswith(" Directory stat")): + line_toks = line.split() + length = len(line_toks) + for i in range(length): + if (i==(length-4)): + db_data['dir_stat_max'] = line_toks[i] + elif (i==(length-3)): + db_data['dir_stat_min'] = line_toks[i] + elif (i==(length-2)): + db_data['dir_stat_mean'] = line_toks[i] + elif (i==(length-1)): + db_data['dir_stat_stddev'] = line_toks[i] + elif (line.startswith(" Directory removal")): + line_toks = line.split() + length = len(line_toks) + for i in range(length): + if (i==(length-4)): + db_data['dir_remove_max'] = line_toks[i] + elif (i==(length-3)): + db_data['dir_remove_min'] = line_toks[i] + elif (i==(length-2)): + db_data['dir_remove_mean'] = line_toks[i] + elif (i==(length-1)): + db_data['dir_remove_stddev'] = line_toks[i] + elif (line.startswith(" File creation")): + line_toks = line.split() + length = len(line_toks) + for i in range(length): + if (i==(length-4)): + db_data['file_create_max'] = line_toks[i] + elif (i==(length-3)): + db_data['file_create_min'] = line_toks[i] + elif (i==(length-2)): + db_data['file_create_mean'] = line_toks[i] + elif (i==(length-1)): + db_data['file_create_stddev'] = line_toks[i] + elif (line.startswith(" File stat")): + line_toks = line.split() + length = len(line_toks) + for i in range(length): + if (i==(length-4)): + db_data['file_stat_max'] = line_toks[i] + elif (i==(length-3)): + db_data['file_stat_min'] = line_toks[i] + elif (i==(length-2)): + db_data['file_stat_mean'] = line_toks[i] + elif (i==(length-1)): + db_data['file_stat_stddev'] = line_toks[i] + elif (line.startswith(" File removal")): + line_toks = line.split() + length = len(line_toks) + for i in range(length): + if (i==(length-4)): + db_data['file_remove_max'] = line_toks[i] + elif (i==(length-3)): + db_data['file_remove_min'] = line_toks[i] + elif (i==(length-2)): + db_data['file_remove_mean'] = line_toks[i] + elif (i==(length-1)): + db_data['file_remove_stddev'] = line_toks[i] + elif (line.startswith('tree removal rate')): + line_toks = line.split(':') + db_data['tree_remove'] = line_toks[1].strip() + + + + db_insert(db,db_data) + + + + +if __name__ == "__main__": + main() + + diff --git a/microbenchmarks/mdtest/scripts/paramCatch.py b/microbenchmarks/mdtest/scripts/paramCatch.py new file mode 100644 index 00000000..dc146c0c --- /dev/null +++ b/microbenchmarks/mdtest/scripts/paramCatch.py @@ -0,0 +1,46 @@ +import MySQLdb as db,string + +def main(): + + ### change these variables to update the desired field ### + param = "-I" + field = "items_per_dir" + + print "Initiating database connection..." + d = db.connect(host="tangerine.lanl.gov", db="mpi_io_test") + print "Connected to database!" + cursor = d.cursor() + + print "Querying database..." + sql = "SELECT command_line,user,system,date_ts FROM mdtest WHERE !isnull(command_line)" + cursor.execute(sql) + print "Completed SELECT query!" + + data = cursor.fetchone() + + f = open("temp_query","w") + + print "Parsing query results..." + while (data != None): + cmd = data[0] + + if (cmd.find(param) != -1): + list = cmd.split() + index = list.index(param) + sql = "UPDATE mdtest SET "+field+"="+list.pop(index+1) + sql += " WHERE user like '"+data[1]+"'" + sql += " && system like '"+data[2]+"'" + sql += " && date_ts="+str(data[3])+"; " + f.write(sql) + + data = cursor.fetchone() + + d.close() + f.close() + + print "Done parsing query results! Update queries located in sql_query." + + + +if __name__ == "__main__": + main() diff --git a/microbenchmarks/mdtest/scripts/tester.py b/microbenchmarks/mdtest/scripts/tester.py new file mode 100755 index 00000000..fd850d5b --- /dev/null +++ b/microbenchmarks/mdtest/scripts/tester.py @@ -0,0 +1,223 @@ +#!/usr/bin/env python +# +# Tester for mdtest +# +#/*****************************************************************************\ +#* * +#* Copyright (c) 2003, The Regents of the University of California * +#* See the file COPYRIGHT for a complete copyright notice and license. * +#* * +#\*****************************************************************************/ +# +# CVS info: +# $RCSfile: tester.py,v $ +# $Revision: 1.1.2.1 $ +# $Date: 2010/05/11 21:25:16 $ +# $Author: loewe6 $ + +import sys +import os.path +import string +import time + +debug = 0 + +# definitions +RMPOOL = 'systest' +NODES = 1 +TPN = 4 +PROCS = NODES * TPN +EXECUTABLE = '/fs/home/bloewe/benchmarks/mdtest/mdtest' +TEST_DIR_LOC1 = '/panfs/REALM226/home/V1' +TEST_DIR_LOC3 = '/panfs/REALM226/home/V1@/panfs/REALM226/home/V2@/panfs/REALM226/home/V3' +TEST_DIRS = '/panfs/REALM226/home/V1 /panfs/REALM226/home/V2 /panfs/REALM226/home/V3' + +# tests +tests = [ + + # default + "", + + # test directory + "-d " + TEST_DIR_LOC1, + + # number of files per processor + "-d " + TEST_DIR_LOC1 + " -n 3", + + # number of iterations of test + "-d " + TEST_DIR_LOC1 + " -n 3 -i 2", + + # serially create before parallel access + "-d " + TEST_DIR_LOC1 + " -n 3 -i 2 -c", + + # pre-test delay + "-d " + TEST_DIR_LOC1 + " -n 3 -i 2 -p 1", + + # verbosity=1 + "-d " + TEST_DIR_LOC1 + " -n 3 -i 2 -v", + + # verbosity=2 + "-d " + TEST_DIR_LOC1 + " -n 3 -i 2 -v -v", + + # verbosity=3 + "-d " + TEST_DIR_LOC1 + " -n 3 -i 2 -V 3", + + # shared file + "-d " + TEST_DIR_LOC1 + " -n 3 -i 2 -S", + + # read-your-neighbor + "-d " + TEST_DIR_LOC1 + " -n 3 -i 2 -S -N " + str(TPN), + + # unique subdirectory + "-d " + TEST_DIR_LOC1 + " -n 3 -i 2 -u", + + # time unique subdirectory creation/deletion + "-d " + TEST_DIR_LOC1 + " -n 3 -i 2 -u -t", + + # directories only + "-d " + TEST_DIR_LOC1 + " -n 3 -i 2 -u -t -D", + + # files only + "-d " + TEST_DIR_LOC1 + " -n 3 -i 2 -u -t -F", + + # write 0 bytes + "-d " + TEST_DIR_LOC1 + " -n 3 -i 2 -u -t -F -w 0", + + # write 1 byte + "-d " + TEST_DIR_LOC1 + " -n 3 -i 2 -u -t -F -w 1", + + # write 0 bytes w/fsync + "-d " + TEST_DIR_LOC1 + " -n 3 -i 2 -u -t -F -w 0 -y", + + # write 1 byte w/fsync + "-d " + TEST_DIR_LOC1 + " -n 3 -i 2 -u -t -F -w 1 -y", + + # read-your-neighbor w/unique subdirectory + "-d " + TEST_DIR_LOC1 + " -n 3 -i 2 -u -t -N " + str(TPN), + + # number of tasks to run + "-d " + TEST_DIR_LOC1 + " -n 3 -i 2 -N " + str(TPN) + " -f 1 -l " \ + + str(PROCS-1) + " -s " + str(PROCS/3), + + # remove any remaining tests from previous run + "-d " + TEST_DIR_LOC1 + " -n 3 -i 2 -N " + str(TPN) + " -f 1 -l " \ + + str(PROCS-1) + " -s " + str(PROCS/3) + " -r ", + + # test directories + "-d " + TEST_DIR_LOC3, + + # number of files per processor + "-d " + TEST_DIR_LOC3 + " -n 3", + + # number of iterations of test + "-d " + TEST_DIR_LOC3 + " -n 3 -i 2", + + # pre-test delay + "-d " + TEST_DIR_LOC3 + " -n 3 -i 2 -p 1", + + # verbosity=1 + "-d " + TEST_DIR_LOC3 + " -n 3 -i 2 -v", + + # verbosity=2 + "-d " + TEST_DIR_LOC3 + " -n 3 -i 2 -v -v", + + # verbosity=3 + "-d " + TEST_DIR_LOC3 + " -n 3 -i 2 -V 3", + + # shared file + "-d " + TEST_DIR_LOC3 + " -n 3 -i 2 -S", + + # unique subdirectory + "-d " + TEST_DIR_LOC3 + " -n 3 -i 2 -u", + + # time unique subdirectory creation/deletion + "-d " + TEST_DIR_LOC3 + " -n 3 -i 2 -u -t", + + # directories only + "-d " + TEST_DIR_LOC3 + " -n 3 -i 2 -u -t -D", + + # files only + "-d " + TEST_DIR_LOC3 + " -n 3 -i 2 -u -t -F", + + # write 0 bytes + "-d " + TEST_DIR_LOC3 + " -n 3 -i 2 -u -t -F -w 0", + + # write 1 byte + "-d " + TEST_DIR_LOC3 + " -n 3 -i 2 -u -t -F -w 1", + + # write 0 bytes w/fsync + "-d " + TEST_DIR_LOC3 + " -n 3 -i 2 -u -t -F -w 0 -y", + + # write 1 byte w/fsync + "-d " + TEST_DIR_LOC3 + " -n 3 -i 2 -u -t -F -w 1 -y", + + # number of tasks to run + "-d " + TEST_DIR_LOC1 + " -n 3 -i 2 -f 1 -l " \ + + str(PROCS-1) + " -s " + str(PROCS/3) + +] + + +############################# +# set environment variables # +############################# +def SetEnvironment(rmpool, nodes, procs): + os.environ['MP_RMPOOL'] = str(rmpool) + os.environ['MP_NODES'] = str(nodes) + os.environ['MP_PROCS'] = str(procs) + return + + +################# +# flush to file # +################# +def Flush2File(resultsFile, string): + resultsFile.write(string + '\n') + resultsFile.flush() + + +################### +# run test script # +################### +def RunScript(resultsFile, test): + # -- for poe -- command = "poe " + EXECUTABLE + " " + test + command = "mpiexec -n " + str(PROCS) + " " + EXECUTABLE + " " + test + if debug == 1: + Flush2File(resultsFile, command) + else: + childIn, childOut = os.popen4(command) + childIn.close() + while 1: + line = childOut.readline() + if line == '': break + Flush2File(resultsFile, line[:-1]) + childOut.close() + return + + +######## +# main # +######## +def main(): + resultsFile = open("./results.txt-" + \ + os.popen("date +%m.%d.%y").read()[:-1], "w") + + Flush2File(resultsFile, "Testing mdtest") + + # test -h option on one task + SetEnvironment(RMPOOL, 1, 1) + RunScript(resultsFile, '-h') + + # set environ and run tests + SetEnvironment(RMPOOL, NODES, PROCS) + for i in range(0, len(tests)): + time.sleep(0) # delay any cleanup for previous test + #os.system("rm -rf " + TEST_DIRS) # cleanup TEST_DIRS between tests + RunScript(resultsFile, tests[i]) + + Flush2File(resultsFile, "\nFinished testing mdtest") + resultsFile.close() + +if __name__ == "__main__": + main() + diff --git a/utils/pav_config/tests/stream.yaml b/utils/pav_config/tests/stream.yaml index ef250616..59ddf0e6 100644 --- a/utils/pav_config/tests/stream.yaml +++ b/utils/pav_config/tests/stream.yaml @@ -88,15 +88,10 @@ _base: SOCKETS: 2 4 * ( 45M * 2 ) / 3 ARRAYS / 8 BYTES/ELEMENT = 15.0 Mi elements = 15000000 ***************************************************************************************************** - SAPPHIRE RAPIDS (DDR5): Intel(R) Xeon(R) Platinum 8480+ - CACHE: 107.52M + SAPPHIRE RAPIDS: Intel(R) Xeon(R) Platinum 8480+ + CACHE: 105 SOCKETS: 2 - 4 x (107.52M * 2 ) / 3 ARRAYS / 8 BYTES/ELEMENT = 35.84 Mi elements = 35840000 - ***************************************************************************************************** - SAPPHIRE RAPIDS (HBM): ?? - CACHE: 115.2M - SOCKETS: 2 - 4 x (115.2M * 2 ) / 3 ARRAYS / 8 BYTES/ELEMENT = 38.4 Mi elements = 38400000 + 4 x (105M * 2 ) / 3 ARRAYS / 8 BYTES/ELEMENT = 35 Mi elements = 35000000 scheduler: slurm schedule: @@ -296,7 +291,7 @@ spr_ddr5_xrds: "{{sys_name}}": [ darwin ] variables: arch: "spr" - stream_array_size: 35840000 + stream_array_size: 35e6 target: "xrds-stream.exe" omp_num_threads: [1, 2, 4, 8, 16, 32, 64, 128, 256, 512] omp_places: [cores, sockets] @@ -325,7 +320,7 @@ spr_hbm_xrds: "{{sys_name}}": [ darwin ] variables: arch: "spr" - stream_array_size: 38400000 + stream_array_size: 35e6 target: "xrds-stream.exe" omp_num_threads: [1, 2, 4, 8, 16, 32, 64, 128, 256, 512] omp_places: [cores, sockets] @@ -348,3 +343,55 @@ spr_hbm_xrds: preamble: - 'module load {{compilers.name}}/{{compilers.version}}' - 'module load {{mpis.modulefile}}' + +cts1_ats5: + inherits_from: cts1_xrds + subtitle: '{{compilers.name}}-{{compilers.version}}_{{tpn}}_{{mpis.name}}-{{mpis.version}}' + + permute_on: + - compilers + - mpis + - tpn + + variables: + numnodes: '1' + tpn: [1, 2, 4, 8, 16, 32, 36] + omp_num_threads: '1' + + run: + env: + GOMP_CPU_AFFINITY: '' + +xrds_ats5: + inherits_from: cts1_ats5 + + only_if: + "{{sys_name}}": ['crossroads', 'rocinante'] + + variables: + tpn: [8, 32, 56, 88, 112] + arch: "spr" + stream_array_size: 35e6 + omp_places: [cores, sockets] + omp_proc_bind: [true] + + schedule: + partition: 'hbm' + + build: + preamble: + #- 'module load friendly-testing' #'module rm craype-hugepages2M' + - 'module swap PrgEnv-${PE_ENV,,} PrgEnv-{{compilers.pe_env}}' + - 'module load {{compilers.name}}/{{compilers.version}}' + - 'module load {{mpis.name}}/{{mpis.version}}' + + run: + + preamble: + #- 'module load friendly-testing' #'module rm craype-hugepages2M' + - 'module swap PrgEnv-${PE_ENV,,} PrgEnv-{{compilers.pe_env}}' + - 'module load {{compilers.name}}/{{compilers.version}}' + - 'module load {{mpis.name}}/{{mpis.version}}' + + env: + GOMP_CPU_AFFINITY: '' \ No newline at end of file