From 39a9e2a6fe99682110e5c684ef3eb39c2de8cacf Mon Sep 17 00:00:00 2001 From: Jered Dominguez-Trujillo Date: Mon, 20 Nov 2023 16:27:32 -0700 Subject: [PATCH] Microbenchmark proofread suggestions (#68) * STREAM suggestions * Added purpose for OSUMB * Run Rules for OSUMB updated to N/A * DGEMM typo in Example Results section * Updated Makefile for DGEMM and make command in documentation --- doc/sphinx/09_Microbenchmarks/M1_STREAM/STREAM.rst | 14 +++++++++++--- doc/sphinx/09_Microbenchmarks/M3_OSUMB/OSUMB.rst | 6 +++++- doc/sphinx/09_Microbenchmarks/M5_DGEMM/DGEMM.rst | 6 +++--- microbenchmarks/dgemm/src/Makefile | 5 ++--- 4 files changed, 21 insertions(+), 10 deletions(-) diff --git a/doc/sphinx/09_Microbenchmarks/M1_STREAM/STREAM.rst b/doc/sphinx/09_Microbenchmarks/M1_STREAM/STREAM.rst index 136bb2bc..1af00b4d 100644 --- a/doc/sphinx/09_Microbenchmarks/M1_STREAM/STREAM.rst +++ b/doc/sphinx/09_Microbenchmarks/M1_STREAM/STREAM.rst @@ -95,18 +95,26 @@ This is the minimum size unless other system attributes constrain it. The array size only influences the capacity of STREAM to fully load the memory bus. At capacity, the measured values should reach a steady state where increasing the value of ``STREAM_ARRAY_SIZE`` doesn't influence the measurement for a certain number of processors. +For Crossroads, the benchmark was build with ``STREAM_ARRAY_SIZE=40000000`` and ``NTIMES=20`` with optmizations and OpenMP enabled. + +.. code-block:: bash + make CC=`which mpicc` FF=`which mpifort` CFLAGS="-O2 -fopenmp -DSTREAM_ARRAY_SIZE=40000000 -DNTIMES=20" FFLAGS="-O2 -fopenmp -DSTREAM_ARRAY_SIZE=40000000 -DNTIMES=20" + + Running ======= .. code-block:: bash - srun -n ./stream + export OMP_NUM_THREADS=1 + srun -n --cpu-bind=core ./stream-mpi.exe Replace `` with the number of MPI processes you want to use. For example, if you want to use 4 MPI processes, the command will be: .. code-block:: bash - srun -n 4 ./stream + export OMP_NUM_THREADS=1 + srun -n 4 --cpu-bind=core ./stream-mpi.exe Example Results =============== @@ -121,7 +129,7 @@ Crossroads These results were obtained using the cce v15.0.1 compiler and cray-mpich v 8.1.25. Results using the intel-oneapi and intel-classic v2023.1.0 and the same cray-mpich were also collected; cce performed the best. -``STREAM_ARRAY_SIZE=40 NTIMES=20`` +``STREAM_ARRAY_SIZE=40000000 NTIMES=20`` .. csv-table:: STREAM microbenchmark bandwidth measurement :file: stream-xrds_ats5cce-cray-mpich.csv diff --git a/doc/sphinx/09_Microbenchmarks/M3_OSUMB/OSUMB.rst b/doc/sphinx/09_Microbenchmarks/M3_OSUMB/OSUMB.rst index e6cfa7e6..37f309b9 100644 --- a/doc/sphinx/09_Microbenchmarks/M3_OSUMB/OSUMB.rst +++ b/doc/sphinx/09_Microbenchmarks/M3_OSUMB/OSUMB.rst @@ -5,6 +5,8 @@ OSU Microbenchmarks Purpose ======= +The OSU Microbenchmarks (OMB) are widely used to measure and evaluate the performance of MPI operations for point-to-oiint, multi-pair, collective, and one-sided communications. + Characteristics =============== @@ -18,6 +20,8 @@ The OSU benchmarks are a suite of microbenchmarks designed to measure network ch Run Rules --------- +N/A + Building ======== @@ -76,4 +80,4 @@ Crossroads :file: OSU_ats3_results.csv :align: center :widths: 10, 10, 10, 10, 10 - :header-rows: 1 \ No newline at end of file + :header-rows: 1 diff --git a/doc/sphinx/09_Microbenchmarks/M5_DGEMM/DGEMM.rst b/doc/sphinx/09_Microbenchmarks/M5_DGEMM/DGEMM.rst index 83a46b63..9f0c2a43 100644 --- a/doc/sphinx/09_Microbenchmarks/M5_DGEMM/DGEMM.rst +++ b/doc/sphinx/09_Microbenchmarks/M5_DGEMM/DGEMM.rst @@ -46,7 +46,7 @@ Makefiles are provided for the intel and gcc compilers. Before building, load th cd src patch -p1 < ../dgemm_omp_fixes.patch - make + make CFLAGS=-I .. @@ -84,7 +84,7 @@ These are positional arguments, so, for instance, R cannot be set without settin Example Results =============== -Results from Branson are provided on the following systems: +Results from DGEMM are provided on the following systems: * Crossroads (see :ref:`GlobalSystemATS3`) @@ -102,4 +102,4 @@ This test was built with the intel 2023.1.0 compiler using the crayOS compiler w .. figure:: dgemm_ats3.png :align: center :scale: 50% - :alt: DGEMM microbenchmark FLOPs measurement \ No newline at end of file + :alt: DGEMM microbenchmark FLOPs measurement diff --git a/microbenchmarks/dgemm/src/Makefile b/microbenchmarks/dgemm/src/Makefile index a448b919..7a6f0eaf 100644 --- a/microbenchmarks/dgemm/src/Makefile +++ b/microbenchmarks/dgemm/src/Makefile @@ -1,11 +1,10 @@ CC=gcc -CFLAGS=-ffast-math -mavx2 -ftree-vectorizer-verbose=3 -O3 -fopenmp -DUSE_CBLAS +CFLAGS+=-ffast-math -mavx2 -ftree-vectorizer-verbose=3 -O3 -fopenmp -DUSE_CBLAS LDFLAGS=-L${OPENBLAS_ROOT}/lib -lopenblas mt-dgemm: mt-dgemm.c - $(CC) $(CFLAGS) -o mt-dgemm mt-dgemm.c + $(CC) $(CFLAGS) $(LDFLAGS) -o mt-dgemm mt-dgemm.c clean: rm mt-dgemm *.o -