From 39a9e2a6fe99682110e5c684ef3eb39c2de8cacf Mon Sep 17 00:00:00 2001
From: Jered Dominguez-Trujillo <jereddt@lanl.gov>
Date: Mon, 20 Nov 2023 16:27:32 -0700
Subject: [PATCH] Microbenchmark proofread suggestions (#68)

* STREAM suggestions

* Added purpose for OSUMB

* Run Rules for OSUMB updated to N/A

* DGEMM typo in Example Results section

* Updated Makefile for DGEMM and make command in documentation
---
 doc/sphinx/09_Microbenchmarks/M1_STREAM/STREAM.rst | 14 +++++++++++---
 doc/sphinx/09_Microbenchmarks/M3_OSUMB/OSUMB.rst   |  6 +++++-
 doc/sphinx/09_Microbenchmarks/M5_DGEMM/DGEMM.rst   |  6 +++---
 microbenchmarks/dgemm/src/Makefile                 |  5 ++---
 4 files changed, 21 insertions(+), 10 deletions(-)

diff --git a/doc/sphinx/09_Microbenchmarks/M1_STREAM/STREAM.rst b/doc/sphinx/09_Microbenchmarks/M1_STREAM/STREAM.rst
index 136bb2bc..1af00b4d 100644
--- a/doc/sphinx/09_Microbenchmarks/M1_STREAM/STREAM.rst
+++ b/doc/sphinx/09_Microbenchmarks/M1_STREAM/STREAM.rst
@@ -95,18 +95,26 @@ This is the minimum size unless other system attributes constrain it.
 The array size only influences the capacity of STREAM to fully load the memory bus.
 At capacity, the measured values should reach a steady state where increasing the value of ``STREAM_ARRAY_SIZE`` doesn't influence the measurement for a certain number of processors.
 
+For Crossroads, the benchmark was build with ``STREAM_ARRAY_SIZE=40000000`` and ``NTIMES=20`` with optmizations and OpenMP enabled.
+
+.. code-block:: bash
+   make CC=`which mpicc` FF=`which mpifort` CFLAGS="-O2 -fopenmp -DSTREAM_ARRAY_SIZE=40000000 -DNTIMES=20" FFLAGS="-O2 -fopenmp -DSTREAM_ARRAY_SIZE=40000000 -DNTIMES=20"
+
+
 Running
 =======
 
 .. code-block:: bash
 
-  srun -n <num_processes> ./stream
+  export OMP_NUM_THREADS=1
+  srun -n <num_processes> --cpu-bind=core ./stream-mpi.exe
 
 Replace `<num_processes>` with the number of MPI processes you want to use. For example, if you want to use 4 MPI processes, the command will be:
 
 .. code-block:: bash
 
-  srun -n 4 ./stream
+  export OMP_NUM_THREADS=1
+  srun -n 4 --cpu-bind=core ./stream-mpi.exe
 
 Example Results
 ===============
@@ -121,7 +129,7 @@ Crossroads
 These results were obtained using the cce v15.0.1 compiler and cray-mpich v 8.1.25. 
 Results using the intel-oneapi and intel-classic v2023.1.0 and the same cray-mpich were also collected; cce performed the best.
 
-``STREAM_ARRAY_SIZE=40 NTIMES=20``
+``STREAM_ARRAY_SIZE=40000000 NTIMES=20``
 
 .. csv-table:: STREAM microbenchmark bandwidth measurement
    :file: stream-xrds_ats5cce-cray-mpich.csv
diff --git a/doc/sphinx/09_Microbenchmarks/M3_OSUMB/OSUMB.rst b/doc/sphinx/09_Microbenchmarks/M3_OSUMB/OSUMB.rst
index e6cfa7e6..37f309b9 100644
--- a/doc/sphinx/09_Microbenchmarks/M3_OSUMB/OSUMB.rst
+++ b/doc/sphinx/09_Microbenchmarks/M3_OSUMB/OSUMB.rst
@@ -5,6 +5,8 @@ OSU Microbenchmarks
 Purpose
 =======
 
+The OSU Microbenchmarks (OMB) are widely used to measure and evaluate the performance of MPI operations for point-to-oiint, multi-pair, collective, and one-sided communications.
+
 Characteristics
 ===============
 
@@ -18,6 +20,8 @@ The OSU benchmarks are a suite of microbenchmarks designed to measure network ch
 Run Rules
 ---------
 
+N/A
+
 Building
 ========
 
@@ -76,4 +80,4 @@ Crossroads
    :file: OSU_ats3_results.csv
    :align: center
    :widths: 10, 10, 10, 10, 10
-   :header-rows: 1
\ No newline at end of file
+   :header-rows: 1
diff --git a/doc/sphinx/09_Microbenchmarks/M5_DGEMM/DGEMM.rst b/doc/sphinx/09_Microbenchmarks/M5_DGEMM/DGEMM.rst
index 83a46b63..9f0c2a43 100644
--- a/doc/sphinx/09_Microbenchmarks/M5_DGEMM/DGEMM.rst
+++ b/doc/sphinx/09_Microbenchmarks/M5_DGEMM/DGEMM.rst
@@ -46,7 +46,7 @@ Makefiles are provided for the intel and gcc compilers. Before building, load th
 
     cd src
     patch -p1 < ../dgemm_omp_fixes.patch
-    make
+    make CFLAGS=-I<openblas_include_dir>
 
 ..
 
@@ -84,7 +84,7 @@ These are positional arguments, so, for instance, R cannot be set without settin
 Example Results
 ===============
 
-Results from Branson are provided on the following systems:
+Results from DGEMM are provided on the following systems:
 
 * Crossroads (see :ref:`GlobalSystemATS3`)
 
@@ -102,4 +102,4 @@ This test was built with the intel 2023.1.0 compiler using the crayOS compiler w
 .. figure:: dgemm_ats3.png
    :align: center
    :scale: 50%
-   :alt: DGEMM microbenchmark FLOPs measurement
\ No newline at end of file
+   :alt: DGEMM microbenchmark FLOPs measurement
diff --git a/microbenchmarks/dgemm/src/Makefile b/microbenchmarks/dgemm/src/Makefile
index a448b919..7a6f0eaf 100644
--- a/microbenchmarks/dgemm/src/Makefile
+++ b/microbenchmarks/dgemm/src/Makefile
@@ -1,11 +1,10 @@
 
 CC=gcc
-CFLAGS=-ffast-math -mavx2 -ftree-vectorizer-verbose=3 -O3 -fopenmp -DUSE_CBLAS
+CFLAGS+=-ffast-math -mavx2 -ftree-vectorizer-verbose=3 -O3 -fopenmp -DUSE_CBLAS
 LDFLAGS=-L${OPENBLAS_ROOT}/lib -lopenblas
 
 mt-dgemm: mt-dgemm.c
-	$(CC) $(CFLAGS) -o mt-dgemm mt-dgemm.c
+	$(CC) $(CFLAGS) $(LDFLAGS) -o mt-dgemm mt-dgemm.c
 
 clean:
 	rm mt-dgemm *.o
-