Merge remote-tracking branch 'origin/main' into spatter/strong-scaling

lanl · Feb 15, 2024 · 0c3a6d3 · 0c3a6d3
2 parents b840ad9 + a38756a
commit 0c3a6d3
Show file tree

Hide file tree

Showing 17 changed files with 192 additions and 105 deletions.
diff --git a/doc/sphinx/01_branson/cpu_10M_new.csv b/doc/sphinx/01_branson/cpu_10M_new.csv
@@ -1,6 +1,6 @@
-Nodes,Actual,Ideal
-8,3.9e+05,3.9e+05
-32,1.12e+06,1.56e+06
-56,1.57e+06,2.73e+06
-88,2.47e+06,4.28e+06
-112,3.2e+06,5.45e+06
+Nodes,Actual,Ideal,Memory GB,Memory %
+8,3.9e+05,3.9e+05,3,2.9
+32,1.12e+06,1.56e+06,5,4.08
+56,1.57e+06,2.73e+06,6,5.16
+88,2.47e+06,4.28e+06,8,6.47
+112,3.2e+06,5.45e+06,9,7.69
diff --git a/doc/sphinx/01_branson/cpu_200M_new.csv b/doc/sphinx/01_branson/cpu_200M_new.csv
@@ -1,6 +1,6 @@
-Nodes,Actual,Ideal
-8,5.56e+05,5.56e+05
-32,1.12e+06,2.22e+06
-56,1.51e+06,3.89e+06
-88,2.4e+06,6.11e+06
-112,3.19e+06,7.78e+06
+Nodes,Actual,Ideal,Memory GB,Memory %
+8,5.56e+05,5.56e+05,46,37.07
+32,1.12e+06,2.22e+06,48,39.03
+56,1.51e+06,3.89e+06,50,40.23
+88,2.4e+06,6.11e+06,51,41.54
+112,3.19e+06,7.78e+06,53,42.75
diff --git a/doc/sphinx/01_branson/cpu_66M_new.csv b/doc/sphinx/01_branson/cpu_66M_new.csv
@@ -1,6 +1,6 @@
-Nodes,Actual,Ideal
-8,5.4e+05,5.4e+05
-32,1.12e+06,2.16e+06
-56,1.52e+06,3.78e+06
-88,2.4e+06,5.94e+06
-112,3.16e+06,7.56e+06
+Nodes,Actual,Ideal,Memory GB,Memory %
+8,5.4e+05,5.4e+05,16,13.05
+32,1.12e+06,2.16e+06,18,14.5
+56,1.52e+06,3.78e+06,19,15.42
+88,2.4e+06,5.94e+06,20,16.72
+112,3.16e+06,7.56e+06,22,17.72
diff --git a/doc/sphinx/02_amg/amg.rst b/doc/sphinx/02_amg/amg.rst
@@ -397,17 +397,17 @@ Multi-node scaling on Crossroads
 The results of the scaling runs performed on rocinante hbm partition are presented below.
 Amg and hypre were built with intel oneapi 2023.1.0 and cray-mpich 8.1.25.
 These runs used 32, 64, and 96 nodes with 108 tasks per node.
-Problems 1 and 2 were run with problem sizes per MPI process, `-n`, of 25,25,125 and 40,40,200 respectively to use 15% of available memory.
+Problems 1 and 2 were run with problem sizes per MPI process, `-n`, of 38,38,38 and 60,60,60 respectively to use roughly 15% of available memory while maintaining a cubic grid.
 The product of the x,y,z process topology must equal the number of processors.
 In this case, x=y=24 for all node counts and z was set to 6, 12, and 18 for 32, 64, and 96 nodes respectively. 
 
-.. figure:: cpu_scale_roci.png
+.. figure:: cpu_scale_roci_cubes.png
    :align: center
    :scale: 50%
    :alt: 
 
 .. csv-table:: Multi Node Scaling AMG problem 1 and 2
-   :file: amg_scale_roci_header.csv
+   :file: amg_scale_roci_cubes_pernode.csv
    :align: center
    :widths: 10, 10, 10, 10, 10
    :header-rows: 1

diff --git a/doc/sphinx/02_amg/amg_scale_roci_cubes.csv b/doc/sphinx/02_amg/amg_scale_roci_cubes.csv
@@ -0,0 +1,4 @@
+Nodes,Problem1,Problem2
+32,6.636799e+09,2.000133e+09
+64,2.034274e+09,3.288118e+08
+96,2.840158e+09,4.669072e+08
diff --git a/doc/sphinx/02_amg/amg_scale_roci_cubes_pernode.csv b/doc/sphinx/02_amg/amg_scale_roci_cubes_pernode.csv
@@ -0,0 +1,4 @@
+Nodes,Problem1,Problem2,Problem1/Node,Problem2/Node
+32,6.64e+09,2e+09,2.07e+08,6.25e+07
+64,2.03e+09,3.29e+08,3.18e+07,5.14e+06
+96,2.84e+09,4.67e+08,2.96e+07,4.86e+06
diff --git a/doc/sphinx/02_amg/cpu.gp b/doc/sphinx/02_amg/cpu.gp
@@ -45,12 +45,13 @@ set title "AMG2023 Strong Scaling for Problem 2, 320 x 320 x 320" font "serif,22
 plot "roci_2_320.csv" using 1:2 with linespoints linestyle 1, "" using 1:3 with line linestyle 2
 
 # SCALING PLOTS, Y IS FOM PER NODE
+unset logscale xy
 set xrange [32:96]
-set xlabel "Number of Nodes"
+set yrange [1e5:3e8]
+set xlabel "Nodes"
 set format y "%.1e"
 set ylabel "FOM/node"
-unset logscale xy
-set output "cpu_scale_roci.png"
+set output "cpu_scale_roci_cubes.png"
 set title "AMG Multi Node Scaling" font "serif,22"
-plot "amg_scale_roci.csv" using 1:4 with linespoints linestyle 1, "" using 1:5 with line linestyle 2
+plot "amg_scale_roci_cubes_pernode.csv" using 1:4 with linespoints linestyle 1, "" using 1:5 with line linestyle 2
 
diff --git a/doc/sphinx/03_vibe/cpu.gp b/doc/sphinx/03_vibe/cpu.gp
@@ -48,10 +48,12 @@ plot "parthenon_roci_scale_nxrange.csv" using 1:2 with linespoints linestyle 1,
 
 # SCALING PLOTS, Y IS FOM PER NODE
 
-# set xrange [32:96]
-# set yrange [2.5e6:3.5e6]
-# set xlabel "Nodes"
-# set ylabel "FOM/node"
-# # set title "Branson Multi Node Scaling" font "serif,22"
-# set output "parthenon_roci_scale.png"
-# plot "parthenon_roci_scale.csv" using 3:5 with linespoints linestyle 1
+set xrange [32:96]
+set yrange [7e6:1.5e7]
+set xlabel "Nodes"
+set ylabel "FOM/node"
+unset title
+unset key
+# set title "Branson Multi Node Scaling" font "serif,22"
+set output "parthenon_roci_scale_pernode.png"
+plot "parthenon_roci_scale_pernode.csv" using 1:5 with linespoints linestyle 1
diff --git a/doc/sphinx/03_vibe/parthenon_roci_scale_pernode.csv b/doc/sphinx/03_vibe/parthenon_roci_scale_pernode.csv
@@ -0,0 +1,4 @@
+Nodes,NX,Zcycles/sec,NX/Node,Zcycles/sec/Node
+32,528,2.6e+08,16.5,8.12e+06
+64,672,7.34e+08,10.5,1.15e+07
+96,768,1e+09,8,1.04e+07
diff --git a/doc/sphinx/03_vibe/parthenon_scale.csv b/doc/sphinx/03_vibe/parthenon_scale.csv
@@ -1,16 +1,4 @@
 Iteration,NX,Nodes,Zcycles/sec
-1,384,32,4.88e+08
-1,384,64,8.36e+08
-1,384,96,1.20e+09
-1,448,32,5.13e+08
-1,448,64,9.17e+08
-1,448,96,9.64e+08
-1,512,32,4.63e+08
-1,512,64,8.05e+08
-1,512,96,1.10e+09
-1,576,32,3.86e+08
-1,576,64,7.48e+08
-1,576,96,1.07e+09
-1,640,32,4.12e+08
-1,640,64,7.67e+08
-1,640,96,1.04e+09
+1,528,32,2.60e+08
+1,672,64,7.34e+08
+1,768,96,1.00e+09
diff --git a/doc/sphinx/03_vibe/parthenon_scale_Zcyclessec_pivot.csv b/doc/sphinx/03_vibe/parthenon_scale_Zcyclessec_pivot.csv
@@ -0,0 +1,6 @@
+NX,32,64,96
+384,4.88e+08,8.36e+08,1.2e+09
+448,5.13e+08,9.17e+08,9.64e+08
+512,4.63e+08,8.05e+08,1.1e+09
+576,3.86e+08,7.48e+08,1.07e+09
+640,4.12e+08,7.67e+08,1.04e+09
diff --git a/doc/sphinx/03_vibe/parthenon_scale_pivot.csv b/doc/sphinx/03_vibe/parthenon_scale_pivot.csv
@@ -0,0 +1,4 @@
+Nodes,384,448,512,576,640
+32,4.88e+08,5.13e+08,4.63e+08,3.86e+08,4.12e+08
+64,8.36e+08,9.17e+08,8.05e+08,7.48e+08,7.67e+08
+96,1.2e+09,9.64e+08,1.1e+09,1.07e+09,1.04e+09
diff --git a/doc/sphinx/03_vibe/parthenon_scale_raw_nxrange.csv b/doc/sphinx/03_vibe/parthenon_scale_raw_nxrange.csv
@@ -0,0 +1,16 @@
+Iteration,NX,Nodes,Zcycles/sec
+1,384,32,4.88e+08
+1,384,64,8.36e+08
+1,384,96,1.20e+09
+1,448,32,5.13e+08
+1,448,64,9.17e+08
+1,448,96,9.64e+08
+1,512,32,4.63e+08
+1,512,64,8.05e+08
+1,512,96,1.10e+09
+1,576,32,3.86e+08
+1,576,64,7.48e+08
+1,576,96,1.07e+09
+1,640,32,4.12e+08
+1,640,64,7.67e+08
+1,640,96,1.04e+09
diff --git a/doc/sphinx/03_vibe/vibe.rst b/doc/sphinx/03_vibe/vibe.rst
@@ -206,21 +206,29 @@ Multi-node scaling on Crossroads
 
 The results of the scaling runs performed on rocinante hbm partition are presented below.
 Parthenon was built with intel oneapi 2023.1.0 and cray-mpich 8.1.25.
-.. These runs used 32, 64, and 96 nodes with 96 tasks per node.
-.. Problems 1 and 2 were run with problem sizes per MPI process, `-n`, of 25,25,125 and 40,40,200 respectively to use 15% of available memory.
-.. The product of the x,y,z process topology must equal the number of processors.
-.. In this case, x=y=24 for all node counts and z was set to 6, 12, and 18 for 32, 64, and 96 nodes respectively. 
-
-.. .. figure:: cpu_scale_roci.png
-..    :align: center
-..    :scale: 50%
-..    :alt: 
-
-.. .. csv-table:: Multi Node Scaling Parthenon
-..    :file: parthenon_scale_roci_header.csv
-..    :align: center
-..    :widths: 10, 10, 10, 10, 10
-..    :header-rows: 1
+These runs used 32, 64, and 96 nodes with 96 tasks per node.
+These runs used approximately 1122 mesh blocks per node for a problem size using 50% of the total avalable memory across nodes.
+The problem size for Parthenon-VIBE is determined by parthenon/mesh/nx{1,2,3} which should be equal to produce a cubic grid.
+To find the appropriate nx value, use:
+
+.. math::
+   \begin{align}
+      \mathbf{blocks\_per\_side} &= \mathbf{int}((\mathbf{number\_of\_nodes}\times\mathbf{blocks\_per\_node})^\frac{1}{3}) \\
+      \mathbf{nx}                &= \mathbf{blocks\_per\_side}\times\mathbf{block\_size\_side}
+   \end{align}
+
+Where :math:`block\_size\_side=parthenon/meshblock/nx1=16`. 
+
+.. figure:: parthenon_roci_scale_pernode.png
+   :align: center
+   :scale: 50%
+   :alt: VIBE Weak scaling per node.
+
+.. csv-table:: Multi Node Scaling Parthenon
+   :file: parthenon_roci_scale_pernode.csv
+   :align: center
+   :widths: 10, 10, 10, 10, 10
+   :header-rows: 1
 
 Validation
 ==========

diff --git a/utils/memory_recorder/memory_recorder.cpp b/utils/memory_recorder/memory_recorder.cpp
@@ -3,22 +3,26 @@
 
 MemoryRecorder::MemoryRecorder() {
 
-    int k, namelen;
+    int k;
     struct stat st = {0};
-    local_maxrss=0;
+    local_maxrss.val=0;
     global_maxrss=0;
     getrss_summary=0;
     getmeminfo=0;
 
     MPI_Comm_rank(MPI_COMM_WORLD, &globalrank);
     MPI_Comm_size(MPI_COMM_WORLD, &globalsize);
     MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0,
-        MPI_INFO_NULL, &shmcomm);
+        MPI_INFO_NULL, &eachnode);
 
-    MPI_Comm_rank(shmcomm, &localrank);
-    MPI_Comm_size(shmcomm, &localsize);
+    MPI_Comm_rank(eachnode, &localrank);
+    MPI_Comm_size(eachnode, &localsize);
     MPI_Get_processor_name(hostname, &namelen);
+    MPI_Comm_split(MPI_COMM_WORLD, localrank, globalrank, &bosscomm);
+    MPI_Comm_rank(bosscomm, &bossrank);
+    MPI_Comm_size(bosscomm, &bosssize);
 
+    local_maxrss.index = bossrank;
     nodenum = globalrank/localsize;
     numnodes = globalsize/localsize;
     pid = getpid();
@@ -28,9 +32,12 @@ MemoryRecorder::MemoryRecorder() {
     // Make the hostname a string and add relative host number.
     std::string strhost, strhostnum;
     for (k=0; k<namelen; k++) {
+        if (hostname[k] == '.') break;
         strhost += hostname[k];
     }
 
+    this->gethostlist();
+
     // Get meminfo files to read from.
     if (localrank == 0) {
         k=0;
@@ -75,18 +82,39 @@ void MemoryRecorder::summarizeMaxRSS() {
     // Sum all Max Rss to get global, sum maxrss on node comm to get node MaxRss
     // Collect all Max Rss to print each out individually.
     MPI_Reduce(&maxrss, &global_maxrss, 1, MPI_LONG, MPI_SUM, 0, MPI_COMM_WORLD);
-    MPI_Gather(&maxrss, 1, MPI_LONG, rss_collect, 1, MPI_LONG, 0, shmcomm);
-    // MPI_Reduce(&maxrss, &local_maxrss, 1, MPI_LONG, MPI_SUM, 0, shmcomm);
+    MPI_Gather(&maxrss, 1, MPI_LONG, rss_collect, 1, MPI_LONG, 0, eachnode);
+    // MPI_Reduce(&maxrss, &local_maxrss, 1, MPI_LONG, MPI_SUM, 0, eachnode);
 
     if (localrank == 0) {
         for (i=0; i<localsize; i++) {
-            local_maxrss += rss_collect[i];
+            local_maxrss.val += rss_collect[i];
+        }
+    }
+
+    MPI_Reduce(&local_maxrss, &min_maxrss, 1, MPI_LONG_INT, MPI_MINLOC, 0, bosscomm);
+    MPI_Reduce(&local_maxrss, &max_maxrss, 1, MPI_LONG_INT, MPI_MAXLOC, 0, bosscomm);
+}
+
+void MemoryRecorder::gethostlist() {
+    std::string strhost;
+    char hostnames[bosssize][MPI_MAX_PROCESSOR_NAME];
+    MPI_Gather(hostname, MPI_MAX_PROCESSOR_NAME, MPI_CHAR, hostnames,
+        MPI_MAX_PROCESSOR_NAME, MPI_CHAR, 0, bosscomm); 
+
+    if (globalrank == 0) {
+        for (int k=0; k<bosssize; k++) {
+            for (int b=0; b<namelen; b++) {
+                if (hostnames[k][b] == '.') break;
+                strhost+=hostnames[k][b];
+            }
+            hostlist.push_back(strhost);
+            strhost="";
         }
     }
 }
 
 void MemoryRecorder::read_meminfo(std::string const &loc) {
-    MPI_Barrier(shmcomm);
+    MPI_Barrier(eachnode);
 
     if (localrank == 0) {
         int i;
@@ -155,7 +183,7 @@ void MemoryRecorder::read_meminfo(std::string const &loc) {
         freemem_pct.insert({loc, mempctnow});
     }
     getmeminfo++;
-    MPI_Barrier(shmcomm);
+    MPI_Barrier(eachnode);
 }
 
 void MemoryRecorder::write_meminfo() {
@@ -219,44 +247,52 @@ void MemoryRecorder::write_meminfo() {
 
         outfile.close();
     }
-    MPI_Barrier(shmcomm);
+    MPI_Barrier(eachnode);
 }
 
-void MemoryRecorder::write_rss() {
+void MemoryRecorder::write_rss(int filewrite) {
 
     // Get the RSS MAX and summarize it if you haven't already done so.
     if (getrss_summary == 0) {
         this->summarizeMaxRSS();
     }
 
+    std::cout << std::fixed << std::dec;
     if (localrank == 0) {
         unsigned long noderam = this->getRamSize()/kb;
         unsigned long totalram = noderam * numnodes;
-        double pctnoderam = (double)local_maxrss/noderam;
+        double pctnoderam = (double)local_maxrss.val/noderam;
 
         // Write out Node rss max sum and per rank value.
-        std::ofstream outfile;
-        outfile.open(rssfileOut, std::ios::out);
-
-        outfile << "Node Number " << nodenum << "/" << numnodes <<std::endl;
-        outfile << "Mem Used: " << local_maxrss/mb << " (GiB)" << std::endl;
-        outfile << "Total Ram: " << noderam/mb << " (GiB)" << std::endl;
-        outfile << "Fraction Ram Used: " << pctnoderam << std::endl;
-        outfile << "Percent Ram Used: " << round_pct(pctnoderam) << "%" << std::endl;
-
-        // Write out Node MaxRSS for each process.
-        for (int i=0; i<localsize; i++) {
-            outfile << "Rank: " << i << " MaxRSS: " << rss_collect[i]/kb << " (MiB)" << std::endl;
+        if (filewrite) {
+            std::ofstream outfile;
+            outfile.open(rssfileOut, std::ios::out);
+
+            outfile << "Node Number " << nodenum << "/" << numnodes << "  " << local_maxrss.index << std::endl;
+            outfile << "Mem Used: " << local_maxrss.val << " - " << local_maxrss.val/mb << " (GiB)" << std::endl;
+            outfile << "Total Ram: " << noderam/mb << " (GiB)" << std::endl;
+            outfile << "Fraction Ram Used: " << pctnoderam << std::endl;
+            outfile << "Percent Ram Used: " << round_pct(pctnoderam) << "%" << std::endl;
+
+            // Write out Node MaxRSS for each process.
+            for (int i=0; i<localsize; i++) {
+                outfile << "Rank: " << i << " MaxRSS: " << rss_collect[i]/kb << " (MiB)" << std::endl;
+            }
+            outfile.close();
         }
-        outfile.close();
 
         // Write out total program rss max.
         if (globalrank == 0) {
             double pcttotalram = (double)global_maxrss/totalram;
+            double pctminram = (double)min_maxrss.val/noderam;
+            double pctmaxram = (double)max_maxrss.val/noderam;
             std::cout << "Mem Used: " << global_maxrss << " Total Ram: " << totalram << " Fraction Ram: " << round_pct(pcttotalram) << "%" << std::endl;
             std::cout << "TOTAL RSS MAX: " << global_maxrss/mb  << " (GiB) - " << round_pct(pcttotalram) << "%" << std::endl;
+            std::cout << "MIN RSS MAX: " << min_maxrss.val << " " << min_maxrss.val/mb  << " (GiB) - " << round_pct(pctminram) << "%";
+            std::cout << " -- On NODE: " << min_maxrss.index << " - " << hostlist.at(min_maxrss.index) << std::endl;
+            std::cout << "MAX RSS MAX: " << max_maxrss.val << " " << max_maxrss.val/mb  << " (GiB) - " << round_pct(pctmaxram) << "%";
+            std::cout << " -- On NODE: " << max_maxrss.index << " - " << hostlist.at(max_maxrss.index) << std::endl;
         }
-
     }
 }