Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/main' into spatter/strong-scaling
Browse files Browse the repository at this point in the history
  • Loading branch information
JDTruj2018 committed Feb 15, 2024
2 parents b840ad9 + a38756a commit 0c3a6d3
Show file tree
Hide file tree
Showing 17 changed files with 192 additions and 105 deletions.
12 changes: 6 additions & 6 deletions doc/sphinx/01_branson/cpu_10M_new.csv
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Nodes,Actual,Ideal
8,3.9e+05,3.9e+05
32,1.12e+06,1.56e+06
56,1.57e+06,2.73e+06
88,2.47e+06,4.28e+06
112,3.2e+06,5.45e+06
Nodes,Actual,Ideal,Memory GB,Memory %
8,3.9e+05,3.9e+05,3,2.9
32,1.12e+06,1.56e+06,5,4.08
56,1.57e+06,2.73e+06,6,5.16
88,2.47e+06,4.28e+06,8,6.47
112,3.2e+06,5.45e+06,9,7.69
12 changes: 6 additions & 6 deletions doc/sphinx/01_branson/cpu_200M_new.csv
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Nodes,Actual,Ideal
8,5.56e+05,5.56e+05
32,1.12e+06,2.22e+06
56,1.51e+06,3.89e+06
88,2.4e+06,6.11e+06
112,3.19e+06,7.78e+06
Nodes,Actual,Ideal,Memory GB,Memory %
8,5.56e+05,5.56e+05,46,37.07
32,1.12e+06,2.22e+06,48,39.03
56,1.51e+06,3.89e+06,50,40.23
88,2.4e+06,6.11e+06,51,41.54
112,3.19e+06,7.78e+06,53,42.75
12 changes: 6 additions & 6 deletions doc/sphinx/01_branson/cpu_66M_new.csv
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Nodes,Actual,Ideal
8,5.4e+05,5.4e+05
32,1.12e+06,2.16e+06
56,1.52e+06,3.78e+06
88,2.4e+06,5.94e+06
112,3.16e+06,7.56e+06
Nodes,Actual,Ideal,Memory GB,Memory %
8,5.4e+05,5.4e+05,16,13.05
32,1.12e+06,2.16e+06,18,14.5
56,1.52e+06,3.78e+06,19,15.42
88,2.4e+06,5.94e+06,20,16.72
112,3.16e+06,7.56e+06,22,17.72
6 changes: 3 additions & 3 deletions doc/sphinx/02_amg/amg.rst
Original file line number Diff line number Diff line change
Expand Up @@ -397,17 +397,17 @@ Multi-node scaling on Crossroads
The results of the scaling runs performed on rocinante hbm partition are presented below.
Amg and hypre were built with intel oneapi 2023.1.0 and cray-mpich 8.1.25.
These runs used 32, 64, and 96 nodes with 108 tasks per node.
Problems 1 and 2 were run with problem sizes per MPI process, `-n`, of 25,25,125 and 40,40,200 respectively to use 15% of available memory.
Problems 1 and 2 were run with problem sizes per MPI process, `-n`, of 38,38,38 and 60,60,60 respectively to use roughly 15% of available memory while maintaining a cubic grid.
The product of the x,y,z process topology must equal the number of processors.
In this case, x=y=24 for all node counts and z was set to 6, 12, and 18 for 32, 64, and 96 nodes respectively.

.. figure:: cpu_scale_roci.png
.. figure:: cpu_scale_roci_cubes.png
:align: center
:scale: 50%
:alt:

.. csv-table:: Multi Node Scaling AMG problem 1 and 2
:file: amg_scale_roci_header.csv
:file: amg_scale_roci_cubes_pernode.csv
:align: center
:widths: 10, 10, 10, 10, 10
:header-rows: 1
Expand Down
4 changes: 4 additions & 0 deletions doc/sphinx/02_amg/amg_scale_roci_cubes.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Nodes,Problem1,Problem2
32,6.636799e+09,2.000133e+09
64,2.034274e+09,3.288118e+08
96,2.840158e+09,4.669072e+08
4 changes: 4 additions & 0 deletions doc/sphinx/02_amg/amg_scale_roci_cubes_pernode.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Nodes,Problem1,Problem2,Problem1/Node,Problem2/Node
32,6.64e+09,2e+09,2.07e+08,6.25e+07
64,2.03e+09,3.29e+08,3.18e+07,5.14e+06
96,2.84e+09,4.67e+08,2.96e+07,4.86e+06
9 changes: 5 additions & 4 deletions doc/sphinx/02_amg/cpu.gp
Original file line number Diff line number Diff line change
Expand Up @@ -45,12 +45,13 @@ set title "AMG2023 Strong Scaling for Problem 2, 320 x 320 x 320" font "serif,22
plot "roci_2_320.csv" using 1:2 with linespoints linestyle 1, "" using 1:3 with line linestyle 2

# SCALING PLOTS, Y IS FOM PER NODE
unset logscale xy
set xrange [32:96]
set xlabel "Number of Nodes"
set yrange [1e5:3e8]
set xlabel "Nodes"
set format y "%.1e"
set ylabel "FOM/node"
unset logscale xy
set output "cpu_scale_roci.png"
set output "cpu_scale_roci_cubes.png"
set title "AMG Multi Node Scaling" font "serif,22"
plot "amg_scale_roci.csv" using 1:4 with linespoints linestyle 1, "" using 1:5 with line linestyle 2
plot "amg_scale_roci_cubes_pernode.csv" using 1:4 with linespoints linestyle 1, "" using 1:5 with line linestyle 2

16 changes: 9 additions & 7 deletions doc/sphinx/03_vibe/cpu.gp
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,12 @@ plot "parthenon_roci_scale_nxrange.csv" using 1:2 with linespoints linestyle 1,

# SCALING PLOTS, Y IS FOM PER NODE

# set xrange [32:96]
# set yrange [2.5e6:3.5e6]
# set xlabel "Nodes"
# set ylabel "FOM/node"
# # set title "Branson Multi Node Scaling" font "serif,22"
# set output "parthenon_roci_scale.png"
# plot "parthenon_roci_scale.csv" using 3:5 with linespoints linestyle 1
set xrange [32:96]
set yrange [7e6:1.5e7]
set xlabel "Nodes"
set ylabel "FOM/node"
unset title
unset key
# set title "Branson Multi Node Scaling" font "serif,22"
set output "parthenon_roci_scale_pernode.png"
plot "parthenon_roci_scale_pernode.csv" using 1:5 with linespoints linestyle 1
4 changes: 4 additions & 0 deletions doc/sphinx/03_vibe/parthenon_roci_scale_pernode.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Nodes,NX,Zcycles/sec,NX/Node,Zcycles/sec/Node
32,528,2.6e+08,16.5,8.12e+06
64,672,7.34e+08,10.5,1.15e+07
96,768,1e+09,8,1.04e+07
18 changes: 3 additions & 15 deletions doc/sphinx/03_vibe/parthenon_scale.csv
Original file line number Diff line number Diff line change
@@ -1,16 +1,4 @@
Iteration,NX,Nodes,Zcycles/sec
1,384,32,4.88e+08
1,384,64,8.36e+08
1,384,96,1.20e+09
1,448,32,5.13e+08
1,448,64,9.17e+08
1,448,96,9.64e+08
1,512,32,4.63e+08
1,512,64,8.05e+08
1,512,96,1.10e+09
1,576,32,3.86e+08
1,576,64,7.48e+08
1,576,96,1.07e+09
1,640,32,4.12e+08
1,640,64,7.67e+08
1,640,96,1.04e+09
1,528,32,2.60e+08
1,672,64,7.34e+08
1,768,96,1.00e+09
6 changes: 6 additions & 0 deletions doc/sphinx/03_vibe/parthenon_scale_Zcyclessec_pivot.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
NX,32,64,96
384,4.88e+08,8.36e+08,1.2e+09
448,5.13e+08,9.17e+08,9.64e+08
512,4.63e+08,8.05e+08,1.1e+09
576,3.86e+08,7.48e+08,1.07e+09
640,4.12e+08,7.67e+08,1.04e+09
4 changes: 4 additions & 0 deletions doc/sphinx/03_vibe/parthenon_scale_pivot.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Nodes,384,448,512,576,640
32,4.88e+08,5.13e+08,4.63e+08,3.86e+08,4.12e+08
64,8.36e+08,9.17e+08,8.05e+08,7.48e+08,7.67e+08
96,1.2e+09,9.64e+08,1.1e+09,1.07e+09,1.04e+09
16 changes: 16 additions & 0 deletions doc/sphinx/03_vibe/parthenon_scale_raw_nxrange.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
Iteration,NX,Nodes,Zcycles/sec
1,384,32,4.88e+08
1,384,64,8.36e+08
1,384,96,1.20e+09
1,448,32,5.13e+08
1,448,64,9.17e+08
1,448,96,9.64e+08
1,512,32,4.63e+08
1,512,64,8.05e+08
1,512,96,1.10e+09
1,576,32,3.86e+08
1,576,64,7.48e+08
1,576,96,1.07e+09
1,640,32,4.12e+08
1,640,64,7.67e+08
1,640,96,1.04e+09
38 changes: 23 additions & 15 deletions doc/sphinx/03_vibe/vibe.rst
Original file line number Diff line number Diff line change
Expand Up @@ -206,21 +206,29 @@ Multi-node scaling on Crossroads
The results of the scaling runs performed on rocinante hbm partition are presented below.
Parthenon was built with intel oneapi 2023.1.0 and cray-mpich 8.1.25.
.. These runs used 32, 64, and 96 nodes with 96 tasks per node.
.. Problems 1 and 2 were run with problem sizes per MPI process, `-n`, of 25,25,125 and 40,40,200 respectively to use 15% of available memory.
.. The product of the x,y,z process topology must equal the number of processors.
.. In this case, x=y=24 for all node counts and z was set to 6, 12, and 18 for 32, 64, and 96 nodes respectively.
.. .. figure:: cpu_scale_roci.png
.. :align: center
.. :scale: 50%
.. :alt:
.. .. csv-table:: Multi Node Scaling Parthenon
.. :file: parthenon_scale_roci_header.csv
.. :align: center
.. :widths: 10, 10, 10, 10, 10
.. :header-rows: 1
These runs used 32, 64, and 96 nodes with 96 tasks per node.
These runs used approximately 1122 mesh blocks per node for a problem size using 50% of the total avalable memory across nodes.
The problem size for Parthenon-VIBE is determined by parthenon/mesh/nx{1,2,3} which should be equal to produce a cubic grid.
To find the appropriate nx value, use:
.. math::
\begin{align}
\mathbf{blocks\_per\_side} &= \mathbf{int}((\mathbf{number\_of\_nodes}\times\mathbf{blocks\_per\_node})^\frac{1}{3}) \\
\mathbf{nx} &= \mathbf{blocks\_per\_side}\times\mathbf{block\_size\_side}
\end{align}
Where :math:`block\_size\_side=parthenon/meshblock/nx1=16`.
.. figure:: parthenon_roci_scale_pernode.png
:align: center
:scale: 50%
:alt: VIBE Weak scaling per node.
.. csv-table:: Multi Node Scaling Parthenon
:file: parthenon_roci_scale_pernode.csv
:align: center
:widths: 10, 10, 10, 10, 10
:header-rows: 1
Validation
==========
Expand Down
90 changes: 63 additions & 27 deletions utils/memory_recorder/memory_recorder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,22 +3,26 @@

MemoryRecorder::MemoryRecorder() {

int k, namelen;
int k;
struct stat st = {0};
local_maxrss=0;
local_maxrss.val=0;
global_maxrss=0;
getrss_summary=0;
getmeminfo=0;

MPI_Comm_rank(MPI_COMM_WORLD, &globalrank);
MPI_Comm_size(MPI_COMM_WORLD, &globalsize);
MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0,
MPI_INFO_NULL, &shmcomm);
MPI_INFO_NULL, &eachnode);

MPI_Comm_rank(shmcomm, &localrank);
MPI_Comm_size(shmcomm, &localsize);
MPI_Comm_rank(eachnode, &localrank);
MPI_Comm_size(eachnode, &localsize);
MPI_Get_processor_name(hostname, &namelen);
MPI_Comm_split(MPI_COMM_WORLD, localrank, globalrank, &bosscomm);
MPI_Comm_rank(bosscomm, &bossrank);
MPI_Comm_size(bosscomm, &bosssize);

local_maxrss.index = bossrank;
nodenum = globalrank/localsize;
numnodes = globalsize/localsize;
pid = getpid();
Expand All @@ -28,9 +32,12 @@ MemoryRecorder::MemoryRecorder() {
// Make the hostname a string and add relative host number.
std::string strhost, strhostnum;
for (k=0; k<namelen; k++) {
if (hostname[k] == '.') break;
strhost += hostname[k];
}

this->gethostlist();

// Get meminfo files to read from.
if (localrank == 0) {
k=0;
Expand Down Expand Up @@ -75,18 +82,39 @@ void MemoryRecorder::summarizeMaxRSS() {
// Sum all Max Rss to get global, sum maxrss on node comm to get node MaxRss
// Collect all Max Rss to print each out individually.
MPI_Reduce(&maxrss, &global_maxrss, 1, MPI_LONG, MPI_SUM, 0, MPI_COMM_WORLD);
MPI_Gather(&maxrss, 1, MPI_LONG, rss_collect, 1, MPI_LONG, 0, shmcomm);
// MPI_Reduce(&maxrss, &local_maxrss, 1, MPI_LONG, MPI_SUM, 0, shmcomm);
MPI_Gather(&maxrss, 1, MPI_LONG, rss_collect, 1, MPI_LONG, 0, eachnode);
// MPI_Reduce(&maxrss, &local_maxrss, 1, MPI_LONG, MPI_SUM, 0, eachnode);

if (localrank == 0) {
for (i=0; i<localsize; i++) {
local_maxrss += rss_collect[i];
local_maxrss.val += rss_collect[i];
}
}

MPI_Reduce(&local_maxrss, &min_maxrss, 1, MPI_LONG_INT, MPI_MINLOC, 0, bosscomm);
MPI_Reduce(&local_maxrss, &max_maxrss, 1, MPI_LONG_INT, MPI_MAXLOC, 0, bosscomm);
}

void MemoryRecorder::gethostlist() {
std::string strhost;
char hostnames[bosssize][MPI_MAX_PROCESSOR_NAME];
MPI_Gather(hostname, MPI_MAX_PROCESSOR_NAME, MPI_CHAR, hostnames,
MPI_MAX_PROCESSOR_NAME, MPI_CHAR, 0, bosscomm);

if (globalrank == 0) {
for (int k=0; k<bosssize; k++) {
for (int b=0; b<namelen; b++) {
if (hostnames[k][b] == '.') break;
strhost+=hostnames[k][b];
}
hostlist.push_back(strhost);
strhost="";
}
}
}

void MemoryRecorder::read_meminfo(std::string const &loc) {
MPI_Barrier(shmcomm);
MPI_Barrier(eachnode);

if (localrank == 0) {
int i;
Expand Down Expand Up @@ -155,7 +183,7 @@ void MemoryRecorder::read_meminfo(std::string const &loc) {
freemem_pct.insert({loc, mempctnow});
}
getmeminfo++;
MPI_Barrier(shmcomm);
MPI_Barrier(eachnode);
}

void MemoryRecorder::write_meminfo() {
Expand Down Expand Up @@ -219,44 +247,52 @@ void MemoryRecorder::write_meminfo() {

outfile.close();
}
MPI_Barrier(shmcomm);
MPI_Barrier(eachnode);
}

void MemoryRecorder::write_rss() {
void MemoryRecorder::write_rss(int filewrite) {

// Get the RSS MAX and summarize it if you haven't already done so.
if (getrss_summary == 0) {
this->summarizeMaxRSS();
}

std::cout << std::fixed << std::dec;
if (localrank == 0) {
unsigned long noderam = this->getRamSize()/kb;
unsigned long totalram = noderam * numnodes;
double pctnoderam = (double)local_maxrss/noderam;
double pctnoderam = (double)local_maxrss.val/noderam;

// Write out Node rss max sum and per rank value.
std::ofstream outfile;
outfile.open(rssfileOut, std::ios::out);

outfile << "Node Number " << nodenum << "/" << numnodes <<std::endl;
outfile << "Mem Used: " << local_maxrss/mb << " (GiB)" << std::endl;
outfile << "Total Ram: " << noderam/mb << " (GiB)" << std::endl;
outfile << "Fraction Ram Used: " << pctnoderam << std::endl;
outfile << "Percent Ram Used: " << round_pct(pctnoderam) << "%" << std::endl;

// Write out Node MaxRSS for each process.
for (int i=0; i<localsize; i++) {
outfile << "Rank: " << i << " MaxRSS: " << rss_collect[i]/kb << " (MiB)" << std::endl;
if (filewrite) {
std::ofstream outfile;
outfile.open(rssfileOut, std::ios::out);

outfile << "Node Number " << nodenum << "/" << numnodes << " " << local_maxrss.index << std::endl;
outfile << "Mem Used: " << local_maxrss.val << " - " << local_maxrss.val/mb << " (GiB)" << std::endl;
outfile << "Total Ram: " << noderam/mb << " (GiB)" << std::endl;
outfile << "Fraction Ram Used: " << pctnoderam << std::endl;
outfile << "Percent Ram Used: " << round_pct(pctnoderam) << "%" << std::endl;

// Write out Node MaxRSS for each process.
for (int i=0; i<localsize; i++) {
outfile << "Rank: " << i << " MaxRSS: " << rss_collect[i]/kb << " (MiB)" << std::endl;
}
outfile.close();
}
outfile.close();

// Write out total program rss max.
if (globalrank == 0) {
double pcttotalram = (double)global_maxrss/totalram;
double pctminram = (double)min_maxrss.val/noderam;
double pctmaxram = (double)max_maxrss.val/noderam;
std::cout << "Mem Used: " << global_maxrss << " Total Ram: " << totalram << " Fraction Ram: " << round_pct(pcttotalram) << "%" << std::endl;
std::cout << "TOTAL RSS MAX: " << global_maxrss/mb << " (GiB) - " << round_pct(pcttotalram) << "%" << std::endl;
std::cout << "MIN RSS MAX: " << min_maxrss.val << " " << min_maxrss.val/mb << " (GiB) - " << round_pct(pctminram) << "%";
std::cout << " -- On NODE: " << min_maxrss.index << " - " << hostlist.at(min_maxrss.index) << std::endl;
std::cout << "MAX RSS MAX: " << max_maxrss.val << " " << max_maxrss.val/mb << " (GiB) - " << round_pct(pctmaxram) << "%";
std::cout << " -- On NODE: " << max_maxrss.index << " - " << hostlist.at(max_maxrss.index) << std::endl;
}

}
}

Expand Down
Loading

0 comments on commit 0c3a6d3

Please sign in to comment.