Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/develop' into develop
Browse files Browse the repository at this point in the history
  • Loading branch information
bartgol committed Mar 18, 2024
2 parents eca1d77 + f1156d7 commit f4ffddb
Show file tree
Hide file tree
Showing 18 changed files with 430 additions and 101 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/PR-gcc-openmpi.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ jobs:
bash -l -c "module list"
printenv PATH
- name: Cancel Previous Runs
uses: styfle/cancel-workflow-action@b173b6ec0100793626c2d9e6b90435061f4fc3e5 # 0.11.0
uses: styfle/cancel-workflow-action@85880fa0301c86cca9da44039ee3bb12d3bedbfa # 0.12.1
with:
access_token: ${{ github.token }}
- name: make dirs
Expand All @@ -44,7 +44,7 @@ jobs:
mkdir -p /home/Trilinos/src/Trilinos
mkdir -p /home/Trilinos/build
- name: Clone trilinos
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633 # v4.1.2
with:
fetch-depth: 0
- name: Repo status
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/clang_format.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ jobs:
runs-on: ubuntu-latest

steps:
- uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # v3.6.0
- uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633 # v4.1.2
- uses: DoozyX/clang-format-lint-action@1566bcec081dcb246ab02e7c5f9786c0b629dd4d # v0.16.2
with:
source: './packages/muelu ./packages/tempus ./packages/teko ./packages/xpetra'
Expand All @@ -32,7 +32,7 @@ jobs:
# This does not work for PRs from forks.
- name: Post artifact in issue comment
uses: mshick/add-pr-comment@7c0890544fb33b0bdd2e59467fbacb62e028a096 # v2.8.1
uses: mshick/add-pr-comment@b8f338c590a895d50bcbfa6c5859251edc8952fc # v2.8.2
if: ${{ (hashFiles('format_patch.txt') != '') && (github.event.pull_request.head.repo.full_name == github.repository) }}
with:
message: |
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/dependency-review.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,6 @@ jobs:
egress-policy: audit

- name: 'Checkout Repository'
uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # v3.6.0
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633 # v4.1.2
- name: 'Dependency Review'
uses: actions/dependency-review-action@9129d7d40b8c12c1ed0f60400d00c92d437adcce # v4.1.3
2 changes: 1 addition & 1 deletion .github/workflows/detect-git-lfs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ jobs:

steps:
- name: Check out code
uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # v3.6.0
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633 # v4.1.2
with:
fetch-depth: 0

Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/detect-mpi-comm-world.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ jobs:

steps:
- name: Check out code
uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # v3.6.0
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633 # v4.1.2
with:
fetch-depth: 0

Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/scorecards.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ jobs:

steps:
- name: "Checkout code"
uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # v3.6.0
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633 # v4.1.2
with:
persist-credentials: false

Expand All @@ -58,14 +58,14 @@ jobs:
# Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF
# format to the repository Actions tab.
- name: "Upload artifact"
uses: actions/upload-artifact@a8a3f3ad30e3422c9c7b888a15615d19a852ae32 # v3.1.3
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3 # v4.3.1
with:
name: SARIF file
path: results.sarif
retention-days: 5

# Upload the results to GitHub's code scanning dashboard.
- name: "Upload to code-scanning"
uses: github/codeql-action/upload-sarif@a56a03b370b87b26fde6d680755f818cfda0372b # v2.24.5
uses: github/codeql-action/upload-sarif@3ab4101902695724f9365a384f86c1074d94e18c # v3.24.7
with:
sarif_file: results.sarif
2 changes: 1 addition & 1 deletion .github/workflows/stale.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ jobs:
pull-requests: write # for actions/stale to close stale PRs
runs-on: ubuntu-latest
steps:
- uses: actions/stale@a20b814fb01b71def3bd6f56e7494d667ddf28da # v4.1.1
- uses: actions/stale@28ca1036281a5e5922ead5184a1bbf96e5fc984e # v9.0.0
with:
debug-only: false
ascending: true
Expand Down
4 changes: 4 additions & 0 deletions packages/ifpack2/doc/UsersGuide/options.tex
Original file line number Diff line number Diff line change
Expand Up @@ -502,6 +502,10 @@ \subsection{ILU($k$)}\label{s:ILU}
these streams can run concurrently, the total time can be faster. When
this option is not set (i.e. not using stream), the entire sub-domain is
used instead.}
\ccc{fact: kspiluk reordering in streams}
{bool}
{\false}
{Whether RCM reordering is applied to diagonal blocks in streams.}
% All overlap-related code was removed by M. Hoemmen in
%
% commit 162f64572fbf93e2cac73e3034d76a3db918a494
Expand Down
2 changes: 2 additions & 0 deletions packages/ifpack2/src/Ifpack2_RILUK_decl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -647,6 +647,8 @@ class RILUK:
bool isKokkosKernelsStream_;
int num_streams_;
std::vector<execution_space> exec_space_instances_;
bool hasStreamReordered_;
std::vector<typename lno_nonzero_view_t::non_const_type> perm_v_;
};

// NOTE (mfh 11 Feb 2015) This used to exist in order to deal with
Expand Down
194 changes: 131 additions & 63 deletions packages/ifpack2/src/Ifpack2_RILUK_def.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,8 @@ RILUK<MatrixType>::RILUK (const Teuchos::RCP<const row_matrix_type>& Matrix_in)
Rthresh_ (Teuchos::ScalarTraits<magnitude_type>::one ()),
isKokkosKernelsSpiluk_(false),
isKokkosKernelsStream_(false),
num_streams_(0)
num_streams_(0),
hasStreamReordered_(false)
{
allocateSolvers();
}
Expand All @@ -116,7 +117,8 @@ RILUK<MatrixType>::RILUK (const Teuchos::RCP<const crs_matrix_type>& Matrix_in)
Rthresh_ (Teuchos::ScalarTraits<magnitude_type>::one ()),
isKokkosKernelsSpiluk_(false),
isKokkosKernelsStream_(false),
num_streams_(0)
num_streams_(0),
hasStreamReordered_(false)
{
allocateSolvers();
}
Expand Down Expand Up @@ -412,7 +414,7 @@ setParameters (const Teuchos::ParameterList& params)
getParamTryingTypes<int, int, global_ordinal_type>
(nstreams, params, paramName, prefix);
}

// Forward to trisolvers.
L_solver_->setParameters(params);
U_solver_->setParameters(params);
Expand All @@ -427,6 +429,9 @@ setParameters (const Teuchos::ParameterList& params)

if (num_streams_ >= 1) {
this->isKokkosKernelsStream_ = true;
// Will we do reordering in streams?
if (params.isParameter("fact: kspiluk reordering in streams"))
hasStreamReordered_ = params.get<bool> ("fact: kspiluk reordering in streams");
}
else {
this->isKokkosKernelsStream_ = false;
Expand Down Expand Up @@ -524,7 +529,7 @@ void RILUK<MatrixType>::initialize ()
"matrix until the matrix is fill complete. If your matrix is a "
"Tpetra::CrsMatrix, please call fillComplete on it (with the domain and "
"range Maps, if appropriate) before calling this method.");

Teuchos::Time timer ("RILUK::initialize");
double startTime = timer.wallTime();
{ // Start timing
Expand Down Expand Up @@ -592,8 +597,10 @@ void RILUK<MatrixType>::initialize ()
}
else {
auto lclMtx = A_local_crs->getLocalMatrixDevice();
KokkosSparse::Impl::kk_extract_diagonal_blocks_crsmatrix_sequential(lclMtx, A_local_diagblks);

if (!hasStreamReordered_)
KokkosSparse::Impl::kk_extract_diagonal_blocks_crsmatrix_sequential(lclMtx, A_local_diagblks);
else
perm_v_ = KokkosSparse::Impl::kk_extract_diagonal_blocks_crsmatrix_sequential(lclMtx, A_local_diagblks, true);
for(int i = 0; i < num_streams_; i++) {
Teuchos::RCP<const crs_map_type> A_local_diagblks_RowMap = rcp (new crs_map_type(A_local_diagblks[i].numRows(),
A_local_diagblks[i].numRows(),
Expand Down Expand Up @@ -654,6 +661,7 @@ void RILUK<MatrixType>::initialize ()
#if !defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) || !defined(KOKKOS_ENABLE_CUDA) || (CUDA_VERSION < 11030)
L_solver_->compute ();//NOTE: It makes sense to do compute here because only the nonzero pattern is involved in trisolve compute
#endif

if (!isKokkosKernelsStream_) {
U_solver_->setMatrix (U_);
}
Expand Down Expand Up @@ -1050,7 +1058,11 @@ void RILUK<MatrixType>::compute ()
A_local_values_ = lclMtx.values;
}
else {
KokkosSparse::Impl::kk_extract_diagonal_blocks_crsmatrix_sequential(lclMtx, A_local_diagblks);
if (!hasStreamReordered_)
KokkosSparse::Impl::kk_extract_diagonal_blocks_crsmatrix_sequential(lclMtx, A_local_diagblks);
else
perm_v_ = KokkosSparse::Impl::kk_extract_diagonal_blocks_crsmatrix_sequential(lclMtx, A_local_diagblks, true);

A_local_diagblks_rowmap_v_ = std::vector<lno_row_view_t>(num_streams_);
A_local_diagblks_entries_v_ = std::vector<lno_nonzero_view_t>(num_streams_);
A_local_diagblks_values_v_ = std::vector<scalar_nonzero_view_t>(num_streams_);
Expand Down Expand Up @@ -1198,77 +1210,133 @@ apply (const Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_t

const scalar_type one = STS::one ();
const scalar_type zero = STS::zero ();

Teuchos::Time timer ("RILUK::apply");
double startTime = timer.wallTime();
{ // Start timing
Teuchos::TimeMonitor timeMon (timer);
if (alpha == one && beta == zero) {
if (mode == Teuchos::NO_TRANS) { // Solve L (D (U Y)) = X for Y.
#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) && defined(KOKKOS_ENABLE_CUDA) && (CUDA_VERSION >= 11030)
//NOTE (Nov-15-2022):
//This is a workaround for Cuda >= 11.3 (using cusparseSpSV)
//since cusparseSpSV_solve() does not support in-place computation
MV Y_tmp (Y.getMap (), Y.getNumVectors ());

// Start by solving L Y_tmp = X for Y_tmp.
L_solver_->apply (X, Y_tmp, mode);

if (!this->isKokkosKernelsSpiluk_) {
// Solve D Y = Y. The operation lets us do this in place in Y, so we can
// write "solve D Y = Y for Y."
Y_tmp.elementWiseMultiply (one, *D_, Y_tmp, zero);
if (isKokkosKernelsSpiluk_ && isKokkosKernelsStream_ && hasStreamReordered_) {
MV ReorderedX (X.getMap(), X.getNumVectors());
MV ReorderedY (Y.getMap(), Y.getNumVectors());
for (size_t j = 0; j < X.getNumVectors(); j++) {
auto X_j = X.getVector(j);
auto ReorderedX_j = ReorderedX.getVectorNonConst(j);
auto X_lcl = X_j->getLocalViewDevice(Tpetra::Access::ReadOnly);
auto ReorderedX_lcl = ReorderedX_j->getLocalViewDevice(Tpetra::Access::ReadWrite);
local_ordinal_type stream_begin = 0;
local_ordinal_type stream_end;
for(int i = 0; i < num_streams_; i++) {
auto perm_i = perm_v_[i];
stream_end = stream_begin + perm_i.extent(0);
auto X_lcl_sub = Kokkos::subview (X_lcl, Kokkos::make_pair(stream_begin, stream_end), 0);
auto ReorderedX_lcl_sub = Kokkos::subview (ReorderedX_lcl, Kokkos::make_pair(stream_begin, stream_end), 0);
Kokkos::parallel_for( Kokkos::RangePolicy<execution_space>(0, static_cast<int>(perm_i.extent(0))), KOKKOS_LAMBDA ( const int& ii ) {
ReorderedX_lcl_sub(perm_i(ii)) = X_lcl_sub(ii);
});
stream_begin = stream_end;
}
}

U_solver_->apply (Y_tmp, Y, mode); // Solve U Y = Y_tmp.
#else
// Start by solving L Y = X for Y.
L_solver_->apply (X, Y, mode);

if (!this->isKokkosKernelsSpiluk_) {
// Solve D Y = Y. The operation lets us do this in place in Y, so we can
// write "solve D Y = Y for Y."
Y.elementWiseMultiply (one, *D_, Y, zero);
Kokkos::fence(); // Make sure X is completely reordered
if (mode == Teuchos::NO_TRANS) { // Solve L (U Y) = X for Y.
// Solve L Y = X for Y.
L_solver_->apply (ReorderedX, Y, mode);
// Solve U Y = Y for Y.
U_solver_->apply (Y, ReorderedY, mode);
}
else { // Solve U^P (L^P Y) = X for Y (where P is * or T).
// Solve U^P Y = X for Y.
U_solver_->apply (ReorderedX, Y, mode);
// Solve L^P Y = Y for Y.
L_solver_->apply (Y, ReorderedY, mode);
}

U_solver_->apply (Y, Y, mode); // Solve U Y = Y.
#endif
for (size_t j = 0; j < Y.getNumVectors(); j++) {
auto Y_j = Y.getVectorNonConst(j);
auto ReorderedY_j = ReorderedY.getVector(j);
auto Y_lcl = Y_j->getLocalViewDevice(Tpetra::Access::ReadWrite);
auto ReorderedY_lcl = ReorderedY_j->getLocalViewDevice(Tpetra::Access::ReadOnly);
local_ordinal_type stream_begin = 0;
local_ordinal_type stream_end;
for(int i = 0; i < num_streams_; i++) {
auto perm_i = perm_v_[i];
stream_end = stream_begin + perm_i.extent(0);
auto Y_lcl_sub = Kokkos::subview (Y_lcl, Kokkos::make_pair(stream_begin, stream_end), 0);
auto ReorderedY_lcl_sub = Kokkos::subview (ReorderedY_lcl, Kokkos::make_pair(stream_begin, stream_end), 0);
Kokkos::parallel_for( Kokkos::RangePolicy<execution_space>(0, static_cast<int>(perm_i.extent(0))), KOKKOS_LAMBDA ( const int& ii ) {
Y_lcl_sub(ii) = ReorderedY_lcl_sub(perm_i(ii));
});
stream_begin = stream_end;
}
}
}
else { // Solve U^P (D^P (L^P Y)) = X for Y (where P is * or T).
else {
if (mode == Teuchos::NO_TRANS) { // Solve L (D (U Y)) = X for Y.
#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) && defined(KOKKOS_ENABLE_CUDA) && (CUDA_VERSION >= 11030)
//NOTE (Nov-15-2022):
//This is a workaround for Cuda >= 11.3 (using cusparseSpSV)
//since cusparseSpSV_solve() does not support in-place computation
MV Y_tmp (Y.getMap (), Y.getNumVectors ());
//NOTE (Nov-15-2022):
//This is a workaround for Cuda >= 11.3 (using cusparseSpSV)
//since cusparseSpSV_solve() does not support in-place computation
MV Y_tmp (Y.getMap (), Y.getNumVectors ());

// Start by solving L Y_tmp = X for Y_tmp.
L_solver_->apply (X, Y_tmp, mode);

if (!this->isKokkosKernelsSpiluk_) {
// Solve D Y = Y. The operation lets us do this in place in Y, so we can
// write "solve D Y = Y for Y."
Y_tmp.elementWiseMultiply (one, *D_, Y_tmp, zero);
}

// Start by solving U^P Y_tmp = X for Y_tmp.
U_solver_->apply (X, Y_tmp, mode);
U_solver_->apply (Y_tmp, Y, mode); // Solve U Y = Y_tmp.
#else
// Start by solving L Y = X for Y.
L_solver_->apply (X, Y, mode);

if (!this->isKokkosKernelsSpiluk_) {
// Solve D^P Y = Y.
//
// FIXME (mfh 24 Jan 2014) If mode = Teuchos::CONJ_TRANS, we
// need to do an elementwise multiply with the conjugate of
// D_, not just with D_ itself.
Y_tmp.elementWiseMultiply (one, *D_, Y_tmp, zero);
}
if (!this->isKokkosKernelsSpiluk_) {
// Solve D Y = Y. The operation lets us do this in place in Y, so we can
// write "solve D Y = Y for Y."
Y.elementWiseMultiply (one, *D_, Y, zero);
}

L_solver_->apply (Y_tmp, Y, mode); // Solve L^P Y = Y_tmp.
U_solver_->apply (Y, Y, mode); // Solve U Y = Y.
#endif
}
else { // Solve U^P (D^P (L^P Y)) = X for Y (where P is * or T).
#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) && defined(KOKKOS_ENABLE_CUDA) && (CUDA_VERSION >= 11030)
//NOTE (Nov-15-2022):
//This is a workaround for Cuda >= 11.3 (using cusparseSpSV)
//since cusparseSpSV_solve() does not support in-place computation
MV Y_tmp (Y.getMap (), Y.getNumVectors ());

// Start by solving U^P Y_tmp = X for Y_tmp.
U_solver_->apply (X, Y_tmp, mode);

if (!this->isKokkosKernelsSpiluk_) {
// Solve D^P Y = Y.
//
// FIXME (mfh 24 Jan 2014) If mode = Teuchos::CONJ_TRANS, we
// need to do an elementwise multiply with the conjugate of
// D_, not just with D_ itself.
Y_tmp.elementWiseMultiply (one, *D_, Y_tmp, zero);
}

L_solver_->apply (Y_tmp, Y, mode); // Solve L^P Y = Y_tmp.
#else
// Start by solving U^P Y = X for Y.
U_solver_->apply (X, Y, mode);

if (!this->isKokkosKernelsSpiluk_) {
// Solve D^P Y = Y.
//
// FIXME (mfh 24 Jan 2014) If mode = Teuchos::CONJ_TRANS, we
// need to do an elementwise multiply with the conjugate of
// D_, not just with D_ itself.
Y.elementWiseMultiply (one, *D_, Y, zero);
}

L_solver_->apply (Y, Y, mode); // Solve L^P Y = Y.
// Start by solving U^P Y = X for Y.
U_solver_->apply (X, Y, mode);

if (!this->isKokkosKernelsSpiluk_) {
// Solve D^P Y = Y.
//
// FIXME (mfh 24 Jan 2014) If mode = Teuchos::CONJ_TRANS, we
// need to do an elementwise multiply with the conjugate of
// D_, not just with D_ itself.
Y.elementWiseMultiply (one, *D_, Y, zero);
}

L_solver_->apply (Y, Y, mode); // Solve L^P Y = Y.
#endif
}
}
}
else { // alpha != 1 or beta != 0
Expand Down
Loading

0 comments on commit f4ffddb

Please sign in to comment.