diff --git a/.github/workflows/PR-gcc-openmpi.yml b/.github/workflows/PR-gcc-openmpi.yml index 7eb91f341044..99b513d3f145 100644 --- a/.github/workflows/PR-gcc-openmpi.yml +++ b/.github/workflows/PR-gcc-openmpi.yml @@ -35,7 +35,7 @@ jobs: bash -l -c "module list" printenv PATH - name: Cancel Previous Runs - uses: styfle/cancel-workflow-action@b173b6ec0100793626c2d9e6b90435061f4fc3e5 # 0.11.0 + uses: styfle/cancel-workflow-action@85880fa0301c86cca9da44039ee3bb12d3bedbfa # 0.12.1 with: access_token: ${{ github.token }} - name: make dirs @@ -44,7 +44,7 @@ jobs: mkdir -p /home/Trilinos/src/Trilinos mkdir -p /home/Trilinos/build - name: Clone trilinos - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 + uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633 # v4.1.2 with: fetch-depth: 0 - name: Repo status diff --git a/.github/workflows/clang_format.yml b/.github/workflows/clang_format.yml index 57760cee24a0..5e6c28e07996 100644 --- a/.github/workflows/clang_format.yml +++ b/.github/workflows/clang_format.yml @@ -7,7 +7,7 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # v3.6.0 + - uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633 # v4.1.2 - uses: DoozyX/clang-format-lint-action@1566bcec081dcb246ab02e7c5f9786c0b629dd4d # v0.16.2 with: source: './packages/muelu ./packages/tempus ./packages/teko ./packages/xpetra' @@ -32,7 +32,7 @@ jobs: # This does not work for PRs from forks. - name: Post artifact in issue comment - uses: mshick/add-pr-comment@7c0890544fb33b0bdd2e59467fbacb62e028a096 # v2.8.1 + uses: mshick/add-pr-comment@b8f338c590a895d50bcbfa6c5859251edc8952fc # v2.8.2 if: ${{ (hashFiles('format_patch.txt') != '') && (github.event.pull_request.head.repo.full_name == github.repository) }} with: message: | diff --git a/.github/workflows/dependency-review.yml b/.github/workflows/dependency-review.yml index 00c327a8f204..671cb478789c 100644 --- a/.github/workflows/dependency-review.yml +++ b/.github/workflows/dependency-review.yml @@ -22,6 +22,6 @@ jobs: egress-policy: audit - name: 'Checkout Repository' - uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # v3.6.0 + uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633 # v4.1.2 - name: 'Dependency Review' uses: actions/dependency-review-action@9129d7d40b8c12c1ed0f60400d00c92d437adcce # v4.1.3 diff --git a/.github/workflows/detect-git-lfs.yml b/.github/workflows/detect-git-lfs.yml index 42811f4d1eac..f8498ce6fd6f 100644 --- a/.github/workflows/detect-git-lfs.yml +++ b/.github/workflows/detect-git-lfs.yml @@ -12,7 +12,7 @@ jobs: steps: - name: Check out code - uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # v3.6.0 + uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633 # v4.1.2 with: fetch-depth: 0 diff --git a/.github/workflows/detect-mpi-comm-world.yml b/.github/workflows/detect-mpi-comm-world.yml index 8d6b57826631..e4e0b5269637 100644 --- a/.github/workflows/detect-mpi-comm-world.yml +++ b/.github/workflows/detect-mpi-comm-world.yml @@ -12,7 +12,7 @@ jobs: steps: - name: Check out code - uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # v3.6.0 + uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633 # v4.1.2 with: fetch-depth: 0 diff --git a/.github/workflows/scorecards.yml b/.github/workflows/scorecards.yml index 7dc4117b2396..f8a711409095 100644 --- a/.github/workflows/scorecards.yml +++ b/.github/workflows/scorecards.yml @@ -31,7 +31,7 @@ jobs: steps: - name: "Checkout code" - uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # v3.6.0 + uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633 # v4.1.2 with: persist-credentials: false @@ -58,7 +58,7 @@ jobs: # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF # format to the repository Actions tab. - name: "Upload artifact" - uses: actions/upload-artifact@a8a3f3ad30e3422c9c7b888a15615d19a852ae32 # v3.1.3 + uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3 # v4.3.1 with: name: SARIF file path: results.sarif @@ -66,6 +66,6 @@ jobs: # Upload the results to GitHub's code scanning dashboard. - name: "Upload to code-scanning" - uses: github/codeql-action/upload-sarif@a56a03b370b87b26fde6d680755f818cfda0372b # v2.24.5 + uses: github/codeql-action/upload-sarif@3ab4101902695724f9365a384f86c1074d94e18c # v3.24.7 with: sarif_file: results.sarif diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml index bf8e74755e96..3fcfd0d5f391 100644 --- a/.github/workflows/stale.yml +++ b/.github/workflows/stale.yml @@ -26,7 +26,7 @@ jobs: pull-requests: write # for actions/stale to close stale PRs runs-on: ubuntu-latest steps: - - uses: actions/stale@a20b814fb01b71def3bd6f56e7494d667ddf28da # v4.1.1 + - uses: actions/stale@28ca1036281a5e5922ead5184a1bbf96e5fc984e # v9.0.0 with: debug-only: false ascending: true diff --git a/packages/ifpack2/doc/UsersGuide/options.tex b/packages/ifpack2/doc/UsersGuide/options.tex index 9800fd4ede33..fc196831ea50 100644 --- a/packages/ifpack2/doc/UsersGuide/options.tex +++ b/packages/ifpack2/doc/UsersGuide/options.tex @@ -502,6 +502,10 @@ \subsection{ILU($k$)}\label{s:ILU} these streams can run concurrently, the total time can be faster. When this option is not set (i.e. not using stream), the entire sub-domain is used instead.} +\ccc{fact: kspiluk reordering in streams} + {bool} + {\false} + {Whether RCM reordering is applied to diagonal blocks in streams.} % All overlap-related code was removed by M. Hoemmen in % % commit 162f64572fbf93e2cac73e3034d76a3db918a494 diff --git a/packages/ifpack2/src/Ifpack2_RILUK_decl.hpp b/packages/ifpack2/src/Ifpack2_RILUK_decl.hpp index 0cfb82080984..850c121922df 100644 --- a/packages/ifpack2/src/Ifpack2_RILUK_decl.hpp +++ b/packages/ifpack2/src/Ifpack2_RILUK_decl.hpp @@ -647,6 +647,8 @@ class RILUK: bool isKokkosKernelsStream_; int num_streams_; std::vector exec_space_instances_; + bool hasStreamReordered_; + std::vector perm_v_; }; // NOTE (mfh 11 Feb 2015) This used to exist in order to deal with diff --git a/packages/ifpack2/src/Ifpack2_RILUK_def.hpp b/packages/ifpack2/src/Ifpack2_RILUK_def.hpp index 1e90f9bdf7f3..c42873035843 100644 --- a/packages/ifpack2/src/Ifpack2_RILUK_def.hpp +++ b/packages/ifpack2/src/Ifpack2_RILUK_def.hpp @@ -91,7 +91,8 @@ RILUK::RILUK (const Teuchos::RCP& Matrix_in) Rthresh_ (Teuchos::ScalarTraits::one ()), isKokkosKernelsSpiluk_(false), isKokkosKernelsStream_(false), - num_streams_(0) + num_streams_(0), + hasStreamReordered_(false) { allocateSolvers(); } @@ -116,7 +117,8 @@ RILUK::RILUK (const Teuchos::RCP& Matrix_in) Rthresh_ (Teuchos::ScalarTraits::one ()), isKokkosKernelsSpiluk_(false), isKokkosKernelsStream_(false), - num_streams_(0) + num_streams_(0), + hasStreamReordered_(false) { allocateSolvers(); } @@ -412,7 +414,7 @@ setParameters (const Teuchos::ParameterList& params) getParamTryingTypes (nstreams, params, paramName, prefix); } - + // Forward to trisolvers. L_solver_->setParameters(params); U_solver_->setParameters(params); @@ -427,6 +429,9 @@ setParameters (const Teuchos::ParameterList& params) if (num_streams_ >= 1) { this->isKokkosKernelsStream_ = true; + // Will we do reordering in streams? + if (params.isParameter("fact: kspiluk reordering in streams")) + hasStreamReordered_ = params.get ("fact: kspiluk reordering in streams"); } else { this->isKokkosKernelsStream_ = false; @@ -524,7 +529,7 @@ void RILUK::initialize () "matrix until the matrix is fill complete. If your matrix is a " "Tpetra::CrsMatrix, please call fillComplete on it (with the domain and " "range Maps, if appropriate) before calling this method."); - + Teuchos::Time timer ("RILUK::initialize"); double startTime = timer.wallTime(); { // Start timing @@ -592,8 +597,10 @@ void RILUK::initialize () } else { auto lclMtx = A_local_crs->getLocalMatrixDevice(); - KokkosSparse::Impl::kk_extract_diagonal_blocks_crsmatrix_sequential(lclMtx, A_local_diagblks); - + if (!hasStreamReordered_) + KokkosSparse::Impl::kk_extract_diagonal_blocks_crsmatrix_sequential(lclMtx, A_local_diagblks); + else + perm_v_ = KokkosSparse::Impl::kk_extract_diagonal_blocks_crsmatrix_sequential(lclMtx, A_local_diagblks, true); for(int i = 0; i < num_streams_; i++) { Teuchos::RCP A_local_diagblks_RowMap = rcp (new crs_map_type(A_local_diagblks[i].numRows(), A_local_diagblks[i].numRows(), @@ -654,6 +661,7 @@ void RILUK::initialize () #if !defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) || !defined(KOKKOS_ENABLE_CUDA) || (CUDA_VERSION < 11030) L_solver_->compute ();//NOTE: It makes sense to do compute here because only the nonzero pattern is involved in trisolve compute #endif + if (!isKokkosKernelsStream_) { U_solver_->setMatrix (U_); } @@ -1050,7 +1058,11 @@ void RILUK::compute () A_local_values_ = lclMtx.values; } else { - KokkosSparse::Impl::kk_extract_diagonal_blocks_crsmatrix_sequential(lclMtx, A_local_diagblks); + if (!hasStreamReordered_) + KokkosSparse::Impl::kk_extract_diagonal_blocks_crsmatrix_sequential(lclMtx, A_local_diagblks); + else + perm_v_ = KokkosSparse::Impl::kk_extract_diagonal_blocks_crsmatrix_sequential(lclMtx, A_local_diagblks, true); + A_local_diagblks_rowmap_v_ = std::vector(num_streams_); A_local_diagblks_entries_v_ = std::vector(num_streams_); A_local_diagblks_values_v_ = std::vector(num_streams_); @@ -1198,77 +1210,133 @@ apply (const Tpetra::MultiVector= 11030) - //NOTE (Nov-15-2022): - //This is a workaround for Cuda >= 11.3 (using cusparseSpSV) - //since cusparseSpSV_solve() does not support in-place computation - MV Y_tmp (Y.getMap (), Y.getNumVectors ()); - - // Start by solving L Y_tmp = X for Y_tmp. - L_solver_->apply (X, Y_tmp, mode); - - if (!this->isKokkosKernelsSpiluk_) { - // Solve D Y = Y. The operation lets us do this in place in Y, so we can - // write "solve D Y = Y for Y." - Y_tmp.elementWiseMultiply (one, *D_, Y_tmp, zero); + if (isKokkosKernelsSpiluk_ && isKokkosKernelsStream_ && hasStreamReordered_) { + MV ReorderedX (X.getMap(), X.getNumVectors()); + MV ReorderedY (Y.getMap(), Y.getNumVectors()); + for (size_t j = 0; j < X.getNumVectors(); j++) { + auto X_j = X.getVector(j); + auto ReorderedX_j = ReorderedX.getVectorNonConst(j); + auto X_lcl = X_j->getLocalViewDevice(Tpetra::Access::ReadOnly); + auto ReorderedX_lcl = ReorderedX_j->getLocalViewDevice(Tpetra::Access::ReadWrite); + local_ordinal_type stream_begin = 0; + local_ordinal_type stream_end; + for(int i = 0; i < num_streams_; i++) { + auto perm_i = perm_v_[i]; + stream_end = stream_begin + perm_i.extent(0); + auto X_lcl_sub = Kokkos::subview (X_lcl, Kokkos::make_pair(stream_begin, stream_end), 0); + auto ReorderedX_lcl_sub = Kokkos::subview (ReorderedX_lcl, Kokkos::make_pair(stream_begin, stream_end), 0); + Kokkos::parallel_for( Kokkos::RangePolicy(0, static_cast(perm_i.extent(0))), KOKKOS_LAMBDA ( const int& ii ) { + ReorderedX_lcl_sub(perm_i(ii)) = X_lcl_sub(ii); + }); + stream_begin = stream_end; + } } - - U_solver_->apply (Y_tmp, Y, mode); // Solve U Y = Y_tmp. -#else - // Start by solving L Y = X for Y. - L_solver_->apply (X, Y, mode); - - if (!this->isKokkosKernelsSpiluk_) { - // Solve D Y = Y. The operation lets us do this in place in Y, so we can - // write "solve D Y = Y for Y." - Y.elementWiseMultiply (one, *D_, Y, zero); + Kokkos::fence(); // Make sure X is completely reordered + if (mode == Teuchos::NO_TRANS) { // Solve L (U Y) = X for Y. + // Solve L Y = X for Y. + L_solver_->apply (ReorderedX, Y, mode); + // Solve U Y = Y for Y. + U_solver_->apply (Y, ReorderedY, mode); + } + else { // Solve U^P (L^P Y) = X for Y (where P is * or T). + // Solve U^P Y = X for Y. + U_solver_->apply (ReorderedX, Y, mode); + // Solve L^P Y = Y for Y. + L_solver_->apply (Y, ReorderedY, mode); } - U_solver_->apply (Y, Y, mode); // Solve U Y = Y. -#endif + for (size_t j = 0; j < Y.getNumVectors(); j++) { + auto Y_j = Y.getVectorNonConst(j); + auto ReorderedY_j = ReorderedY.getVector(j); + auto Y_lcl = Y_j->getLocalViewDevice(Tpetra::Access::ReadWrite); + auto ReorderedY_lcl = ReorderedY_j->getLocalViewDevice(Tpetra::Access::ReadOnly); + local_ordinal_type stream_begin = 0; + local_ordinal_type stream_end; + for(int i = 0; i < num_streams_; i++) { + auto perm_i = perm_v_[i]; + stream_end = stream_begin + perm_i.extent(0); + auto Y_lcl_sub = Kokkos::subview (Y_lcl, Kokkos::make_pair(stream_begin, stream_end), 0); + auto ReorderedY_lcl_sub = Kokkos::subview (ReorderedY_lcl, Kokkos::make_pair(stream_begin, stream_end), 0); + Kokkos::parallel_for( Kokkos::RangePolicy(0, static_cast(perm_i.extent(0))), KOKKOS_LAMBDA ( const int& ii ) { + Y_lcl_sub(ii) = ReorderedY_lcl_sub(perm_i(ii)); + }); + stream_begin = stream_end; + } + } } - else { // Solve U^P (D^P (L^P Y)) = X for Y (where P is * or T). + else { + if (mode == Teuchos::NO_TRANS) { // Solve L (D (U Y)) = X for Y. #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) && defined(KOKKOS_ENABLE_CUDA) && (CUDA_VERSION >= 11030) - //NOTE (Nov-15-2022): - //This is a workaround for Cuda >= 11.3 (using cusparseSpSV) - //since cusparseSpSV_solve() does not support in-place computation - MV Y_tmp (Y.getMap (), Y.getNumVectors ()); + //NOTE (Nov-15-2022): + //This is a workaround for Cuda >= 11.3 (using cusparseSpSV) + //since cusparseSpSV_solve() does not support in-place computation + MV Y_tmp (Y.getMap (), Y.getNumVectors ()); + + // Start by solving L Y_tmp = X for Y_tmp. + L_solver_->apply (X, Y_tmp, mode); + + if (!this->isKokkosKernelsSpiluk_) { + // Solve D Y = Y. The operation lets us do this in place in Y, so we can + // write "solve D Y = Y for Y." + Y_tmp.elementWiseMultiply (one, *D_, Y_tmp, zero); + } - // Start by solving U^P Y_tmp = X for Y_tmp. - U_solver_->apply (X, Y_tmp, mode); + U_solver_->apply (Y_tmp, Y, mode); // Solve U Y = Y_tmp. +#else + // Start by solving L Y = X for Y. + L_solver_->apply (X, Y, mode); - if (!this->isKokkosKernelsSpiluk_) { - // Solve D^P Y = Y. - // - // FIXME (mfh 24 Jan 2014) If mode = Teuchos::CONJ_TRANS, we - // need to do an elementwise multiply with the conjugate of - // D_, not just with D_ itself. - Y_tmp.elementWiseMultiply (one, *D_, Y_tmp, zero); - } + if (!this->isKokkosKernelsSpiluk_) { + // Solve D Y = Y. The operation lets us do this in place in Y, so we can + // write "solve D Y = Y for Y." + Y.elementWiseMultiply (one, *D_, Y, zero); + } - L_solver_->apply (Y_tmp, Y, mode); // Solve L^P Y = Y_tmp. + U_solver_->apply (Y, Y, mode); // Solve U Y = Y. +#endif + } + else { // Solve U^P (D^P (L^P Y)) = X for Y (where P is * or T). +#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) && defined(KOKKOS_ENABLE_CUDA) && (CUDA_VERSION >= 11030) + //NOTE (Nov-15-2022): + //This is a workaround for Cuda >= 11.3 (using cusparseSpSV) + //since cusparseSpSV_solve() does not support in-place computation + MV Y_tmp (Y.getMap (), Y.getNumVectors ()); + + // Start by solving U^P Y_tmp = X for Y_tmp. + U_solver_->apply (X, Y_tmp, mode); + + if (!this->isKokkosKernelsSpiluk_) { + // Solve D^P Y = Y. + // + // FIXME (mfh 24 Jan 2014) If mode = Teuchos::CONJ_TRANS, we + // need to do an elementwise multiply with the conjugate of + // D_, not just with D_ itself. + Y_tmp.elementWiseMultiply (one, *D_, Y_tmp, zero); + } + + L_solver_->apply (Y_tmp, Y, mode); // Solve L^P Y = Y_tmp. #else - // Start by solving U^P Y = X for Y. - U_solver_->apply (X, Y, mode); - - if (!this->isKokkosKernelsSpiluk_) { - // Solve D^P Y = Y. - // - // FIXME (mfh 24 Jan 2014) If mode = Teuchos::CONJ_TRANS, we - // need to do an elementwise multiply with the conjugate of - // D_, not just with D_ itself. - Y.elementWiseMultiply (one, *D_, Y, zero); - } - - L_solver_->apply (Y, Y, mode); // Solve L^P Y = Y. + // Start by solving U^P Y = X for Y. + U_solver_->apply (X, Y, mode); + + if (!this->isKokkosKernelsSpiluk_) { + // Solve D^P Y = Y. + // + // FIXME (mfh 24 Jan 2014) If mode = Teuchos::CONJ_TRANS, we + // need to do an elementwise multiply with the conjugate of + // D_, not just with D_ itself. + Y.elementWiseMultiply (one, *D_, Y, zero); + } + + L_solver_->apply (Y, Y, mode); // Solve L^P Y = Y. #endif + } } } else { // alpha != 1 or beta != 0 diff --git a/packages/ifpack2/test/belos/CMakeLists.txt b/packages/ifpack2/test/belos/CMakeLists.txt index 9e9799eaa643..00fdb64c288f 100644 --- a/packages/ifpack2/test/belos/CMakeLists.txt +++ b/packages/ifpack2/test/belos/CMakeLists.txt @@ -72,11 +72,15 @@ TRIBITS_COPY_FILES_TO_BINARY_DIR(Ifpack2BelosCopyFiles test_2_RILUK_HTS_nos1_hb.xml test_2_RILUK_2streams_nos1_hb.xml test_2_RILUK_4streams_nos1_hb.xml + test_2_RILUK_2streams_rcm_nos1_hb.xml + test_2_RILUK_4streams_rcm_nos1_hb.xml test_4_ILUT_nos1_hb.xml test_4_RILUK_nos1_hb.xml test_4_RILUK_HTS_nos1_hb.xml test_4_RILUK_2streams_nos1_hb.xml test_4_RILUK_4streams_nos1_hb.xml + test_4_RILUK_2streams_rcm_nos1_hb.xml + test_4_RILUK_4streams_rcm_nos1_hb.xml test_SGS_calore1_mm.xml test_MTSGS_calore1_mm.xml calore1.mtx @@ -403,6 +407,41 @@ IF(Kokkos_ENABLE_CUDA) NUM_MPI_PROCS 4 STANDARD_PASS_OUTPUT ) + TRIBITS_ADD_TEST( + tif_belos + NAME RILUK_2streams_rcm_hb_belos + ARGS "--xml_file=test_2_RILUK_2streams_rcm_nos1_hb.xml" + COMM serial mpi + NUM_MPI_PROCS 2 + STANDARD_PASS_OUTPUT + ) + + TRIBITS_ADD_TEST( + tif_belos + NAME RILUK_4streams_rcm_hb_belos + ARGS "--xml_file=test_2_RILUK_4streams_rcm_nos1_hb.xml" + COMM serial mpi + NUM_MPI_PROCS 2 + STANDARD_PASS_OUTPUT + ) + + TRIBITS_ADD_TEST( + tif_belos + NAME RILUK_2streams_rcm_hb_belos + ARGS "--xml_file=test_4_RILUK_2streams_rcm_nos1_hb.xml" + COMM serial mpi + NUM_MPI_PROCS 4 + STANDARD_PASS_OUTPUT + ) + + TRIBITS_ADD_TEST( + tif_belos + NAME RILUK_4streams_rcm_hb_belos + ARGS "--xml_file=test_4_RILUK_4streams_rcm_nos1_hb.xml" + COMM serial mpi + NUM_MPI_PROCS 4 + STANDARD_PASS_OUTPUT + ) ENDIF() ENDIF() diff --git a/packages/ifpack2/test/belos/test_2_RILUK_2streams_rcm_nos1_hb.xml b/packages/ifpack2/test/belos/test_2_RILUK_2streams_rcm_nos1_hb.xml new file mode 100644 index 000000000000..7af01acaa95f --- /dev/null +++ b/packages/ifpack2/test/belos/test_2_RILUK_2streams_rcm_nos1_hb.xml @@ -0,0 +1,22 @@ + + + + + + + + + + + + + + + + + + + + + + diff --git a/packages/ifpack2/test/belos/test_2_RILUK_4streams_rcm_nos1_hb.xml b/packages/ifpack2/test/belos/test_2_RILUK_4streams_rcm_nos1_hb.xml new file mode 100644 index 000000000000..fc3859820a7d --- /dev/null +++ b/packages/ifpack2/test/belos/test_2_RILUK_4streams_rcm_nos1_hb.xml @@ -0,0 +1,22 @@ + + + + + + + + + + + + + + + + + + + + + + diff --git a/packages/ifpack2/test/belos/test_4_RILUK_2streams_rcm_nos1_hb.xml b/packages/ifpack2/test/belos/test_4_RILUK_2streams_rcm_nos1_hb.xml new file mode 100644 index 000000000000..60d217541fde --- /dev/null +++ b/packages/ifpack2/test/belos/test_4_RILUK_2streams_rcm_nos1_hb.xml @@ -0,0 +1,23 @@ + + + + + + + + + + + + + + + + + + + + + + + diff --git a/packages/ifpack2/test/belos/test_4_RILUK_4streams_rcm_nos1_hb.xml b/packages/ifpack2/test/belos/test_4_RILUK_4streams_rcm_nos1_hb.xml new file mode 100644 index 000000000000..7ff88b06bfc1 --- /dev/null +++ b/packages/ifpack2/test/belos/test_4_RILUK_4streams_rcm_nos1_hb.xml @@ -0,0 +1,23 @@ + + + + + + + + + + + + + + + + + + + + + + + diff --git a/packages/kokkos-kernels/sparse/src/KokkosSparse_Utils.hpp b/packages/kokkos-kernels/sparse/src/KokkosSparse_Utils.hpp index f3fbec18369f..2b89c1a2f74e 100644 --- a/packages/kokkos-kernels/sparse/src/KokkosSparse_Utils.hpp +++ b/packages/kokkos-kernels/sparse/src/KokkosSparse_Utils.hpp @@ -25,6 +25,7 @@ #include "KokkosSparse_CrsMatrix.hpp" #include "KokkosSparse_BsrMatrix.hpp" #include "Kokkos_Bitset.hpp" +#include "KokkosGraph_RCM.hpp" #ifdef KOKKOSKERNELS_HAVE_PARALLEL_GNUSORT #include @@ -2415,15 +2416,23 @@ void kk_extract_subblock_crsmatrix_sequential( * @tparam crsMat_t The type of the CRS matrix. * @param A [in] The square CrsMatrix. It is expected that column indices are * in ascending order + * @param UseRCMReordering [in] Boolean indicating whether applying (true) RCM + * reordering to diagonal blocks or not (false) (default: false) * @param DiagBlk_v [out] The vector of the extracted the CRS diagonal blocks * (1 <= the number of diagonal blocks <= A_nrows) + * @return a vector of lists of vertices in RCM order (a list per a diagonal + * block) if UseRCMReordering is true, or an empty vector if UseRCMReordering is + * false * * Usage Example: - * kk_extract_diagonal_blocks_crsmatrix_sequential(A_in, diagBlk_in_b); + * perm = kk_extract_diagonal_blocks_crsmatrix_sequential(A_in, diagBlk_out, + * UseRCMReordering); */ template -void kk_extract_diagonal_blocks_crsmatrix_sequential( - const crsMat_t &A, std::vector &DiagBlk_v) { +std::vector +kk_extract_diagonal_blocks_crsmatrix_sequential( + const crsMat_t &A, std::vector &DiagBlk_v, + bool UseRCMReordering = false) { using row_map_type = typename crsMat_t::row_map_type; using entries_type = typename crsMat_t::index_type; using values_type = typename crsMat_t::values_type; @@ -2437,6 +2446,7 @@ void kk_extract_diagonal_blocks_crsmatrix_sequential( using ordinal_type = typename crsMat_t::non_const_ordinal_type; using size_type = typename crsMat_t::non_const_size_type; + using value_type = typename crsMat_t::non_const_value_type; using offset_view1d_type = Kokkos::View; @@ -2463,8 +2473,12 @@ void kk_extract_diagonal_blocks_crsmatrix_sequential( throw std::runtime_error(os.str()); } + std::vector perm_v; + std::vector perm_h_v; + if (n_blocks == 1) { // One block case: simply shallow copy A to DiagBlk_v[0] + // Note: always not applying RCM reordering, for now DiagBlk_v[0] = crsMat_t(A); } else { // n_blocks > 1 @@ -2487,12 +2501,10 @@ void kk_extract_diagonal_blocks_crsmatrix_sequential( ? (A_nrows / n_blocks) : (A_nrows / n_blocks + 1); - std::vector row_map_v(n_blocks); - std::vector entries_v(n_blocks); - std::vector values_v(n_blocks); - std::vector row_map_h_v(n_blocks); - std::vector entries_h_v(n_blocks); - std::vector values_h_v(n_blocks); + if (UseRCMReordering) { + perm_v.resize(n_blocks); + perm_h_v.resize(n_blocks); + } ordinal_type blk_row_start = 0; // first row index of i-th diagonal block ordinal_type blk_col_start = 0; // first col index of i-th diagonal block @@ -2509,37 +2521,110 @@ void kk_extract_diagonal_blocks_crsmatrix_sequential( // First round: count i-th non-zeros or size of entries_v[i] and find // the first and last column indices at each row size_type blk_nnz = 0; - offset_view1d_type first("first", blk_nrows); // first position per row - offset_view1d_type last("last", blk_nrows); // last position per row + offset_view1d_type first( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "first"), + blk_nrows); // first position per row + offset_view1d_type last( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "last"), + blk_nrows); // last position per row kk_find_nnz_first_last_indices_subblock_crsmatrix_sequential( A_row_map_h, A_entries_h, blk_row_start, blk_col_start, blk_nrows, blk_ncols, blk_nnz, first, last); // Second round: extract - row_map_v[i] = out_row_map_type("row_map_v", blk_nrows + 1); - entries_v[i] = out_entries_type("entries_v", blk_nnz); - values_v[i] = out_values_type("values_v", blk_nnz); - row_map_h_v[i] = - out_row_map_hostmirror_type("row_map_h_v", blk_nrows + 1); - entries_h_v[i] = out_entries_hostmirror_type("entries_h_v", blk_nnz); - values_h_v[i] = out_values_hostmirror_type("values_h_v", blk_nnz); + out_row_map_type row_map( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "row_map"), + blk_nrows + 1); + out_entries_type entries( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "entries"), + blk_nnz); + out_values_type values( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "values"), blk_nnz); + out_row_map_hostmirror_type row_map_h( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "row_map_h"), + blk_nrows + 1); + out_entries_hostmirror_type entries_h( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "entries_h"), + blk_nnz); + out_values_hostmirror_type values_h( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "values_h"), + blk_nnz); kk_extract_subblock_crsmatrix_sequential( A_entries_h, A_values_h, blk_col_start, blk_nrows, blk_nnz, first, - last, row_map_h_v[i], entries_h_v[i], values_h_v[i]); + last, row_map_h, entries_h, values_h); + + if (!UseRCMReordering) { + Kokkos::deep_copy(row_map, row_map_h); + Kokkos::deep_copy(entries, entries_h); + Kokkos::deep_copy(values, values_h); + } else { + perm_h_v[i] = KokkosGraph::Experimental::graph_rcm< + Kokkos::DefaultHostExecutionSpace>(row_map_h, entries_h); + perm_v[i] = out_entries_type( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "perm_v"), + perm_h_v[i].extent(0)); + + out_row_map_hostmirror_type row_map_perm_h( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "row_map_perm_h"), + blk_nrows + 1); + out_entries_hostmirror_type entries_perm_h( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "entries_perm_h"), + blk_nnz); + out_values_hostmirror_type values_perm_h( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "values_perm_h"), + blk_nnz); + + out_entries_hostmirror_type reverseperm_h( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "reverseperm_h"), + blk_nrows); + for (ordinal_type ii = 0; ii < blk_nrows; ii++) + reverseperm_h(perm_h_v[i](ii)) = ii; + + std::map colIdx_Value_rcm; + + // Loop through each row of the reordered matrix + size_type cnt = 0; + for (ordinal_type ii = 0; ii < blk_nrows; ii++) { + colIdx_Value_rcm.clear(); + // ii: reordered index + ordinal_type origRow = reverseperm_h( + ii); // get the original row idx of the reordered row idx, ii + for (size_type j = row_map_h(origRow); j < row_map_h(origRow + 1); + j++) { + ordinal_type origEi = entries_h(j); + value_type origV = values_h(j); + ordinal_type Ei = + perm_h_v[i](origEi); // get the reordered col idx of the + // original col idx, origEi + colIdx_Value_rcm[Ei] = origV; + } + row_map_perm_h(ii) = cnt; + for (typename std::map::iterator it = + colIdx_Value_rcm.begin(); + it != colIdx_Value_rcm.end(); ++it) { + entries_perm_h(cnt) = it->first; + values_perm_h(cnt) = it->second; + cnt++; + } + } + row_map_perm_h(blk_nrows) = cnt; - Kokkos::deep_copy(row_map_v[i], row_map_h_v[i]); - Kokkos::deep_copy(entries_v[i], entries_h_v[i]); - Kokkos::deep_copy(values_v[i], values_h_v[i]); + Kokkos::deep_copy(row_map, row_map_perm_h); + Kokkos::deep_copy(entries, entries_perm_h); + Kokkos::deep_copy(values, values_perm_h); + Kokkos::deep_copy(perm_v[i], perm_h_v[i]); + } DiagBlk_v[i] = crsMat_t("CrsMatrix", blk_nrows, blk_ncols, blk_nnz, - values_v[i], row_map_v[i], entries_v[i]); + values, row_map, entries); blk_row_start += blk_nrows; } // for (ordinal_type i = 0; i < n_blocks; i++) } // A_nrows >= 1 } // n_blocks > 1 + return perm_v; } } // namespace Impl diff --git a/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_extractCrsDiagonalBlocks.hpp b/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_extractCrsDiagonalBlocks.hpp index 327780dec32d..28674ad353fe 100644 --- a/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_extractCrsDiagonalBlocks.hpp +++ b/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_extractCrsDiagonalBlocks.hpp @@ -15,6 +15,8 @@ //@HEADER #include "KokkosSparse_Utils.hpp" +#include "KokkosSparse_spmv.hpp" +#include "KokkosBlas1_nrm2.hpp" #include "KokkosKernels_TestUtils.hpp" namespace Test { @@ -31,6 +33,7 @@ void run_test_extract_diagonal_blocks(int nrows, int nblocks) { crsMat_t A; std::vector DiagBlks(nblocks); + std::vector DiagBlks_rcm(nblocks); if (nrows != 0) { // Generate test matrix @@ -84,6 +87,10 @@ void run_test_extract_diagonal_blocks(int nrows, int nblocks) { KokkosSparse::Impl::kk_extract_diagonal_blocks_crsmatrix_sequential(A, DiagBlks); + auto perm = + KokkosSparse::Impl::kk_extract_diagonal_blocks_crsmatrix_sequential( + A, DiagBlks_rcm, true); + // Checking lno_t numRows = 0; lno_t numCols = 0; @@ -125,6 +132,40 @@ void run_test_extract_diagonal_blocks(int nrows, int nblocks) { col_start += DiagBlks[i].numCols(); } EXPECT_TRUE(flag); + + // Checking RCM + if (!perm.empty()) { + scalar_t one = scalar_t(1.0); + scalar_t zero = scalar_t(0.0); + scalar_t mone = scalar_t(-1.0); + for (int i = 0; i < nblocks; i++) { + ValuesType In("In", DiagBlks[i].numRows()); + ValuesType Out("Out", DiagBlks[i].numRows()); + + ValuesType_hm h_Out = Kokkos::create_mirror_view(Out); + ValuesType_hm h_Out_tmp = Kokkos::create_mirror(Out); + + Kokkos::deep_copy(In, one); + + auto h_perm = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), perm[i]); + + KokkosSparse::spmv("N", one, DiagBlks_rcm[i], In, zero, Out); + + Kokkos::deep_copy(h_Out_tmp, Out); + for (lno_t ii = 0; ii < static_cast(DiagBlks[i].numRows()); + ii++) { + lno_t rcm_ii = h_perm(ii); + h_Out(ii) = h_Out_tmp(rcm_ii); + } + Kokkos::deep_copy(Out, h_Out); + + KokkosSparse::spmv("N", one, DiagBlks[i], In, mone, Out); + + double nrm_val = KokkosBlas::nrm2(Out); + EXPECT_LE(nrm_val, 1e-9); + } + } } } } // namespace Test @@ -136,9 +177,9 @@ void test_extract_diagonal_blocks() { Test::run_test_extract_diagonal_blocks( 0, s); Test::run_test_extract_diagonal_blocks( - 12, s); + 153, s); Test::run_test_extract_diagonal_blocks( - 123, s); + 1553, s); } } diff --git a/packages/xpetra/src/Operator/Xpetra_TpetraHalfPrecisionOperator.hpp b/packages/xpetra/src/Operator/Xpetra_TpetraHalfPrecisionOperator.hpp index 509294a77fdc..b0c87f06d811 100644 --- a/packages/xpetra/src/Operator/Xpetra_TpetraHalfPrecisionOperator.hpp +++ b/packages/xpetra/src/Operator/Xpetra_TpetraHalfPrecisionOperator.hpp @@ -127,12 +127,12 @@ class TpetraHalfPrecisionOperator : public Xpetra::Operator> getDomainMap() const { + const Teuchos::RCP> getDomainMap() const { return Op_->getDomainMap(); } //! Returns the Tpetra::Map object associated with the range of this TpetraOperator. - Teuchos::RCP> getRangeMap() const { + const Teuchos::RCP> getRangeMap() const { return Op_->getRangeMap(); }