diff --git a/.github/workflows/PR-gcc-openmpi.yml b/.github/workflows/PR-gcc-openmpi.yml
index 7eb91f341044..99b513d3f145 100644
--- a/.github/workflows/PR-gcc-openmpi.yml
+++ b/.github/workflows/PR-gcc-openmpi.yml
@@ -35,7 +35,7 @@ jobs:
           bash -l -c "module list"
           printenv PATH
       - name: Cancel Previous Runs
-        uses: styfle/cancel-workflow-action@b173b6ec0100793626c2d9e6b90435061f4fc3e5 # 0.11.0
+        uses: styfle/cancel-workflow-action@85880fa0301c86cca9da44039ee3bb12d3bedbfa # 0.12.1
         with:
           access_token: ${{ github.token }}
       - name: make dirs
@@ -44,7 +44,7 @@ jobs:
           mkdir -p /home/Trilinos/src/Trilinos
           mkdir -p /home/Trilinos/build
       - name: Clone trilinos
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633 # v4.1.2
         with:
           fetch-depth: 0
       - name: Repo status
diff --git a/.github/workflows/clang_format.yml b/.github/workflows/clang_format.yml
index 57760cee24a0..5e6c28e07996 100644
--- a/.github/workflows/clang_format.yml
+++ b/.github/workflows/clang_format.yml
@@ -7,7 +7,7 @@ jobs:
     runs-on: ubuntu-latest
 
     steps:
-    - uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # v3.6.0
+    - uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633 # v4.1.2
     - uses: DoozyX/clang-format-lint-action@1566bcec081dcb246ab02e7c5f9786c0b629dd4d # v0.16.2
       with:
         source: './packages/muelu ./packages/tempus ./packages/teko ./packages/xpetra'
@@ -32,7 +32,7 @@ jobs:
 
     # This does not work for PRs from forks.
     - name: Post artifact in issue comment
-      uses: mshick/add-pr-comment@7c0890544fb33b0bdd2e59467fbacb62e028a096 # v2.8.1
+      uses: mshick/add-pr-comment@b8f338c590a895d50bcbfa6c5859251edc8952fc # v2.8.2
       if: ${{ (hashFiles('format_patch.txt') != '') && (github.event.pull_request.head.repo.full_name == github.repository) }}
       with:
         message: |
diff --git a/.github/workflows/dependency-review.yml b/.github/workflows/dependency-review.yml
index 00c327a8f204..671cb478789c 100644
--- a/.github/workflows/dependency-review.yml
+++ b/.github/workflows/dependency-review.yml
@@ -22,6 +22,6 @@ jobs:
           egress-policy: audit
 
       - name: 'Checkout Repository'
-        uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # v3.6.0
+        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633 # v4.1.2
       - name: 'Dependency Review'
         uses: actions/dependency-review-action@9129d7d40b8c12c1ed0f60400d00c92d437adcce # v4.1.3
diff --git a/.github/workflows/detect-git-lfs.yml b/.github/workflows/detect-git-lfs.yml
index 42811f4d1eac..f8498ce6fd6f 100644
--- a/.github/workflows/detect-git-lfs.yml
+++ b/.github/workflows/detect-git-lfs.yml
@@ -12,7 +12,7 @@ jobs:
 
     steps:
     - name: Check out code
-      uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # v3.6.0
+      uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633 # v4.1.2
       with:
         fetch-depth: 0
 
diff --git a/.github/workflows/detect-mpi-comm-world.yml b/.github/workflows/detect-mpi-comm-world.yml
index 8d6b57826631..e4e0b5269637 100644
--- a/.github/workflows/detect-mpi-comm-world.yml
+++ b/.github/workflows/detect-mpi-comm-world.yml
@@ -12,7 +12,7 @@ jobs:
 
     steps:
     - name: Check out code
-      uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # v3.6.0
+      uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633 # v4.1.2
       with:
         fetch-depth: 0
 
diff --git a/.github/workflows/scorecards.yml b/.github/workflows/scorecards.yml
index 7dc4117b2396..f8a711409095 100644
--- a/.github/workflows/scorecards.yml
+++ b/.github/workflows/scorecards.yml
@@ -31,7 +31,7 @@ jobs:
 
     steps:
       - name: "Checkout code"
-        uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # v3.6.0
+        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633 # v4.1.2
         with:
           persist-credentials: false
 
@@ -58,7 +58,7 @@ jobs:
       # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF
       # format to the repository Actions tab.
       - name: "Upload artifact"
-        uses: actions/upload-artifact@a8a3f3ad30e3422c9c7b888a15615d19a852ae32 # v3.1.3
+        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3 # v4.3.1
         with:
           name: SARIF file
           path: results.sarif
@@ -66,6 +66,6 @@ jobs:
 
       # Upload the results to GitHub's code scanning dashboard.
       - name: "Upload to code-scanning"
-        uses: github/codeql-action/upload-sarif@a56a03b370b87b26fde6d680755f818cfda0372b # v2.24.5
+        uses: github/codeql-action/upload-sarif@3ab4101902695724f9365a384f86c1074d94e18c # v3.24.7
         with:
           sarif_file: results.sarif
diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml
index bf8e74755e96..3fcfd0d5f391 100644
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@@ -26,7 +26,7 @@ jobs:
       pull-requests: write  # for actions/stale to close stale PRs
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/stale@a20b814fb01b71def3bd6f56e7494d667ddf28da # v4.1.1
+    - uses: actions/stale@28ca1036281a5e5922ead5184a1bbf96e5fc984e # v9.0.0
       with:
         debug-only: false
         ascending: true
diff --git a/packages/ifpack2/doc/UsersGuide/options.tex b/packages/ifpack2/doc/UsersGuide/options.tex
index 9800fd4ede33..fc196831ea50 100644
--- a/packages/ifpack2/doc/UsersGuide/options.tex
+++ b/packages/ifpack2/doc/UsersGuide/options.tex
@@ -502,6 +502,10 @@ \subsection{ILU($k$)}\label{s:ILU}
      these streams can run concurrently, the total time can be faster. When
      this option is not set (i.e. not using stream), the entire sub-domain is
      used instead.}
+\ccc{fact: kspiluk reordering in streams}
+    {bool}
+    {\false}
+    {Whether RCM reordering is applied to diagonal blocks in streams.}
 % All overlap-related code was removed by M. Hoemmen in
 %
 % commit 162f64572fbf93e2cac73e3034d76a3db918a494
diff --git a/packages/ifpack2/src/Ifpack2_RILUK_decl.hpp b/packages/ifpack2/src/Ifpack2_RILUK_decl.hpp
index 0cfb82080984..850c121922df 100644
--- a/packages/ifpack2/src/Ifpack2_RILUK_decl.hpp
+++ b/packages/ifpack2/src/Ifpack2_RILUK_decl.hpp
@@ -647,6 +647,8 @@ class RILUK:
   bool isKokkosKernelsStream_;
   int num_streams_;
   std::vector<execution_space> exec_space_instances_;
+  bool hasStreamReordered_;
+  std::vector<typename lno_nonzero_view_t::non_const_type> perm_v_;
 };
 
 // NOTE (mfh 11 Feb 2015) This used to exist in order to deal with
diff --git a/packages/ifpack2/src/Ifpack2_RILUK_def.hpp b/packages/ifpack2/src/Ifpack2_RILUK_def.hpp
index 1e90f9bdf7f3..c42873035843 100644
--- a/packages/ifpack2/src/Ifpack2_RILUK_def.hpp
+++ b/packages/ifpack2/src/Ifpack2_RILUK_def.hpp
@@ -91,7 +91,8 @@ RILUK<MatrixType>::RILUK (const Teuchos::RCP<const row_matrix_type>& Matrix_in)
     Rthresh_ (Teuchos::ScalarTraits<magnitude_type>::one ()),
     isKokkosKernelsSpiluk_(false),
     isKokkosKernelsStream_(false),
-    num_streams_(0)
+    num_streams_(0),
+    hasStreamReordered_(false)
 {
   allocateSolvers();
 }
@@ -116,7 +117,8 @@ RILUK<MatrixType>::RILUK (const Teuchos::RCP<const crs_matrix_type>& Matrix_in)
     Rthresh_ (Teuchos::ScalarTraits<magnitude_type>::one ()),
     isKokkosKernelsSpiluk_(false),
     isKokkosKernelsStream_(false),
-    num_streams_(0)
+    num_streams_(0),
+    hasStreamReordered_(false)
 {
   allocateSolvers();
 }
@@ -412,7 +414,7 @@ setParameters (const Teuchos::ParameterList& params)
     getParamTryingTypes<int, int, global_ordinal_type>
       (nstreams, params, paramName, prefix);
   }
-  
+
   // Forward to trisolvers.
   L_solver_->setParameters(params);
   U_solver_->setParameters(params);
@@ -427,6 +429,9 @@ setParameters (const Teuchos::ParameterList& params)
 
   if (num_streams_ >= 1) {
     this->isKokkosKernelsStream_ = true;
+    // Will we do reordering in streams?
+    if (params.isParameter("fact: kspiluk reordering in streams"))
+      hasStreamReordered_ = params.get<bool> ("fact: kspiluk reordering in streams");
   }
   else {
     this->isKokkosKernelsStream_ = false;
@@ -524,7 +529,7 @@ void RILUK<MatrixType>::initialize ()
      "matrix until the matrix is fill complete.  If your matrix is a "
      "Tpetra::CrsMatrix, please call fillComplete on it (with the domain and "
      "range Maps, if appropriate) before calling this method.");
-  
+
   Teuchos::Time timer ("RILUK::initialize");
   double startTime = timer.wallTime();
   { // Start timing
@@ -592,8 +597,10 @@ void RILUK<MatrixType>::initialize ()
       }
       else {
         auto lclMtx = A_local_crs->getLocalMatrixDevice();
-        KokkosSparse::Impl::kk_extract_diagonal_blocks_crsmatrix_sequential(lclMtx, A_local_diagblks);
-
+        if (!hasStreamReordered_)
+          KokkosSparse::Impl::kk_extract_diagonal_blocks_crsmatrix_sequential(lclMtx, A_local_diagblks);
+        else
+          perm_v_ = KokkosSparse::Impl::kk_extract_diagonal_blocks_crsmatrix_sequential(lclMtx, A_local_diagblks, true);
         for(int i = 0; i < num_streams_; i++) {
           Teuchos::RCP<const crs_map_type> A_local_diagblks_RowMap = rcp (new crs_map_type(A_local_diagblks[i].numRows(),
                                                                                            A_local_diagblks[i].numRows(),
@@ -654,6 +661,7 @@ void RILUK<MatrixType>::initialize ()
 #if !defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) || !defined(KOKKOS_ENABLE_CUDA) || (CUDA_VERSION < 11030)
     L_solver_->compute ();//NOTE: It makes sense to do compute here because only the nonzero pattern is involved in trisolve compute
 #endif
+
     if (!isKokkosKernelsStream_) {
       U_solver_->setMatrix (U_);
     }
@@ -1050,7 +1058,11 @@ void RILUK<MatrixType>::compute ()
         A_local_values_  = lclMtx.values;
       }
       else {
-        KokkosSparse::Impl::kk_extract_diagonal_blocks_crsmatrix_sequential(lclMtx, A_local_diagblks);
+        if (!hasStreamReordered_)
+          KokkosSparse::Impl::kk_extract_diagonal_blocks_crsmatrix_sequential(lclMtx, A_local_diagblks);
+        else
+          perm_v_ = KokkosSparse::Impl::kk_extract_diagonal_blocks_crsmatrix_sequential(lclMtx, A_local_diagblks, true);
+
         A_local_diagblks_rowmap_v_  = std::vector<lno_row_view_t>(num_streams_);
         A_local_diagblks_entries_v_ = std::vector<lno_nonzero_view_t>(num_streams_);
         A_local_diagblks_values_v_  = std::vector<scalar_nonzero_view_t>(num_streams_);
@@ -1198,77 +1210,133 @@ apply (const Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_t
 
   const scalar_type one = STS::one ();
   const scalar_type zero = STS::zero ();
-
+  
   Teuchos::Time timer ("RILUK::apply");
   double startTime = timer.wallTime();
   { // Start timing
     Teuchos::TimeMonitor timeMon (timer);
     if (alpha == one && beta == zero) {
-      if (mode == Teuchos::NO_TRANS) { // Solve L (D (U Y)) = X for Y.      
-#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) && defined(KOKKOS_ENABLE_CUDA) && (CUDA_VERSION >= 11030)
-        //NOTE (Nov-15-2022):
-        //This is a workaround for Cuda >= 11.3 (using cusparseSpSV)
-        //since cusparseSpSV_solve() does not support in-place computation
-        MV Y_tmp (Y.getMap (), Y.getNumVectors ());
-
-        // Start by solving L Y_tmp = X for Y_tmp.
-        L_solver_->apply (X, Y_tmp, mode);
-
-        if (!this->isKokkosKernelsSpiluk_) {
-          // Solve D Y = Y.  The operation lets us do this in place in Y, so we can
-          // write "solve D Y = Y for Y."
-          Y_tmp.elementWiseMultiply (one, *D_, Y_tmp, zero);
+      if (isKokkosKernelsSpiluk_ && isKokkosKernelsStream_ && hasStreamReordered_) {
+        MV ReorderedX (X.getMap(), X.getNumVectors());
+        MV ReorderedY (Y.getMap(), Y.getNumVectors());
+        for (size_t j = 0; j < X.getNumVectors(); j++) {
+          auto X_j = X.getVector(j);
+          auto ReorderedX_j = ReorderedX.getVectorNonConst(j);
+          auto X_lcl = X_j->getLocalViewDevice(Tpetra::Access::ReadOnly);
+          auto ReorderedX_lcl = ReorderedX_j->getLocalViewDevice(Tpetra::Access::ReadWrite);
+          local_ordinal_type stream_begin = 0;
+          local_ordinal_type stream_end;
+          for(int i = 0; i < num_streams_; i++) {
+            auto perm_i = perm_v_[i];
+            stream_end = stream_begin + perm_i.extent(0);
+            auto X_lcl_sub = Kokkos::subview (X_lcl, Kokkos::make_pair(stream_begin, stream_end), 0);
+            auto ReorderedX_lcl_sub = Kokkos::subview (ReorderedX_lcl, Kokkos::make_pair(stream_begin, stream_end), 0);
+            Kokkos::parallel_for( Kokkos::RangePolicy<execution_space>(0, static_cast<int>(perm_i.extent(0))), KOKKOS_LAMBDA ( const int& ii ) {
+              ReorderedX_lcl_sub(perm_i(ii)) = X_lcl_sub(ii);
+            });
+            stream_begin = stream_end;
+          }
         }
-
-        U_solver_->apply (Y_tmp, Y, mode); // Solve U Y = Y_tmp.
-#else
-        // Start by solving L Y = X for Y.
-        L_solver_->apply (X, Y, mode);
-
-        if (!this->isKokkosKernelsSpiluk_) {
-          // Solve D Y = Y.  The operation lets us do this in place in Y, so we can
-          // write "solve D Y = Y for Y."
-          Y.elementWiseMultiply (one, *D_, Y, zero);
+        Kokkos::fence(); // Make sure X is completely reordered
+        if (mode == Teuchos::NO_TRANS) { // Solve L (U Y) = X for Y.
+          // Solve L Y = X for Y.
+          L_solver_->apply (ReorderedX, Y, mode);
+          // Solve U Y = Y for Y.
+          U_solver_->apply (Y, ReorderedY, mode);
+        }
+        else { // Solve U^P (L^P Y) = X for Y (where P is * or T).
+          // Solve U^P Y = X for Y.
+          U_solver_->apply (ReorderedX, Y, mode);
+          // Solve L^P Y = Y for Y.
+          L_solver_->apply (Y, ReorderedY, mode);
         }
 
-        U_solver_->apply (Y, Y, mode); // Solve U Y = Y.
-#endif
+        for (size_t j = 0; j < Y.getNumVectors(); j++) {
+          auto Y_j = Y.getVectorNonConst(j);
+          auto ReorderedY_j = ReorderedY.getVector(j);
+          auto Y_lcl = Y_j->getLocalViewDevice(Tpetra::Access::ReadWrite);
+          auto ReorderedY_lcl = ReorderedY_j->getLocalViewDevice(Tpetra::Access::ReadOnly);
+          local_ordinal_type stream_begin = 0;
+          local_ordinal_type stream_end;
+          for(int i = 0; i < num_streams_; i++) {
+            auto perm_i = perm_v_[i];
+            stream_end = stream_begin + perm_i.extent(0);
+            auto Y_lcl_sub = Kokkos::subview (Y_lcl, Kokkos::make_pair(stream_begin, stream_end), 0);
+            auto ReorderedY_lcl_sub = Kokkos::subview (ReorderedY_lcl, Kokkos::make_pair(stream_begin, stream_end), 0);
+            Kokkos::parallel_for( Kokkos::RangePolicy<execution_space>(0, static_cast<int>(perm_i.extent(0))), KOKKOS_LAMBDA ( const int& ii ) {
+              Y_lcl_sub(ii) = ReorderedY_lcl_sub(perm_i(ii));
+            });
+            stream_begin = stream_end;
+          }
+        }
       }
-      else { // Solve U^P (D^P (L^P Y)) = X for Y (where P is * or T).          
+      else {
+        if (mode == Teuchos::NO_TRANS) { // Solve L (D (U Y)) = X for Y.
 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) && defined(KOKKOS_ENABLE_CUDA) && (CUDA_VERSION >= 11030)
-        //NOTE (Nov-15-2022):
-        //This is a workaround for Cuda >= 11.3 (using cusparseSpSV)
-        //since cusparseSpSV_solve() does not support in-place computation
-        MV Y_tmp (Y.getMap (), Y.getNumVectors ());
+          //NOTE (Nov-15-2022):
+          //This is a workaround for Cuda >= 11.3 (using cusparseSpSV)
+          //since cusparseSpSV_solve() does not support in-place computation
+          MV Y_tmp (Y.getMap (), Y.getNumVectors ());
+
+          // Start by solving L Y_tmp = X for Y_tmp.
+          L_solver_->apply (X, Y_tmp, mode);
+
+          if (!this->isKokkosKernelsSpiluk_) {
+            // Solve D Y = Y.  The operation lets us do this in place in Y, so we can
+            // write "solve D Y = Y for Y."
+            Y_tmp.elementWiseMultiply (one, *D_, Y_tmp, zero);
+          }
 
-        // Start by solving U^P Y_tmp = X for Y_tmp.
-        U_solver_->apply (X, Y_tmp, mode);
+          U_solver_->apply (Y_tmp, Y, mode); // Solve U Y = Y_tmp.
+#else
+          // Start by solving L Y = X for Y.
+          L_solver_->apply (X, Y, mode);
 
-        if (!this->isKokkosKernelsSpiluk_) {
-          // Solve D^P Y = Y.
-          //
-          // FIXME (mfh 24 Jan 2014) If mode = Teuchos::CONJ_TRANS, we
-          // need to do an elementwise multiply with the conjugate of
-          // D_, not just with D_ itself.
-          Y_tmp.elementWiseMultiply (one, *D_, Y_tmp, zero);
-	    }
+          if (!this->isKokkosKernelsSpiluk_) {
+            // Solve D Y = Y.  The operation lets us do this in place in Y, so we can
+            // write "solve D Y = Y for Y."
+            Y.elementWiseMultiply (one, *D_, Y, zero);
+          }
 
-        L_solver_->apply (Y_tmp, Y, mode); // Solve L^P Y = Y_tmp.
+          U_solver_->apply (Y, Y, mode); // Solve U Y = Y.
+#endif
+        }
+        else { // Solve U^P (D^P (L^P Y)) = X for Y (where P is * or T).          
+#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) && defined(KOKKOS_ENABLE_CUDA) && (CUDA_VERSION >= 11030)
+          //NOTE (Nov-15-2022):
+          //This is a workaround for Cuda >= 11.3 (using cusparseSpSV)
+          //since cusparseSpSV_solve() does not support in-place computation
+          MV Y_tmp (Y.getMap (), Y.getNumVectors ());
+
+          // Start by solving U^P Y_tmp = X for Y_tmp.
+          U_solver_->apply (X, Y_tmp, mode);
+
+          if (!this->isKokkosKernelsSpiluk_) {
+            // Solve D^P Y = Y.
+            //
+            // FIXME (mfh 24 Jan 2014) If mode = Teuchos::CONJ_TRANS, we
+            // need to do an elementwise multiply with the conjugate of
+            // D_, not just with D_ itself.
+            Y_tmp.elementWiseMultiply (one, *D_, Y_tmp, zero);
+	      }
+
+          L_solver_->apply (Y_tmp, Y, mode); // Solve L^P Y = Y_tmp.
 #else
-        // Start by solving U^P Y = X for Y.
-        U_solver_->apply (X, Y, mode);
-
-        if (!this->isKokkosKernelsSpiluk_) {
-          // Solve D^P Y = Y.
-          //
-          // FIXME (mfh 24 Jan 2014) If mode = Teuchos::CONJ_TRANS, we
-          // need to do an elementwise multiply with the conjugate of
-          // D_, not just with D_ itself.
-          Y.elementWiseMultiply (one, *D_, Y, zero);
-	    }
-
-        L_solver_->apply (Y, Y, mode); // Solve L^P Y = Y.
+          // Start by solving U^P Y = X for Y.
+          U_solver_->apply (X, Y, mode);
+
+          if (!this->isKokkosKernelsSpiluk_) {
+            // Solve D^P Y = Y.
+            //
+            // FIXME (mfh 24 Jan 2014) If mode = Teuchos::CONJ_TRANS, we
+            // need to do an elementwise multiply with the conjugate of
+            // D_, not just with D_ itself.
+            Y.elementWiseMultiply (one, *D_, Y, zero);
+	      }
+
+          L_solver_->apply (Y, Y, mode); // Solve L^P Y = Y.
 #endif
+        }
       }
     }
     else { // alpha != 1 or beta != 0
diff --git a/packages/ifpack2/test/belos/CMakeLists.txt b/packages/ifpack2/test/belos/CMakeLists.txt
index 9e9799eaa643..00fdb64c288f 100644
--- a/packages/ifpack2/test/belos/CMakeLists.txt
+++ b/packages/ifpack2/test/belos/CMakeLists.txt
@@ -72,11 +72,15 @@ TRIBITS_COPY_FILES_TO_BINARY_DIR(Ifpack2BelosCopyFiles
   test_2_RILUK_HTS_nos1_hb.xml
   test_2_RILUK_2streams_nos1_hb.xml
   test_2_RILUK_4streams_nos1_hb.xml
+  test_2_RILUK_2streams_rcm_nos1_hb.xml
+  test_2_RILUK_4streams_rcm_nos1_hb.xml
   test_4_ILUT_nos1_hb.xml 
   test_4_RILUK_nos1_hb.xml
   test_4_RILUK_HTS_nos1_hb.xml
   test_4_RILUK_2streams_nos1_hb.xml
   test_4_RILUK_4streams_nos1_hb.xml
+  test_4_RILUK_2streams_rcm_nos1_hb.xml
+  test_4_RILUK_4streams_rcm_nos1_hb.xml
   test_SGS_calore1_mm.xml 
   test_MTSGS_calore1_mm.xml
   calore1.mtx 
@@ -403,6 +407,41 @@ IF(Kokkos_ENABLE_CUDA)
     NUM_MPI_PROCS 4
     STANDARD_PASS_OUTPUT
   )
+  TRIBITS_ADD_TEST(
+    tif_belos
+    NAME RILUK_2streams_rcm_hb_belos
+    ARGS "--xml_file=test_2_RILUK_2streams_rcm_nos1_hb.xml"
+    COMM serial mpi
+    NUM_MPI_PROCS 2
+    STANDARD_PASS_OUTPUT
+  )
+
+  TRIBITS_ADD_TEST(
+    tif_belos
+    NAME RILUK_4streams_rcm_hb_belos
+    ARGS "--xml_file=test_2_RILUK_4streams_rcm_nos1_hb.xml"
+    COMM serial mpi
+    NUM_MPI_PROCS 2
+    STANDARD_PASS_OUTPUT
+  )
+
+  TRIBITS_ADD_TEST(
+    tif_belos
+    NAME RILUK_2streams_rcm_hb_belos
+    ARGS "--xml_file=test_4_RILUK_2streams_rcm_nos1_hb.xml"
+    COMM serial mpi
+    NUM_MPI_PROCS 4
+    STANDARD_PASS_OUTPUT
+  )
+
+  TRIBITS_ADD_TEST(
+    tif_belos
+    NAME RILUK_4streams_rcm_hb_belos
+    ARGS "--xml_file=test_4_RILUK_4streams_rcm_nos1_hb.xml"
+    COMM serial mpi
+    NUM_MPI_PROCS 4
+    STANDARD_PASS_OUTPUT
+  )
 ENDIF()
 ENDIF()
 
diff --git a/packages/ifpack2/test/belos/test_2_RILUK_2streams_rcm_nos1_hb.xml b/packages/ifpack2/test/belos/test_2_RILUK_2streams_rcm_nos1_hb.xml
new file mode 100644
index 000000000000..7af01acaa95f
--- /dev/null
+++ b/packages/ifpack2/test/belos/test_2_RILUK_2streams_rcm_nos1_hb.xml
@@ -0,0 +1,22 @@
+<ParameterList name="test_params">
+  <Parameter name="hb_file" type="string" value="nos1.rsa"/>
+
+  <Parameter name="solver_type" type="string" value="Block Gmres"/>
+  <ParameterList name="Belos">
+    <Parameter name="Num Blocks" type="int" value="300"/>
+    <Parameter name="Verbosity" type="int" value="33"/>
+    <Parameter name="Output Style" type="int" value="1"/>
+    <Parameter name="Output Frequency" type="int" value="1"/>
+  </ParameterList>
+
+  <Parameter name="Ifpack2::Preconditioner" type="string" value="RILUK"/>
+  <ParameterList name="Ifpack2">
+    <Parameter name="fact: iluk level-of-fill" type="int" value="2"/>
+    <Parameter name="fact: type" type="string" value="KSPILUK"/>
+    <Parameter name="fact: kspiluk number-of-streams" type="int" value="2"/>
+    <Parameter name="fact: kspiluk reordering in streams" type="bool" value="true"/>
+    <Parameter name="trisolver: type" type="string" value="KSPTRSV"/>
+  </ParameterList>
+
+  <Parameter name="expectNumIters" type="int" value="50"/>
+</ParameterList>
diff --git a/packages/ifpack2/test/belos/test_2_RILUK_4streams_rcm_nos1_hb.xml b/packages/ifpack2/test/belos/test_2_RILUK_4streams_rcm_nos1_hb.xml
new file mode 100644
index 000000000000..fc3859820a7d
--- /dev/null
+++ b/packages/ifpack2/test/belos/test_2_RILUK_4streams_rcm_nos1_hb.xml
@@ -0,0 +1,22 @@
+<ParameterList name="test_params">
+  <Parameter name="hb_file" type="string" value="nos1.rsa"/>
+
+  <Parameter name="solver_type" type="string" value="Block Gmres"/>
+  <ParameterList name="Belos">
+    <Parameter name="Num Blocks" type="int" value="300"/>
+    <Parameter name="Verbosity" type="int" value="33"/>
+    <Parameter name="Output Style" type="int" value="1"/>
+    <Parameter name="Output Frequency" type="int" value="1"/>
+  </ParameterList>
+
+  <Parameter name="Ifpack2::Preconditioner" type="string" value="RILUK"/>
+  <ParameterList name="Ifpack2">
+    <Parameter name="fact: iluk level-of-fill" type="int" value="2"/>
+    <Parameter name="fact: type" type="string" value="KSPILUK"/>
+    <Parameter name="fact: kspiluk number-of-streams" type="int" value="4"/>
+    <Parameter name="fact: kspiluk reordering in streams" type="bool" value="true"/>
+    <Parameter name="trisolver: type" type="string" value="KSPTRSV"/>
+  </ParameterList>
+
+  <Parameter name="expectNumIters" type="int" value="50"/>
+</ParameterList>
diff --git a/packages/ifpack2/test/belos/test_4_RILUK_2streams_rcm_nos1_hb.xml b/packages/ifpack2/test/belos/test_4_RILUK_2streams_rcm_nos1_hb.xml
new file mode 100644
index 000000000000..60d217541fde
--- /dev/null
+++ b/packages/ifpack2/test/belos/test_4_RILUK_2streams_rcm_nos1_hb.xml
@@ -0,0 +1,23 @@
+<ParameterList name="test_params">
+  <Parameter name="hb_file" type="string" value="nos1.rsa"/>
+
+  <Parameter name="solver_type" type="string" value="Block Gmres"/>
+  <ParameterList name="Belos">
+    <Parameter name="Num Blocks" type="int" value="300"/>
+    <Parameter name="Verbosity" type="int" value="33"/>
+    <Parameter name="Output Style" type="int" value="1"/>
+    <Parameter name="Output Frequency" type="int" value="1"/>
+  </ParameterList>
+
+  <Parameter name="Ifpack2::Preconditioner" type="string" value="RILUK"/>
+  <ParameterList name="Ifpack2">
+    <Parameter name="fact: iluk level-of-fill" type="int" value="2"/>
+    <Parameter name="fact: type" type="string" value="KSPILUK"/>
+    <Parameter name="fact: kspiluk number-of-streams" type="int" value="2"/>
+    <Parameter name="fact: kspiluk reordering in streams" type="bool" value="true"/>
+    <Parameter name="trisolver: type" type="string" value="KSPTRSV"/>
+  </ParameterList>
+
+  <Parameter name="expectNumIters" type="int" value="80"/>
+  <Parameter name="Reuse Pattern" type="bool" value="true"/>
+</ParameterList>
diff --git a/packages/ifpack2/test/belos/test_4_RILUK_4streams_rcm_nos1_hb.xml b/packages/ifpack2/test/belos/test_4_RILUK_4streams_rcm_nos1_hb.xml
new file mode 100644
index 000000000000..7ff88b06bfc1
--- /dev/null
+++ b/packages/ifpack2/test/belos/test_4_RILUK_4streams_rcm_nos1_hb.xml
@@ -0,0 +1,23 @@
+<ParameterList name="test_params">
+  <Parameter name="hb_file" type="string" value="nos1.rsa"/>
+
+  <Parameter name="solver_type" type="string" value="Block Gmres"/>
+  <ParameterList name="Belos">
+    <Parameter name="Num Blocks" type="int" value="300"/>
+    <Parameter name="Verbosity" type="int" value="33"/>
+    <Parameter name="Output Style" type="int" value="1"/>
+    <Parameter name="Output Frequency" type="int" value="1"/>
+  </ParameterList>
+
+  <Parameter name="Ifpack2::Preconditioner" type="string" value="RILUK"/>
+  <ParameterList name="Ifpack2">
+    <Parameter name="fact: iluk level-of-fill" type="int" value="2"/>
+    <Parameter name="fact: type" type="string" value="KSPILUK"/>
+    <Parameter name="fact: kspiluk number-of-streams" type="int" value="4"/>
+    <Parameter name="fact: kspiluk reordering in streams" type="bool" value="true"/>
+    <Parameter name="trisolver: type" type="string" value="KSPTRSV"/>
+  </ParameterList>
+
+  <Parameter name="expectNumIters" type="int" value="80"/>
+  <Parameter name="Reuse Pattern" type="bool" value="true"/>
+</ParameterList>
diff --git a/packages/kokkos-kernels/sparse/src/KokkosSparse_Utils.hpp b/packages/kokkos-kernels/sparse/src/KokkosSparse_Utils.hpp
index f3fbec18369f..2b89c1a2f74e 100644
--- a/packages/kokkos-kernels/sparse/src/KokkosSparse_Utils.hpp
+++ b/packages/kokkos-kernels/sparse/src/KokkosSparse_Utils.hpp
@@ -25,6 +25,7 @@
 #include "KokkosSparse_CrsMatrix.hpp"
 #include "KokkosSparse_BsrMatrix.hpp"
 #include "Kokkos_Bitset.hpp"
+#include "KokkosGraph_RCM.hpp"
 
 #ifdef KOKKOSKERNELS_HAVE_PARALLEL_GNUSORT
 #include <parallel/algorithm>
@@ -2415,15 +2416,23 @@ void kk_extract_subblock_crsmatrix_sequential(
  * @tparam crsMat_t The type of the CRS matrix.
  * @param A [in] The square CrsMatrix. It is expected that column indices are
  * in ascending order
+ * @param UseRCMReordering [in] Boolean indicating whether applying (true) RCM
+ * reordering to diagonal blocks or not (false) (default: false)
  * @param DiagBlk_v [out] The vector of the extracted the CRS diagonal blocks
  * (1 <= the number of diagonal blocks <= A_nrows)
+ * @return a vector of lists of vertices in RCM order (a list per a diagonal
+ * block) if UseRCMReordering is true, or an empty vector if UseRCMReordering is
+ * false
  *
  * Usage Example:
- *    kk_extract_diagonal_blocks_crsmatrix_sequential(A_in, diagBlk_in_b);
+ *    perm = kk_extract_diagonal_blocks_crsmatrix_sequential(A_in, diagBlk_out,
+ * UseRCMReordering);
  */
 template <typename crsMat_t>
-void kk_extract_diagonal_blocks_crsmatrix_sequential(
-    const crsMat_t &A, std::vector<crsMat_t> &DiagBlk_v) {
+std::vector<typename crsMat_t::StaticCrsGraphType::entries_type::non_const_type>
+kk_extract_diagonal_blocks_crsmatrix_sequential(
+    const crsMat_t &A, std::vector<crsMat_t> &DiagBlk_v,
+    bool UseRCMReordering = false) {
   using row_map_type     = typename crsMat_t::row_map_type;
   using entries_type     = typename crsMat_t::index_type;
   using values_type      = typename crsMat_t::values_type;
@@ -2437,6 +2446,7 @@ void kk_extract_diagonal_blocks_crsmatrix_sequential(
 
   using ordinal_type = typename crsMat_t::non_const_ordinal_type;
   using size_type    = typename crsMat_t::non_const_size_type;
+  using value_type   = typename crsMat_t::non_const_value_type;
   using offset_view1d_type =
       Kokkos::View<size_type *, Kokkos::LayoutLeft, Kokkos::HostSpace>;
 
@@ -2463,8 +2473,12 @@ void kk_extract_diagonal_blocks_crsmatrix_sequential(
     throw std::runtime_error(os.str());
   }
 
+  std::vector<out_entries_type> perm_v;
+  std::vector<out_entries_hostmirror_type> perm_h_v;
+
   if (n_blocks == 1) {
     // One block case: simply shallow copy A to DiagBlk_v[0]
+    // Note: always not applying RCM reordering, for now
     DiagBlk_v[0] = crsMat_t(A);
   } else {
     // n_blocks > 1
@@ -2487,12 +2501,10 @@ void kk_extract_diagonal_blocks_crsmatrix_sequential(
                                         ? (A_nrows / n_blocks)
                                         : (A_nrows / n_blocks + 1);
 
-      std::vector<out_row_map_type> row_map_v(n_blocks);
-      std::vector<out_entries_type> entries_v(n_blocks);
-      std::vector<out_values_type> values_v(n_blocks);
-      std::vector<out_row_map_hostmirror_type> row_map_h_v(n_blocks);
-      std::vector<out_entries_hostmirror_type> entries_h_v(n_blocks);
-      std::vector<out_values_hostmirror_type> values_h_v(n_blocks);
+      if (UseRCMReordering) {
+        perm_v.resize(n_blocks);
+        perm_h_v.resize(n_blocks);
+      }
 
       ordinal_type blk_row_start = 0;  // first row index of i-th diagonal block
       ordinal_type blk_col_start = 0;  // first col index of i-th diagonal block
@@ -2509,37 +2521,110 @@ void kk_extract_diagonal_blocks_crsmatrix_sequential(
         // First round: count i-th non-zeros or size of entries_v[i] and find
         // the first and last column indices at each row
         size_type blk_nnz = 0;
-        offset_view1d_type first("first", blk_nrows);  // first position per row
-        offset_view1d_type last("last", blk_nrows);    // last position per row
+        offset_view1d_type first(
+            Kokkos::view_alloc(Kokkos::WithoutInitializing, "first"),
+            blk_nrows);  // first position per row
+        offset_view1d_type last(
+            Kokkos::view_alloc(Kokkos::WithoutInitializing, "last"),
+            blk_nrows);  // last position per row
 
         kk_find_nnz_first_last_indices_subblock_crsmatrix_sequential(
             A_row_map_h, A_entries_h, blk_row_start, blk_col_start, blk_nrows,
             blk_ncols, blk_nnz, first, last);
 
         // Second round: extract
-        row_map_v[i] = out_row_map_type("row_map_v", blk_nrows + 1);
-        entries_v[i] = out_entries_type("entries_v", blk_nnz);
-        values_v[i]  = out_values_type("values_v", blk_nnz);
-        row_map_h_v[i] =
-            out_row_map_hostmirror_type("row_map_h_v", blk_nrows + 1);
-        entries_h_v[i] = out_entries_hostmirror_type("entries_h_v", blk_nnz);
-        values_h_v[i]  = out_values_hostmirror_type("values_h_v", blk_nnz);
+        out_row_map_type row_map(
+            Kokkos::view_alloc(Kokkos::WithoutInitializing, "row_map"),
+            blk_nrows + 1);
+        out_entries_type entries(
+            Kokkos::view_alloc(Kokkos::WithoutInitializing, "entries"),
+            blk_nnz);
+        out_values_type values(
+            Kokkos::view_alloc(Kokkos::WithoutInitializing, "values"), blk_nnz);
+        out_row_map_hostmirror_type row_map_h(
+            Kokkos::view_alloc(Kokkos::WithoutInitializing, "row_map_h"),
+            blk_nrows + 1);
+        out_entries_hostmirror_type entries_h(
+            Kokkos::view_alloc(Kokkos::WithoutInitializing, "entries_h"),
+            blk_nnz);
+        out_values_hostmirror_type values_h(
+            Kokkos::view_alloc(Kokkos::WithoutInitializing, "values_h"),
+            blk_nnz);
 
         kk_extract_subblock_crsmatrix_sequential(
             A_entries_h, A_values_h, blk_col_start, blk_nrows, blk_nnz, first,
-            last, row_map_h_v[i], entries_h_v[i], values_h_v[i]);
+            last, row_map_h, entries_h, values_h);
+
+        if (!UseRCMReordering) {
+          Kokkos::deep_copy(row_map, row_map_h);
+          Kokkos::deep_copy(entries, entries_h);
+          Kokkos::deep_copy(values, values_h);
+        } else {
+          perm_h_v[i] = KokkosGraph::Experimental::graph_rcm<
+              Kokkos::DefaultHostExecutionSpace>(row_map_h, entries_h);
+          perm_v[i] = out_entries_type(
+              Kokkos::view_alloc(Kokkos::WithoutInitializing, "perm_v"),
+              perm_h_v[i].extent(0));
+
+          out_row_map_hostmirror_type row_map_perm_h(
+              Kokkos::view_alloc(Kokkos::WithoutInitializing, "row_map_perm_h"),
+              blk_nrows + 1);
+          out_entries_hostmirror_type entries_perm_h(
+              Kokkos::view_alloc(Kokkos::WithoutInitializing, "entries_perm_h"),
+              blk_nnz);
+          out_values_hostmirror_type values_perm_h(
+              Kokkos::view_alloc(Kokkos::WithoutInitializing, "values_perm_h"),
+              blk_nnz);
+
+          out_entries_hostmirror_type reverseperm_h(
+              Kokkos::view_alloc(Kokkos::WithoutInitializing, "reverseperm_h"),
+              blk_nrows);
+          for (ordinal_type ii = 0; ii < blk_nrows; ii++)
+            reverseperm_h(perm_h_v[i](ii)) = ii;
+
+          std::map<ordinal_type, value_type> colIdx_Value_rcm;
+
+          // Loop through each row of the reordered matrix
+          size_type cnt = 0;
+          for (ordinal_type ii = 0; ii < blk_nrows; ii++) {
+            colIdx_Value_rcm.clear();
+            // ii: reordered index
+            ordinal_type origRow = reverseperm_h(
+                ii);  // get the original row idx of the reordered row idx, ii
+            for (size_type j = row_map_h(origRow); j < row_map_h(origRow + 1);
+                 j++) {
+              ordinal_type origEi = entries_h(j);
+              value_type origV    = values_h(j);
+              ordinal_type Ei =
+                  perm_h_v[i](origEi);  // get the reordered col idx of the
+                                        // original col idx, origEi
+              colIdx_Value_rcm[Ei] = origV;
+            }
+            row_map_perm_h(ii) = cnt;
+            for (typename std::map<ordinal_type, value_type>::iterator it =
+                     colIdx_Value_rcm.begin();
+                 it != colIdx_Value_rcm.end(); ++it) {
+              entries_perm_h(cnt) = it->first;
+              values_perm_h(cnt)  = it->second;
+              cnt++;
+            }
+          }
+          row_map_perm_h(blk_nrows) = cnt;
 
-        Kokkos::deep_copy(row_map_v[i], row_map_h_v[i]);
-        Kokkos::deep_copy(entries_v[i], entries_h_v[i]);
-        Kokkos::deep_copy(values_v[i], values_h_v[i]);
+          Kokkos::deep_copy(row_map, row_map_perm_h);
+          Kokkos::deep_copy(entries, entries_perm_h);
+          Kokkos::deep_copy(values, values_perm_h);
+          Kokkos::deep_copy(perm_v[i], perm_h_v[i]);
+        }
 
         DiagBlk_v[i] = crsMat_t("CrsMatrix", blk_nrows, blk_ncols, blk_nnz,
-                                values_v[i], row_map_v[i], entries_v[i]);
+                                values, row_map, entries);
 
         blk_row_start += blk_nrows;
       }  // for (ordinal_type i = 0; i < n_blocks; i++)
     }    // A_nrows >= 1
   }      // n_blocks > 1
+  return perm_v;
 }
 
 }  // namespace Impl
diff --git a/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_extractCrsDiagonalBlocks.hpp b/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_extractCrsDiagonalBlocks.hpp
index 327780dec32d..28674ad353fe 100644
--- a/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_extractCrsDiagonalBlocks.hpp
+++ b/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_extractCrsDiagonalBlocks.hpp
@@ -15,6 +15,8 @@
 //@HEADER
 
 #include "KokkosSparse_Utils.hpp"
+#include "KokkosSparse_spmv.hpp"
+#include "KokkosBlas1_nrm2.hpp"
 #include "KokkosKernels_TestUtils.hpp"
 
 namespace Test {
@@ -31,6 +33,7 @@ void run_test_extract_diagonal_blocks(int nrows, int nblocks) {
 
   crsMat_t A;
   std::vector<crsMat_t> DiagBlks(nblocks);
+  std::vector<crsMat_t> DiagBlks_rcm(nblocks);
 
   if (nrows != 0) {
     // Generate test matrix
@@ -84,6 +87,10 @@ void run_test_extract_diagonal_blocks(int nrows, int nblocks) {
   KokkosSparse::Impl::kk_extract_diagonal_blocks_crsmatrix_sequential(A,
                                                                       DiagBlks);
 
+  auto perm =
+      KokkosSparse::Impl::kk_extract_diagonal_blocks_crsmatrix_sequential(
+          A, DiagBlks_rcm, true);
+
   // Checking
   lno_t numRows = 0;
   lno_t numCols = 0;
@@ -125,6 +132,40 @@ void run_test_extract_diagonal_blocks(int nrows, int nblocks) {
       col_start += DiagBlks[i].numCols();
     }
     EXPECT_TRUE(flag);
+
+    // Checking RCM
+    if (!perm.empty()) {
+      scalar_t one  = scalar_t(1.0);
+      scalar_t zero = scalar_t(0.0);
+      scalar_t mone = scalar_t(-1.0);
+      for (int i = 0; i < nblocks; i++) {
+        ValuesType In("In", DiagBlks[i].numRows());
+        ValuesType Out("Out", DiagBlks[i].numRows());
+
+        ValuesType_hm h_Out     = Kokkos::create_mirror_view(Out);
+        ValuesType_hm h_Out_tmp = Kokkos::create_mirror(Out);
+
+        Kokkos::deep_copy(In, one);
+
+        auto h_perm =
+            Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), perm[i]);
+
+        KokkosSparse::spmv("N", one, DiagBlks_rcm[i], In, zero, Out);
+
+        Kokkos::deep_copy(h_Out_tmp, Out);
+        for (lno_t ii = 0; ii < static_cast<lno_t>(DiagBlks[i].numRows());
+             ii++) {
+          lno_t rcm_ii = h_perm(ii);
+          h_Out(ii)    = h_Out_tmp(rcm_ii);
+        }
+        Kokkos::deep_copy(Out, h_Out);
+
+        KokkosSparse::spmv("N", one, DiagBlks[i], In, mone, Out);
+
+        double nrm_val = KokkosBlas::nrm2(Out);
+        EXPECT_LE(nrm_val, 1e-9);
+      }
+    }
   }
 }
 }  // namespace Test
@@ -136,9 +177,9 @@ void test_extract_diagonal_blocks() {
     Test::run_test_extract_diagonal_blocks<scalar_t, lno_t, size_type, device>(
         0, s);
     Test::run_test_extract_diagonal_blocks<scalar_t, lno_t, size_type, device>(
-        12, s);
+        153, s);
     Test::run_test_extract_diagonal_blocks<scalar_t, lno_t, size_type, device>(
-        123, s);
+        1553, s);
   }
 }
 
diff --git a/packages/xpetra/src/Operator/Xpetra_TpetraHalfPrecisionOperator.hpp b/packages/xpetra/src/Operator/Xpetra_TpetraHalfPrecisionOperator.hpp
index 509294a77fdc..b0c87f06d811 100644
--- a/packages/xpetra/src/Operator/Xpetra_TpetraHalfPrecisionOperator.hpp
+++ b/packages/xpetra/src/Operator/Xpetra_TpetraHalfPrecisionOperator.hpp
@@ -127,12 +127,12 @@ class TpetraHalfPrecisionOperator : public Xpetra::Operator<Scalar, LocalOrdinal
   //@}
 
   //! Returns the Tpetra::Map object associated with the domain of this TpetraOperator.
-  Teuchos::RCP<const Xpetra::Map<LocalOrdinal, GlobalOrdinal, Node>> getDomainMap() const {
+  const Teuchos::RCP<const Xpetra::Map<LocalOrdinal, GlobalOrdinal, Node>> getDomainMap() const {
     return Op_->getDomainMap();
   }
 
   //! Returns the Tpetra::Map object associated with the range of this TpetraOperator.
-  Teuchos::RCP<const Xpetra::Map<LocalOrdinal, GlobalOrdinal, Node>> getRangeMap() const {
+  const Teuchos::RCP<const Xpetra::Map<LocalOrdinal, GlobalOrdinal, Node>> getRangeMap() const {
     return Op_->getRangeMap();
   }