diff --git a/.github/workflows/apps.yml b/.github/workflows/apps.yml
index e909193601..19a3c4699a 100644
--- a/.github/workflows/apps.yml
+++ b/.github/workflows/apps.yml
@@ -52,7 +52,9 @@ jobs:
         export AMREX_HOME=${PWD}
         export MICROPHYSICS_HOME=${PWD}/Microphysics
         cd Castro/Exec/hydro_tests/Sedov/
-        make -j4 CCACHE=ccache USE_MPI=FALSE
+        make -j4 CCACHE=ccache USE_MPI=FALSE \
+            USE_LINEAR_SOLVERS_INCFLO=FALSE \
+            USE_LINEAR_SOLVERS_EM=FALSE
 
         ccache -s
         du -hs ~/.cache/ccache
@@ -92,7 +94,9 @@ jobs:
             -DWarpX_QED=OFF                       \
             -DWarpX_OPENPMD=OFF                   \
             -DCMAKE_VERBOSE_MAKEFILE=ON           \
-            -DCMAKE_CXX_COMPILER_LAUNCHER=ccache
+            -DCMAKE_CXX_COMPILER_LAUNCHER=ccache  \
+            -DAMReX_FFT=ON                        \
+            -DAMReX_LINEAR_SOLVERS_INCFLO=OFF
         cmake --build WarpX/build -j 4
 
         ccache -s
diff --git a/.github/workflows/clang.yml b/.github/workflows/clang.yml
index c84d19850a..1ed5240164 100644
--- a/.github/workflows/clang.yml
+++ b/.github/workflows/clang.yml
@@ -44,6 +44,7 @@ jobs:
             -DCMAKE_BUILD_TYPE=Debug    \
             -DCMAKE_VERBOSE_MAKEFILE=ON \
             -DCMAKE_INSTALL_PREFIX=/tmp/my-amrex      \
+            -DAMReX_FFT=ON                            \
             -DAMReX_EB=ON                             \
             -DAMReX_FORTRAN=ON                        \
             -DAMReX_MPI=OFF                           \
@@ -104,6 +105,7 @@ jobs:
         cmake ..                                      \
             -DCMAKE_BUILD_TYPE=Debug                  \
             -DCMAKE_VERBOSE_MAKEFILE=ON               \
+            -DAMReX_FFT=ON                            \
             -DAMReX_EB=ON                             \
             -DAMReX_ENABLE_TESTS=ON                   \
             -DAMReX_FORTRAN=ON                        \
@@ -158,6 +160,7 @@ jobs:
         cmake ..                                      \
             -DCMAKE_BUILD_TYPE=RelWithDebInfo         \
             -DCMAKE_VERBOSE_MAKEFILE=ON               \
+            -DAMReX_FFT=ON                            \
             -DAMReX_EB=ON                             \
             -DAMReX_ENABLE_TESTS=ON                   \
             -DAMReX_FORTRAN=OFF                       \
@@ -200,7 +203,7 @@ jobs:
         export CCACHE_LOGFILE=${{ github.workspace }}/ccache.log.txt
         ccache -z
 
-        ./configure --dim 2 --with-fortran no --comp llvm --with-mpi no
+        ./configure --dim 2 --with-fortran no --comp llvm --with-mpi no --enable-fft yes
         make -j4 WARN_ALL=TRUE WARN_ERROR=TRUE XTRA_CXXFLAGS="-fno-operator-names" \
             CCACHE=ccache
         make install
diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml
index 9e96aefac5..927e99ded4 100644
--- a/.github/workflows/cuda.yml
+++ b/.github/workflows/cuda.yml
@@ -38,6 +38,7 @@ jobs:
 
         cmake -S . -B build                              \
             -DCMAKE_VERBOSE_MAKEFILE=ON                  \
+            -DAMReX_FFT=ON                               \
             -DAMReX_EB=ON                                \
             -DAMReX_ENABLE_TESTS=ON                      \
             -DAMReX_FORTRAN=OFF                          \
@@ -97,6 +98,7 @@ jobs:
         cmake -S . -B build                              \
             -DCMAKE_VERBOSE_MAKEFILE=ON                  \
             -DAMReX_MPI=OFF                              \
+            -DAMReX_FFT=ON                               \
             -DAMReX_EB=ON                                \
             -DAMReX_ENABLE_TESTS=ON                      \
             -DAMReX_FORTRAN=OFF                          \
@@ -153,6 +155,7 @@ jobs:
             -DCMAKE_VERBOSE_MAKEFILE=ON                  \
             -DAMReX_ENABLE_TESTS=ON                      \
             -DAMReX_TEST_TYPE=Small                      \
+            -DAMReX_FFT=ON                               \
             -DAMReX_FORTRAN=ON                           \
             -DAMReX_FORTRAN_INTERFACES=ON                \
             -DAMReX_GPU_BACKEND=CUDA                     \
@@ -196,7 +199,7 @@ jobs:
         ccache -z
 
         export PATH=/usr/local/nvidia/bin:/usr/local/cuda/bin:${PATH}
-        ./configure --dim 3 --with-cuda yes --enable-eb yes --enable-xsdk-defaults yes --with-fortran no
+        ./configure --dim 3 --with-cuda yes --enable-eb yes --enable-xsdk-defaults yes --with-fortran no --enable-fft yes
         #
         # /home/runner/work/amrex/amrex/Src/Base/AMReX_GpuLaunchGlobal.H:16:41: error: unused parameter ‘f0’ [-Werror=unused-parameter]
         #    16 |     AMREX_GPU_GLOBAL void launch_global (L f0) { f0(); }
diff --git a/.github/workflows/dependencies/dependencies.sh b/.github/workflows/dependencies/dependencies.sh
index 07e461f577..c7cde49651 100755
--- a/.github/workflows/dependencies/dependencies.sh
+++ b/.github/workflows/dependencies/dependencies.sh
@@ -16,6 +16,7 @@ sudo apt-get update
 
 sudo apt-get install -y --no-install-recommends\
     build-essential \
+    libfftw3-dev    \
     g++ gfortran    \
     libopenmpi-dev  \
     openmpi-bin
diff --git a/.github/workflows/dependencies/dependencies_clang.sh b/.github/workflows/dependencies/dependencies_clang.sh
index 2e96b5196d..4c329321b6 100755
--- a/.github/workflows/dependencies/dependencies_clang.sh
+++ b/.github/workflows/dependencies/dependencies_clang.sh
@@ -16,5 +16,6 @@ sudo apt-get update
 
 sudo apt-get install -y --no-install-recommends \
     build-essential      \
+    libfftw3-dev         \
     gfortran             \
     clang-$1
diff --git a/.github/workflows/dependencies/dependencies_gcc.sh b/.github/workflows/dependencies/dependencies_gcc.sh
index 2a576c0b52..93d9aa27ec 100755
--- a/.github/workflows/dependencies/dependencies_gcc.sh
+++ b/.github/workflows/dependencies/dependencies_gcc.sh
@@ -17,6 +17,7 @@ sudo apt-get update
 
 sudo apt-get install -y --no-install-recommends \
     build-essential    \
+    libfftw3-dev       \
     g++-$1 gfortran-$1 \
     libopenmpi-dev     \
     openmpi-bin
diff --git a/.github/workflows/dependencies/dependencies_hip.sh b/.github/workflows/dependencies/dependencies_hip.sh
index ab5185ce0a..df4f274ef3 100755
--- a/.github/workflows/dependencies/dependencies_hip.sh
+++ b/.github/workflows/dependencies/dependencies_hip.sh
@@ -56,6 +56,7 @@ sudo apt-get install -y --no-install-recommends \
     roctracer-dev   \
     rocprofiler-dev \
     rocrand-dev     \
+    rocfft-dev      \
     rocprim-dev
 
 # hiprand-dev is a new package that does not exist in old versions
diff --git a/.github/workflows/dependencies/dependencies_nvcc.sh b/.github/workflows/dependencies/dependencies_nvcc.sh
index abf9504801..2578bd33fe 100755
--- a/.github/workflows/dependencies/dependencies_nvcc.sh
+++ b/.github/workflows/dependencies/dependencies_nvcc.sh
@@ -35,5 +35,6 @@ sudo apt-get install -y \
     cuda-minimal-build-$VERSION_DASHED      \
     cuda-nvml-dev-$VERSION_DASHED           \
     cuda-nvtx-$VERSION_DASHED               \
+    libcufft-dev-$VERSION_DASHED            \
     libcurand-dev-$VERSION_DASHED
 sudo ln -s cuda-$VERSION_DOTTED /usr/local/cuda
diff --git a/.github/workflows/gcc.yml b/.github/workflows/gcc.yml
index f4ae08f76d..88fe47c988 100644
--- a/.github/workflows/gcc.yml
+++ b/.github/workflows/gcc.yml
@@ -42,6 +42,7 @@ jobs:
         mkdir build
         cd build
         cmake ..                                  \
+            -DAMReX_FFT=ON                        \
             -DAMReX_FORTRAN=ON                    \
             -DAMReX_PLOTFILE_TOOLS=ON             \
             -DCMAKE_VERBOSE_MAKEFILE=ON           \
@@ -86,7 +87,7 @@ jobs:
         restore-keys: |
              ccache-${{ github.workflow }}-${{ github.job }}-git-
     - name: Build & Install
-      env: {CXXFLAGS: "-fno-operator-names -Werror -Wall -Wextra -Wpedantic -Wnull-dereference -Wfloat-conversion -Wshadow -Woverloaded-virtual -O1 -Wnon-virtual-dtor -Wlogical-op -Wmisleading-indentation -Wduplicated-cond -Wduplicated-branches -Wmissing-include-dirs -Wno-null-dereference"}
+      env: {CXXFLAGS: "-fno-operator-names -Werror -Wall -Wextra -Wpedantic -Wnull-dereference -Wfloat-conversion -Wshadow -Woverloaded-virtual -O1 -Wnon-virtual-dtor -Wlogical-op -Wmisleading-indentation -Wduplicated-cond -Wduplicated-branches -Wmissing-include-dirs"}
         # It's too slow with -O0
       run: |
         export CCACHE_COMPRESS=1
@@ -99,6 +100,7 @@ jobs:
         cmake -S . -B build             \
             -DCMAKE_BUILD_TYPE=Debug    \
             -DCMAKE_VERBOSE_MAKEFILE=ON \
+            -DAMReX_FFT=ON              \
             -DAMReX_EB=ON               \
             -DAMReX_ENABLE_TESTS=ON     \
             -DAMReX_FORTRAN=ON          \
@@ -147,6 +149,7 @@ jobs:
         cmake -S . -B build             \
             -DCMAKE_BUILD_TYPE=Debug    \
             -DCMAKE_VERBOSE_MAKEFILE=ON \
+            -DAMReX_FFT=ON              \
             -DAMReX_EB=ON               \
             -DAMReX_ENABLE_TESTS=ON     \
             -DAMReX_FORTRAN=ON          \
@@ -196,6 +199,7 @@ jobs:
         cmake -S . -B build             \
             -DCMAKE_BUILD_TYPE=Debug    \
             -DCMAKE_VERBOSE_MAKEFILE=ON \
+            -DAMReX_FFT=ON              \
             -DAMReX_EB=OFF              \
             -DAMReX_ENABLE_TESTS=ON     \
             -DAMReX_FORTRAN=ON          \
@@ -248,6 +252,7 @@ jobs:
             -DCMAKE_VERBOSE_MAKEFILE=ON \
             -DAMReX_ASSERTIONS=ON       \
             -DAMReX_TESTING=ON          \
+            -DAMReX_FFT=ON              \
             -DAMReX_EB=OFF              \
             -DAMReX_ENABLE_TESTS=ON     \
             -DAMReX_BOUND_CHECK=ON      \
@@ -310,6 +315,7 @@ jobs:
             -DAMReX_TESTING=ON          \
             -DAMReX_BOUND_CHECK=ON      \
             -DAMReX_FPE=ON              \
+            -DAMReX_FFT=ON              \
             -DAMReX_EB=ON               \
             -DAMReX_ENABLE_TESTS=ON     \
             -DAMReX_FORTRAN=ON          \
@@ -353,10 +359,7 @@ jobs:
       # /home/runner/work/amrex/amrex/Src/Base/AMReX_IntVect.H:194:92: error: array subscript -1 is below array bounds of ‘int [3]’ [-Werror=array-bounds]
       # int& operator[] (int i) noexcept { BL_ASSERT(i>=0 && i < AMREX_SPACEDIM); return vect[i]; }
       #
-      # inlined from ‘const amrex::MultiFab& amrex::EBFArrayBoxFactory::getVolFrac() const’ at /home/runner/work/amrex/amrex/Src/EB/AMReX_EBFabFactory.H:53:91,
-      # /usr/include/c++/12/bits/shared_ptr_base.h:1666:16: error: potential null pointer dereference [-Werror=null-dereference]
-      #
-      env: {CXXFLAGS: "-fno-operator-names -Werror -Wall -Wextra -Wpedantic -Wnull-dereference -Wfloat-conversion -Wshadow -Woverloaded-virtual -Wunreachable-code -Wnon-virtual-dtor -Wlogical-op -Wmisleading-indentation -Wduplicated-cond -Wduplicated-branches -Wmissing-include-dirs -Wno-array-bounds -Wno-null-dereference"}
+      env: {CXXFLAGS: "-fno-operator-names -Werror -Wall -Wextra -Wpedantic -Wnull-dereference -Wfloat-conversion -Wshadow -Woverloaded-virtual -Wunreachable-code -Wnon-virtual-dtor -Wlogical-op -Wmisleading-indentation -Wduplicated-cond -Wduplicated-branches -Wmissing-include-dirs -Wno-array-bounds"}
       run: |
         export CCACHE_COMPRESS=1
         export CCACHE_COMPRESSLEVEL=10
@@ -374,10 +377,10 @@ jobs:
             -DAMReX_TESTING=ON          \
             -DAMReX_BOUND_CHECK=ON      \
             -DAMReX_FPE=ON              \
+            -DAMReX_FFT=ON              \
             -DAMReX_EB=ON               \
             -DAMReX_ENABLE_TESTS=ON     \
             -DAMReX_FORTRAN=OFF         \
-            -DAMReX_FORTRAN=OFF         \
             -DCMAKE_C_COMPILER=$(which gcc-12)     \
             -DCMAKE_CXX_COMPILER=$(which g++-12)   \
             -DCMAKE_CXX_STANDARD=17     \
@@ -461,7 +464,7 @@ jobs:
         export CCACHE_LOGFILE=${{ github.workspace }}/ccache.log.txt
         ccache -z
 
-        ./configure --dim 3 --enable-eb yes --enable-xsdk-defaults yes
+        ./configure --dim 3 --enable-eb yes --enable-xsdk-defaults yes --enable-fft yes
         make -j4 WARN_ALL=TRUE WARN_ERROR=TRUE XTRA_CXXFLAGS=-fno-operator-names \
             CCACHE=ccache
         make install
@@ -501,7 +504,8 @@ jobs:
         export CCACHE_LOGFILE=${{ github.workspace }}/ccache.log.txt
         ccache -z
 
-        ./configure --dim 3 --enable-eb no --enable-xsdk-defaults no --single-precision yes --single-precision-particles yes --enable-tiny-profile yes
+        ./configure --dim 3 --enable-eb no --enable-xsdk-defaults no --single-precision yes \
+                    --single-precision-particles yes --enable-tiny-profile yes --enable-fft yes
         make -j4 WARN_ALL=TRUE WARN_ERROR=TRUE XTRA_CXXFLAGS=-fno-operator-names \
             CCACHE=ccache
         make install
@@ -627,6 +631,7 @@ jobs:
             -DAMReX_OMP=ON              \
             -DCMAKE_VERBOSE_MAKEFILE=ON \
             -DAMReX_ENABLE_TESTS=ON     \
+            -DAMReX_FFT=ON              \
             -DCMAKE_CXX_COMPILER_LAUNCHER=ccache
         make -j 4
 
diff --git a/.github/workflows/hip.yml b/.github/workflows/hip.yml
index 345d7c468b..22154d6b01 100644
--- a/.github/workflows/hip.yml
+++ b/.github/workflows/hip.yml
@@ -48,6 +48,7 @@ jobs:
 
         cmake -S . -B build                               \
             -DCMAKE_VERBOSE_MAKEFILE=ON                   \
+            -DAMReX_FFT=ON                                \
             -DAMReX_EB=ON                                 \
             -DAMReX_ENABLE_TESTS=ON                       \
             -DAMReX_FORTRAN=ON                            \
@@ -103,6 +104,7 @@ jobs:
 
         cmake -S . -B build_full_legacywrapper            \
             -DCMAKE_VERBOSE_MAKEFILE=ON                   \
+            -DAMReX_FFT=ON                                \
             -DAMReX_EB=OFF                                \
             -DAMReX_ENABLE_TESTS=ON                       \
             -DAMReX_FORTRAN=ON                            \
@@ -145,7 +147,9 @@ jobs:
         export CCACHE_MAXSIZE=100M
         ccache -z
 
-        ./configure --dim 2 --with-hip yes --enable-eb yes --enable-xsdk-defaults yes --with-mpi no --with-omp no --single-precision yes --single-precision-particles yes
+        ./configure --dim 2 --with-hip yes --enable-eb yes --enable-xsdk-defaults yes \
+                    --with-mpi no --with-omp no --single-precision yes \
+                    --single-precision-particles yes --enable-fft yes
         make -j4 WARN_ALL=TRUE AMD_ARCH=gfx90a CCACHE=ccache
         make install
 
diff --git a/.github/workflows/intel.yml b/.github/workflows/intel.yml
index 227b0f9738..15c7bbda58 100644
--- a/.github/workflows/intel.yml
+++ b/.github/workflows/intel.yml
@@ -41,6 +41,7 @@ jobs:
         set -e
         cmake -S . -B build                                \
             -DCMAKE_VERBOSE_MAKEFILE=ON                    \
+            -DAMReX_FFT=ON                                 \
             -DAMReX_EB=OFF                                 \
             -DAMReX_ENABLE_TESTS=ON                        \
             -DAMReX_FORTRAN=ON                             \
@@ -89,6 +90,7 @@ jobs:
         set -e
         cmake -S . -B build                                \
             -DCMAKE_VERBOSE_MAKEFILE=ON                    \
+            -DAMReX_FFT=ON                                 \
             -DAMReX_EB=ON                                  \
             -DAMReX_ENABLE_TESTS=ON                        \
             -DAMReX_FORTRAN=OFF                            \
diff --git a/CHANGES b/CHANGES
index 6c0cf8c6fe..2f5d0e5373 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,3 +1,81 @@
+# 24.10
+
+  -- TinyProfiler: Remove unnecessary precision() call (#4174)
+
+  -- Fix GCC 12 & 13 warnings on null-dereference (#4171)
+
+  -- fix: wavefront_size for gfx11* (#4170)
+
+  -- CI: Test GCC-13 (#4169)
+
+  -- allow hidden dimension when calling FFlux routines (#4162)
+
+  -- Deregister BArena from Profiling in Arena::Finalize (#4164)
+
+  -- FillPatchSingleLevel and FillPatchTwoLevels for ERF (#4158)
+
+  -- `ParmParse:addFile`: User-Friendly Error (#4156)
+
+  -- Adding computation of complete elliptic integrals into amrex::Math (#4151)
+
+  -- Fix roundoff issue in SUNDIALS evolve() (#4148) (#4157)
+
+  -- Add a new InterFromCoarseLevel for ERF (#4150)
+
+  -- Add ParmParse features for WarpX (#4149)
+
+  -- ParmParse::queryAsDouble: Support bool and std::optional (#4152)
+
+  -- add geometric terms for spherical 2D support. (#4141)
+
+  -- Add std::setfill to PrintMemStats (#4147)
+
+  -- Add ParmParse::query_enum_sloppy that can ignore characters (#4145)
+
+  -- Fix ParmParse::query_enum_case_insensitive (#4144)
+
+  -- AMREX_ENUM: Add more capabilites (#4143)
+
+  -- Add ParmParse::eval (#4142)
+
+  -- AnyCTO with arbitrary number of functions (#4135)
+
+  -- IOFormatSaver (#4104)
+
+  -- amrex::Stack (#4139)
+
+  -- Use BL_PROFILE instead of BL_PROFILE_VAR to time in knapsack()swap (#4134)
+
+  -- Add iMultiFab::sum that returns the sum over a region (#4132)
+
+  -- EB Boundary Area: Fix issues for anisotropic cell size (#4131)
+
+  -- `ParmParse`: Prefix to `FILE` (#4126)
+
+  -- MLMG: Minimum domain width (#4129)
+
+  -- Capability adds for ParmParse enum (#4119)
+
+  -- use perl instead of sed in style checks for portability to MacOS (#4127)
+
+  -- Fortran Interfaces: Add new average down functions (#4124)
+
+  -- TinyProfiler: A few updates (#4102)
+
+  -- ArenaProfiler: Fix clang-tidy warning (#4128)
+
+  -- CTOParallelFor with BoxND / add AnyCTO (#4109)
+
+  -- TinyProfiler with BArena and PArena (#4113)
+
+  -- Fix Fortran interface compilation issue using `nvfortran` (#4115)
+
+  -- `AMREX_DEVICE_PRINTF`: Host (#4116)
+
+  -- EB: don't abort for no-op case in unsupported addFineLevels functions (#4123)
+
+  -- Fix FillPatchNLevels (#4117)
+
 # 24.09
 
   -- Curl Curl Solver: Option to use PCG instead of LU (#3812)
diff --git a/Docs/sphinx_documentation/source/BuildingAMReX.rst b/Docs/sphinx_documentation/source/BuildingAMReX.rst
index 90fb4d6eb3..831346765b 100644
--- a/Docs/sphinx_documentation/source/BuildingAMReX.rst
+++ b/Docs/sphinx_documentation/source/BuildingAMReX.rst
@@ -463,12 +463,20 @@ The list of available options is reported in the :ref:`table <tab:cmakevar>` bel
    +------------------------------+-------------------------------------------------+-------------------------+-----------------------+
    | AMReX_LINEAR_SOLVERS         |  Build AMReX linear solvers                     | YES                     | YES, NO               |
    +------------------------------+-------------------------------------------------+-------------------------+-----------------------+
+   | AMReX_LINEAR_SOLVERS_INCFLO  |  Build AMReX linear solvers for incompressible  | YES                     | YES, NO               |
+   |                              |  flow                                           |                         |
+   +------------------------------+-------------------------------------------------+-------------------------+-----------------------+
+   | AMReX_LINEAR_SOLVERS_EM      |  Build AMReX linear solvers for electromagnetic | YES                     | YES, NO               |
+   |                              |  solvers                                        |                         |
+   +------------------------------+-------------------------------------------------+-------------------------+-----------------------+
    | AMReX_AMRDATA                |  Build data services                            | NO                      | YES, NO               |
    +------------------------------+-------------------------------------------------+-------------------------+-----------------------+
    | AMReX_AMRLEVEL               |  Build AmrLevel class                           | YES                     | YES, NO               |
    +------------------------------+-------------------------------------------------+-------------------------+-----------------------+
    | AMReX_EB                     |  Build Embedded Boundary support                | NO                      | YES, NO               |
    +------------------------------+-------------------------------------------------+-------------------------+-----------------------+
+   | AMReX_FFT                    |  Build FFT support                              | NO                      | YES, NO               |
+   +------------------------------+-------------------------------------------------+-------------------------+-----------------------+
    | AMReX_PARTICLES              |  Build particle classes                         | YES                     | YES, NO               |
    +------------------------------+-------------------------------------------------+-------------------------+-----------------------+
    | AMReX_PARTICLES_PRECISION    |  Set reals precision in particle classes        | Same as AMReX_PRECISION | DOUBLE, SINGLE        |
@@ -681,12 +689,18 @@ A list of AMReX component names and related configure options are shown in the t
    +------------------------------+-----------------+
    | AMReX_LINEAR_SOLVERS         | LSOLVERS        |
    +------------------------------+-----------------+
+   | AMReX_LINEAR_SOLVERS_INCFLO  | LSOLVERS_INCFLO |
+   +------------------------------+-----------------+
+   | AMReX_LINEAR_SOLVERS_EM      | LSOLVERS_EM     |
+   +------------------------------+-----------------+
    | AMReX_AMRDATA                | AMRDATA         |
    +------------------------------+-----------------+
    | AMReX_AMRLEVEL               | AMRLEVEL        |
    +------------------------------+-----------------+
    | AMReX_EB                     | EB              |
    +------------------------------+-----------------+
+   | AMReX_FFT                    | FFT             |
+   +------------------------------+-----------------+
    | AMReX_PARTICLES              | PARTICLES       |
    +------------------------------+-----------------+
    | AMReX_PARTICLES_PRECISION    | PDOUBLE, PSINGLE|
diff --git a/Docs/sphinx_documentation/source/FFT.rst b/Docs/sphinx_documentation/source/FFT.rst
new file mode 100644
index 0000000000..3fc24fcab8
--- /dev/null
+++ b/Docs/sphinx_documentation/source/FFT.rst
@@ -0,0 +1,71 @@
+.. role:: cpp(code)
+   :language: c++
+
+.. _sec:FFT:r2c:
+
+FFT::R2C Class
+==============
+
+Class template `FFT::R2C` supports discrete Fourier transforms between real
+and complex data. The name R2C indicates that the forward transform converts
+real data to complex data, while the backward transform converts complex
+data to real data. It should be noted that both directions of transformation
+are supported, not just from real to complex.
+
+The implementation utilizes cuFFT, rocFFT, oneMKL and FFTW, for CUDA, HIP,
+SYCL and CPU builds, respectively. Because the parallel communication is
+handled by AMReX, it does not need the parallel version of
+FFTW. Furthermore, there is no constraint on the domain decomposition such
+as one Box per process. This class performs parallel FFT on AMReX's parallel
+data containers (e.g., :cpp:`MultiFab` and
+:cpp:`FabArray<BaseFab<ComplexData<Real>>>`. For local FFT, the users can
+use FFTW, cuFFT, rocFFT, or oneMKL directly.
+
+Other than using column-majored order, AMReX follows the convention of
+FFTW. Applying the forward transform followed by the backward transform
+scales the original data by the size of the input array. The layout of the
+complex data also follows the FFTW convention, where the complex Hermitian
+output array has `(nx/2+1,ny,nz)` elements. Here `nx`, `ny` and `nz` are the
+sizes of the real array and the division is rounded down.
+
+Below are examples of using :cpp:`FFT:R2C`.
+
+.. highlight:: c++
+
+::
+
+    Geometry geom(...);
+    MultiFab mfin(...);
+    MultiFab mfout(...);
+
+    auto scaling = 1. / geom.Domain().d_numPts();
+
+    FFT::R2C r2c(geom.Domain());
+    r2c.forwardThenBackward(mfin, mfout,
+        [=] AMREX_GPU_DEVICE (int, int, int, auto& sp)
+        {
+            sp *= scaling;
+        });
+
+    cMultiFab cmf(...);
+    FFT::R2C<Real,FFT::Direction::forward> r2c_forward(geom.Domain());
+    r2c_forward(mfin, cmf);
+
+    FFT::R2C<Real,FFT::Direction::backward> r2c_backward(geom.Domain());
+    r2c_backward(cmf, mfout);
+
+Note that using :cpp:`forwardThenBackward` is expected to be more efficient
+than separate calls to :cpp:`forward` and :cpp:`backward` because some
+parallel communication can be avoided. It should also be noted that a lot of
+preparation works are done in the construction of an :cpp:`FFT::R2C`
+object. Therefore, one should cache it for reuse if possible.
+
+
+Poisson Solver
+==============
+
+AMReX provides FFT based Poisson solvers. :cpp:`FFT::Poisson` supports all
+periodic boundaries using purely FFT. :cpp:`FFT::PoissonHybrid` is a 3D only
+solver that supports periodic boundaries in the first two dimensions and
+Neumann boundary in the last dimension. Similar to :cpp:`FFT::R2C`, the
+Poisson solvers should be cached for reuse.
diff --git a/Docs/sphinx_documentation/source/FFT_Chapter.rst b/Docs/sphinx_documentation/source/FFT_Chapter.rst
new file mode 100644
index 0000000000..9d6e9505d4
--- /dev/null
+++ b/Docs/sphinx_documentation/source/FFT_Chapter.rst
@@ -0,0 +1,16 @@
+.. _Chap:FFT:
+
+.. _sec:FFT:FFTOverview:
+
+Discrete Fourier Transform
+==========================
+
+AMReX provides support for parallel discrete Fourier transform. The
+implementation utilizes cuFFT, rocFFT, oneMKL and FFTW, for CUDA, HIP, SYCL
+and CPU builds, respectively. It also provides FFT based Poisson
+solvers.
+
+.. toctree::
+   :maxdepth: 1
+
+   FFT
diff --git a/Docs/sphinx_documentation/source/index.rst b/Docs/sphinx_documentation/source/index.rst
index 203545cf40..09ffbb5c0b 100644
--- a/Docs/sphinx_documentation/source/index.rst
+++ b/Docs/sphinx_documentation/source/index.rst
@@ -52,6 +52,7 @@ Documentation on migration from BoxLib is available in the AMReX repository at D
    Fortran_Chapter
    Python_Chapter
    EB_Chapter
+   FFT_Chapter
    TimeIntegration_Chapter
    GPU_Chapter
    Visualization_Chapter
diff --git a/GNUmakefile.in b/GNUmakefile.in
index b85c2e0c35..67c789d97c 100644
--- a/GNUmakefile.in
+++ b/GNUmakefile.in
@@ -26,6 +26,9 @@ ifeq ($(USE_LINEAR_SOLVERS),TRUE)
      Pdirs += F_Interfaces/LinearSolvers
    endif
 endif
+ifeq ($(USE_FFT),TRUE)
+   Pdirs += FFT
+endif
 ifeq ($(USE_EB),TRUE)
    Pdirs += EB
 endif
diff --git a/Src/Base/AMReX_Array.H b/Src/Base/AMReX_Array.H
index 15ddde4d1e..d9f5fd1af5 100644
--- a/Src/Base/AMReX_Array.H
+++ b/Src/Base/AMReX_Array.H
@@ -10,6 +10,7 @@
 #include <AMReX_REAL.H>
 #include <AMReX_Algorithm.H>
 #include <AMReX_Dim3.H>
+#include <AMReX_SmallMatrix.H>
 
 #include <array>
 #include <memory>
@@ -148,10 +149,6 @@ namespace amrex {
      * order (last index moving fastest). If not specified, Fortran order is
      * assumed.
      */
-    namespace Order {
-        struct C {};
-        struct F {};
-    }
 
     /**
      * A GPU-compatible one-dimensional array.
@@ -280,7 +277,7 @@ namespace amrex {
      *               default if not given)
      */
     template <class T, int XLO, int XHI, int YLO, int YHI,
-              class ORDER=Order::F>
+              Order ORDER = Order::F>
     struct Array2D
     {
         /**
@@ -370,8 +367,7 @@ namespace amrex {
          * If the order is not specified, Fortran column-major order is assumed
          * (the index \c i moves the fastest)
          */
-        template <typename O=ORDER,
-                  std::enable_if_t<std::is_same_v<O,Order::F>,int> = 0>
+        template <Order Ord=ORDER, std::enable_if_t<Ord==Order::F,int> = 0>
         [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
         const T& operator() (int i, int j) const noexcept {
             AMREX_ASSERT(i >= XLO && i <= XHI && j >= YLO && j <= YHI);
@@ -384,8 +380,7 @@ namespace amrex {
          * If the order is not specified, Fortran column-major order is assumed
          * (the index \c i moves the fastest)
          */
-        template <typename O=ORDER,
-                  std::enable_if_t<std::is_same_v<O,Order::F>,int> = 0>
+        template <Order Ord=ORDER, std::enable_if_t<Ord==Order::F,int> = 0>
         [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
         T& operator() (int i, int j) noexcept {
             AMREX_ASSERT(i >= XLO && i <= XHI && j >= YLO && j <= YHI);
@@ -398,8 +393,7 @@ namespace amrex {
          * When the order is manually specified as Order::C, row-major order
          * is used (the index \c j moves the fastest).
          */
-        template <typename O=ORDER,
-                  std::enable_if_t<std::is_same_v<O,Order::C>,int> = 0>
+        template <Order Ord=ORDER, std::enable_if_t<Ord==Order::C,int> = 0>
         [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
         const T& operator() (int i, int j) const noexcept {
             AMREX_ASSERT(i >= XLO && i <= XHI && j >= YLO && j <= YHI);
@@ -412,8 +406,7 @@ namespace amrex {
          * When the order is manually specified as Order::C, row-major order
          * is used (the index \c j moves the fastest).
          */
-        template <typename O=ORDER,
-                  std::enable_if_t<std::is_same_v<O,Order::C>,int> = 0>
+        template <Order Ord=ORDER, std::enable_if_t<Ord==Order::C,int> = 0>
         [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
         T& operator() (int i, int j) noexcept {
             AMREX_ASSERT(i >= XLO && i <= XHI && j >= YLO && j <= YHI);
@@ -551,7 +544,7 @@ namespace amrex {
      *               default if not given)
      */
     template <class T, int XLO, int XHI, int YLO, int YHI, int ZLO, int ZHI,
-              class ORDER=Order::F>
+              Order ORDER=Order::F>
     struct Array3D
     {
         /**
@@ -662,8 +655,7 @@ namespace amrex {
          * If the order is not specified, Fortran column-major order is assumed
          * (the index \c i moves the fastest)
          */
-        template <typename O=ORDER,
-                  std::enable_if_t<std::is_same_v<O,Order::F>,int> = 0>
+        template <Order Ord=ORDER, std::enable_if_t<Ord==Order::F,int> = 0>
         [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
         const T& operator() (int i, int j, int k) const noexcept {
             return arr[i+j*(XHI-XLO+1)+k*((XHI-XLO+1)*(YHI-YLO+1))
@@ -676,8 +668,7 @@ namespace amrex {
          * If the order is not specified, Fortran column-major order is assumed
          * (the index \c i moves the fastest)
          */
-        template <typename O=ORDER,
-                  std::enable_if_t<std::is_same_v<O,Order::F>,int> = 0>
+        template <Order Ord=ORDER, std::enable_if_t<Ord==Order::F,int> = 0>
         [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
         T& operator() (int i, int j, int k) noexcept {
             return arr[i+j*(XHI-XLO+1)+k*((XHI-XLO+1)*(YHI-YLO+1))
@@ -690,8 +681,7 @@ namespace amrex {
          * When the order is manually specified as Order::C, row-major order
          * is used (the index \c k moves the fastest).
          */
-        template <typename O=ORDER,
-                  std::enable_if_t<std::is_same_v<O,Order::C>,int> = 0>
+        template <Order Ord=ORDER, std::enable_if_t<Ord==Order::C,int> = 0>
         [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
         const T& operator() (int i, int j, int k) const noexcept {
             return arr[k+j*(ZHI-ZLO+1)+i*((ZHI-ZLO+1)*(YHI-YLO+1))
@@ -704,8 +694,7 @@ namespace amrex {
          * When the order is manually specified as Order::C, row-major order
          * is used (the index \c k moves the fastest).
          */
-        template <typename O=ORDER,
-                  std::enable_if_t<std::is_same_v<O,Order::C>,int> = 0>
+        template <Order Ord=ORDER, std::enable_if_t<Ord==Order::C,int> = 0>
         [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
         T& operator() (int i, int j, int k) noexcept {
             return arr[k+j*(ZHI-ZLO+1)+i*((ZHI-ZLO+1)*(YHI-YLO+1))
diff --git a/Src/Base/AMReX_BoxArray.H b/Src/Base/AMReX_BoxArray.H
index b3b339c33b..e85946872c 100644
--- a/Src/Base/AMReX_BoxArray.H
+++ b/Src/Base/AMReX_BoxArray.H
@@ -53,6 +53,24 @@ namespace amrex
     //! Note that two BoxArrays that match are not necessarily equal.
     [[nodiscard]] bool match (const BoxArray& x, const BoxArray& y);
 
+    /**
+     * \brief Decompose domain box into BoxArray
+     *
+     * The returned BoxArray has nboxes Boxes, unless the the domain is too
+     * small. We aim to decompose the domain into subdomains that are as
+     * cubic as possible, even if this results in Boxes with odd numbers of
+     * cells. Thus, this function is generally not suited for applications
+     * with multiple AMR levels or for multigrid solvers.
+     *
+     * \param domain Domain Box
+     * \param nboxes the target number of Boxes
+     * \param decomp controls whether domain decomposition should be done in
+     *               that direction.
+     */
+    [[nodiscard]] BoxArray decompose (Box const& domain, int nboxes,
+                                      Array<bool,AMREX_SPACEDIM> const& decomp
+                                      = {AMREX_D_DECL(true,true,true)});
+
 struct BARef
 {
     BARef ();
diff --git a/Src/Base/AMReX_BoxArray.cpp b/Src/Base/AMReX_BoxArray.cpp
index 9bca594352..576d4cb870 100644
--- a/Src/Base/AMReX_BoxArray.cpp
+++ b/Src/Base/AMReX_BoxArray.cpp
@@ -12,6 +12,9 @@
 
 #include <AMReX_OpenMP.H>
 
+#include <algorithm>
+#include <cstdlib>
+#include <functional>
 #include <iostream>
 
 namespace amrex {
@@ -1887,6 +1890,173 @@ bool match (const BoxArray& x, const BoxArray& y)
     }
 }
 
+BoxArray decompose (Box const& domain, int nboxes,
+                    Array<bool,AMREX_SPACEDIM> const& decomp)
+{
+    auto ndecomp = std::count(decomp.begin(), decomp.end(), true);
+
+    if (nboxes <= 1 || ndecomp == 0) {
+        return BoxArray(domain);
+    }
+
+    Box const& ccdomain = amrex::enclosedCells(domain);
+    IntVect const& ncells = ccdomain.length();
+    IntVect nprocs(1);
+
+    if (ndecomp == 1) {
+        for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) {
+            if (decomp[idim]) {
+                nprocs[idim] = nboxes;
+            }
+        }
+    } else {
+        // Factorization of nboxes
+        Vector<int> factors;
+        {
+            int x = 2;
+            int n = nboxes;
+            while (x*x <= n) {
+                std::div_t dv = std::div(n, x);
+                if (dv.rem == 0) {
+                    factors.push_back(x);
+                    n = dv.quot;
+                } else {
+                    ++x;
+                }
+            }
+            if (n != 1) {
+                factors.push_back(n);
+            }
+            AMREX_ALWAYS_ASSERT(nboxes == std::accumulate(factors.begin(), factors.end(),
+                                                          1, std::multiplies<>()));
+        }
+
+        struct ProcDim
+        {
+            int nproc;
+            int idim;
+            Vector<int> procs;
+            ProcDim (int np, int dim) : nproc(np), idim(dim) {}
+        };
+
+        Vector<ProcDim> procdim;
+        procdim.reserve(AMREX_SPACEDIM);
+
+        Array<Long,AMREX_SPACEDIM> nblocks;
+
+        for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) {
+            if (decomp[idim]) {
+                nblocks[idim] = ncells[idim];
+                procdim.emplace_back(1,idim);
+            } else {
+                nblocks[idim] = 0;  // This dimension will not be decomposed.
+            }
+        }
+
+        auto comp = [&] (ProcDim const& a, ProcDim const& b) {
+                          if (nblocks[a.idim]*b.nproc <
+                              nblocks[b.idim]*a.nproc) {
+                              return true;
+                          } else if (nblocks[a.idim]*b.nproc >
+                                     nblocks[b.idim]*a.nproc) {
+                              return false;
+                          } else {
+                              return a.procs.size() > b.procs.size();
+                          }
+                      };
+
+        int nprocs_tot = 1;
+        while (!factors.empty()) {
+            std::sort(procdim.begin(), procdim.end(), comp);
+            auto f = factors.back();
+            factors.pop_back();
+            procdim.back().nproc *= f;
+            procdim.back().procs.push_back(f);
+            nprocs_tot *= f;
+            if (nprocs_tot == nboxes) {
+                break;
+            }
+        }
+
+        // swap to see if the decomposition can be improved.
+        while (true)
+        {
+            std::sort(procdim.begin(), procdim.end(), comp);
+            auto fit = std::find_if(procdim.begin(),procdim.end(),
+                                    [] (ProcDim const& x) { return x.nproc > 1; });
+            if (fit == procdim.end()) { break; } // This should not actually happen.
+            auto& light = *fit;
+            auto& heavy = procdim.back();
+            Long w0 = nblocks[light.idim] * heavy.nproc;
+            Long w1 = nblocks[heavy.idim] * light.nproc;
+            if (w0 >= w1) { break; }
+            bool swapped = false;
+            for (auto& f0 : light.procs) {
+                for (auto& f1 : heavy.procs) {
+                    if ((f0 > f1) && (w0*f0 < w1*f1)) {
+                        light.nproc /= f0;
+                        light.nproc *= f1;
+                        heavy.nproc /= f1;
+                        heavy.nproc *= f0;
+                        std::swap(f0,f1);
+                        swapped = true;
+                        break;
+                    }
+                }
+                if (swapped) { break;}
+            }
+            if (!swapped) { break; }
+        }
+
+        for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) {
+            if (!decomp[idim]) {
+                procdim.emplace_back(1,idim);
+            }
+        }
+        for (auto const& pd : procdim) {
+            nprocs[pd.idim] = pd.nproc;
+        }
+    }
+
+    AMREX_ALWAYS_ASSERT(AMREX_D_TERM(nprocs[0],*nprocs[1],*nprocs[2]) == nboxes);
+
+    IntVect const domlo = ccdomain.smallEnd();
+    IntVect const sz = ncells / nprocs;
+    IntVect const extra = ncells - sz*nprocs;
+    auto ixtyp = domain.ixType();
+    BoxList bl(ixtyp);
+#if (AMREX_SPACEDIM == 3)
+    for (int k = 0; k < nprocs[2]; ++k) {
+        // The first extra[2] blocks get one extra cell with a total of
+        // sz[2]+1.  The rest get sz[2] cells.  The decomposition in y
+        // and x directions are similar.
+        int klo = (k < extra[2]) ? k*(sz[2]+1) : (k*sz[2]+extra[2]);
+        int khi = (k < extra[2]) ? klo+(sz[2]+1)-1 : klo+sz[2]-1;
+        klo += domlo[2];
+        khi += domlo[2];
+#endif
+#if (AMREX_SPACEDIM >= 2)
+        for (int j = 0; j < nprocs[1]; ++j) {
+            int jlo = (j < extra[1]) ? j*(sz[1]+1) : (j*sz[1]+extra[1]);
+            int jhi = (j < extra[1]) ? jlo+(sz[1]+1)-1 : jlo+sz[1]-1;
+            jlo += domlo[1];
+            jhi += domlo[1];
+#endif
+            for (int i = 0; i < nprocs[0]; ++i) {
+                int ilo = (i < extra[0]) ? i*(sz[0]+1) : (i*sz[0]+extra[0]);
+                int ihi = (i < extra[0]) ? ilo+(sz[0]+1)-1 : ilo+sz[0]-1;
+                ilo += domlo[0];
+                ihi += domlo[0];
+                Box b{IntVect(AMREX_D_DECL(ilo,jlo,klo)),
+                      IntVect(AMREX_D_DECL(ihi,jhi,khi))};
+                if (b.ok()) {
+                    bl.push_back(b.convert(ixtyp));
+                }
+    AMREX_D_TERM(},},})
+
+    return BoxArray(std::move(bl));
+}
+
 std::ostream&
 operator<< (std::ostream& os, const BoxArray::RefID& id)
 {
diff --git a/Src/Base/AMReX_ConstexprFor.H b/Src/Base/AMReX_ConstexprFor.H
new file mode 100644
index 0000000000..972dd1ac30
--- /dev/null
+++ b/Src/Base/AMReX_ConstexprFor.H
@@ -0,0 +1,38 @@
+#ifndef AMREX_CONSTEXPR_FOR_H_
+#define AMREX_CONSTEXPR_FOR_H_
+#include <AMReX_Config.H>
+
+#include <AMReX_GpuQualifiers.H>
+#include <AMReX_Extension.H>
+
+#include <type_traits>
+
+namespace amrex {
+
+// Implementation of "constexpr for" based on
+// https://artificial-mind.net/blog/2020/10/31/constexpr-for
+//
+// Approximates what one would get from a compile-time
+// unrolling of the loop
+// for (int i = 0; i < N; ++i) {
+//    f(i);
+// }
+//
+// The mechanism is recursive: we evaluate f(i) at the current
+// i and then call the for loop at i+1. f() is a lambda function
+// that provides the body of the loop and takes only an integer
+// i as its argument.
+
+template<auto I, auto N, class F>
+AMREX_GPU_HOST_DEVICE AMREX_INLINE
+constexpr void constexpr_for (F const& f)
+{
+    if constexpr (I < N) {
+        f(std::integral_constant<decltype(I), I>());
+        constexpr_for<I+1, N>(f);
+    }
+}
+
+}
+
+#endif
diff --git a/Src/Base/AMReX_FabArray.H b/Src/Base/AMReX_FabArray.H
index 69970d6401..a67a72f0a3 100644
--- a/Src/Base/AMReX_FabArray.H
+++ b/Src/Base/AMReX_FabArray.H
@@ -13,6 +13,7 @@
 #include <AMReX_FabFactory.H>
 #include <AMReX_DistributionMapping.H>
 #include <AMReX_Geometry.H>
+#include <AMReX_GpuComplex.H>
 #include <AMReX_ParallelDescriptor.H>
 #include <AMReX_Utility.H>
 #include <AMReX_ccse-mpi.H>
@@ -3679,6 +3680,8 @@ FabArray<FAB>::norminf (FabArray<IFAB> const& mask, int comp, int ncomp,
     return nm0;
 }
 
+using cMultiFab = FabArray<BaseFab<GpuComplex<Real> > >;
+
 }
 
 #endif /*BL_FABARRAY_H*/
diff --git a/Src/Base/AMReX_GpuComplex.H b/Src/Base/AMReX_GpuComplex.H
index 274da82604..42dfc7626e 100644
--- a/Src/Base/AMReX_GpuComplex.H
+++ b/Src/Base/AMReX_GpuComplex.H
@@ -41,16 +41,16 @@ struct alignas(2*sizeof(T)) GpuComplex
     /**
      * \brief Return the real part.
      */
-    AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     constexpr T real () const noexcept { return m_real; }
 
     /**
      * \brief Return the imaginary part.
      */
-    AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     constexpr T imag () const noexcept { return m_imag; }
 
-   /**
+    /**
      * \brief Add a real number to this complex number.
      */
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
diff --git a/Src/Base/AMReX_GpuError.H b/Src/Base/AMReX_GpuError.H
index ce3ac188a8..65457c8f4e 100644
--- a/Src/Base/AMReX_GpuError.H
+++ b/Src/Base/AMReX_GpuError.H
@@ -84,6 +84,16 @@ namespace Gpu {
     std::string errStr(std::string("CURAND error in file ") + __FILE__  \
                        + " line " + std::to_string(__LINE__));          \
     amrex::Abort(errStr); }} while(0)
+
+#define AMREX_CUFFT_SAFE_CALL(call) { \
+    cufftResult_t amrex_i_err = call; \
+    if (CUFFT_SUCCESS != amrex_i_err) { \
+        std::string errStr(std::string("CUFFT error ")+std::to_string(amrex_i_err) \
+                           + std::string(" in file ") + __FILE__        \
+                           + " line " + std::to_string(__LINE__));      \
+        amrex::Abort(errStr); \
+    }}
+
 #endif
 
 #ifdef AMREX_USE_HIP
@@ -100,6 +110,16 @@ namespace Gpu {
     std::string errStr(std::string("HIPRAND error in file ") + __FILE__  \
                        + " line " + std::to_string(__LINE__));          \
     amrex::Abort(errStr); }} while(0)
+
+#define AMREX_ROCFFT_SAFE_CALL(call) { \
+    auto amrex_i_err = call; \
+    if (rocfft_status_success != amrex_i_err) { \
+        std::string errStr(std::string("rocFFT error ")+std::to_string(amrex_i_err) \
+                           + std::string(" in file ") + __FILE__        \
+                           + " line " + std::to_string(__LINE__));      \
+        amrex::Abort(errStr); \
+    }}
+
 #endif
 
 #define AMREX_GPU_ERROR_CHECK() amrex::Gpu::ErrorCheck(__FILE__, __LINE__)
diff --git a/Src/Base/AMReX_GpuLaunch.H b/Src/Base/AMReX_GpuLaunch.H
index 435a11f342..5f1e61e008 100644
--- a/Src/Base/AMReX_GpuLaunch.H
+++ b/Src/Base/AMReX_GpuLaunch.H
@@ -21,6 +21,7 @@
 #include <AMReX_RandomEngine.H>
 #include <AMReX_Algorithm.H>
 #include <AMReX_Math.H>
+#include <AMReX_Vector.H>
 #include <cstddef>
 #include <limits>
 #include <algorithm>
@@ -176,6 +177,47 @@ namespace Gpu {
     {
         return makeExecutionConfig<MT>(box.numPts());
     }
+
+    struct ExecConfig
+    {
+        Long start_idx;
+        int nblocks;
+    };
+
+    template <int MT>
+    Vector<ExecConfig> makeNExecutionConfigs (Long N) noexcept
+    {
+        // Max # of blocks in a kernel launch
+        int numblocks_max = std::numeric_limits<int>::max();
+        // Max # of threads in a kernel launch
+        Long nmax = Long(MT) * numblocks_max;
+        // # of launches needed for N elements without using grid-stride
+        // loops inside GPU kernels.
+        auto nlaunches = int((N+nmax-1)/nmax);
+        Vector<ExecConfig> r(nlaunches);
+        Long ndone = 0;
+        for (int i = 0; i < nlaunches; ++i) {
+            int nblocks;
+            if (N > nmax) {
+                nblocks = numblocks_max;
+                N -= nmax;
+            } else {
+                nblocks = int((N+MT-1)/MT);
+            }
+            // At which element ID the kernel should start
+            r[i].start_idx = ndone;
+            ndone += Long(nblocks) * MT;
+            // # of blocks in this launch
+            r[i].nblocks = nblocks;
+        }
+        return r;
+    }
+
+    template <int MT, int dim>
+    Vector<ExecConfig> makeNExecutionConfigs (BoxND<dim> const& box) noexcept
+    {
+        return makeNExecutionConfigs<MT>(box.numPts());
+    }
 #endif
 
 }
diff --git a/Src/Base/AMReX_GpuLaunchFunctsG.H b/Src/Base/AMReX_GpuLaunchFunctsG.H
index 7955410f8b..56a95dbc5b 100644
--- a/Src/Base/AMReX_GpuLaunchFunctsG.H
+++ b/Src/Base/AMReX_GpuLaunchFunctsG.H
@@ -747,17 +747,45 @@ void launch (int nblocks, int nthreads_per_block, gpuStream_t stream, L&& f) noe
     launch(nblocks, nthreads_per_block, 0, stream, std::forward<L>(f));
 }
 
-template<int MT, typename T, typename L>
+template<int MT, typename T, typename L, std::enable_if_t<std::is_integral_v<T>,int> FOO = 0>
 void launch (T const& n, L const& f) noexcept
 {
+    static_assert(sizeof(T) >= 2);
     if (amrex::isEmpty(n)) { return; }
-    const auto ec = Gpu::makeExecutionConfig<MT>(n);
-    AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
-    [=] AMREX_GPU_DEVICE () noexcept {
-        for (auto const i : Gpu::Range(n)) {
-            f(i);
-        }
-    });
+    const auto& nec = Gpu::makeNExecutionConfigs<MT>(n);
+    for (auto const& ec : nec) {
+        const T start_idx = T(ec.start_idx);
+        const T nleft = n - start_idx;
+        AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(),
+        [=] AMREX_GPU_DEVICE () noexcept {
+            // This will not overflow, even though nblocks*MT might.
+            auto tid = T(MT)*T(blockIdx.x)+T(threadIdx.x);
+            if (tid < nleft) {
+                f(tid+start_idx);
+            }
+        });
+    }
+    AMREX_GPU_ERROR_CHECK();
+}
+
+template<int MT, int dim, typename L>
+void launch (BoxND<dim> const& box, L const& f) noexcept
+{
+    if (box.isEmpty()) { return; }
+    const auto& nec = Gpu::makeNExecutionConfigs<MT>(box);
+    const BoxIndexerND<dim> indexer(box);
+    const auto type = box.ixType();
+    for (auto const& ec : nec) {
+        const auto start_idx = std::uint64_t(ec.start_idx);
+        AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(),
+        [=] AMREX_GPU_DEVICE () noexcept {
+            auto icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x + start_idx;
+            if (icell < indexer.numPts()) {
+                auto iv = indexer.intVect(icell);
+                f(BoxND<dim>(iv,iv,type));
+            }
+        });
+    }
     AMREX_GPU_ERROR_CHECK();
 }
 
@@ -765,17 +793,23 @@ template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_in
 std::enable_if_t<MaybeDeviceRunnable<L>::value>
 ParallelFor (Gpu::KernelInfo const&, T n, L const& f) noexcept
 {
+    static_assert(sizeof(T) >= 2);
     if (amrex::isEmpty(n)) { return; }
-    const auto ec = Gpu::makeExecutionConfig<MT>(n);
-    AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
-    [=] AMREX_GPU_DEVICE () noexcept {
-        for (Long i = Long(blockDim.x)*blockIdx.x+threadIdx.x, stride = Long(blockDim.x)*gridDim.x;
-             i < Long(n); i += stride) {
-            detail::call_f_scalar_handler(f, T(i),
-                Gpu::Handler(amrex::min((std::uint64_t(n)-i+(std::uint64_t)threadIdx.x),
-                (std::uint64_t)blockDim.x)));
-        }
-    });
+    const auto& nec = Gpu::makeNExecutionConfigs<MT>(n);
+    for (auto const& ec : nec) {
+        const T start_idx = T(ec.start_idx);
+        const T nleft = n - start_idx;
+        AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(),
+        [=] AMREX_GPU_DEVICE () noexcept {
+            // This will not overflow, even though nblocks*MT might.
+            auto tid = T(MT)*T(blockIdx.x)+T(threadIdx.x);
+            if (tid < nleft) {
+                detail::call_f_scalar_handler(f, tid+start_idx,
+                    Gpu::Handler(amrex::min((std::uint64_t(nleft-tid)+(std::uint64_t)threadIdx.x),
+                    (std::uint64_t)blockDim.x)));
+            }
+        });
+    }
     AMREX_GPU_ERROR_CHECK();
 }
 
@@ -785,18 +819,20 @@ ParallelFor (Gpu::KernelInfo const&, BoxND<dim> const& box, L const& f) noexcept
 {
     if (amrex::isEmpty(box)) { return; }
     const BoxIndexerND<dim> indexer(box);
-    const auto ec = Gpu::makeExecutionConfig<MT>(box.numPts());
-    AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
-    [=] AMREX_GPU_DEVICE () noexcept {
-        for (std::uint64_t icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x, stride = std::uint64_t(blockDim.x)*gridDim.x;
-             icell < indexer.numPts(); icell += stride)
-        {
-            auto iv = indexer.intVect(icell);
-            detail::call_f_intvect_handler(f, iv,
-                Gpu::Handler(amrex::min((indexer.numPts()-icell+(std::uint64_t)threadIdx.x),
-                (std::uint64_t)blockDim.x)));
-        }
-    });
+    const auto& nec = Gpu::makeNExecutionConfigs<MT>(box);
+    for (auto const& ec : nec) {
+        const auto start_idx = std::uint64_t(ec.start_idx);
+        AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(),
+        [=] AMREX_GPU_DEVICE () noexcept {
+            auto icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x + start_idx;
+            if (icell < indexer.numPts()) {
+                auto iv = indexer.intVect(icell);
+                detail::call_f_intvect_handler(f, iv,
+                    Gpu::Handler(amrex::min((indexer.numPts()-icell+(std::uint64_t)threadIdx.x),
+                    (std::uint64_t)blockDim.x)));
+            }
+        });
+    }
     AMREX_GPU_ERROR_CHECK();
 }
 
@@ -806,17 +842,20 @@ ParallelFor (Gpu::KernelInfo const&, BoxND<dim> const& box, T ncomp, L const& f)
 {
     if (amrex::isEmpty(box)) { return; }
     const BoxIndexerND<dim> indexer(box);
-    const auto ec = Gpu::makeExecutionConfig<MT>(box.numPts());
-    AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
-    [=] AMREX_GPU_DEVICE () noexcept {
-        for (std::uint64_t icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x, stride = std::uint64_t(blockDim.x)*gridDim.x;
-             icell < indexer.numPts(); icell += stride) {
-            auto iv = indexer.intVect(icell);
-            detail::call_f_intvect_ncomp_handler(f, iv, ncomp,
-                Gpu::Handler(amrex::min((indexer.numPts()-icell+(std::uint64_t)threadIdx.x),
-                (std::uint64_t)blockDim.x)));
-        }
-    });
+    const auto& nec = Gpu::makeNExecutionConfigs<MT>(box);
+    for (auto const& ec : nec) {
+        const auto start_idx = std::uint64_t(ec.start_idx);
+        AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(),
+        [=] AMREX_GPU_DEVICE () noexcept {
+            auto icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x + start_idx;
+            if (icell < indexer.numPts()) {
+                auto iv = indexer.intVect(icell);
+                detail::call_f_intvect_ncomp_handler(f, iv, ncomp,
+                    Gpu::Handler(amrex::min((indexer.numPts()-icell+(std::uint64_t)threadIdx.x),
+                    (std::uint64_t)blockDim.x)));
+            }
+        });
+    }
     AMREX_GPU_ERROR_CHECK();
 }
 
diff --git a/Src/Base/AMReX_Loop.H b/Src/Base/AMReX_Loop.H
index fe76b8c988..fe216bac45 100644
--- a/Src/Base/AMReX_Loop.H
+++ b/Src/Base/AMReX_Loop.H
@@ -3,6 +3,7 @@
 #include <AMReX_Config.H>
 
 #include <AMReX_Box.H>
+#include <AMReX_ConstexprFor.H>
 #include <AMReX_Extension.H>
 
 namespace amrex {
@@ -567,30 +568,6 @@ void LoopConcurrentOnCpu (BoxND<dim> const& bx, int ncomp, F const& f) noexcept
     }
 }
 
-// Implementation of "constexpr for" based on
-// https://artificial-mind.net/blog/2020/10/31/constexpr-for
-//
-// Approximates what one would get from a compile-time
-// unrolling of the loop
-// for (int i = 0; i < N; ++i) {
-//    f(i);
-// }
-//
-// The mechanism is recursive: we evaluate f(i) at the current
-// i and then call the for loop at i+1. f() is a lambda function
-// that provides the body of the loop and takes only an integer
-// i as its argument.
-
-template<auto I, auto N, class F>
-AMREX_GPU_HOST_DEVICE AMREX_INLINE
-constexpr void constexpr_for (F const& f)
-{
-    if constexpr (I < N) {
-        f(std::integral_constant<decltype(I), I>());
-        constexpr_for<I+1, N>(f);
-    }
-}
-
 #include <AMReX_Loop.nolint.H>
 
 }
diff --git a/Src/Base/AMReX_MultiFabUtil.H b/Src/Base/AMReX_MultiFabUtil.H
index 228070a13c..0740069c68 100644
--- a/Src/Base/AMReX_MultiFabUtil.H
+++ b/Src/Base/AMReX_MultiFabUtil.H
@@ -169,7 +169,8 @@ namespace amrex
     std::unique_ptr<MultiFab> get_slice_data(int dir, Real coord,
                                              const MultiFab& cc,
                                              const Geometry& geom, int start_comp, int ncomp,
-                                             bool interpolate=false);
+                                             bool interpolate=false,
+                                             RealBox const& bnd_rbx = RealBox());
 
     /**
      * \brief Get data in a cell of MultiFab/FabArray
@@ -188,7 +189,7 @@ namespace amrex
      * specified by a direction and a cell.
      */
     template <typename MF, std::enable_if_t<IsFabArray<MF>::value,int> FOO = 0>
-    MF get_line_data (MF const& mf, int dir, IntVect const& cell);
+    MF get_line_data (MF const& mf, int dir, IntVect const& cell, Box const& bnd_bx = Box());
 
     //! Return an iMultiFab that has the same BoxArray and DistributionMapping
     //! as the coarse MultiFab cmf. Cells covered by the coarsened fine grids
@@ -996,8 +997,10 @@ Vector<typename MF::value_type> get_cell_data (MF const& mf, IntVect const& cell
 }
 
 template <typename MF, std::enable_if_t<IsFabArray<MF>::value,int> FOO>
-MF get_line_data (MF const& mf, int dir, IntVect const& cell)
+MF get_line_data (MF const& mf, int dir, IntVect const& cell, Box const& bnd_bx)
 {
+    bool do_bnd = (!bnd_bx.isEmpty());
+
     BoxArray const& ba = mf.boxArray();
     DistributionMapping const& dm = mf.DistributionMap();
     const auto nboxes = static_cast<int>(ba.size());
@@ -1005,17 +1008,29 @@ MF get_line_data (MF const& mf, int dir, IntVect const& cell)
     BoxList bl(ba.ixType());
     Vector<int> procmap;
     Vector<int> index_map;
-    for (int i = 0; i < nboxes; ++i) {
-        Box const& b = ba[i];
-        IntVect lo = cell;
-        lo[dir] = b.smallEnd(dir);
-        if (b.contains(lo)) {
-            IntVect hi = lo;
-            hi[dir] = b.bigEnd(dir);
-            Box b1d(lo,hi,b.ixType());
-            bl.push_back(b1d);
-            procmap.push_back(dm[i]);
-            index_map.push_back(i);
+    if (!do_bnd) {
+        for (int i = 0; i < nboxes; ++i) {
+            Box const& b = ba[i];
+            IntVect lo = cell;
+            lo[dir] = b.smallEnd(dir);
+            if (b.contains(lo)) {
+                IntVect hi = lo;
+                hi[dir] = b.bigEnd(dir);
+                Box b1d(lo,hi,b.ixType());
+                bl.push_back(b1d);
+                procmap.push_back(dm[i]);
+                index_map.push_back(i);
+            }
+        }
+    } else {
+        for (int i = 0; i < nboxes; ++i) {
+            Box const& b   = ba[i];
+            Box const& b1d = bnd_bx & b;
+            if (b1d.ok()) {
+                bl.push_back(b1d);
+                procmap.push_back(dm[i]);
+                index_map.push_back(i);
+            }
         }
     }
 
diff --git a/Src/Base/AMReX_MultiFabUtil.cpp b/Src/Base/AMReX_MultiFabUtil.cpp
index 86a1e29054..721919f509 100644
--- a/Src/Base/AMReX_MultiFabUtil.cpp
+++ b/Src/Base/AMReX_MultiFabUtil.cpp
@@ -9,7 +9,7 @@ namespace {
     using namespace amrex;
 
     Box
-    getIndexBox(const RealBox& real_box, const Geometry& geom) {
+    getIndexBox (const RealBox& real_box, const Geometry& geom) {
         IntVect slice_lo, slice_hi;
 
         AMREX_D_TERM(slice_lo[0]=static_cast<int>(std::floor((real_box.lo(0) - geom.ProbLo(0))/geom.CellSize(0)));,
@@ -24,12 +24,11 @@ namespace {
     }
 
 
-    std::unique_ptr<MultiFab> allocateSlice(int dir, const MultiFab& cell_centered_data,
-                                            int ncomp, const Geometry& geom, Real dir_coord,
-                                            Vector<int>& slice_to_full_ba_map) {
+    std::unique_ptr<MultiFab> allocateSlice (int dir, const MultiFab& cell_centered_data,
+                                             int ncomp, const Geometry& geom, Real dir_coord,
+                                             Vector<int>& slice_to_full_ba_map, RealBox real_slice) {
 
         // Get our slice and convert to index space
-        RealBox real_slice = geom.ProbDomain();
         real_slice.setLo(dir, dir_coord);
         real_slice.setHi(dir, dir_coord);
         Box slice_box = getIndexBox(real_slice, geom);
@@ -550,7 +549,7 @@ namespace amrex
         return amrex::cast<FabArray<BaseFab<Long> > > (imf);
     }
 
-    std::unique_ptr<MultiFab> get_slice_data(int dir, Real coord, const MultiFab& cc, const Geometry& geom, int start_comp, int ncomp, bool interpolate) {
+    std::unique_ptr<MultiFab> get_slice_data(int dir, Real coord, const MultiFab& cc, const Geometry& geom, int start_comp, int ncomp, bool interpolate, RealBox const& bnd_rbx) {
 
         BL_PROFILE("amrex::get_slice_data");
 
@@ -559,9 +558,15 @@ namespace amrex
         }
 
         const auto geomdata = geom.data();
+        RealBox real_slice;
+        if (bnd_rbx.ok()) {
+            real_slice = bnd_rbx;
+        } else {
+            real_slice = geom.ProbDomain();
+        }
 
         Vector<int> slice_to_full_ba_map;
-        std::unique_ptr<MultiFab> slice = allocateSlice(dir, cc, ncomp, geom, coord, slice_to_full_ba_map);
+        std::unique_ptr<MultiFab> slice = allocateSlice(dir, cc, ncomp, geom, coord, slice_to_full_ba_map, real_slice);
 
         if (!slice) {
             return nullptr;
diff --git a/Src/Base/AMReX_ParmParse.cpp b/Src/Base/AMReX_ParmParse.cpp
index 34d383c99a..3ecfc8503a 100644
--- a/Src/Base/AMReX_ParmParse.cpp
+++ b/Src/Base/AMReX_ParmParse.cpp
@@ -349,7 +349,7 @@ getToken (const char*& str, std::string& ostr, int& num_linefeeds)
 //
 // Return the index of the n'th occurrence of a parameter name,
 // except if n==-1, return the index of the last occurrence.
-// Return 0 if the specified occurrence does not exist.
+// Return nullptr if the specified occurrence does not exist.
 //
 std::vector<std::string> const*
 ppindex (const ParmParse::Table& table, int n, const std::string& name)
@@ -365,6 +365,9 @@ ppindex (const ParmParse::Table& table, int n, const std::string& name)
     if (n == ParmParse::LAST) {
         return &(found->second.m_vals.back());
     } else {
+        if(found->second.m_vals.size() < (std::size_t)n + 1) {
+            return nullptr;
+        }
         return &(found->second.m_vals[n]);
     }
 }
@@ -642,7 +645,7 @@ squeryval (const ParmParse::Table& table,
            int                     occurrence)
 {
     //
-    // Get last occurrence of name in table.
+    // Get specified occurrence of name in table.
     //
     auto const* def = ppindex(table, occurrence, name);
     if ( def == nullptr )
diff --git a/Src/Base/AMReX_SmallMatrix.H b/Src/Base/AMReX_SmallMatrix.H
new file mode 100644
index 0000000000..05305d1839
--- /dev/null
+++ b/Src/Base/AMReX_SmallMatrix.H
@@ -0,0 +1,490 @@
+#ifndef AMREX_SMALL_MATRIX_H_
+#define AMREX_SMALL_MATRIX_H_
+#include <AMReX_Config.H>
+
+#include <AMReX_Algorithm.H>
+#include <AMReX_BLassert.H>
+#include <AMReX_Extension.H>
+#include <AMReX_GpuQualifiers.H>
+#include <AMReX_ConstexprFor.H>
+
+#include <algorithm>
+#include <initializer_list>
+#include <iostream>
+#include <tuple>
+#include <type_traits>
+
+namespace amrex {
+
+    enum struct Order { C, F, RowMajor=C, ColumnMajor=F };
+
+    /**
+     * \brief Matrix class with compile-time size
+     *
+     * Note that column vectors and row vectors are special cases of a
+     * Matrix.
+     *
+     * \tparam T Matrix element data type.
+     * \tparam NRows Number of rows.
+     * \tparam NCols Number of columns.
+     * \tparam ORDER Memory layout order. Order::F (i.e., column-major) by default.
+     * \tparam StartIndex Starting index. Either 0 or 1.
+     */
+    template <class T, int NRows, int NCols, Order ORDER = Order::F, int StartIndex = 0>
+    struct SmallMatrix
+    {
+        using value_type = T;
+        using reference_type = T&;
+        static constexpr int row_size = NRows;
+        static constexpr int column_size = NCols;
+        static constexpr Order ordering = ORDER;
+        static constexpr int starting_index = StartIndex;
+
+        /**
+         * \brief Default constructor
+         *
+         * The data are uninitialized by default. If you want to initialize
+         * to zero, you can do `SmallMatrix<T,NRows,NCols> M{};`.
+         */
+        AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+        constexpr SmallMatrix () = default;
+
+        /**
+         * \brief Constructs column- or row-vector
+         *
+         * The data are initialized with the given variadic arguments. If
+         * the number of argument is less than the size of the vector, the
+         * rest of the vector is initialized to zero.
+         */
+        template <typename... Ts, int MM=NRows, int NN=NCols,
+                  std::enable_if_t<MM==1 || NN==1, int> = 0>
+        AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+        constexpr explicit SmallMatrix (Ts... vs)
+            : m_mat{vs...}
+        {
+            static_assert(sizeof...(vs) <= std::max(NRows,NCols));
+        }
+
+        /**
+         * \brief Constructs SmallMatrix with nested std::initializer_list
+         *
+         * The initializer list is assumed to be in row-major order, even when
+         * the ordering for the SmallMatrix object is colum-major. Below is
+         * an example of constructing a matrix with 2 rows and 3 columns.
+         \verbatim
+             SmallMatrix<double,2,3> M{{11., 12., 13.},
+                                       {21., 22., 23.}};
+         \endverbatim
+         */
+        AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+        explicit SmallMatrix (std::initializer_list<std::initializer_list<T>> const& init)
+        {
+            AMREX_ASSERT(NRows == init.size());
+            int i = StartIndex;
+            for (auto const& row : init) {
+                AMREX_ASSERT(NCols == row.size());
+                int j = StartIndex;
+                for (auto const& x : row) {
+                    (*this)(i,j) = x;
+                    ++j;
+                }
+                ++i;
+            }
+        }
+
+        //! Returns a const reference to the element at row i and column j.
+        [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+        const T& operator() (int i, int j) const noexcept {
+            static_assert(StartIndex == 0 || StartIndex == 1);
+            if constexpr (StartIndex == 1) {
+                --i;
+                --j;
+            }
+            AMREX_ASSERT(i < NRows && j < NCols);
+            if constexpr (ORDER == Order::F) {
+                return m_mat[i+j*NRows];
+            } else {
+                return m_mat[j+i*NCols];
+            }
+        }
+
+        //! Returns a reference to the element at row i and column j.
+        [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+        T& operator() (int i, int j) noexcept {
+            static_assert(StartIndex == 0 || StartIndex == 1);
+            if constexpr (StartIndex == 1) {
+                --i;
+                --j;
+            }
+            AMREX_ASSERT(i < NRows && j < NCols);
+            if constexpr (ORDER == Order::F) {
+                return m_mat[i+j*NRows];
+            } else {
+                return m_mat[j+i*NCols];
+            }
+        }
+
+        //! Returns a const reference to element i of a vector
+        template <int MM=NRows, int NN=NCols, std::enable_if_t<(MM==1 || NN==1), int> = 0>
+        [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+        const T& operator() (int i) const noexcept {
+            static_assert(StartIndex == 0 || StartIndex == 1);
+            if constexpr (StartIndex == 1) {
+                --i;
+            }
+            AMREX_ASSERT(i < NRows*NCols);
+            return m_mat[i];
+        }
+
+        //! Returns a reference to element i of a vector
+        template <int MM=NRows, int NN=NCols, std::enable_if_t<(MM==1 || NN==1), int> = 0>
+        [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+        T& operator() (int i) noexcept {
+            static_assert(StartIndex == 0 || StartIndex == 1);
+            if constexpr (StartIndex == 1) {
+                --i;
+            }
+            AMREX_ASSERT(i < NRows*NCols);
+            return m_mat[i];
+        }
+
+        //! Returns a const reference to element i of a vector
+        template <int MM=NRows, int NN=NCols, std::enable_if_t<(MM==1 || NN==1), int> = 0>
+        [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+        const T& operator[] (int i) const noexcept {
+            static_assert(StartIndex == 0 || StartIndex == 1);
+            if constexpr (StartIndex == 1) {
+                --i;
+            }
+            AMREX_ASSERT(i < NRows*NCols);
+            return m_mat[i];
+        }
+
+        //! Returns a reference to element i of a vector
+        template <int MM=NRows, int NN=NCols, std::enable_if_t<(MM==1 || NN==1), int> = 0>
+        [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+        T& operator[] (int i) noexcept {
+            static_assert(StartIndex == 0 || StartIndex == 1);
+            if constexpr (StartIndex == 1) {
+                --i;
+            }
+            AMREX_ASSERT(i < NRows*NCols);
+            return m_mat[i];
+        }
+
+        /**
+         * Returns a \c const pointer address to the first element of the
+         * SmallMatrix object, as if the object is treated as one-dimensional.
+         */
+        [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+        const T* begin () const noexcept { return m_mat; }
+
+        /**
+         * Returns a \c const pointer address right after the last element of the
+         * SmallMatrix object, as if the object is treated as one-dimensional.
+         */
+        [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+        const T* end () const noexcept { return m_mat + NRows*NCols; }
+
+        /**
+         * Returns a pointer address to the first element of the
+         * SmallMatrix object, as if the object is treated as one-dimensional.
+         */
+        [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+        T* begin () noexcept { return m_mat; }
+
+        /**
+         * Returns a pointer address right after the last element of the
+         * SmallMatrix object, as if the object is treated as one-dimensional.
+         */
+        [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+        T* end () noexcept { return m_mat + NRows*NCols; }
+
+        //! Set all elements in the matrix to the given value
+        AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+        SmallMatrix<T,NRows,NCols,ORDER,StartIndex>&
+        setVal (T val)
+        {
+            for (auto& x : m_mat) { x = val; }
+            return *this;
+        }
+
+        //! Returns an identity matrix
+        template <int MM=NRows, int NN=NCols, std::enable_if_t<MM==NN, int> = 0>
+        static constexpr
+        AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+        SmallMatrix<T,NRows,NCols,ORDER,StartIndex>
+        Identity () noexcept {
+            static_assert(StartIndex == 0 || StartIndex == 1);
+            SmallMatrix<T,NRows,NCols,ORDER,StartIndex> I{};
+            constexpr_for<StartIndex,NRows+StartIndex>(
+                [&] (int i) { I(i,i) = T(1); });
+            return I;
+        }
+
+        //! Returns a matrix initialized with zeros
+        static constexpr
+        AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+        SmallMatrix<T,NRows,NCols,ORDER,StartIndex>
+        Zero () noexcept {
+            SmallMatrix<T,NRows,NCols,ORDER,StartIndex> Z{};
+            return Z;
+        }
+
+        //! Returns transposed matrix
+        [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+        SmallMatrix<T,NCols,NRows,ORDER,StartIndex>
+        transpose () const
+        {
+            SmallMatrix<T,NCols,NRows,ORDER,StartIndex> r;
+            for (int j = StartIndex; j < NRows+StartIndex; ++j) {
+                for (int i = StartIndex; i < NCols+StartIndex; ++i) {
+                    r(i,j) = (*this)(j,i);
+                }
+            }
+            return r;
+        }
+
+        //! Transposes a square matrix in-place.
+        template <int MM=NRows, int NN=NCols, std::enable_if_t<MM==NN,int> = 0>
+        AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+        SmallMatrix<T,NRows,NCols,ORDER,StartIndex>&
+        transposeInPlace ()
+        {
+            static_assert(StartIndex == 0 || StartIndex == 1);
+            for (int j = 1+StartIndex; j < NCols+StartIndex; ++j) {
+                for (int i = StartIndex; i < j; ++i) {
+                    amrex::Swap((*this)(i,j), (*this)(j,i));
+                }
+            }
+            return *this;
+        }
+
+        //! Returns the product of all elements in the matrix
+        [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+        T product () const
+        {
+            T p = 1;
+            for (auto const& x : m_mat) {
+                p *= x;
+            }
+            return p;
+        }
+
+        //! Returns the sum of all elements in the matrix
+        [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+        T sum () const
+        {
+            T s = 0;
+            for (auto const& x : m_mat) {
+                s += x;
+            }
+            return s;
+        }
+
+        //! Returns the trace of a square matrix
+        template <int MM=NRows, int NN=NCols, std::enable_if_t<MM==NN,int> = 0>
+        [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+        T trace () const
+        {
+            T t = 0;
+            constexpr_for<StartIndex,MM+StartIndex>([&] (int i) { t += (*this)(i,i); });
+            return t;
+        }
+
+        //! Operator += performing matrix addition as in (*this) += rhs
+        AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+        SmallMatrix<T,NRows,NCols,ORDER,StartIndex>&
+        operator += (SmallMatrix<T,NRows,NCols,ORDER,StartIndex> const& rhs)
+        {
+            for (int n = 0; n < NRows*NCols; ++n) {
+                m_mat[n] += rhs.m_mat[n];
+            }
+            return *this;
+        }
+
+        //! Binary operator + returning the result of maxtrix addition, lhs+rhs
+        AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+        friend SmallMatrix<T,NRows,NCols,ORDER,StartIndex>
+        operator+ (SmallMatrix<T,NRows,NCols,ORDER,StartIndex>        lhs,
+                   SmallMatrix<T,NRows,NCols,ORDER,StartIndex> const& rhs)
+        {
+            lhs += rhs;
+            return lhs;
+        }
+
+        //! Operator -= performing matrix subtraction as in (*this) -= rhs
+        AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+        SmallMatrix<T,NRows,NCols,ORDER,StartIndex>&
+        operator -= (SmallMatrix<T,NRows,NCols,ORDER,StartIndex> const& rhs)
+        {
+            for (int n = 0; n < NRows*NCols; ++n) {
+                m_mat[n] -= rhs.m_mat[n];
+            }
+            return *this;
+        }
+
+        //! Binary operator - returning the result of maxtrix subtraction, lhs-rhs
+        AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+        friend SmallMatrix<T,NRows,NCols,ORDER,StartIndex>
+        operator- (SmallMatrix<T,NRows,NCols,ORDER,StartIndex>        lhs,
+                   SmallMatrix<T,NRows,NCols,ORDER,StartIndex> const& rhs)
+        {
+            lhs -= rhs;
+            return lhs;
+        }
+
+        //! Unary minus operator
+        [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+        SmallMatrix<T,NRows,NCols,ORDER,StartIndex>
+        operator- () const
+        {
+            return (*this) * T(-1);
+        }
+
+        //! Operator *= that scales this matrix in place by a scalar.
+        AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+        SmallMatrix<T,NRows,NCols,ORDER,StartIndex>&
+        operator *= (T a)
+        {
+            for (auto& x : m_mat) {
+                x *= a;
+            }
+            return *this;
+        }
+
+        //! Returns the product of a matrix and a scalar
+        AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+        friend SmallMatrix<T,NRows,NCols,ORDER,StartIndex>
+        operator* (SmallMatrix<T,NRows,NCols,ORDER,StartIndex> m, T a)
+        {
+            m *= a;
+            return m;
+        }
+
+        //! Returns the product of a scalar and a matrix
+        AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+        friend SmallMatrix<T,NRows,NCols,ORDER,StartIndex>
+        operator* (T a, SmallMatrix<T,NRows,NCols,ORDER,StartIndex> m)
+        {
+            m *= a;
+            return m;
+        }
+
+        //! Returns matrix product of two matrices
+        template <class U, int N1, int N2, int N3, Order Ord, int SI>
+        AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+        friend SmallMatrix<U,N1,N3,Ord,SI>
+        operator* (SmallMatrix<U,N1,N2,Ord,SI> const& lhs,
+                   SmallMatrix<U,N2,N3,Ord,SI> const& rhs);
+
+        //! Returns the dot product of two vectors
+        [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+        T dot (SmallMatrix<T,NRows,NCols,ORDER,StartIndex> const& rhs) const
+        {
+            T r = 0;
+            for (int n = 0; n < NRows*NCols; ++n) {
+                r += m_mat[n] * rhs.m_mat[n];
+            }
+            return r;
+        }
+
+        template <int N, std::enable_if_t<(N<NRows*NCols),int> = 0>
+        [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+        constexpr T const& get () const { return m_mat[N]; }
+
+        template <int N, std::enable_if_t<(N<NRows*NCols),int> = 0>
+        [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+        constexpr T& get () { return m_mat[N]; }
+
+    private:
+        T m_mat[NRows*NCols];
+    };
+
+    template <class U, int N1, int N2, int N3, Order Ord, int SI>
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    SmallMatrix<U,N1,N3,Ord,SI>
+    operator* (SmallMatrix<U,N1,N2,Ord,SI> const& lhs,
+               SmallMatrix<U,N2,N3,Ord,SI> const& rhs)
+    {
+        static_assert(SI == 0 || SI == 1);
+        SmallMatrix<U,N1,N3,Ord,SI> r;
+        if constexpr (Ord == Order::F) {
+            for (int j = SI; j < N3+SI; ++j) {
+                constexpr_for<SI,N1+SI>([&] (int i) { r(i,j) = U(0); });
+                for (int k = SI; k < N2+SI; ++k) {
+                    auto b = rhs(k,j);
+                    constexpr_for<SI,N1+SI>([&] (int i)
+                    {
+                        r(i,j) += lhs(i,k) * b;
+                    });
+                }
+            }
+        } else {
+            for (int i = SI; i < N1+SI; ++i) {
+                constexpr_for<SI,N3+SI>([&] (int j) { r(i,j) = U(0); });
+                for (int k = SI; k < N2+SI; ++k) {
+                    auto a = lhs(i,k);
+                    constexpr_for<SI,N3+SI>([&] (int j)
+                    {
+                        r(i,j) += a * rhs(k,j);
+                    });
+                }
+            }
+        }
+        return r;
+    }
+
+    template <class T, int NRows, int NCols, Order ORDER, int SI>
+    std::ostream& operator<< (std::ostream& os,
+                              SmallMatrix<T,NRows,NCols,ORDER,SI> const& mat)
+    {
+        for (int i = SI; i < NRows+SI; ++i) {
+            os << mat(i,SI);
+            for (int j = 1+SI; j < NCols+SI; ++j) {
+                os << " " << mat(i,j);
+            }
+            os << "\n";
+        }
+        return os;
+    }
+
+    template <class T, int N, int StartIndex = 0>
+    using SmallVector = SmallMatrix<T,N,1,Order::F,StartIndex>;
+
+    template <class T, int N, int StartIndex = 0>
+    using SmallRowVector = SmallMatrix<T,1,N,Order::F,StartIndex>;
+}
+
+template <class T, int NRows, int NCols, amrex::Order ORDER, int StartIndex>
+struct std::tuple_size<amrex::SmallMatrix<T,NRows,NCols,ORDER,StartIndex> >
+    : std::integral_constant<std::size_t,NRows*NCols> {};
+
+template <std::size_t N, class T, int NRows, int NCols, amrex::Order ORDER, int StartIndex>
+struct std::tuple_element<N, amrex::SmallMatrix<T,NRows,NCols,ORDER,StartIndex> >
+{
+    using type = T;
+};
+
+#endif
+
+/*
+ * Notes on why SmallMatrix matrix{} is zero initialized.
+ *
+ * SmallMatrix is not an aggregate, because it has a user declared default
+ * constructor. The rule is that, for `SmallMatrix matrix{}` with an empty
+ * brace-enclosed initializer list, value-initialization is performed. The
+ * effects of value-initialization of SmallMatrix (which has a user-declared
+ * but not user-provided default constructor) are that the matrix object is
+ * first zero-initialized and then the object's default constructor is
+ * applied. Since the default constructor does nothing, the final result is
+ * the object is zero-initialized.
+ *
+ * Why is SmallMatrix's default constructor user-declared not user-provided?
+ * It's because we first declare it with `SmallMatrix () = default`.
+ *
+ * Reference:
+ *   https://en.cppreference.com/w/cpp/language/list_initialization
+ *   https://en.cppreference.com/w/cpp/language/value_initialization
+ *   https://en.cppreference.com/w/cpp/language/zero_initialization
+ */
diff --git a/Src/Base/AMReX_TableData.H b/Src/Base/AMReX_TableData.H
index 9c2a0d06ca..ee2471d36c 100644
--- a/Src/Base/AMReX_TableData.H
+++ b/Src/Base/AMReX_TableData.H
@@ -72,7 +72,7 @@ struct Table1D
 #endif
 };
 
-template <typename T, typename ORDER = Order::F>
+template <typename T, Order ORDER = Order::F>
 struct Table2D
 {
     T* AMREX_RESTRICT p = nullptr;
@@ -110,9 +110,7 @@ struct Table2D
 #if defined(AMREX_DEBUG) || defined(AMREX_BOUND_CHECK)
         index_assert(i,j);
 #endif
-        static_assert(std::is_same_v<ORDER,Order::F> ||
-                      std::is_same_v<ORDER,Order::C>);
-        if constexpr (std::is_same_v<ORDER,Order::F>) {
+        if constexpr (ORDER == Order::F) {
             return p[(i-begin[0])+(j-begin[1])*stride1];
         } else {
             return p[(i-begin[0])*stride1+(j-begin[1])];
@@ -146,7 +144,7 @@ private:
     static constexpr int len0 (GpuArray<int,2> const& a_begin,
                                GpuArray<int,2> const& a_end) noexcept
     {
-        if constexpr (std::is_same_v<ORDER,Order::F>) {
+        if constexpr (ORDER == Order::F) {
             return a_end[0] - a_begin[0];
         } else {
             return a_end[1] - a_begin[1];
@@ -154,7 +152,7 @@ private:
     }
 };
 
-template <typename T, typename ORDER = Order::F>
+template <typename T, Order ORDER = Order::F>
 struct Table3D
 {
     T* AMREX_RESTRICT p = nullptr;
@@ -195,9 +193,7 @@ struct Table3D
 #if defined(AMREX_DEBUG) || defined(AMREX_BOUND_CHECK)
         index_assert(i,j,k);
 #endif
-        static_assert(std::is_same_v<ORDER,Order::F> ||
-                      std::is_same_v<ORDER,Order::C>);
-        if constexpr (std::is_same_v<ORDER,Order::F>) {
+        if constexpr (ORDER == Order::F) {
             return p[(i-begin[0])+(j-begin[1])*stride1+(k-begin[2])*stride2];
         } else {
             return p[(i-begin[0])*stride2+(j-begin[1])*stride1+(k-begin[2])];
@@ -234,7 +230,7 @@ private:
     static constexpr int len0 (GpuArray<int,3> const& a_begin,
                                GpuArray<int,3> const& a_end) noexcept
     {
-        if constexpr (std::is_same_v<ORDER,Order::F>) {
+        if constexpr (ORDER == Order::F) {
             return a_end[0] - a_begin[0];
         } else {
             return a_end[2] - a_begin[2];
@@ -248,7 +244,7 @@ private:
     }
 };
 
-template <typename T, typename ORDER = Order::F>
+template <typename T, Order ORDER = Order::F>
 struct Table4D
 {
     T* AMREX_RESTRICT p = nullptr;
@@ -292,9 +288,7 @@ struct Table4D
 #if defined(AMREX_DEBUG) || defined(AMREX_BOUND_CHECK)
         index_assert(i,j,k,n);
 #endif
-        static_assert(std::is_same_v<ORDER,Order::F> ||
-                      std::is_same_v<ORDER,Order::C>);
-        if constexpr (std::is_same_v<ORDER,Order::F>) {
+        if constexpr (ORDER == Order::F) {
             return p[(i-begin[0])+(j-begin[1])*stride1+(k-begin[2])*stride2+(n-begin[3])*stride3];
         } else {
             return p[(i-begin[0])*stride3+(j-begin[1])*stride2+(k-begin[2])*stride1+(n-begin[3])];
@@ -333,7 +327,7 @@ private:
     static constexpr int len0 (GpuArray<int,4> const& a_begin,
                                GpuArray<int,4> const& a_end) noexcept
     {
-        if constexpr (std::is_same_v<ORDER,Order::F>) {
+        if constexpr (ORDER == Order::F) {
             return a_end[0] - a_begin[0];
         } else {
             return a_end[3] - a_begin[3];
@@ -343,7 +337,7 @@ private:
     static constexpr int len1 (GpuArray<int,4> const& a_begin,
                                GpuArray<int,4> const& a_end) noexcept
     {
-        if constexpr (std::is_same_v<ORDER,Order::F>) {
+        if constexpr (ORDER == Order::F) {
             return a_end[1] - a_begin[1];
         } else {
             return a_end[2] - a_begin[2];
@@ -353,7 +347,7 @@ private:
     static constexpr int len2 (GpuArray<int,4> const& a_begin,
                                GpuArray<int,4> const& a_end) noexcept
     {
-        if constexpr (std::is_same_v<ORDER,Order::F>) {
+        if constexpr (ORDER == Order::F) {
             return a_end[2] - a_begin[2];
         } else {
             return a_end[1] - a_begin[1];
@@ -399,13 +393,13 @@ private:
  *      // We can now use table in device lambda.
  * \endcode
  */
-template <typename T, int N, typename ORDER = Order::F>
+template <typename T, int N, Order ORDER = Order::F>
 class TableData
     : public DataAllocator
 {
 public:
 
-    template <class U, int M, class O> friend class TableData;
+    template <class U, int M, Order O> friend class TableData;
     using value_type = T;
     using table_type = std::conditional_t<N==1, Table1D<T>,
                        std::conditional_t<N==2, Table2D<T, ORDER>,
@@ -459,7 +453,7 @@ private:
     bool m_ptr_owner = false;
 };
 
-template <typename T, int N, typename ORDER>
+template <typename T, int N, Order ORDER>
 TableData<T,N,ORDER>::TableData (Array<int,N> const& lo, Array<int,N> const& hi, Arena* ar)
     : DataAllocator{ar}, m_lo(lo), m_hi(hi)
 {
@@ -467,7 +461,7 @@ TableData<T,N,ORDER>::TableData (Array<int,N> const& lo, Array<int,N> const& hi,
 }
 
 
-template <typename T, int N, typename ORDER>
+template <typename T, int N, Order ORDER>
 TableData<T,N,ORDER>::TableData (TableData<T,N,ORDER>&& rhs) noexcept
     : DataAllocator{rhs.arena()},
       m_dptr(rhs.m_dptr),
@@ -480,7 +474,7 @@ TableData<T,N,ORDER>::TableData (TableData<T,N,ORDER>&& rhs) noexcept
     rhs.m_ptr_owner = false;
 }
 
-template <typename T, int N, typename ORDER>
+template <typename T, int N, Order ORDER>
 TableData<T,N,ORDER>&
 TableData<T,N,ORDER>::operator= (TableData<T,N,ORDER> && rhs) noexcept
 {
@@ -498,20 +492,17 @@ TableData<T,N,ORDER>::operator= (TableData<T,N,ORDER> && rhs) noexcept
     return *this;
 }
 
-template <typename T, int N, typename ORDER>
+template <typename T, int N, Order ORDER>
 TableData<T,N,ORDER>::~TableData () noexcept
 {
     static_assert(std::is_trivially_copyable<T>() &&
                   std::is_trivially_destructible<T>(),
                   "TableData<T,N,ORDER>: T must be trivially copyable and trivially destructible");
     static_assert(N>=1 && N <=4, "TableData<T,N,ORDER>: N must be in the range of [1,4]");
-    static_assert(std::is_same_v<ORDER,Order::F> ||
-                  std::is_same_v<ORDER,Order::C>,
-                  "TableDat<T,N,ORDER>: ORDER must be either Order::F or Order::C");
     clear();
 }
 
-template <typename T, int N, typename ORDER>
+template <typename T, int N, Order ORDER>
 void
 TableData<T,N,ORDER>::resize (Array<int,N> const& lo, Array<int,N> const& hi, Arena* ar)
 {
@@ -535,7 +526,7 @@ TableData<T,N,ORDER>::resize (Array<int,N> const& lo, Array<int,N> const& hi, Ar
     }
 }
 
-template <typename T, int N, typename ORDER>
+template <typename T, int N, Order ORDER>
 Long
 TableData<T,N,ORDER>::size () const noexcept
 {
@@ -546,7 +537,7 @@ TableData<T,N,ORDER>::size () const noexcept
     return r;
 }
 
-template <typename T, int N, typename ORDER>
+template <typename T, int N, Order ORDER>
 void
 TableData<T,N,ORDER>::clear () noexcept
 {
@@ -559,7 +550,7 @@ TableData<T,N,ORDER>::clear () noexcept
     }
 }
 
-template <typename T, int N, typename ORDER>
+template <typename T, int N, Order ORDER>
 void
 TableData<T,N,ORDER>::define ()
 {
@@ -574,46 +565,46 @@ TableData<T,N,ORDER>::define ()
 }
 
 namespace detail {
-    template <typename T, typename>
+    template <typename T, Order>
     Table1D<T> make_table (T* p, Array<int,1> const& lo, Array<int,1> const& hi) {
         return Table1D<T>(p, lo[0], hi[0]+1);
     }
-    template <typename T, typename ORDER>
+    template <typename T, Order ORDER>
     Table2D<T,ORDER> make_table (T* p, Array<int,2> const& lo, Array<int,2> const& hi) {
         return Table2D<T,ORDER>(p, {lo[0],lo[1]}, {hi[0]+1,hi[1]+1});
     }
-    template <typename T, typename ORDER>
+    template <typename T, Order ORDER>
     Table3D<T> make_table (T* p, Array<int,3> const& lo, Array<int,3> const& hi) {
         return Table3D<T,ORDER>(p, {lo[0],lo[1],lo[2]}, {hi[0]+1,hi[1]+1,hi[2]+1});
     }
-    template <typename T, typename ORDER>
+    template <typename T, Order ORDER>
     Table4D<T> make_table (T* p, Array<int,4> const& lo, Array<int,4> const& hi) {
         return Table4D<T,ORDER>(p, {lo[0],lo[1],lo[2],lo[3]}, {hi[0]+1,hi[1]+1,hi[2]+1,hi[3]+1});
     }
 }
 
-template <typename T, int N, typename ORDER>
+template <typename T, int N, Order ORDER>
 typename TableData<T,N,ORDER>::table_type
 TableData<T,N,ORDER>::table () noexcept
 {
     return detail::make_table<T,ORDER>(m_dptr, m_lo, m_hi);
 }
 
-template <typename T, int N, typename ORDER>
+template <typename T, int N, Order ORDER>
 typename TableData<T,N,ORDER>::const_table_type
 TableData<T,N,ORDER>::table () const noexcept
 {
     return detail::make_table<T const, ORDER>(m_dptr, m_lo, m_hi);
 }
 
-template <typename T, int N, typename ORDER>
+template <typename T, int N, Order ORDER>
 typename TableData<T,N,ORDER>::const_table_type
 TableData<T,N,ORDER>::const_table () const noexcept
 {
     return detail::make_table<T const, ORDER>(m_dptr, m_lo, m_hi);
 }
 
-template <typename T, int N, typename ORDER>
+template <typename T, int N, Order ORDER>
 void
 TableData<T,N,ORDER>::copy (TableData<T,N,ORDER> const& rhs) noexcept
 {
diff --git a/Src/Base/AMReX_TinyProfiler.cpp b/Src/Base/AMReX_TinyProfiler.cpp
index 32b35bf770..78a1685855 100644
--- a/Src/Base/AMReX_TinyProfiler.cpp
+++ b/Src/Base/AMReX_TinyProfiler.cpp
@@ -465,7 +465,6 @@ TinyProfiler::MemoryFinalize (bool bFlushing) noexcept
 
     std::ofstream ofs;
     std::ostream* os = nullptr;
-    std::streamsize oldprec = 0;
     if (ParallelDescriptor::IOProcessor()) {
         auto const& ofile = get_output_file();
         if (ofile.empty()) {
@@ -487,8 +486,6 @@ TinyProfiler::MemoryFinalize (bool bFlushing) noexcept
         all_memstats.clear();
         all_memnames.clear();
     }
-
-    if(os) { os->precision(oldprec); }
 }
 
 bool
diff --git a/Src/Base/CMakeLists.txt b/Src/Base/CMakeLists.txt
index 882f401228..2b6387ece1 100644
--- a/Src/Base/CMakeLists.txt
+++ b/Src/Base/CMakeLists.txt
@@ -14,6 +14,8 @@ foreach(D IN LISTS AMReX_SPACEDIM)
        AMReX_BlockMutex.cpp
        AMReX_Enum.H
        AMReX_GpuComplex.H
+       AMReX_SmallMatrix.H
+       AMReX_ConstexprFor.H
        AMReX_Vector.H
        AMReX_TableData.H
        AMReX_Tuple.H
diff --git a/Src/Base/Make.package b/Src/Base/Make.package
index c64fa50f11..264de0581f 100644
--- a/Src/Base/Make.package
+++ b/Src/Base/Make.package
@@ -4,6 +4,7 @@ AMREX_BASE=EXE
 C$(AMREX_BASE)_headers += AMReX_ccse-mpi.H AMReX_Algorithm.H AMReX_Any.H AMReX_Array.H
 C$(AMREX_BASE)_headers += AMReX_Enum.H
 C$(AMREX_BASE)_headers += AMReX_Vector.H AMReX_TableData.H AMReX_Tuple.H AMReX_Math.H
+C$(AMREX_BASE)_headers += AMReX_SmallMatrix.H AMReX_ConstexprFor.H
 
 C$(AMREX_BASE)_headers += AMReX_TypeList.H
 
diff --git a/Src/CMakeLists.txt b/Src/CMakeLists.txt
index 6e8af043e0..25455d7263 100644
--- a/Src/CMakeLists.txt
+++ b/Src/CMakeLists.txt
@@ -136,6 +136,10 @@ if (AMReX_PARTICLES)
    add_subdirectory(Particle)
 endif ()
 
+if (AMReX_FFT)
+   add_subdirectory(FFT)
+endif ()
+
 #
 # Optional external components
 #
diff --git a/Src/EB/AMReX_EB2.cpp b/Src/EB/AMReX_EB2.cpp
index f99eb504d2..4f7a5cb84b 100644
--- a/Src/EB/AMReX_EB2.cpp
+++ b/Src/EB/AMReX_EB2.cpp
@@ -216,6 +216,8 @@ Build (const Geometry& geom, int required_coarsening_level,
         pp.queryAdd("stl_center", stl_center);
         bool stl_reverse_normal = false;
         pp.queryAdd("stl_reverse_normal", stl_reverse_normal);
+        bool stl_use_bvh = true;
+        pp.queryAdd("stl_use_bvh", stl_use_bvh);
         IndexSpace::push(new IndexSpaceSTL(stl_file, stl_scale, // NOLINT(clang-analyzer-cplusplus.NewDeleteLeaks)
                                            {stl_center[0], stl_center[1], stl_center[2]},
                                            int(stl_reverse_normal),
@@ -223,7 +225,8 @@ Build (const Geometry& geom, int required_coarsening_level,
                                            max_coarsening_level, ngrow,
                                            build_coarse_level_by_coarsening,
                                            a_extend_domain_face,
-                                           a_num_coarsen_opt));
+                                           a_num_coarsen_opt,
+                                           stl_use_bvh));
     }
     else
     {
diff --git a/Src/EB/AMReX_EB2_2D_C.cpp b/Src/EB/AMReX_EB2_2D_C.cpp
index b99b5559c7..231faf0cb8 100644
--- a/Src/EB/AMReX_EB2_2D_C.cpp
+++ b/Src/EB/AMReX_EB2_2D_C.cpp
@@ -342,7 +342,8 @@ int build_faces (Box const& bx, Array4<EBCellFlag> const& cell,
     nsmallfaces += *(hp+1);
 
     if (*hp > 0 && !cover_multiple_cuts) {
-        amrex::Abort("amrex::EB2::build_faces: more than 2 cuts not supported");
+        amrex::Abort("amrex::EB2::build_faces: more than 2 cuts not supported. "
+                     "You can try to fix it by using runtime parameter eb2.cover_multiple_cuts=1.");
     }
 
     return *hp;
diff --git a/Src/EB/AMReX_EB2_3D_C.cpp b/Src/EB/AMReX_EB2_3D_C.cpp
index 2d02e53bdc..ec7d643391 100644
--- a/Src/EB/AMReX_EB2_3D_C.cpp
+++ b/Src/EB/AMReX_EB2_3D_C.cpp
@@ -768,7 +768,8 @@ int build_faces (Box const& bx, Array4<EBCellFlag> const& cell,
                 }
             });
         } else {
-            amrex::Abort("amrex::EB2::build_faces: more than 2 cuts not supported");
+            amrex::Abort("amrex::EB2::build_faces: more than 2 cuts not supported. "
+                         "You can try to fix it by using runtime parameter eb2.cover_multiple_cuts=1.");
         }
     }
 
@@ -932,7 +933,8 @@ void build_cells (Box const& bx, Array4<EBCellFlag> const& cell,
 
     if (nsmallcells > 0 || nmulticuts > 0) {
         if (!cover_multiple_cuts && nmulticuts > 0) {
-            amrex::Abort("amrex::EB2::build_cells: multi-cuts not supported");
+            amrex::Abort("amrex::EB2::build_cells: multi-cuts not supported. "
+                         "You can try to fix it by using runtime parameter eb2.cover_multiple_cuts=1.");
         }
         return;
     } else {
diff --git a/Src/EB/AMReX_EB2_IndexSpace_STL.H b/Src/EB/AMReX_EB2_IndexSpace_STL.H
index 0c72d076ea..f974daba7a 100644
--- a/Src/EB/AMReX_EB2_IndexSpace_STL.H
+++ b/Src/EB/AMReX_EB2_IndexSpace_STL.H
@@ -19,7 +19,7 @@ public:
                   const Geometry& geom, int required_coarsening_level,
                   int max_coarsening_level, int ngrow,
                   bool build_coarse_level_by_coarsening,
-                  bool extend_domain_face, int num_coarsen_opt);
+                  bool extend_domain_face, int num_coarsen_opt, bool bvh_optimization);
 
     IndexSpaceSTL (IndexSpaceSTL const&) = delete;
     IndexSpaceSTL (IndexSpaceSTL &&) = delete;
diff --git a/Src/EB/AMReX_EB2_IndexSpace_STL.cpp b/Src/EB/AMReX_EB2_IndexSpace_STL.cpp
index 70e3b492d8..f8f62684f2 100644
--- a/Src/EB/AMReX_EB2_IndexSpace_STL.cpp
+++ b/Src/EB/AMReX_EB2_IndexSpace_STL.cpp
@@ -7,11 +7,13 @@ IndexSpaceSTL::IndexSpaceSTL (const std::string& stl_file, Real stl_scale,
                               const Geometry& geom, int required_coarsening_level,
                               int max_coarsening_level, int ngrow,
                               bool build_coarse_level_by_coarsening,
-                              bool extend_domain_face, int num_coarsen_opt)
+                              bool extend_domain_face, int num_coarsen_opt,
+                              bool bvh_optimization)
 {
     Gpu::LaunchSafeGuard lsg(true); // Always use GPU
 
     STLtools stl_tools;
+    stl_tools.setBVHOptimization(bvh_optimization);
     stl_tools.read_stl_file(stl_file, stl_scale, stl_center, stl_reverse_normal);
 
     // build finest level (i.e., level 0) first
diff --git a/Src/EB/AMReX_EB_STL_utils.H b/Src/EB/AMReX_EB_STL_utils.H
index eb277202cd..828d5a120c 100644
--- a/Src/EB/AMReX_EB_STL_utils.H
+++ b/Src/EB/AMReX_EB_STL_utils.H
@@ -7,6 +7,11 @@
 #include <AMReX_Dim3.H>
 #include <AMReX_EB2_Graph.H>
 
+#include <algorithm>
+#include <cstdint>
+#include <limits>
+#include <utility>
+
 namespace amrex
 {
 
@@ -15,33 +20,47 @@ class STLtools
 public:
     struct Triangle {
         XDim3 v1, v2, v3;
-    };
-
-    static constexpr int allregular = -1;
-    static constexpr int mixedcells = 0;
-    static constexpr int allcovered = 1;
-
-private:
 
-    Gpu::PinnedVector<Triangle> m_tri_pts_h;
-    Gpu::DeviceVector<Triangle> m_tri_pts_d;
-    Gpu::DeviceVector<XDim3> m_tri_normals_d;
+        [[nodiscard]] Real cent (int d) const
+        {
+            static_assert(sizeof(XDim3) == sizeof(Real)*3);
+            return Real(1./3.)*((&v1.x)[d] + (&v2.x)[d] + (&v3.x)[d]);
+        }
+
+        [[nodiscard]] std::pair<Real,Real> minmax (int d) const
+        {
+            static_assert(sizeof(XDim3) == sizeof(Real)*3);
+            return std::minmax({(&v1.x)[d], (&v2.x)[d], (&v3.x)[d]});
+        }
+    };
 
-    int m_num_tri=0;
+    template <int M, int N>
+    struct BVHNodeT
+    {
+        RealBox boundingbox{AMREX_D_DECL(std::numeric_limits<Real>::max(),
+                                         std::numeric_limits<Real>::max(),
+                                         std::numeric_limits<Real>::max()),
+                            AMREX_D_DECL(std::numeric_limits<Real>::lowest(),
+                                         std::numeric_limits<Real>::lowest(),
+                                         std::numeric_limits<Real>::lowest())};
+        STLtools::Triangle triangles[M];
+        XDim3 trinorm[M];
+        int children[N];
+        std::int8_t ntriangles = 0;
+        std::int8_t nchildren = 0;
+    };
 
-    XDim3 m_ptmin;  // All triangles are inside the bounding box defined by
-    XDim3 m_ptmax;  //     m_ptmin and m_ptmax.
-    XDim3 m_ptref;  // The reference point is slightly outside the bounding box.
-    bool m_boundry_is_outside; // Is the bounding box boundary outside or inside the object?
+    static constexpr int m_bvh_max_size = 4; // max # of triangles in a leaf node
+    static constexpr int m_bvh_max_splits = 4; // max # of children
+    static constexpr int m_bvh_max_stack_size = 12; // max depth of the tree
 
-    void read_ascii_stl_file (std::string const& fname, Real scale,
-                              Array<Real,3> const& center, int reverse_normal);
-    void read_binary_stl_file (std::string const& fname, Real scale,
-                               Array<Real,3> const& center, int reverse_normal);
+    using Node = BVHNodeT<m_bvh_max_size,m_bvh_max_splits>;
 
-public:
+    static constexpr int allregular = -1;
+    static constexpr int mixedcells = 0;
+    static constexpr int allcovered = 1;
 
-    void prepare ();  // public for cuda
+    void setBVHOptimization (bool flag) { m_bvh_optimization = flag; }
 
     void read_stl_file (std::string const& fname, Real scale, Array<Real,3> const& center,
                         int reverse_normal);
@@ -65,6 +84,32 @@ public:
                           Array<Array4<EB2::Type_t const>,AMREX_SPACEDIM> const& type_arr,
                           Array4<Real const> const& lst, Geometry const& geom) ;
 
+    void prepare (Gpu::PinnedVector<Triangle> a_tri_pts);  // public for cuda
+
+private:
+
+    bool m_bvh_optimization = true;
+
+    Gpu::DeviceVector<Triangle> m_tri_pts_d;
+    Gpu::DeviceVector<XDim3> m_tri_normals_d;
+    Gpu::DeviceVector<Node> m_bvh_nodes;
+
+    int m_num_tri=0;
+
+    XDim3 m_ptmin;  // All triangles are inside the bounding box defined by
+    XDim3 m_ptmax;  //     m_ptmin and m_ptmax.
+    XDim3 m_ptref;  // The reference point is slightly outside the bounding box.
+    bool m_boundry_is_outside; // Is the bounding box boundary outside or inside the object?
+
+    void read_ascii_stl_file (std::string const& fname, Real scale,
+                              Array<Real,3> const& center, int reverse_normal,
+                              Gpu::PinnedVector<Triangle>& a_tri_pts);
+    void read_binary_stl_file (std::string const& fname, Real scale,
+                               Array<Real,3> const& center, int reverse_normal,
+                               Gpu::PinnedVector<Triangle>& a_tri_pts);
+
+    static void build_bvh (Triangle* begin, Triangle * end, Gpu::PinnedVector<Node>& bvh_nodes);
+    static void bvh_size (int ntri, std::size_t& nnodes);
 };
 
 }
diff --git a/Src/EB/AMReX_EB_STL_utils.cpp b/Src/EB/AMReX_EB_STL_utils.cpp
index f7ce3045d9..3a3070f188 100644
--- a/Src/EB/AMReX_EB_STL_utils.cpp
+++ b/Src/EB/AMReX_EB_STL_utils.cpp
@@ -1,15 +1,34 @@
+#include <AMReX_BLProfiler.H>
 #include <AMReX_EB_STL_utils.H>
 #include <AMReX_EB_triGeomOps_K.H>
 #include <AMReX_IntConv.H>
+#include <AMReX_Math.H>
+#include <AMReX_Stack.H>
+
 #include <cstring>
 
+// Reference for BVH: https://rmrsk.github.io/EBGeometry/Concepts.html#bounding-volume-hierarchies
+
 namespace amrex
 {
 
 namespace {
+
+    AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    XDim3 triangle_norm (STLtools::Triangle const& tri)
+    {
+        XDim3 vec1{tri.v2.x-tri.v1.x, tri.v2.y-tri.v1.y, tri.v2.z-tri.v1.z};
+        XDim3 vec2{tri.v3.x-tri.v2.x, tri.v3.y-tri.v2.y, tri.v3.z-tri.v2.z};
+        XDim3 norm{vec1.y*vec2.z-vec1.z*vec2.y,
+                   vec1.z*vec2.x-vec1.x*vec2.z,
+                   vec1.x*vec2.y-vec1.y*vec2.x};
+        Real tmp = 1._rt / std::sqrt(norm.x*norm.x + norm.y*norm.y + norm.z*norm.z);
+        return {norm.x * tmp, norm.y * tmp, norm.z * tmp};
+    }
+
     // Does line ab intersect with the triangle?
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-    bool line_tri_intersects (Real a[3], Real b[3], STLtools::Triangle const& tri)
+    bool line_tri_intersects (Real const a[3], Real const b[3], STLtools::Triangle const& tri)
     {
         if (amrex::max(a[0],b[0]) < amrex::min(tri.v1.x,tri.v2.x,tri.v3.x) ||
             amrex::min(a[0],b[0]) > amrex::max(tri.v1.x,tri.v2.x,tri.v3.x) ||
@@ -89,12 +108,95 @@ namespace {
             return std::make_pair(false,0.0_rt);
         }
     }
+
+    AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    bool line_box_intersects (Real const a[3], Real const b[3], RealBox const& box)
+    {
+        for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) {
+            if ((a[idim] < box.lo(idim) && b[idim] < box.lo(idim)) ||
+                (a[idim] > box.hi(idim) && b[idim] > box.hi(idim))) {
+                return false;
+            }
+        }
+        if (box.contains(a) || box.contains(b)) {
+            return true;
+        }
+        for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) {
+            // Note that we have made bounding box slightly bigger. So it's
+            // safe to assume that a line in the plane does not intersect
+            // with the actual bounding box.
+            if (a[idim] == b[idim]) { continue; }
+            Real xi[] = {box.lo(idim), box.hi(idim)};
+            for (auto xface : xi) {
+                if (!((a[idim] > xface && b[idim] > xface) ||
+                      (a[idim] < xface && b[idim] < xface)))
+                {
+                    Real w = (xface-a[idim]) / (b[idim]-a[idim]);
+                    bool inside = true;
+                    for (int jdim = 0; jdim < AMREX_SPACEDIM; ++jdim) {
+                        if (idim != jdim) {
+                            Real xpt = a[jdim] + (b[jdim]-a[jdim]) * w;
+                            inside = inside && (xpt >= box.lo(jdim)
+                                            &&  xpt <= box.hi(jdim));
+                        }
+                    }
+                    if (inside) { return true; }
+                }
+            }
+        }
+
+        return false;
+    }
+
+    template <int M, int N, typename F>
+    AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    void bvh_line_tri_intersects (Real const a[3], Real const b[3],
+                                  STLtools::BVHNodeT<M,N> const* root,
+                                  F const& f)
+    {
+        // Use stack to avoid recursion
+        Stack<int, STLtools::m_bvh_max_stack_size> nodes_to_do;
+        Stack<std::int8_t, STLtools::m_bvh_max_stack_size> nchildren_done;
+
+        if (line_box_intersects(a, b, root->boundingbox)) {
+            nodes_to_do.push(0);
+            nchildren_done.push(0);
+        }
+
+        while (!nodes_to_do.empty()) {
+            auto const& node = root[nodes_to_do.top()];
+            if (node.nchildren == 0) { // leaf node
+                int ret = f(node.ntriangles, node.triangles, node.trinorm);
+                if (ret != 0) { break; }
+                nodes_to_do.pop();
+                nchildren_done.pop();
+            } else {
+                auto& ndone = nchildren_done.top();
+                if (ndone < node.nchildren) {
+                    for (auto ichild = ndone; ichild < node.nchildren; ++ichild) {
+                        ++ndone;
+                        int inode = node.children[ichild];
+                        if (line_box_intersects(a, b, root[inode].boundingbox)) {
+                            nodes_to_do.push(inode);
+                            nchildren_done.push(0);
+                            break;
+                        }
+                    }
+                } else {
+                    nodes_to_do.pop();
+                    nchildren_done.pop();
+                }
+            }
+        }
+    }
 }
 
 void
 STLtools::read_stl_file (std::string const& fname, Real scale, Array<Real,3> const& center,
                          int reverse_normal)
 {
+    Gpu::PinnedVector<Triangle> tri_pts;
+
     if (ParallelDescriptor::IOProcessor()) {
         char header[6];
         header[5] = '\0';
@@ -107,18 +209,19 @@ STLtools::read_stl_file (std::string const& fname, Real scale, Array<Real,3> con
         }
         int is_binary = std::strcmp(header, "solid");
         if (is_binary) {
-            read_binary_stl_file(fname, scale, center, reverse_normal);
+            read_binary_stl_file(fname, scale, center, reverse_normal, tri_pts);
         } else {
-            read_ascii_stl_file(fname, scale, center, reverse_normal);
+            read_ascii_stl_file(fname, scale, center, reverse_normal, tri_pts);
         }
     }
 
-    prepare();
+    prepare(std::move(tri_pts));
 }
 
 void
 STLtools::read_binary_stl_file (std::string const& fname, Real scale,
-                                Array<Real,3> const& center, int reverse_normal)
+                                Array<Real,3> const& center, int reverse_normal,
+                                Gpu::PinnedVector<Triangle>& a_tri_pts)
 {
     if (ParallelDescriptor::IOProcessor()) {
         if (amrex::Verbose()) {
@@ -140,9 +243,13 @@ STLtools::read_binary_stl_file (std::string const& fname, Real scale,
 
         uint32_t numtris; // uint32 - Number of triangles - 4 bytes
         amrex::readIntData<uint32_t,uint32_t>(&numtris, 1, is, uint32_descr);
-        AMREX_ASSERT(numtris < uint32_t(std::numeric_limits<int>::max()));
+        AMREX_ALWAYS_ASSERT(numtris < uint32_t(std::numeric_limits<int>::max()));
         m_num_tri = static_cast<int>(numtris);
-        m_tri_pts_h.resize(m_num_tri);
+        // maximum number of triangles allowed for traversing the BVH tree
+        // using stack.
+        int max_tri_stack = Math::powi<m_bvh_max_stack_size-1>(m_bvh_max_splits)*m_bvh_max_size;
+        AMREX_ALWAYS_ASSERT(m_num_tri <= max_tri_stack);
+        a_tri_pts.resize(m_num_tri);
 
         if (amrex::Verbose()) {
             Print() << "    Number of triangles: " << m_num_tri << "\n";
@@ -150,7 +257,7 @@ STLtools::read_binary_stl_file (std::string const& fname, Real scale,
 
         for (int i=0; i < m_num_tri; ++i) {
             is.read(tmp, 50);  // 50 bytes for each triangle. Vertex 1 starts at 12 bytes.
-            Real* p = &(m_tri_pts_h[i].v1.x);
+            Real* p = &(a_tri_pts[i].v1.x);
             RealDescriptor::convertToNativeFormat(p, 9, tmp+12, real32_descr);
             for (int j = 0; j < 3; ++j) {
                 p[0] = p[0] * scale + center[0];
@@ -159,7 +266,7 @@ STLtools::read_binary_stl_file (std::string const& fname, Real scale,
                 p += 3;
             }
             if (reverse_normal) {
-                std::swap(m_tri_pts_h[i].v1, m_tri_pts_h[i].v2);
+                std::swap(a_tri_pts[i].v1, a_tri_pts[i].v2);
             }
         }
     }
@@ -167,7 +274,8 @@ STLtools::read_binary_stl_file (std::string const& fname, Real scale,
 
 void
 STLtools::read_ascii_stl_file (std::string const& fname, Real scale,
-                               Array<Real,3> const& center, int reverse_normal)
+                               Array<Real,3> const& center, int reverse_normal,
+                               Gpu::PinnedVector<Triangle>& a_tri_pts)
 {
     if (ParallelDescriptor::IOProcessor()) {
         if (amrex::Verbose()) {
@@ -200,9 +308,9 @@ STLtools::read_ascii_stl_file (std::string const& fname, Real scale,
         }
 
         m_num_tri = nlines / nlines_per_facet;
-        m_tri_pts_h.resize(m_num_tri);
+        a_tri_pts.resize(m_num_tri);
         static_assert(sizeof(Triangle) == sizeof(Real)*9, "sizeof(Triangle) is wrong");
-        Real* p = &(m_tri_pts_h[0].v1.x);
+        Real* p = &(a_tri_pts[0].v1.x);
 
         if (amrex::Verbose()) {
             Print() << "    Number of triangles: " << m_num_tri << "\n";
@@ -230,45 +338,52 @@ STLtools::read_ascii_stl_file (std::string const& fname, Real scale,
             std::getline(is,tmp); //end facet
 
             if (reverse_normal) {
-                std::swap(m_tri_pts_h[i].v1, m_tri_pts_h[i].v2);
+                std::swap(a_tri_pts[i].v1, a_tri_pts[i].v2);
             }
         }
     }
 }
 
 void
-STLtools::prepare ()
+STLtools::prepare (Gpu::PinnedVector<Triangle> a_tri_pts)
 {
+    BL_PROFILE("STLtools::prepare");
+
     ParallelDescriptor::Bcast(&m_num_tri, 1);
     if (!ParallelDescriptor::IOProcessor()) {
-        m_tri_pts_h.resize(m_num_tri);
+        a_tri_pts.resize(m_num_tri);
     }
-    ParallelDescriptor::Bcast((char*)(m_tri_pts_h.dataPtr()), m_num_tri*sizeof(Triangle));
+    ParallelDescriptor::Bcast((char*)(a_tri_pts.dataPtr()), m_num_tri*sizeof(Triangle));
 
-    //device vectors
-    m_tri_pts_d.resize(m_num_tri);
-    m_tri_normals_d.resize(m_num_tri);
+    Gpu::PinnedVector<Node> bvh_nodes;
+    if (m_bvh_optimization) {
+        BL_PROFILE("STLtools::build_bvh");
+        std::size_t nnodes = 0;
+        bvh_size(int(a_tri_pts.size()), nnodes);
+        bvh_nodes.reserve(nnodes);
+        build_bvh(a_tri_pts.data(), a_tri_pts.data()+a_tri_pts.size(), bvh_nodes);
+#ifdef AMREX_USE_GPU
+        m_bvh_nodes.resize(bvh_nodes.size());
+        Gpu::copyAsync(Gpu::hostToDevice, bvh_nodes.begin(), bvh_nodes.end(),
+                       m_bvh_nodes.begin());
+#else
+        m_bvh_nodes = std::move(bvh_nodes);
+#endif
+    }
 
-    Gpu::copyAsync(Gpu::hostToDevice, m_tri_pts_h.begin(), m_tri_pts_h.end(),
-                   m_tri_pts_d.begin());
+    auto const tri0 = a_tri_pts[0];
 
+#ifdef AMREX_USE_GPU
+    m_tri_pts_d.resize(m_num_tri);
+    Gpu::copyAsync(Gpu::hostToDevice, a_tri_pts.begin(), a_tri_pts.end(),
+                   m_tri_pts_d.begin());
+#else
+    m_tri_pts_d = std::move(a_tri_pts);
+#endif
     Triangle const* tri_pts = m_tri_pts_d.data();
-    XDim3* tri_norm = m_tri_normals_d.data();
 
-    // Compute normals in case the STL file does not have valid data for normals
-    ParallelFor(m_num_tri, [=] AMREX_GPU_DEVICE (int i) noexcept
-    {
-        Triangle const& tri = tri_pts[i];
-        XDim3 vec1{tri.v2.x-tri.v1.x, tri.v2.y-tri.v1.y, tri.v2.z-tri.v1.z};
-        XDim3 vec2{tri.v3.x-tri.v2.x, tri.v3.y-tri.v2.y, tri.v3.z-tri.v2.z};
-        XDim3 norm{vec1.y*vec2.z-vec1.z*vec2.y,
-                   vec1.z*vec2.x-vec1.x*vec2.z,
-                   vec1.x*vec2.y-vec1.y*vec2.x};
-        Real tmp = 1._rt / std::sqrt(norm.x*norm.x + norm.y*norm.y + norm.z*norm.z);
-        tri_norm[i].x = norm.x * tmp;
-        tri_norm[i].y = norm.y * tmp;
-        tri_norm[i].z = norm.z * tmp;
-    });
+    m_tri_normals_d.resize(m_num_tri);
+    XDim3* tri_norm = m_tri_normals_d.data();
 
     ReduceOps<ReduceOpMin,ReduceOpMin,ReduceOpMin,ReduceOpMax,ReduceOpMax,ReduceOpMax> reduce_op;
     ReduceData<Real,Real,Real,Real,Real,Real> reduce_data(reduce_op);
@@ -276,6 +391,7 @@ STLtools::prepare ()
     reduce_op.eval(m_num_tri, reduce_data,
                    [=] AMREX_GPU_DEVICE (int i) -> ReduceTuple
                    {
+                       tri_norm[i] = triangle_norm(tri_pts[i]);
                        return {amrex::min(tri_pts[i].v1.x,
                                           tri_pts[i].v2.x,
                                           tri_pts[i].v3.x),
@@ -309,24 +425,12 @@ STLtools::prepare ()
 
     // Choose a reference point by extending the normal vector of the first
     // triangle until it's slightly outside the bounding box.
-    XDim3 cent0; // centroid of the first triangle
+    XDim3 cent0{tri0.cent(0), tri0.cent(1), tri0.cent(2)};
     int is_ref_positive;
     {
-        Triangle const& tri = m_tri_pts_h[0];
-        cent0 = XDim3{(tri.v1.x + tri.v2.x + tri.v3.x) / 3._rt,
-                      (tri.v1.y + tri.v2.y + tri.v3.y) / 3._rt,
-                      (tri.v1.z + tri.v2.z + tri.v3.z) / 3._rt};
         // We are computing the normal ourselves in case the stl file does
         // not have valid data on normal.
-        XDim3 vec1{tri.v2.x-tri.v1.x, tri.v2.y-tri.v1.y, tri.v2.z-tri.v1.z};
-        XDim3 vec2{tri.v3.x-tri.v2.x, tri.v3.y-tri.v2.y, tri.v3.z-tri.v2.z};
-        XDim3 norm{vec1.y*vec2.z-vec1.z*vec2.y,
-                   vec1.z*vec2.x-vec1.x*vec2.z,
-                   vec1.x*vec2.y-vec1.y*vec2.x};
-        Real tmp = 1._rt / std::sqrt(norm.x*norm.x + norm.y*norm.y + norm.z*norm.z);
-        norm.x *= tmp;
-        norm.y *= tmp;
-        norm.z *= tmp;
+        XDim3 norm = triangle_norm(tri0);
         // Now we need to find out where the normal vector will intersect
         // with the bounding box defined by m_ptmin and m_ptmax.
         Real Lx, Ly, Lz;
@@ -415,10 +519,113 @@ STLtools::prepare ()
     m_boundry_is_outside = num_isects % 2 == 0;
 }
 
+void
+STLtools::build_bvh (Triangle* begin, Triangle* end, Gpu::PinnedVector<Node>& bvh_nodes)
+{
+    auto ntri = int(end - begin);
+
+    if (ntri <= m_bvh_max_size) {
+        // This is a leaf node
+        bvh_nodes.push_back(Node());
+        auto& node = bvh_nodes.back();
+        auto& bbox = node.boundingbox;
+        for (int tr = 0; tr < ntri; ++tr) {
+            auto const& tri = begin[tr];
+            for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) {
+                auto const& [xmin,xmax] = tri.minmax(idim);
+                bbox.setLo(idim,amrex::min(xmin, bbox.lo(idim)));
+                bbox.setHi(idim,amrex::max(xmax, bbox.hi(idim)));
+            }
+            node.triangles[tr] = tri;
+            node.trinorm[tr] = triangle_norm(tri);
+        }
+#ifdef AMREX_USE_FLOAT
+        constexpr Real eps = Real(1.e-5);
+#else
+        constexpr Real eps = Real(1.e-10);
+#endif
+        Real small = eps*std::max({AMREX_D_DECL(bbox.length(0),
+                                                bbox.length(1),
+                                                bbox.length(2))});
+        // Make bounding box slightly bigger for robustness.
+        for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) {
+            bbox.setLo(idim,bbox.lo(idim)-small);
+            bbox.setHi(idim,bbox.hi(idim)+small);
+        }
+        node.ntriangles = int(ntri); // NOLINT
+        return;
+    }
+
+    RealVect centmin(std::numeric_limits<Real>::max());
+    RealVect centmax(std::numeric_limits<Real>::lowest());
+    for (auto* p = begin; p != end; ++p) {
+        RealVect cent(AMREX_D_DECL(p->cent(0), p->cent(1), p->cent(2)));
+        centmin.min(cent);
+        centmax.max(cent);
+    }
+    int max_dir = (centmax-centmin).maxDir(false);
+    std::sort(begin, end, [max_dir] (Triangle const& a, Triangle const& b) -> bool
+                              { return a.cent(max_dir) < b.cent(max_dir); });
+
+    int nsplits = std::min((ntri + (m_bvh_max_size-1)) / m_bvh_max_size, m_bvh_max_splits);
+    int tsize = ntri / nsplits;
+    int nleft = ntri - tsize*nsplits;
+
+    bvh_nodes.push_back(Node());
+    bvh_nodes.back().nchildren = std::int8_t(nsplits);
+    auto this_node = bvh_nodes.size()-1;
+
+    for (int isplit = 0; isplit < nsplits; ++isplit) {
+        int tbegin, tend;
+        if (isplit < nleft) {
+            tbegin = isplit * (tsize+1);
+            tend = tbegin + tsize + 1;
+        } else {
+            tbegin = isplit * tsize + nleft;
+            tend = tbegin + tsize;
+        }
+        bvh_nodes[this_node].children[isplit] = int(bvh_nodes.size());
+        build_bvh(begin+tbegin, begin+tend, bvh_nodes);
+    }
+
+    // update bounding box
+    auto& node = bvh_nodes[this_node];
+    for (int ichild = 0; ichild < node.nchildren; ++ichild) {
+        int inode = node.children[ichild];
+        for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) {
+            auto lo = node.boundingbox.lo(idim);
+            auto hi = node.boundingbox.hi(idim);
+            auto clo = bvh_nodes[inode].boundingbox.lo(idim);
+            auto chi = bvh_nodes[inode].boundingbox.hi(idim);
+            node.boundingbox.setLo(idim, std::min(lo,clo));
+            node.boundingbox.setHi(idim, std::max(hi,chi));
+        }
+    }
+}
+
+void
+STLtools::bvh_size (int ntri, std::size_t& nnodes)
+{
+    ++nnodes;
+
+    if (ntri <= m_bvh_max_size) { return; } // This is a leaf node
+
+    int nsplits = std::min((ntri + (m_bvh_max_size-1)) / m_bvh_max_size, m_bvh_max_splits);
+    int tsize = ntri / nsplits;
+    int nleft = ntri - tsize*nsplits;
+
+    for (int isplit = 0; isplit < nsplits; ++isplit) {
+        int child_size = (isplit < nleft) ? (tsize+1) : tsize;
+        bvh_size(child_size, nnodes);
+    }
+}
+
 void
 STLtools::fill (MultiFab& mf, IntVect const& nghost, Geometry const& geom,
                 Real outside_value, Real inside_value) const
 {
+    BL_PROFILE("STLtools::fill");
+
     int num_triangles = m_num_tri;
 
     const auto plo = geom.ProbLoArray();
@@ -432,8 +639,15 @@ STLtools::fill (MultiFab& mf, IntVect const& nghost, Geometry const& geom,
     Real other_value     = m_boundry_is_outside ?  inside_value : outside_value;
 
     auto const& ma = mf.arrays();
+    auto const* bvh_root = m_bvh_nodes.data();
+
+    enum bvh_opt_options : int { no_bvh, yes_bvh };
+    int bvh_opt_runtime_option = m_bvh_optimization ? yes_bvh : no_bvh;
 
-    ParallelFor(mf, nghost, [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k) noexcept
+    AnyCTO(TypeList<CompileTimeOptions<no_bvh, yes_bvh>>{},
+           {bvh_opt_runtime_option},
+           [&] (auto cto_func) { ParallelFor(mf, nghost, cto_func); },
+           [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k, auto control) noexcept
     {
         Real coords[3];
         coords[0]=plo[0]+static_cast<Real>(i)*dx[0];
@@ -449,9 +663,26 @@ STLtools::fill (MultiFab& mf, IntVect const& nghost, Geometry const& geom,
             coords[2] >= ptmin.z && coords[2] <= ptmax.z)
         {
             Real pr[]={ptref.x, ptref.y, ptref.z};
-            for (int tr=0; tr < num_triangles; ++tr) {
-                if (line_tri_intersects(pr, coords, tri_pts[tr])) {
-                    ++num_intersects;
+#ifdef AMREX_USE_CUDA
+            amrex::ignore_unused(bvh_root, num_triangles, tri_pts);
+#endif
+            if constexpr (control == yes_bvh) {
+                bvh_line_tri_intersects(pr, coords, bvh_root,
+                                        [&] (int ntri, Triangle const* tri,
+                                             XDim3 const*) -> int
+                {
+                    for (int tr=0; tr < ntri; ++tr) {
+                        if (line_tri_intersects(pr, coords, tri[tr])) {
+                            ++num_intersects;
+                        }
+                    }
+                    return 0;
+                });
+            } else {
+                for (int tr=0; tr < num_triangles; ++tr) {
+                    if (line_tri_intersects(pr, coords, tri_pts[tr])) {
+                        ++num_intersects;
+                    }
                 }
             }
         }
@@ -463,6 +694,8 @@ STLtools::fill (MultiFab& mf, IntVect const& nghost, Geometry const& geom,
 int
 STLtools::getBoxType (Box const& box, Geometry const& geom, RunOn) const
 {
+    BL_PROFILE("STLtools::getBoxType");
+
     const auto plo = geom.ProbLoArray();
     const auto dx  = geom.CellSizeArray();
 
@@ -498,11 +731,19 @@ STLtools::getBoxType (Box const& box, Geometry const& geom, RunOn) const
         XDim3 ptref = m_ptref;
         int ref_value = m_boundry_is_outside ? 1 : 0;
 
+        auto const* bvh_root = m_bvh_nodes.data();
+
         ReduceOps<ReduceOpSum> reduce_op;
         ReduceData<int> reduce_data(reduce_op);
         using ReduceTuple = typename decltype(reduce_data)::Type;
-        reduce_op.eval(box, reduce_data,
-        [=] AMREX_GPU_DEVICE (int i, int j, int k) -> ReduceTuple
+
+        enum bvh_opt_options : int { no_bvh, yes_bvh };
+        int bvh_opt_runtime_option = m_bvh_optimization ? yes_bvh : no_bvh;
+
+        AnyCTO(TypeList<CompileTimeOptions<no_bvh, yes_bvh>>{},
+               {bvh_opt_runtime_option},
+               [&] (auto cto_func) { reduce_op.eval(box, reduce_data, cto_func); },
+               [=] AMREX_GPU_DEVICE (int i, int j, int k, auto control) -> ReduceTuple
         {
             Real coords[3];
             coords[0]=plo[0]+static_cast<Real>(i)*dx[0];
@@ -519,9 +760,26 @@ STLtools::getBoxType (Box const& box, Geometry const& geom, RunOn) const
                 coords[2] >= ptmin.z && coords[2] <= ptmax.z)
             {
                 Real pr[]={ptref.x, ptref.y, ptref.z};
-                for (int tr=0; tr < num_triangles; ++tr) {
-                    if (line_tri_intersects(pr, coords, tri_pts[tr])) {
-                        ++num_intersects;
+#ifdef AMREX_USE_CUDA
+                amrex::ignore_unused(bvh_root,num_triangles,tri_pts);
+#endif
+                if constexpr (control == yes_bvh) {
+                    bvh_line_tri_intersects(pr, coords, bvh_root,
+                                            [&] (int ntri, Triangle const* tri,
+                                                 XDim3 const*) -> int
+                    {
+                        for (int tr=0; tr < ntri; ++tr) {
+                            if (line_tri_intersects(pr, coords, tri[tr])) {
+                                ++num_intersects;
+                            }
+                        }
+                        return 0;
+                    });
+                } else {
+                    for (int tr=0; tr < num_triangles; ++tr) {
+                        if (line_tri_intersects(pr, coords, tri_pts[tr])) {
+                            ++num_intersects;
+                        }
                     }
                 }
             }
@@ -556,9 +814,17 @@ STLtools::fillFab (BaseFab<Real>& levelset, const Geometry& geom, RunOn, Box con
     Real reference_value = m_boundry_is_outside ? -1.0_rt :  1.0_rt;
     Real other_value     = m_boundry_is_outside ?  1.0_rt : -1.0_rt;
 
+    auto const* bvh_root = m_bvh_nodes.data();
+
     auto const& a = levelset.array();
     const Box& bx = levelset.box();
-    ParallelFor(bx, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
+
+    enum bvh_opt_options : int { no_bvh, yes_bvh };
+    int bvh_opt_runtime_option = m_bvh_optimization ? yes_bvh : no_bvh;
+
+    ParallelFor(TypeList<CompileTimeOptions<no_bvh, yes_bvh>>{},
+                {bvh_opt_runtime_option},
+                bx, [=] AMREX_GPU_DEVICE (int i, int j, int k, auto control) noexcept
     {
         Real coords[3];
         coords[0]=plo[0]+static_cast<Real>(i)*dx[0];
@@ -574,9 +840,26 @@ STLtools::fillFab (BaseFab<Real>& levelset, const Geometry& geom, RunOn, Box con
             coords[2] >= ptmin.z && coords[2] <= ptmax.z)
         {
             Real pr[]={ptref.x, ptref.y, ptref.z};
-            for (int tr=0; tr < num_triangles; ++tr) {
-                if (line_tri_intersects(pr, coords, tri_pts[tr])) {
-                    ++num_intersects;
+#ifdef AMREX_USE_CUDA
+            amrex::ignore_unused(bvh_root,num_triangles,tri_pts);
+#endif
+            if constexpr (control == yes_bvh) {
+                bvh_line_tri_intersects(pr, coords, bvh_root,
+                                        [&] (int ntri, Triangle const* tri,
+                                             XDim3 const*) -> int
+                {
+                    for (int tr=0; tr < ntri; ++tr) {
+                        if (line_tri_intersects(pr, coords, tri[tr])) {
+                            ++num_intersects;
+                        }
+                    }
+                    return 0;
+                });
+            } else {
+                for (int tr=0; tr < num_triangles; ++tr) {
+                    if (line_tri_intersects(pr, coords, tri_pts[tr])) {
+                        ++num_intersects;
+                    }
                 }
             }
         }
@@ -597,13 +880,22 @@ STLtools::getIntercept (Array<Array4<Real>,AMREX_SPACEDIM> const& inter_arr,
 
     const Triangle* tri_pts = m_tri_pts_d.data();
     const XDim3* tri_norm = m_tri_normals_d.data();
+    const Node* bvh_root = m_bvh_nodes.data();
+
+    enum bvh_opt_options : int { no_bvh, yes_bvh };
+    int bvh_opt_runtime_option = m_bvh_optimization ? yes_bvh : no_bvh;
 
     for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) {
         Array4<Real> const& inter = inter_arr[idim];
         Array4<EB2::Type_t const> const& type = type_arr[idim];
         const Box bx{inter};
-        ParallelFor(bx, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
+        ParallelFor(TypeList<CompileTimeOptions<no_bvh, yes_bvh>>{},
+                    {bvh_opt_runtime_option},
+            bx, [=] AMREX_GPU_DEVICE (int i, int j, int k, auto bvh_control) noexcept
         {
+#ifdef AMREX_USE_CUDA
+            amrex::ignore_unused(num_triangles,tri_pts,tri_norm,lst,bvh_root);
+#endif
             Real r = std::numeric_limits<Real>::quiet_NaN();
             if (type(i,j,k) == EB2::Type::irregular) {
                 XDim3 p1{plo[0]+static_cast<Real>(i)*dx[0],
@@ -616,62 +908,143 @@ STLtools::getIntercept (Array<Array4<Real>,AMREX_SPACEDIM> const& inter_arr,
                 };
                 if (idim == 0) {
                     Real x2 = plo[0]+static_cast<Real>(i+1)*dx[0];
-                    int it;
-                    for (it=0; it < num_triangles; ++it) {
-                        auto const& tri = tri_pts[it];
-                        auto tmp = edge_tri_intersects(p1.x, x2, p1.y, p1.z,
-                                                       tri.v1, tri.v2, tri.v3,
-                                                       tri_norm[it],
-                                                       lst(i+1,j,k)-lst(i,j,k));
-                        if (tmp.first) {
-                            r = tmp.second;
-                            break;
+                    bool found = false;
+                    if constexpr (bvh_control == no_bvh) {
+                        for (int it=0; it < num_triangles; ++it) {
+                            auto const& tri = tri_pts[it];
+                            auto tmp = edge_tri_intersects(p1.x, x2, p1.y, p1.z,
+                                                           tri.v1, tri.v2, tri.v3,
+                                                           tri_norm[it],
+                                                           lst(i+1,j,k)-lst(i,j,k));
+                            if (tmp.first) {
+                                r = tmp.second;
+                                found = true;
+                                break;
+                            }
                         }
+                    } else {
+                        Real a[3] = {p1.x , p1.y, p1.z};
+                        Real b[3] = {   x2, p1.y, p1.z};
+                        bvh_line_tri_intersects(a, b, bvh_root,
+                                                [&] (int ntri, Triangle const* ptri,
+                                                     XDim3 const* ptrinorm) -> int
+                        {
+                            for (int it=0; it < ntri; ++it) {
+                                auto const& tri = ptri[it];
+                                auto tmp = edge_tri_intersects(p1.x, x2, p1.y, p1.z,
+                                                               tri.v1, tri.v2, tri.v3,
+                                                               ptrinorm[it],
+                                                               lst(i+1,j,k)-lst(i,j,k));
+                                if (tmp.first) {
+                                    r = tmp.second;
+                                    found = true;
+                                    return 1;
+                                }
+                            }
+                            return 0;
+                        });
                     }
-                    if (it == num_triangles) {
+                    if (!found) {
                         r = (lst(i,j,k) > 0._rt) ? p1.x : x2;
                     }
                 } else if (idim == 1) {
                     Real y2 = plo[1]+static_cast<Real>(j+1)*dx[1];
-                    int it;
-                    for (it=0; it < num_triangles; ++it) {
-                        auto const& tri = tri_pts[it];
-                        auto const& norm = tri_norm[it];
-                        auto tmp = edge_tri_intersects(p1.y, y2, p1.z, p1.x,
-                                                       {tri.v1.y, tri.v1.z, tri.v1.x},
-                                                       {tri.v2.y, tri.v2.z, tri.v2.x},
-                                                       {tri.v3.y, tri.v3.z, tri.v3.x},
-                                                       {  norm.y,   norm.z,   norm.x},
-                                                       lst(i,j+1,k)-lst(i,j,k));
-                        if (tmp.first) {
-                            r = tmp.second;
-                            break;
+                    bool found = false;
+                    if constexpr (bvh_control == no_bvh) {
+                        for (int it=0; it < num_triangles; ++it) {
+                            auto const& tri = tri_pts[it];
+                            auto const& norm = tri_norm[it];
+                            auto tmp = edge_tri_intersects(p1.y, y2, p1.z, p1.x,
+                                                           {tri.v1.y, tri.v1.z, tri.v1.x},
+                                                           {tri.v2.y, tri.v2.z, tri.v2.x},
+                                                           {tri.v3.y, tri.v3.z, tri.v3.x},
+                                                           {  norm.y,   norm.z,   norm.x},
+                                                           lst(i,j+1,k)-lst(i,j,k));
+                            if (tmp.first) {
+                                r = tmp.second;
+                                found = true;
+                                break;
+                            }
                         }
+                    } else {
+                        Real a[3] = {p1.x, p1.y , p1.z};
+                        Real b[3] = {p1.x,    y2, p1.z};
+                        bvh_line_tri_intersects(a, b, bvh_root,
+                                                [&] (int ntri, Triangle const* ptri,
+                                                     XDim3 const* ptrinorm) -> int
+                        {
+                            for (int it=0; it < ntri; ++it) {
+                                auto const& tri = ptri[it];
+                                auto const& norm = ptrinorm[it];
+                                auto tmp = edge_tri_intersects(p1.y, y2, p1.z, p1.x,
+                                                               {tri.v1.y, tri.v1.z, tri.v1.x},
+                                                               {tri.v2.y, tri.v2.z, tri.v2.x},
+                                                               {tri.v3.y, tri.v3.z, tri.v3.x},
+                                                               {  norm.y,   norm.z,   norm.x},
+                                                               lst(i,j+1,k)-lst(i,j,k));
+                                if (tmp.first) {
+                                    r = tmp.second;
+                                    found = true;
+                                    return 1;
+                                }
+                            }
+                            return 0;
+                        });
                     }
-                    if (it == num_triangles) {
+                    if (!found) {
                         r = (lst(i,j,k) > 0._rt) ? p1.y : y2;
                     }
-                } else {
+                }
+#if (AMREX_SPACEDIM == 3)
+                else {
                     Real z2 = plo[2]+static_cast<Real>(k+1)*dx[2];
-                    int it;
-                    for (it=0; it < num_triangles; ++it) {
-                        auto const& tri = tri_pts[it];
-                        auto const& norm = tri_norm[it];
-                        auto tmp = edge_tri_intersects(p1.z, z2, p1.x, p1.y,
-                                                       {tri.v1.z, tri.v1.x, tri.v1.y},
-                                                       {tri.v2.z, tri.v2.x, tri.v2.y},
-                                                       {tri.v3.z, tri.v3.x, tri.v3.y},
-                                                       {  norm.z,   norm.x,   norm.y},
-                                                       lst(i,j,k+1)-lst(i,j,k));
-                        if (tmp.first) {
-                            r = tmp.second;
-                            break;
+                    bool found = false;
+                    if constexpr (bvh_control == no_bvh) {
+                        for (int it=0; it < num_triangles; ++it) {
+                            auto const& tri = tri_pts[it];
+                            auto const& norm = tri_norm[it];
+                            auto tmp = edge_tri_intersects(p1.z, z2, p1.x, p1.y,
+                                                           {tri.v1.z, tri.v1.x, tri.v1.y},
+                                                           {tri.v2.z, tri.v2.x, tri.v2.y},
+                                                           {tri.v3.z, tri.v3.x, tri.v3.y},
+                                                           {  norm.z,   norm.x,   norm.y},
+                                                           lst(i,j,k+1)-lst(i,j,k));
+                            if (tmp.first) {
+                                r = tmp.second;
+                                found = true;
+                                break;
+                            }
                         }
+                    } else {
+                        Real a[3] = {p1.x, p1.y, p1.z };
+                        Real b[3] = {p1.x, p1.y,    z2};
+                        bvh_line_tri_intersects(a, b, bvh_root,
+                                                [&] (int ntri, Triangle const* ptri,
+                                                     XDim3 const* ptrinorm) -> int
+                        {
+                            for (int it=0; it < ntri; ++it) {
+                                auto const& tri = ptri[it];
+                                auto const& norm = ptrinorm[it];
+                                auto tmp = edge_tri_intersects(p1.z, z2, p1.x, p1.y,
+                                                               {tri.v1.z, tri.v1.x, tri.v1.y},
+                                                               {tri.v2.z, tri.v2.x, tri.v2.y},
+                                                               {tri.v3.z, tri.v3.x, tri.v3.y},
+                                                               {  norm.z,   norm.x,   norm.y},
+                                                               lst(i,j,k+1)-lst(i,j,k));
+                                if (tmp.first) {
+                                    r = tmp.second;
+                                    found = true;
+                                    return 1;
+                                }
+                            }
+                            return 0;
+                        });
                     }
-                    if (it == num_triangles) {
+                    if (!found) {
                         r = (lst(i,j,k) > 0._rt) ? p1.z : z2;
                     }
                 }
+#endif
             }
             inter(i,j,k) = r;
         });
@@ -723,7 +1096,7 @@ STLtools::updateIntercept (Array<Array4<Real>,AMREX_SPACEDIM> const& inter_arr,
                         (lst(i,j,k) > Real(0.0) && is_nan))
                     {
                         inter(i,j,k) = problo[2] + static_cast<Real>(k)*dx[2];
-                        }
+                    }
                     else if (lst(i,j,k+1) == Real(0.0) ||
                              (lst(i,j,k+1) > Real(0.0) && is_nan))
                     {
diff --git a/Src/EB/AMReX_EB_triGeomOps_K.H b/Src/EB/AMReX_EB_triGeomOps_K.H
index 25a803892b..7ab517efe9 100644
--- a/Src/EB/AMReX_EB_triGeomOps_K.H
+++ b/Src/EB/AMReX_EB_triGeomOps_K.H
@@ -63,8 +63,8 @@ namespace amrex::tri_geom_ops
             L[5] = v2[1]       - v1[1];
         }
         //================================================================================
-        AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void side_op3(Real v1[3],Real v2[3],
-                Real t1[3],Real t2[3],Real t3[3],
+        AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void side_op3(const Real v1[3], const Real v2[3],
+                const Real t1[3], const Real t2[3], const Real t3[3],
                 Real &S1, Real &S2, Real &S3)
         {
 
@@ -81,8 +81,8 @@ namespace amrex::tri_geom_ops
         }
         //================================================================================
         //get normal of triangle pointing at a test-point
-        AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void tri_n(Real P1[3],Real P2[3],Real P3[3],
-                Real testp[3],Real n[3])
+        AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void tri_n(const Real P1[3], const Real P2[3], const Real P3[3],
+                const Real testp[3], Real n[3])
         {
             Real v1[3],v2[3],magn;
             Real centr[3],c_tp_vec[3];
@@ -92,9 +92,9 @@ namespace amrex::tri_geom_ops
             CrossProd(v1,v2,n);
 
 
-            centr[0]=Real(0.333333)*(P1[0]+P2[0]+P3[0]);
-            centr[1]=Real(0.333333)*(P1[1]+P2[1]+P3[1]);
-            centr[2]=Real(0.333333)*(P1[2]+P2[2]+P3[2]);
+            centr[0]=Real(1./3.)*(P1[0]+P2[0]+P3[0]);
+            centr[1]=Real(1./3.)*(P1[1]+P2[1]+P3[1]);
+            centr[2]=Real(1./3.)*(P1[2]+P2[2]+P3[2]);
 
             getvec(centr,testp,c_tp_vec);
             magn=std::sqrt(n[0]*n[0]+n[1]*n[1]+n[2]*n[2]);
@@ -109,7 +109,7 @@ namespace amrex::tri_geom_ops
             n[2]=n[2]/magn;
         }
         //================================================================================
-        AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Real triangle_area(Real P1[3],Real P2[3],Real P3[3])
+        AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Real triangle_area(const Real P1[3], const Real P2[3], const Real P3[3])
         {
             Real v1[3],v2[3],area[3];
 
@@ -121,7 +121,7 @@ namespace amrex::tri_geom_ops
         //================================================================================
         //this is only useful when v1-v2 segment intersects the triangle
         AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE bool find_intersection_point(const Real v1[3],const Real v2[3],
-                Real t1[3], Real t2[3], Real t3[3],Real ip[3],int bisect_iters=20,Real tol=1e-6)
+                const Real t1[3], const Real t2[3], const Real t3[3], Real ip[3],int bisect_iters=20,Real tol=1e-6)
         {
             Real plane_eq_mid,plane_eq1,plane_eq2;
 
@@ -166,13 +166,13 @@ namespace amrex::tri_geom_ops
                     break;
                 }
 
-                if(plane_eq_mid*plane_eq1 < 0.0)
+                if(plane_eq_mid*plane_eq1 < Real(0.0))
                 {
                     p2[0]=midp[0];
                     p2[1]=midp[1];
                     p2[2]=midp[2];
                 }
-                else if(plane_eq_mid*plane_eq2 < 0.0)
+                else if(plane_eq_mid*plane_eq2 < Real(0.0))
                 {
                     p1[0]=midp[0];
                     p1[1]=midp[1];
@@ -182,7 +182,7 @@ namespace amrex::tri_geom_ops
                     //or error: p1,midp and p2 are on the same side
                     //which is not what this function is meant for
                 {
-                    if(plane_eq_mid*plane_eq1 > 0.0 && plane_eq_mid*plane_eq2 > 0.0)
+                    if(plane_eq_mid*plane_eq1 > Real(0.0) && plane_eq_mid*plane_eq2 > Real(0.0))
                     {
                         all_ok=false;
                     }
@@ -197,8 +197,8 @@ namespace amrex::tri_geom_ops
             return(all_ok);
         }
         //================================================================================
-        AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE int lineseg_tri_intersect(Real v1[3],Real v2[3],
-                Real t1[3],Real t2[3],Real t3[3])
+        AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE int lineseg_tri_intersect(const Real v1[3], const Real v2[3],
+                const Real t1[3], const Real t2[3], const Real t3[3])
         {
             //see plucker coordinates based method
             //https://members.loria.fr/SLazard/ARC-Visi3D/Pant-project/files/Line_Triangle.html
@@ -233,11 +233,11 @@ namespace amrex::tri_geom_ops
                 }
             }
             //proper and edge intersection
-            else if( (S1 < 0.0 && S2 < 0.0 && S3 < 0.0) ||
-                    (S1 > 0.0 && S2 > 0.0 && S3 > 0.0) ||
-                    (std::abs(S1) < eps && S2*S3 > 0.0) ||     //S1=0
-                    (std::abs(S2) < eps && S3*S1 > 0.0) ||     //S2=0
-                    (std::abs(S3) < eps && S1*S2 > 0.0) )      //S3=0
+            else if( (S1 < Real(0.0) && S2 < Real(0.0) && S3 < Real(0.0)) ||
+                    (S1 > Real(0.0) && S2 > Real(0.0) && S3 > Real(0.0)) ||
+                    (std::abs(S1) < eps && S2*S3 > Real(0.0)) ||     //S1=0
+                    (std::abs(S2) < eps && S3*S1 > Real(0.0)) ||     //S2=0
+                    (std::abs(S3) < eps && S1*S2 > Real(0.0)) )      //S3=0
             {
 
                 get_plucker_coords(v1,t1,L2);
@@ -253,7 +253,7 @@ namespace amrex::tri_geom_ops
                 ls_s1 = side_op(L4,L3);
                 ls_s2 = side_op(L4,L2);
 
-                if(ls_s1*ls_s2 > 0.0)
+                if(ls_s1*ls_s2 > Real(0.0))
                 {
                     no_intersections = 0;
                 }
diff --git a/Src/EB/AMReX_algoim.cpp b/Src/EB/AMReX_algoim.cpp
index 254e15dab0..864ec626a0 100644
--- a/Src/EB/AMReX_algoim.cpp
+++ b/Src/EB/AMReX_algoim.cpp
@@ -66,8 +66,16 @@ compute_integrals (MultiFab& intgmf, IntVect nghost)
 
             if (Gpu::inLaunchRegion())
             {
+#if defined(AMREX_USE_CUDA)
+                // It appears that there is a nvcc bug. We have to use the
+                // 4D ParallelFor here, even though ncomp is 1.
+                int ncomp = fg.nComp();
+                amrex::ParallelFor(bx, ncomp,
+                [=] AMREX_GPU_DEVICE (int i, int j, int k, int) noexcept
+#else
                 amrex::ParallelFor(bx,
                 [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
+#endif
                 {
                     const auto ebflag = fg(i,j,k);
                     if (ebflag.isRegular()) {
diff --git a/Src/Extern/HYPRE/AMReX_Habec_2D_K.H b/Src/Extern/HYPRE/AMReX_Habec_2D_K.H
index 731ad04d4b..2d1d63432a 100644
--- a/Src/Extern/HYPRE/AMReX_Habec_2D_K.H
+++ b/Src/Extern/HYPRE/AMReX_Habec_2D_K.H
@@ -6,7 +6,7 @@
 #include <AMReX_EBMultiFabUtil.H>
 #include <AMReX_MultiCutFab.H>
 #include <AMReX_EBFabFactory.H>
-#include <AMReX_MLEBABecLap_K.H>
+#include <AMReX_MLLinOp_K.H>
 #endif
 
 namespace amrex {
diff --git a/Src/Extern/HYPRE/AMReX_Habec_3D_K.H b/Src/Extern/HYPRE/AMReX_Habec_3D_K.H
index 5d5c054758..6b4e67587d 100644
--- a/Src/Extern/HYPRE/AMReX_Habec_3D_K.H
+++ b/Src/Extern/HYPRE/AMReX_Habec_3D_K.H
@@ -6,7 +6,7 @@
 #include <AMReX_EBMultiFabUtil.H>
 #include <AMReX_MultiCutFab.H>
 #include <AMReX_EBFabFactory.H>
-#include <AMReX_MLEBABecLap_K.H>
+#include <AMReX_MLLinOp_K.H>
 #endif
 
 namespace amrex {
diff --git a/Src/FFT/AMReX_FFT.H b/Src/FFT/AMReX_FFT.H
new file mode 100644
index 0000000000..f8050fff93
--- /dev/null
+++ b/Src/FFT/AMReX_FFT.H
@@ -0,0 +1,969 @@
+#ifndef AMREX_FFT_H_
+#define AMREX_FFT_H_
+#include <AMReX_Config.H>
+
+#include <AMReX_MultiFab.H>
+#include <AMReX_FFT_Helper.H>
+#include <numeric>
+#include <tuple>
+#include <utility>
+
+#if defined(AMREX_USE_CUDA)
+#  include <cufft.h>
+#  include <cuComplex.h>
+#elif defined(AMREX_USE_HIP)
+#  if __has_include(<rocfft/rocfft.h>)  // ROCm 5.3+
+#    include <rocfft/rocfft.h>
+#  else
+#    include <rocfft.h>
+#  endif
+#  include <hip/hip_complex.h>
+#elif defined(AMREX_USE_SYCL)
+#  include <oneapi/mkl/dfti.hpp>
+#else
+#  include <fftw3.h>
+#endif
+
+namespace amrex::FFT
+{
+
+/**
+ * \brief Discrete Fourier Transform
+ *
+ * This class supports Fourier transforms between real and complex data. The
+ * name R2C indicates that the forward transform converts real data to
+ * complex data, while the backward transform converts complex data to real
+ * data. It should be noted that both directions of transformation are
+ * supported, not just from real to complex. The scaling follows the FFTW
+ * convention, where applying the forward transform followed by the backward
+ * transform scales the original data by the size of the input array.
+ *
+ * For more details, we refer the users to
+ * https://amrex-codes.github.io/amrex/docs_html/FFT_Chapter.html.
+ */
+template <typename T = Real, FFT::Direction D = FFT::Direction::both>
+class R2C
+{
+public:
+    using MF = std::conditional_t<std::is_same_v<T,Real>,
+                                  MultiFab, FabArray<BaseFab<T> > >;
+    using cMF = FabArray<BaseFab<GpuComplex<T> > >;
+
+    /**
+     * \brief Constructor
+     *
+     * \param domain the forward domain (i.e., the domain of the real data)
+     * \param info optional information
+     */
+    explicit R2C (Box const& domain, Info const& info = Info{});
+
+    ~R2C ();
+
+    R2C (R2C const&) = delete;
+    R2C (R2C &&) = delete;
+    R2C& operator= (R2C const&) = delete;
+    R2C& operator= (R2C &&) = delete;
+
+    /**
+     * \brief Forward and then backward transform
+     *
+     * This function is available only when this class template is
+     * instantiated for transforms in both directions. It's more efficient
+     * than calling the forward function that stores the spectral data in a
+     * caller provided container followed by the backward function, because
+     * this can avoid parallel communication between the internal data and
+     * the caller's data container.
+     *
+     * \param inmf         input data in MultiFab or FabArray<BaseFab<float>>
+     * \param outmf        output data in MultiFab or FabArray<BaseFab<float>>
+     * \param post_forward a callable object for processing the post-forward
+     *                     data before the backward transform. Its interface
+     *                     is `(int,int,int,GpuComplex<T>&)`, where the integers
+     *                     are indices in the spectral space, and the reference
+     *                     to the complex number allows for the modification of
+     *                     the spectral data at that location.
+     */
+    template <typename F, Direction DIR=D,
+              std::enable_if_t<DIR == Direction::both, int> = 0>
+    void forwardThenBackward (MF const& inmf, MF& outmf, F const& post_forward)
+    {
+        this->forward(inmf);
+        this->post_forward_doit(post_forward);
+        this->backward(outmf);
+    }
+
+    /**
+     * \brief Forward transform
+     *
+     * The output is stored in this object's internal data. This function is
+     * not available when this class template is instantiated for
+     * backward-only transform.
+     *
+     * \param inmf input data in MultiFab or FabArray<BaseFab<float>>
+     */
+    template <Direction DIR=D, std::enable_if_t<DIR == Direction::forward ||
+                                                DIR == Direction::both, int> = 0>
+    void forward (MF const& inmf);
+
+    /**
+     * \brief Forward transform
+     *
+     * This function is not available when this class template is
+     * instantiated for backward-only transform.
+     *
+     * \param inmf input data in MultiFab or FabArray<BaseFab<float>>
+     * \param outmf output data in FabArray<BaseFab<GpuComplex<T>>>
+     */
+    template <Direction DIR=D, std::enable_if_t<DIR == Direction::forward ||
+                                                DIR == Direction::both, int> = 0>
+    void forward (MF const& inmf, cMF& outmf);
+
+    /**
+     * \brief Backward transform
+     *
+     * This function is available only when this class template is
+     * instantiated for transforms in both directions.
+     *
+     * \param outmf output data in MultiFab or FabArray<BaseFab<float>>
+     */
+    template <Direction DIR=D, std::enable_if_t<DIR == Direction::both, int> = 0>
+    void backward (MF& outmf);
+
+    /**
+     * \brief Backward transform
+     *
+     * This function is not available when this class template is
+     * instantiated for forward-only transform.
+     *
+     * \param inmf input data in FabArray<BaseFab<GpuComplex<T>>>
+     * \param outmf output data in MultiFab or FabArray<BaseFab<float>>
+     */
+    template <Direction DIR=D, std::enable_if_t<DIR == Direction::backward ||
+                                                DIR == Direction::both, int> = 0>
+    void backward (cMF const& inmf, MF& outmf);
+
+    /**
+     * \brief Get the internal spectral data
+     *
+     * This function is not available when this class template is
+     * instantiated for backward-only transform. For performance reasons,
+     * the returned data array does not have the usual ordering of
+     * `(x,y,z)`. The order is specified in the second part of the return
+     * value.
+     */
+    template <Direction DIR=D, std::enable_if_t<DIR == Direction::forward ||
+                                                DIR == Direction::both, int> = 0>
+    std::pair<cMF*,IntVect> getSpectralData ();
+
+    struct Swap01
+    {
+        [[nodiscard]] AMREX_GPU_HOST_DEVICE Dim3 operator() (Dim3 i) const noexcept
+        {
+            return {i.y, i.x, i.z};
+        }
+
+        [[nodiscard]] AMREX_GPU_HOST_DEVICE Dim3 Inverse (Dim3 i) const noexcept
+        {
+            return {i.y, i.x, i.z};
+        }
+
+        [[nodiscard]] IndexType operator() (IndexType it) const noexcept
+        {
+            return it;
+        }
+
+        [[nodiscard]] IndexType Inverse (IndexType it) const noexcept
+        {
+            return it;
+        }
+    };
+
+    struct Swap02
+    {
+        [[nodiscard]] AMREX_GPU_HOST_DEVICE Dim3 operator() (Dim3 i) const noexcept
+        {
+            return {i.z, i.y, i.x};
+        }
+
+        [[nodiscard]] AMREX_GPU_HOST_DEVICE Dim3 Inverse (Dim3 i) const noexcept
+        {
+            return {i.z, i.y, i.x};
+        }
+
+        [[nodiscard]] IndexType operator() (IndexType it) const noexcept
+        {
+            return it;
+        }
+
+        [[nodiscard]] IndexType Inverse (IndexType it) const noexcept
+        {
+            return it;
+        }
+    };
+
+    struct RotateFwd
+    {
+        // dest -> src: (x,y,z) -> (y,z,x)
+        [[nodiscard]] AMREX_GPU_HOST_DEVICE Dim3 operator() (Dim3 i) const noexcept
+        {
+            return {i.y, i.z, i.x};
+        }
+
+        // src -> dest: (x,y,z) -> (z,x,y)
+        [[nodiscard]] AMREX_GPU_HOST_DEVICE Dim3 Inverse (Dim3 i) const noexcept
+        {
+            return {i.z, i.x, i.y};
+        }
+
+        [[nodiscard]] IndexType operator() (IndexType it) const noexcept
+        {
+            return it;
+        }
+
+        [[nodiscard]] IndexType Inverse (IndexType it) const noexcept
+        {
+            return it;
+        }
+    };
+
+    struct RotateBwd
+    {
+        // dest -> src: (x,y,z) -> (z,x,y)
+        [[nodiscard]] AMREX_GPU_HOST_DEVICE Dim3 operator() (Dim3 i) const noexcept
+        {
+            return {i.z, i.x, i.y};
+        }
+
+        // src -> dest: (x,y,z) -> (y,z,x)
+        [[nodiscard]] AMREX_GPU_HOST_DEVICE Dim3 Inverse (Dim3 i) const noexcept
+        {
+            return {i.y, i.z, i.x};
+        }
+
+        [[nodiscard]] IndexType operator() (IndexType it) const noexcept
+        {
+            return it;
+        }
+
+        [[nodiscard]] IndexType Inverse (IndexType it) const noexcept
+        {
+            return it;
+        }
+    };
+
+    // public for cuda
+    template <typename F>
+    void post_forward_doit (F const& post_forward);
+
+private:
+
+#if defined(AMREX_USE_CUDA)
+    using VendorPlan = cufftHandle;
+    using VendorPlan2 = VendorPlan;
+    using FFTComplex = std::conditional_t<std::is_same_v<float,T>,
+                                          cuComplex, cuDoubleComplex>;
+#elif defined(AMREX_USE_HIP)
+    using VendorPlan = rocfft_plan;
+    using VendorPlan2 = VendorPlan;
+    using FFTComplex = std::conditional_t<std::is_same_v<float,T>,
+                                          float2, double2>;
+#elif defined(AMREX_USE_SYCL)
+    using VendorPlan = oneapi::mkl::dft::descriptor<
+        std::is_same_v<float,T> ? oneapi::mkl::dft::precision::SINGLE
+                                : oneapi::mkl::dft::precision::DOUBLE,
+        oneapi::mkl::dft::domain::REAL> *;
+    using VendorPlan2 = oneapi::mkl::dft::descriptor<
+        std::is_same_v<float,T> ? oneapi::mkl::dft::precision::SINGLE
+                                : oneapi::mkl::dft::precision::DOUBLE,
+        oneapi::mkl::dft::domain::COMPLEX> *;
+    using FFTComplex = GpuComplex<T>;
+#else
+    using VendorPlan = std::conditional_t<std::is_same_v<float,T>,
+                                          fftwf_plan, fftw_plan>;
+    using VendorPlan2 = VendorPlan;
+    using FFTComplex = std::conditional_t<std::is_same_v<float,T>,
+                                          fftwf_complex, fftw_complex>;
+#endif
+
+    struct Plan {
+        bool defined = false;
+        VendorPlan plan = 0; // NOLINT
+    };
+
+    struct Plan2 {
+        bool defined = false;
+        VendorPlan2 plan = 0; // NOLINT
+    };
+
+    template <typename FA>
+    static typename FA::FABType::value_type *
+    get_fab (FA& fa) {
+        auto myproc = ParallelContext::MyProcSub();
+        if (myproc < fa.size()) {
+            return fa.fabPtr(myproc);
+        } else {
+            return nullptr;
+        }
+    }
+
+    static void exec_r2c (Plan plan, MF& in, cMF& out);
+    static void exec_c2r (Plan plan, cMF& in, MF& out);
+    template <Direction direction>
+    static void exec_c2c (Plan2 plan, cMF& inout);
+
+    template <typename P>
+    static void destroy_plan (P plan);
+    static std::pair<Plan2,Plan2> make_c2c_plans (cMF& inout);
+
+    void backward_doit (MF& outmf);
+
+    Plan m_fft_fwd_x{};
+    Plan m_fft_bwd_x{};
+    Plan2 m_fft_fwd_y{};
+    Plan2 m_fft_bwd_y{};
+    Plan2 m_fft_fwd_z{};
+    Plan2 m_fft_bwd_z{};
+
+    // Comm meta-data. In the forward phase, we start with (x,y,z),
+    // transpose to (y,x,z) and then (z,x,y). In the backward phase, we
+    // perform inverse transpose.
+    std::unique_ptr<MultiBlockCommMetaData> m_cmd_x2y; // (x,y,z) -> (y,x,z)
+    std::unique_ptr<MultiBlockCommMetaData> m_cmd_y2x; // (y,x,z) -> (x,y,z)
+    std::unique_ptr<MultiBlockCommMetaData> m_cmd_y2z; // (y,x,z) -> (z,x,y)
+    std::unique_ptr<MultiBlockCommMetaData> m_cmd_z2y; // (z,x,y) -> (y,x,z)
+    Swap01 m_dtos_x2y{};
+    Swap01 m_dtos_y2x{};
+    Swap02 m_dtos_y2z{};
+    Swap02 m_dtos_z2y{};
+
+    MF  m_rx;
+    cMF m_cx;
+    cMF m_cy;
+    cMF m_cz;
+
+    Box m_real_domain;
+    Box m_spectral_domain_x;
+    Box m_spectral_domain_y;
+    Box m_spectral_domain_z;
+
+    Info m_info;
+};
+
+template <typename T, Direction D>
+R2C<T,D>::R2C (Box const& domain, Info const& info)
+    : m_real_domain(domain),
+      m_spectral_domain_x(IntVect(0), IntVect(AMREX_D_DECL(domain.length(0)/2,
+                                                           domain.bigEnd(1),
+                                                           domain.bigEnd(2)))),
+#if (AMREX_SPACEDIM >= 2)
+      m_spectral_domain_y(IntVect(0), IntVect(AMREX_D_DECL(domain.bigEnd(1),
+                                                           domain.length(0)/2,
+                                                           domain.bigEnd(2)))),
+#if (AMREX_SPACEDIM == 3)
+      m_spectral_domain_z(IntVect(0), IntVect(AMREX_D_DECL(domain.bigEnd(2),
+                                                           domain.length(0)/2,
+                                                           domain.bigEnd(1)))),
+#endif
+#endif
+      m_info(info)
+{
+    static_assert(std::is_same_v<float,T> || std::is_same_v<double,T>);
+    AMREX_ALWAYS_ASSERT(m_real_domain.smallEnd() == 0 &&
+                        m_real_domain.length(0) > 1 &&
+                        m_real_domain.cellCentered());
+#if (AMREX_SPACEDIM == 3)
+    AMREX_ALWAYS_ASSERT(m_real_domain.length(2) > 1 || ! m_info.batch_mode);
+    AMREX_ALWAYS_ASSERT(m_real_domain.length(1) > 1 || m_real_domain.length(2) == 1);
+#else
+    AMREX_ALWAYS_ASSERT(! m_info.batch_mode);
+#endif
+
+    int myproc = ParallelContext::MyProcSub();
+    int nprocs = ParallelContext::NProcsSub();
+
+    auto bax = amrex::decompose(m_real_domain, nprocs, {AMREX_D_DECL(false,true,true)});
+    DistributionMapping dmx = detail::make_iota_distromap(bax.size());
+    m_rx.define(bax, dmx, 1, 0);
+
+    {
+        BoxList bl = bax.boxList();
+        for (auto & b : bl) {
+            b.setBig(0, m_spectral_domain_x.bigEnd(0));
+        }
+        BoxArray cbax(std::move(bl));
+        m_cx.define(cbax, dmx, 1, 0);
+    }
+
+    // plans for x-direction
+    if (myproc < m_rx.size())
+    {
+        Box const local_box = m_rx.boxArray()[myproc];
+        int n = local_box.length(0);
+        int howmany = AMREX_D_TERM(1, *local_box.length(1), *local_box.length(2));
+
+#if defined(AMREX_USE_CUDA)
+        if constexpr (D == Direction::both || D == Direction::forward) {
+            cufftType fwd_type = std::is_same_v<float,T> ? CUFFT_R2C : CUFFT_D2Z;
+            AMREX_CUFFT_SAFE_CALL
+                (cufftPlanMany(&m_fft_fwd_x.plan, 1, &n,
+                               nullptr, 1, m_real_domain.length(0),
+                               nullptr, 1, m_spectral_domain_x.length(0),
+                               fwd_type, howmany));
+            AMREX_CUFFT_SAFE_CALL(cufftSetStream(m_fft_fwd_x.plan, Gpu::gpuStream()));
+        }
+        if constexpr (D == Direction::both || D == Direction::backward) {
+            cufftType bwd_type = std::is_same_v<float,T> ? CUFFT_C2R : CUFFT_Z2D;
+            AMREX_CUFFT_SAFE_CALL
+                (cufftPlanMany(&m_fft_bwd_x.plan, 1, &n,
+                               nullptr, 1, m_spectral_domain_x.length(0),
+                               nullptr, 1, m_real_domain.length(0),
+                               bwd_type, howmany));
+            AMREX_CUFFT_SAFE_CALL(cufftSetStream(m_fft_bwd_x.plan, Gpu::gpuStream()));
+        }
+#elif defined(AMREX_USE_HIP)
+
+        auto prec = std::is_same_v<float,T> ? rocfft_precision_single : rocfft_precision_double;
+        const std::size_t length = n;
+        if constexpr (D == Direction::both || D == Direction::forward) {
+            AMREX_ROCFFT_SAFE_CALL
+                (rocfft_plan_create(&m_fft_fwd_x.plan, rocfft_placement_notinplace,
+                                    rocfft_transform_type_real_forward, prec, 1,
+                                    &length, howmany, nullptr));
+        }
+        if constexpr (D == Direction::both || D == Direction::backward) {
+            AMREX_ROCFFT_SAFE_CALL
+                (rocfft_plan_create(&m_fft_bwd_x.plan, rocfft_placement_notinplace,
+                                    rocfft_transform_type_real_inverse, prec, 1,
+                                    &length, howmany, nullptr));
+        }
+
+#elif defined(AMREX_USE_SYCL)
+
+        m_fft_fwd_x.plan = new std::remove_pointer_t<VendorPlan>(n);
+        m_fft_fwd_x.plan->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
+                                    DFTI_NOT_INPLACE);
+        m_fft_fwd_x.plan->set_value(oneapi::mkl::dft::config_param::NUMBER_OF_TRANSFORMS,
+                                    howmany);
+        m_fft_fwd_x.plan->set_value(oneapi::mkl::dft::config_param::FWD_DISTANCE,
+                                    m_real_domain.length(0));
+        m_fft_fwd_x.plan->set_value(oneapi::mkl::dft::config_param::BWD_DISTANCE,
+                                    m_spectral_domain_x.length(0));
+        std::array<std::int64_t,2> strides{0,1};
+        m_fft_fwd_x.plan->set_value(oneapi::mkl::dft::config_param::FWD_STRIDES,
+                                    strides.data());
+        m_fft_fwd_x.plan->set_value(oneapi::mkl::dft::config_param::BWD_STRIDES,
+                                    strides.data());
+        m_fft_fwd_x.plan->set_value(oneapi::mkl::dft::config_param::WORKSPACE,
+                                    oneapi::mkl::dft::config_value::WORKSPACE_EXTERNAL);
+        m_fft_fwd_x.plan->commit(amrex::Gpu::Device::streamQueue());
+
+        m_fft_bwd_x.plan = m_fft_fwd_x.plan;
+
+#else /* FFTW */
+
+        auto* in = m_rx[myproc].dataPtr();
+        auto* out = (FFTComplex*)(m_cx[myproc].dataPtr());
+
+        if constexpr (std::is_same_v<float,T>) {
+            if constexpr (D == Direction::both || D == Direction::forward) {
+                m_fft_fwd_x.plan = fftwf_plan_many_dft_r2c
+                    (1, &n, howmany, in, nullptr, 1, m_real_domain.length(0),
+                     out, nullptr, 1, m_spectral_domain_x.length(0),
+                     FFTW_ESTIMATE | FFTW_DESTROY_INPUT);
+            }
+            if constexpr (D == Direction::both || D == Direction::backward) {
+                m_fft_bwd_x.plan = fftwf_plan_many_dft_c2r
+                    (1, &n, howmany, out, nullptr, 1, m_spectral_domain_x.length(0),
+                     in, nullptr, 1, m_real_domain.length(0),
+                     FFTW_ESTIMATE | FFTW_DESTROY_INPUT);
+            }
+        } else {
+            if constexpr (D == Direction::both || D == Direction::forward) {
+                m_fft_fwd_x.plan = fftw_plan_many_dft_r2c
+                    (1, &n, howmany, in, nullptr, 1, m_real_domain.length(0),
+                     out, nullptr, 1, m_spectral_domain_x.length(0),
+                     FFTW_ESTIMATE | FFTW_DESTROY_INPUT);
+            }
+            if constexpr (D == Direction::both || D == Direction::backward) {
+                m_fft_bwd_x.plan = fftw_plan_many_dft_c2r
+                    (1, &n, howmany, out, nullptr, 1, m_spectral_domain_x.length(0),
+                     in, nullptr, 1, m_real_domain.length(0),
+                     FFTW_ESTIMATE | FFTW_DESTROY_INPUT);
+            }
+        }
+#endif
+        if constexpr (D == Direction::both || D == Direction::forward) {
+            m_fft_fwd_x.defined = true;
+        }
+        if constexpr (D == Direction::both || D == Direction::backward) {
+            m_fft_bwd_x.defined = true;
+        }
+    }
+
+#if (AMREX_SPACEDIM >= 2)
+    DistributionMapping cdmy;
+    if (m_real_domain.length(1) > 1) {
+        auto cbay = amrex::decompose(m_spectral_domain_y, nprocs, {AMREX_D_DECL(false,true,true)});
+        if (cbay.size() == dmx.size()) {
+            cdmy = dmx;
+        } else {
+            cdmy = detail::make_iota_distromap(cbay.size());
+        }
+        m_cy.define(cbay, cdmy, 1, 0);
+
+        std::tie(m_fft_fwd_y, m_fft_bwd_y) = make_c2c_plans(m_cy);
+
+        // comm meta-data between x and y phases
+        m_cmd_x2y = std::make_unique<MultiBlockCommMetaData>
+            (m_cy, m_spectral_domain_y, m_cx, IntVect(0), m_dtos_x2y);
+        m_cmd_y2x = std::make_unique<MultiBlockCommMetaData>
+            (m_cx, m_spectral_domain_x, m_cy, IntVect(0), m_dtos_y2x);
+    }
+
+#if (AMREX_SPACEDIM == 3)
+    if (m_real_domain.length(1) > 1 &&
+        (! m_info.batch_mode && m_real_domain.length(2) > 1))
+    {
+        auto cbaz = amrex::decompose(m_spectral_domain_z, nprocs, {false,true,true});
+        DistributionMapping cdmz;
+        if (cbaz.size() == dmx.size()) {
+            cdmz = dmx;
+        } else if (cbaz.size() == cdmy.size()) {
+            cdmz = cdmy;
+        } else {
+            cdmz = detail::make_iota_distromap(cbaz.size());
+        }
+         m_cz.define(cbaz, cdmz, 1, 0);
+
+        std::tie(m_fft_fwd_z, m_fft_bwd_z) = make_c2c_plans(m_cz);
+
+        // comm meta-data between y and z phases
+        m_cmd_y2z = std::make_unique<MultiBlockCommMetaData>
+            (m_cz, m_spectral_domain_z, m_cy, IntVect(0), m_dtos_y2z);
+        m_cmd_z2y = std::make_unique<MultiBlockCommMetaData>
+            (m_cy, m_spectral_domain_y, m_cz, IntVect(0), m_dtos_z2y);
+    }
+#endif
+#endif
+}
+
+template <typename T, Direction D>
+template <typename P>
+void R2C<T,D>::destroy_plan (P plan)
+{
+    if (! plan.defined) { return; }
+
+#if defined(AMREX_USE_CUDA)
+    AMREX_CUFFT_SAFE_CALL(cufftDestroy(plan.plan));
+#elif defined(AMREX_USE_HIP)
+    AMREX_ROCFFT_SAFE_CALL(rocfft_plan_destroy(plan.plan));
+#elif defined(AMREX_USE_SYCL)
+    delete plan.plan;
+#else
+    if constexpr (std::is_same_v<float,T>) {
+        fftwf_destroy_plan(plan.plan);
+    } else {
+        fftw_destroy_plan(plan.plan);
+    }
+#endif
+
+    plan.defined = false;
+}
+
+template <typename T, Direction D>
+R2C<T,D>::~R2C<T,D> ()
+{
+#if defined(AMREX_USE_SYCL)
+    if constexpr (D == Direction::both || D == Direction::forward) {
+        destroy_plan(m_fft_fwd_x);
+        destroy_plan(m_fft_fwd_y);
+        destroy_plan(m_fft_fwd_z);
+    } else {
+        destroy_plan(m_fft_bwd_x);
+        destroy_plan(m_fft_bwd_y);
+        destroy_plan(m_fft_bwd_z);
+    }
+#else
+    destroy_plan(m_fft_fwd_x);
+    destroy_plan(m_fft_fwd_y);
+    destroy_plan(m_fft_fwd_z);
+    destroy_plan(m_fft_bwd_x);
+    destroy_plan(m_fft_bwd_y);
+    destroy_plan(m_fft_bwd_z);
+#endif
+}
+
+#ifdef AMREX_USE_HIP
+namespace detail { void hip_execute (rocfft_plan plan, void **in, void **out); }
+#endif
+
+#ifdef AMREX_USE_SYCL
+namespace detail
+{
+template <typename T, Direction direction, typename P, typename TI, typename TO>
+void sycl_execute (P plan, TI* in, TO* out)
+{
+    std::size_t workspaceSize = 0;
+    plan->get_value(oneapi::mkl::dft::config_param::WORKSPACE_BYTES,
+                    &workspaceSize);
+    auto* buffer = (T*)amrex::The_Arena()->alloc(workspaceSize);
+    plan->set_workspace(buffer);
+    sycl::event r;
+    if (std::is_same_v<TI,TO>) {
+        amrex::ignore_unused(in);
+        if constexpr (direction == Direction::forward) {
+            r = oneapi::mkl::dft::compute_forward(*plan, out);
+        } else {
+            r = oneapi::mkl::dft::compute_backward(*plan, out);
+        }
+    } else {
+        if constexpr (direction == Direction::forward) {
+            r = oneapi::mkl::dft::compute_forward(*plan, in, out);
+        } else {
+            r = oneapi::mkl::dft::compute_backward(*plan, in, out);
+        }
+    }
+    r.wait();
+    amrex::The_Arena()->free(buffer);
+}
+}
+#endif
+
+template <typename T, Direction D>
+void R2C<T,D>::exec_r2c (Plan plan, MF& in, cMF& out)
+{
+    if (! plan.defined) { return; }
+
+#if defined(AMREX_USE_GPU)
+    auto* pin = in[ParallelContext::MyProcSub()].dataPtr();
+    auto* pout = out[ParallelContext::MyProcSub()].dataPtr();
+#else
+    amrex::ignore_unused(in,out);
+#endif
+
+#if defined(AMREX_USE_CUDA)
+    if constexpr (std::is_same_v<float,T>) {
+        AMREX_CUFFT_SAFE_CALL(cufftExecR2C(plan.plan, pin, (FFTComplex*)pout));
+    } else {
+        AMREX_CUFFT_SAFE_CALL(cufftExecD2Z(plan.plan, pin, (FFTComplex*)pout));
+    }
+#elif defined(AMREX_USE_HIP)
+    detail::hip_execute(plan.plan, (void**)&pin, (void**)&pout);
+#elif defined(AMREX_USE_SYCL)
+    detail::sycl_execute<T,Direction::forward>(plan.plan, pin, (std::complex<T>*)pout);
+#else
+    if constexpr (std::is_same_v<float,T>) {
+        fftwf_execute(plan.plan);
+    } else {
+        fftw_execute(plan.plan);
+    }
+#endif
+}
+
+template <typename T, Direction D>
+void R2C<T,D>::exec_c2r (Plan plan, cMF& in, MF& out)
+{
+    if (! plan.defined) { return; }
+
+#if defined(AMREX_USE_GPU)
+    auto* pin = in[ParallelContext::MyProcSub()].dataPtr();
+    auto* pout = out[ParallelContext::MyProcSub()].dataPtr();
+#else
+    amrex::ignore_unused(in,out);
+#endif
+
+#if defined(AMREX_USE_CUDA)
+    if constexpr (std::is_same_v<float,T>) {
+        AMREX_CUFFT_SAFE_CALL(cufftExecC2R(plan.plan, (FFTComplex*)pin, pout));
+    } else {
+        AMREX_CUFFT_SAFE_CALL(cufftExecZ2D(plan.plan, (FFTComplex*)pin, pout));
+    }
+#elif defined(AMREX_USE_HIP)
+    detail::hip_execute(plan.plan, (void**)&pin, (void**)&pout);
+#elif defined(AMREX_USE_SYCL)
+    detail::sycl_execute<T,Direction::backward>(plan.plan, (std::complex<T>*)pin, pout);
+#else
+    if constexpr (std::is_same_v<float,T>) {
+        fftwf_execute(plan.plan);
+    } else {
+        fftw_execute(plan.plan);
+    }
+#endif
+}
+
+template <typename T, Direction D>
+template <Direction direction>
+void R2C<T,D>::exec_c2c (Plan2 plan, cMF& inout)
+{
+    if (! plan.defined) { return; }
+
+    amrex::ignore_unused(inout);
+#if defined(AMREX_USE_GPU)
+    auto* p = inout[ParallelContext::MyProcSub()].dataPtr();
+#endif
+
+#if defined(AMREX_USE_CUDA)
+    auto cufft_direction = (direction == Direction::forward) ? CUFFT_FORWARD : CUFFT_INVERSE;
+    if constexpr (std::is_same_v<float,T>) {
+        AMREX_CUFFT_SAFE_CALL(cufftExecC2C(plan.plan, (FFTComplex*)p, (FFTComplex*)p,
+                                           cufft_direction));
+    } else {
+        AMREX_CUFFT_SAFE_CALL(cufftExecZ2Z(plan.plan, (FFTComplex*)p, (FFTComplex*)p,
+                                           cufft_direction));
+    }
+#elif defined(AMREX_USE_HIP)
+    detail::hip_execute(plan.plan, (void**)&p, (void**)&p);
+#elif defined(AMREX_USE_SYCL)
+    detail::sycl_execute<T,direction>(plan.plan, (std::complex<T>*)p, (std::complex<T>*)p);
+#else
+    if constexpr (std::is_same_v<float,T>) {
+        fftwf_execute(plan.plan);
+    } else {
+        fftw_execute(plan.plan);
+    }
+#endif
+}
+
+template <typename T, Direction D>
+template <Direction DIR, std::enable_if_t<DIR == Direction::forward ||
+                                          DIR == Direction::both, int> >
+void R2C<T,D>::forward (MF const& inmf)
+{
+    m_rx.ParallelCopy(inmf, 0, 0, 1);
+    exec_r2c(m_fft_fwd_x, m_rx, m_cx);
+
+    if (                          m_cmd_x2y) {
+        ParallelCopy(m_cy, m_cx, *m_cmd_x2y, 0, 0, 1, m_dtos_x2y);
+    }
+    exec_c2c<Direction::forward>(m_fft_fwd_y, m_cy);
+
+    if (                          m_cmd_y2z) {
+        ParallelCopy(m_cz, m_cy, *m_cmd_y2z, 0, 0, 1, m_dtos_y2z);
+    }
+    exec_c2c<Direction::forward>(m_fft_fwd_z, m_cz);
+}
+
+template <typename T, Direction D>
+template <Direction DIR, std::enable_if_t<DIR == Direction::both, int> >
+void R2C<T,D>::backward (MF& outmf)
+{
+    backward_doit(outmf);
+}
+
+template <typename T, Direction D>
+void R2C<T,D>::backward_doit (MF& outmf)
+{
+    exec_c2c<Direction::backward>(m_fft_bwd_z, m_cz);
+    if (                          m_cmd_z2y) {
+        ParallelCopy(m_cy, m_cz, *m_cmd_z2y, 0, 0, 1, m_dtos_z2y);
+    }
+
+    exec_c2c<Direction::backward>(m_fft_bwd_y, m_cy);
+    if (                          m_cmd_y2x) {
+        ParallelCopy(m_cx, m_cy, *m_cmd_y2x, 0, 0, 1, m_dtos_y2x);
+    }
+
+    exec_c2r(m_fft_bwd_x, m_cx, m_rx);
+    outmf.ParallelCopy(m_rx, 0, 0, 1);
+}
+
+template <typename T, Direction D>
+std::pair<typename R2C<T,D>::Plan2, typename R2C<T,D>::Plan2>
+R2C<T,D>::make_c2c_plans (cMF& inout)
+{
+    Plan2 fwd;
+    Plan2 bwd;
+
+    auto* fab = get_fab(inout);
+    if (!fab) { return {fwd, bwd};}
+
+    Box const& local_box = fab->box();
+
+    int n = local_box.length(0);
+    int howmany = AMREX_D_TERM(1, *local_box.length(1), *local_box.length(2));
+
+#if defined(AMREX_USE_CUDA)
+
+    if constexpr (D == Direction::both || D == Direction::forward) {
+        cufftType fwd_type = std::is_same_v<float,T> ? CUFFT_C2C : CUFFT_Z2Z;
+        AMREX_CUFFT_SAFE_CALL
+            (cufftPlanMany(&fwd.plan, 1, &n, nullptr, 1, n, nullptr, 1, n,
+                           fwd_type, howmany));
+        AMREX_CUFFT_SAFE_CALL(cufftSetStream(fwd.plan, Gpu::gpuStream()));
+    }
+    if constexpr (D == Direction::both || D == Direction::backward) {
+        cufftType bwd_type = std::is_same_v<float,T> ? CUFFT_C2C : CUFFT_Z2Z;
+        AMREX_CUFFT_SAFE_CALL
+            (cufftPlanMany(&bwd.plan, 1, &n, nullptr, 1, n, nullptr, 1, n,
+                           bwd_type, howmany));
+        AMREX_CUFFT_SAFE_CALL(cufftSetStream(bwd.plan, Gpu::gpuStream()));
+    }
+
+#elif defined(AMREX_USE_HIP)
+
+    auto prec = std::is_same_v<float,T> ? rocfft_precision_single : rocfft_precision_double;
+    const std::size_t length = n;
+    if constexpr (D == Direction::both || D == Direction::forward) {
+        AMREX_ROCFFT_SAFE_CALL
+            (rocfft_plan_create(&fwd.plan, rocfft_placement_inplace,
+                                rocfft_transform_type_complex_forward, prec, 1,
+                                &length, howmany, nullptr));
+    }
+    if constexpr (D == Direction::both || D == Direction::backward) {
+        AMREX_ROCFFT_SAFE_CALL
+            (rocfft_plan_create(&bwd.plan, rocfft_placement_inplace,
+                                rocfft_transform_type_complex_inverse, prec, 1,
+                                &length, howmany, nullptr));
+    }
+
+#elif defined(AMREX_USE_SYCL)
+
+    fwd.plan = new std::remove_pointer_t<VendorPlan2>(n);
+    fwd.plan->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
+                        DFTI_INPLACE);
+    fwd.plan->set_value(oneapi::mkl::dft::config_param::NUMBER_OF_TRANSFORMS,
+                        howmany);
+    fwd.plan->set_value(oneapi::mkl::dft::config_param::FWD_DISTANCE, n);
+    fwd.plan->set_value(oneapi::mkl::dft::config_param::BWD_DISTANCE, n);
+    std::array<std::int64_t,2> strides{0,1};
+    fwd.plan->set_value(oneapi::mkl::dft::config_param::FWD_STRIDES, strides.data());
+    fwd.plan->set_value(oneapi::mkl::dft::config_param::BWD_STRIDES, strides.data());
+    fwd.plan->set_value(oneapi::mkl::dft::config_param::WORKSPACE,
+                        oneapi::mkl::dft::config_value::WORKSPACE_EXTERNAL);
+    fwd.plan->commit(amrex::Gpu::Device::streamQueue());
+
+    bwd.plan = fwd.plan;
+
+#else
+    auto* pinout = (FFTComplex*)fab->dataPtr();
+
+    if constexpr (std::is_same_v<float,T>) {
+        if constexpr (D == Direction::both || D == Direction::forward) {
+            fwd.plan = fftwf_plan_many_dft(1, &n, howmany, pinout, nullptr, 1, n,
+                                           pinout, nullptr, 1, n, -1, FFTW_ESTIMATE);
+        }
+        if constexpr (D == Direction::both || D == Direction::backward) {
+            bwd.plan = fftwf_plan_many_dft(1, &n, howmany, pinout, nullptr, 1, n,
+                                           pinout, nullptr, 1, n, +1, FFTW_ESTIMATE);
+        }
+    } else {
+        if constexpr (D == Direction::both || D == Direction::forward) {
+            fwd.plan = fftw_plan_many_dft(1, &n, howmany, pinout, nullptr, 1, n,
+                                          pinout, nullptr, 1, n, -1, FFTW_ESTIMATE);
+        }
+        if constexpr (D == Direction::both || D == Direction::backward) {
+            bwd.plan = fftw_plan_many_dft(1, &n, howmany, pinout, nullptr, 1, n,
+                                          pinout, nullptr, 1, n, +1, FFTW_ESTIMATE);
+        }
+    }
+#endif
+
+    if constexpr (D == Direction::both || D == Direction::forward) {
+        fwd.defined = true;
+    }
+    if constexpr (D == Direction::both || D == Direction::backward) {
+        bwd.defined = true;
+    }
+
+    return {fwd,bwd};
+}
+
+template <typename T, Direction D>
+template <typename F>
+void R2C<T,D>::post_forward_doit (F const& post_forward)
+{
+    if (m_info.batch_mode) {
+        amrex::Abort("xxxxx todo: post_forward");
+    } else {
+        if (                           ! m_cz.empty()) {
+            auto* spectral_fab = get_fab(m_cz);
+            if (spectral_fab) {
+                auto const& a = spectral_fab->array(); // m_cz's ordering is z,x,y
+                ParallelFor(spectral_fab->box(),
+                [=] AMREX_GPU_DEVICE (int iz, int jx, int ky)
+                {
+                    post_forward(jx,ky,iz,a(iz,jx,ky));
+                });
+            }
+        } else if (                    ! m_cy.empty()) {
+            auto* spectral_fab = get_fab(m_cy);
+            if (spectral_fab) {
+                auto const& a = spectral_fab->array(); // m_cy's ordering is y,x,z
+                ParallelFor(spectral_fab->box(),
+                [=] AMREX_GPU_DEVICE (int iy, int jx, int k)
+                {
+                    post_forward(jx,iy,k,a(iy,jx,k));
+                });
+            }
+        } else {
+            auto* spectral_fab = get_fab(m_cx);
+            if (spectral_fab) {
+                auto const& a = spectral_fab->array();
+                ParallelFor(spectral_fab->box(),
+                [=] AMREX_GPU_DEVICE (int i, int j, int k)
+                {
+                    post_forward(i,j,k,a(i,j,k));
+                });
+            }
+        }
+    }
+}
+
+template <typename T, Direction D>
+template <Direction DIR, std::enable_if_t<DIR == Direction::forward ||
+                                          DIR == Direction::both, int> >
+std::pair<typename R2C<T,D>::cMF *, IntVect>
+R2C<T,D>::getSpectralData ()
+{
+    if (!m_cz.empty()) {
+        return std::make_pair(&m_cz, IntVect{AMREX_D_DECL(2,0,1)});
+    } else if (!m_cy.empty()) {
+        return std::make_pair(&m_cy, IntVect{AMREX_D_DECL(1,0,2)});
+    } else {
+        return std::make_pair(&m_cx, IntVect{AMREX_D_DECL(0,1,2)});
+    }
+}
+
+template <typename T, Direction D>
+template <Direction DIR, std::enable_if_t<DIR == Direction::forward ||
+                                          DIR == Direction::both, int> >
+void R2C<T,D>::forward (MF const& inmf, cMF& outmf)
+{
+    forward(inmf);
+    if (!m_cz.empty()) { // m_cz's order (z,x,y) -> (x,y,z)
+        RotateBwd dtos{};
+        MultiBlockCommMetaData cmd
+            (outmf, m_spectral_domain_x, m_cz, IntVect(0), dtos);
+        ParallelCopy(outmf, m_cz, cmd, 0, 0, 1, dtos);
+    } else if (!m_cy.empty()) { // m_cy's order (y,x,z) -> (x,y,z)
+        MultiBlockCommMetaData cmd
+            (outmf, m_spectral_domain_x, m_cy, IntVect(0), m_dtos_y2x);
+        ParallelCopy(outmf, m_cy, cmd, 0, 0, 1, m_dtos_y2x);
+    } else {
+        outmf.ParallelCopy(m_cx, 0, 0, 1);
+    }
+}
+
+template <typename T, Direction D>
+template <Direction DIR, std::enable_if_t<DIR == Direction::backward ||
+                                          DIR == Direction::both, int> >
+void R2C<T,D>::backward (cMF const& inmf, MF& outmf)
+{
+    if (!m_cz.empty()) { // (x,y,z) -> m_cz's order (z,x,y)
+        RotateFwd dtos{};
+        MultiBlockCommMetaData cmd
+            (m_cz, m_spectral_domain_z, inmf, IntVect(0), dtos);
+        ParallelCopy(m_cz, inmf, cmd, 0, 0, 1, dtos);
+    } else if (!m_cy.empty()) { // (x,y,z) -> m_cy's ordering (y,x,z)
+        MultiBlockCommMetaData cmd
+            (m_cy, m_spectral_domain_y, inmf, IntVect(0), m_dtos_x2y);
+        ParallelCopy(m_cy, inmf, cmd, 0, 0, 1, m_dtos_x2y);
+    } else {
+        m_cx.ParallelCopy(inmf, 0, 0, 1);
+    }
+    backward_doit(outmf);
+}
+
+}
+
+#endif
diff --git a/Src/FFT/AMReX_FFT.cpp b/Src/FFT/AMReX_FFT.cpp
new file mode 100644
index 0000000000..68984a8f24
--- /dev/null
+++ b/Src/FFT/AMReX_FFT.cpp
@@ -0,0 +1,40 @@
+#include <AMReX_FFT.H>
+#include <algorithm>
+
+namespace amrex::FFT::detail
+{
+
+DistributionMapping make_iota_distromap (Long n)
+{
+    AMREX_ASSERT(n <= ParallelContext::NProcsSub());
+    Vector<int> pm(n);
+    for (int i = 0; i < n; ++i) {
+        pm[i] = ParallelContext::local_to_global_rank(i);
+    }
+    return DistributionMapping(std::move(pm));
+}
+
+#ifdef AMREX_USE_HIP
+void hip_execute (rocfft_plan plan, void **in, void **out)
+{
+    rocfft_execution_info execinfo = nullptr;
+    AMREX_ROCFFT_SAFE_CALL(rocfft_execution_info_create(&execinfo));
+
+    std::size_t buffersize = 0;
+    AMREX_ROCFFT_SAFE_CALL(rocfft_plan_get_work_buffer_size(plan, &buffersize));
+
+    auto* buffer = (void*)amrex::The_Arena()->alloc(buffersize);
+    AMREX_ROCFFT_SAFE_CALL(rocfft_execution_info_set_work_buffer(execinfo, buffer, buffersize));
+
+    AMREX_ROCFFT_SAFE_CALL(rocfft_execution_info_set_stream(execinfo, amrex::Gpu::gpuStream()));
+
+    AMREX_ROCFFT_SAFE_CALL(rocfft_execute(plan, in, out, execinfo));
+
+    amrex::Gpu::streamSynchronize();
+    amrex::The_Arena()->free(buffer);
+
+    AMREX_ROCFFT_SAFE_CALL(rocfft_execution_info_destroy(execinfo));
+}
+#endif
+
+}
diff --git a/Src/FFT/AMReX_FFT_Helper.H b/Src/FFT/AMReX_FFT_Helper.H
new file mode 100644
index 0000000000..c8ae2b74ea
--- /dev/null
+++ b/Src/FFT/AMReX_FFT_Helper.H
@@ -0,0 +1,29 @@
+#ifndef AMREX_FFT_HELPER_H_
+#define AMREX_FFT_HELPER_H_
+#include <AMReX_Config.H>
+
+#include <AMReX_DistributionMapping.H>
+
+namespace amrex::FFT
+{
+
+enum struct Direction { forward, backward, both };
+
+struct Info
+{
+    //! Supported only in 3D. When batch_mode is true, FFT is performed on
+    //! the first two dimensions only and the third dimension size is the
+    //! batch size.
+    bool batch_mode = false;
+
+    Info& setBatchMode (bool x) { batch_mode = x; return *this; }
+};
+
+namespace detail
+{
+    DistributionMapping make_iota_distromap (Long n);
+}
+
+}
+
+#endif
diff --git a/Src/FFT/AMReX_FFT_Poisson.H b/Src/FFT/AMReX_FFT_Poisson.H
new file mode 100644
index 0000000000..6206206210
--- /dev/null
+++ b/Src/FFT/AMReX_FFT_Poisson.H
@@ -0,0 +1,259 @@
+#ifndef AMREX_FFT_POISSON_H_
+#define AMREX_FFT_POISSON_H_
+
+#include <AMReX_FFT.H>
+#include <AMReX_Geometry.H>
+
+namespace amrex::FFT
+{
+
+/**
+ * \brief Poisson solver for all periodic boundaries using FFT
+ */
+template <typename MF>
+class Poisson
+{
+public:
+
+    template <typename FA=MF, std::enable_if_t<IsFabArray_v<FA>,int> = 0>
+    explicit Poisson (Geometry const& geom)
+        : m_geom(geom), m_r2c(geom.Domain())
+    {
+        AMREX_ALWAYS_ASSERT(geom.isAllPeriodic());
+    }
+
+    void solve (MF& soln, MF const& rhs);
+
+private:
+    Geometry m_geom;
+    R2C<typename MF::value_type, Direction::both> m_r2c;
+};
+
+/**
+ * \brief 3D Poisson solver for periodic boundaries in the first two
+ * dimensions and Neumann in the last dimension.
+ */
+template <typename MF>
+class PoissonHybrid
+{
+public:
+
+    template <typename FA=MF, std::enable_if_t<IsFabArray_v<FA>,int> = 0>
+    explicit PoissonHybrid (Geometry const& geom)
+        : m_geom(geom), m_r2c(geom.Domain(), Info().setBatchMode(true))
+    {
+#if (AMREX_SPACEDIM == 3)
+        AMREX_ALWAYS_ASSERT(geom.isPeriodic(0) && geom.isPeriodic(1));
+#else
+        amrex::Abort("FFT::PoissonHybrid: 1D & 2D todo");
+#endif
+    }
+
+    void solve (MF& soln, MF const& rhs);
+
+private:
+    Geometry m_geom;
+    R2C<typename MF::value_type, Direction::both> m_r2c;
+};
+
+template <typename MF>
+void Poisson<MF>::solve (MF& soln, MF const& rhs)
+{
+    using T = typename MF::value_type;
+
+    GpuArray<T,AMREX_SPACEDIM> fac
+        {AMREX_D_DECL(T(2)*Math::pi<T>()/T(m_geom.ProbLength(0)),
+                      T(2)*Math::pi<T>()/T(m_geom.ProbLength(1)),
+                      T(2)*Math::pi<T>()/T(m_geom.ProbLength(2)))};
+    GpuArray<T,AMREX_SPACEDIM> dx
+        {AMREX_D_DECL(T(m_geom.CellSize(0)),
+                      T(m_geom.CellSize(1)),
+                      T(m_geom.CellSize(2)))};
+    auto scale = T(1.0/m_geom.Domain().d_numPts());
+#if (AMREX_SPACEDIM > 1)
+    auto const& len = m_geom.Domain().length();
+#endif
+
+    m_r2c.forwardThenBackward(rhs, soln,
+                              [=] AMREX_GPU_DEVICE (int i, int j, int k,
+                                                    GpuComplex<T>& spectral_data)
+    {
+        amrex::ignore_unused(i,j,k);
+        // the values in the upper-half of the spectral array in y and z
+        // are here interpreted as negative wavenumbers
+        AMREX_D_TERM(T a = fac[0]*i;,
+                     T b = (j < len[1]/2) ? fac[1]*j : fac[1]*(len[1]-j);,
+                     T c = (k < len[2]/2) ? fac[2]*k : fac[2]*(len[2]-k));
+        T k2 = AMREX_D_TERM(T(2)*(std::cos(a*dx[0])-T(1))/(dx[0]*dx[0]),
+                           +T(2)*(std::cos(b*dx[1])-T(1))/(dx[1]*dx[1]),
+                           +T(2)*(std::cos(c*dx[2])-T(1))/(dx[2]*dx[2]));
+        if (k2 != T(0)) {
+            spectral_data /= k2;
+        } else {
+            // interpretation here is that the average value of the
+            // solution is zero
+            spectral_data = 0;
+        }
+        spectral_data *= scale;
+    });
+}
+
+template <typename MF>
+void PoissonHybrid<MF>::solve (MF& soln, MF const& rhs)
+{
+#if (AMREX_SPACEDIM < 3)
+    amrex::ignore_unused(soln, rhs);
+#else
+    using T = typename MF::value_type;
+
+    auto facx = T(2)*Math::pi<T>()/T(m_geom.ProbLength(0));
+    auto facy = T(2)*Math::pi<T>()/T(m_geom.ProbLength(1));
+    auto dx = T(m_geom.CellSize(0));
+    auto dy = T(m_geom.CellSize(1));
+    auto scale = T(1.0)/(T(m_geom.Domain().length(0)) *
+                         T(m_geom.Domain().length(1)));
+    auto ny = m_geom.Domain().length(1);
+    auto nz = m_geom.Domain().length(2);
+
+    Gpu::DeviceVector<T> delzv(nz, T(m_geom.CellSize(2)));
+    auto const* delz = delzv.data();
+
+    Box cdomain = m_geom.Domain();
+    cdomain.setBig(0,cdomain.length(0)/2);
+    auto cba = amrex::decompose(cdomain, ParallelContext::NProcsSub(),
+                                {AMREX_D_DECL(true,true,false)});
+    DistributionMapping dm = detail::make_iota_distromap(cba.size());
+    FabArray<BaseFab<GpuComplex<T> > > spmf(cba, dm, 1, 0);
+
+    m_r2c.forward(rhs, spmf);
+
+    for (MFIter mfi(spmf); mfi.isValid(); ++mfi)
+    {
+        auto const& spectral = spmf.array(mfi);
+        auto const& box = mfi.validbox();
+        auto const& xybox = amrex::makeSlab(box, 2, 0);
+
+#ifdef AMREX_USE_GPU
+        // xxxxx TODO: We need to explore how to optimize this
+        // function. Maybe we can use cusparse. Maybe we should make
+        // z-direction to be the unit stride direction.
+
+        FArrayBox tridiag_workspace(box,4);
+        auto const& ald = tridiag_workspace.array(0);
+        auto const& bd = tridiag_workspace.array(1);
+        auto const& cud = tridiag_workspace.array(2);
+        auto const& scratch = tridiag_workspace.array(3);
+
+        amrex::ParallelFor(xybox, [=] AMREX_GPU_DEVICE (int i, int j, int)
+        {
+            T a = facx*i;
+            T b = (j < ny/2) ? facy*j : facy*(ny-j);
+
+            T k2 = T(2)*(std::cos(a*dx)-T(1))/(dx*dx)
+                +  T(2)*(std::cos(b*dy)-T(1))/(dy*dy);
+
+            // Tridiagonal solve with homogeneous Neumann
+            for(int k=0; k < nz; k++) {
+                if(k==0) {
+                    ald(i,j,k) = 0.;
+                    cud(i,j,k) = 2.0 /(delz[k]*(delz[k]+delz[k+1]));
+                    bd(i,j,k) = k2 -ald(i,j,k)-cud(i,j,k);
+                } else if (k == nz-1) {
+                    ald(i,j,k) = 2.0 /(delz[k]*(delz[k]+delz[k-1]));
+                    cud(i,j,k) = 0.;
+                    bd(i,j,k) = k2 -ald(i,j,k)-cud(i,j,k);
+                    if (i == 0 && j == 0) {
+                        bd(i,j,k) *= 2.0;
+                    }
+                } else {
+                    ald(i,j,k) = 2.0 /(delz[k]*(delz[k]+delz[k-1]));
+                    cud(i,j,k) = 2.0 /(delz[k]*(delz[k]+delz[k+1]));
+                    bd(i,j,k) = k2 -ald(i,j,k)-cud(i,j,k);
+                }
+            }
+
+            scratch(i,j,0) = cud(i,j,0)/bd(i,j,0);
+            spectral(i,j,0) = spectral(i,j,0)/bd(i,j,0);
+
+            for (int k = 1; k < nz; k++) {
+                if (k < nz-1) {
+                    scratch(i,j,k) = cud(i,j,k) / (bd(i,j,k) - ald(i,j,k) * scratch(i,j,k-1));
+                }
+                spectral(i,j,k) = (spectral(i,j,k) - ald(i,j,k) * spectral(i,j,k - 1))
+                    / (bd(i,j,k) - ald(i,j,k) * scratch(i,j,k-1));
+            }
+
+            for (int k = nz - 2; k >= 0; k--) {
+                spectral(i,j,k) -= scratch(i,j,k) * spectral(i,j,k + 1);
+            }
+
+            for (int k = 0; k < nz; ++k) {
+                spectral(i,j,k) *= scale;
+            }
+        });
+        Gpu::streamSynchronize();
+
+#else
+
+        Gpu::DeviceVector<GpuComplex<Real>> ald(nz);
+        Gpu::DeviceVector<GpuComplex<Real>> bd(nz);
+        Gpu::DeviceVector<GpuComplex<Real>> cud(nz);
+        Gpu::DeviceVector<GpuComplex<Real>> scratch(nz);
+
+        amrex::LoopOnCpu(xybox, [&] (int i, int j, int)
+        {
+            T a = facx*i;
+            T b = (j < ny/2) ? facy*j : facy*(ny-j);
+
+            T k2 = T(2)*(std::cos(a*dx)-T(1))/(dx*dx)
+                +  T(2)*(std::cos(b*dy)-T(1))/(dy*dy);
+
+            // Tridiagonal solve with homogeneous Neumann
+            for(int k=0; k < nz; k++) {
+                if(k==0) {
+                    ald[k] = 0.;
+                    cud[k] = 2.0 /(delz[k]*(delz[k]+delz[k+1]));
+                    bd[k] = k2 -ald[k]-cud[k];
+                } else if (k == nz-1) {
+                    ald[k] = 2.0 /(delz[k]*(delz[k]+delz[k-1]));
+                    cud[k] = 0.;
+                    bd[k] = k2 -ald[k]-cud[k];
+                    if (i == 0 && j == 0) {
+                        bd[k] *= 2.0;
+                    }
+                } else {
+                    ald[k] = 2.0 /(delz[k]*(delz[k]+delz[k-1]));
+                    cud[k] = 2.0 /(delz[k]*(delz[k]+delz[k+1]));
+                    bd[k] = k2 -ald[k]-cud[k];
+                }
+            }
+
+            scratch[0] = cud[0]/bd[0];
+            spectral(i,j,0) = spectral(i,j,0)/bd[0];
+
+            for (int k = 1; k < nz; k++) {
+                if (k < nz-1) {
+                    scratch[k] = cud[k] / (bd[k] - ald[k] * scratch[k-1]);
+                }
+                spectral(i,j,k) = (spectral(i,j,k) - ald[k] * spectral(i,j,k - 1))
+                    / (bd[k] - ald[k] * scratch[k-1]);
+            }
+
+            for (int k = nz - 2; k >= 0; k--) {
+                spectral(i,j,k) -= scratch[k] * spectral(i,j,k + 1);
+            }
+
+            for (int k = 0; k < nz; ++k) {
+                spectral(i,j,k) *= scale;
+            }
+        });
+#endif
+    }
+
+    m_r2c.backward(spmf, soln);
+#endif
+}
+
+}
+
+#endif
diff --git a/Src/FFT/CMakeLists.txt b/Src/FFT/CMakeLists.txt
new file mode 100644
index 0000000000..2c695a9aec
--- /dev/null
+++ b/Src/FFT/CMakeLists.txt
@@ -0,0 +1,14 @@
+add_amrex_define(AMREX_USE_FFT NO_LEGACY)
+
+foreach(D IN LISTS AMReX_SPACEDIM)
+    target_include_directories(amrex_${D}d PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_LIST_DIR}>)
+
+    target_sources(amrex_${D}d
+       PRIVATE
+       AMReX_FFT.H
+       AMReX_FFT.cpp
+       AMReX_FFT_Helper.H
+       AMReX_FFT_Poisson.H
+       )
+
+endforeach()
diff --git a/Src/FFT/Make.package b/Src/FFT/Make.package
new file mode 100644
index 0000000000..1dcd714f64
--- /dev/null
+++ b/Src/FFT/Make.package
@@ -0,0 +1,10 @@
+ifndef AMREX_FFT_MAKE
+       AMREX_FFT_MAKE := 1
+
+CEXE_headers += AMReX_FFT.H AMReX_FFT_Helper.H AMReX_FFT_Poisson.H
+CEXE_sources += AMReX_FFT.cpp
+
+VPATH_LOCATIONS += $(AMREX_HOME)/Src/FFT
+INCLUDE_LOCATIONS += $(AMREX_HOME)/Src/FFT
+
+endif
diff --git a/Src/F_Interfaces/Base/AMReX_boxarray_fi.cpp b/Src/F_Interfaces/Base/AMReX_boxarray_fi.cpp
index dd7916a9ad..48d8eae7a6 100644
--- a/Src/F_Interfaces/Base/AMReX_boxarray_fi.cpp
+++ b/Src/F_Interfaces/Base/AMReX_boxarray_fi.cpp
@@ -86,4 +86,9 @@ extern "C" {
         Box bx(IntVect(lo), IntVect(hi), ba->ixType());
         return ba->intersects(bx);
     }
+
+    int amrex_fi_boxarray_issame (const BoxArray* baa, const BoxArray* bab)
+    {
+        return *baa == *bab;
+    }
 }
diff --git a/Src/F_Interfaces/Base/AMReX_boxarray_mod.F90 b/Src/F_Interfaces/Base/AMReX_boxarray_mod.F90
index 0181c6cfb9..f85b5e8d74 100644
--- a/Src/F_Interfaces/Base/AMReX_boxarray_mod.F90
+++ b/Src/F_Interfaces/Base/AMReX_boxarray_mod.F90
@@ -9,7 +9,8 @@ module amrex_boxarray_module
 
   private
 
-  public :: amrex_boxarray_build, amrex_boxarray_destroy, amrex_print
+  public :: amrex_boxarray_build, amrex_boxarray_destroy, amrex_print, &
+            operator(==)
 
   type, public :: amrex_boxarray
      logical     :: owner = .false.
@@ -36,6 +37,10 @@ module amrex_boxarray_module
 #endif
   end type amrex_boxarray
 
+  interface operator(==)
+    module procedure amrex_boxarray_issame
+  end interface operator(==)
+
   interface amrex_boxarray_build
      module procedure amrex_boxarray_build_bx
      module procedure amrex_boxarray_build_bxs
@@ -122,12 +127,18 @@ pure function amrex_fi_boxarray_numpts (ba) bind(c)
        integer(amrex_long) :: amrex_fi_boxarray_numpts
      end function amrex_fi_boxarray_numpts
 
-     pure integer function amrex_fi_boxarray_intersects_box (ba, lo, hi) bind(c)
+     pure integer(c_int) function amrex_fi_boxarray_intersects_box (ba, lo, hi) bind(c)
        import
        implicit none
        type(c_ptr), value, intent(in) :: ba
        integer, intent(in) :: lo(*), hi(*)
      end function amrex_fi_boxarray_intersects_box
+
+     pure integer(c_int) function amrex_fi_boxarray_issame (baa, bab) bind(c)
+       import
+       implicit none
+       type(c_ptr), value, intent(in) :: baa, bab
+     end function amrex_fi_boxarray_issame
   end interface
 
 contains
@@ -258,4 +269,10 @@ pure function amrex_boxarray_intersects_box (this, bx) result(r)
     r = ir .ne. 0
   end function amrex_boxarray_intersects_box
 
+  pure logical function amrex_boxarray_issame(baa, bab) result(r)
+    type(amrex_boxarray), intent(in) :: baa
+    type(amrex_boxarray), intent(in) :: bab
+    r = amrex_fi_boxarray_issame(baa%p, bab%p) .ne. 0
+  end function amrex_boxarray_issame
+
 end module amrex_boxarray_module
diff --git a/Src/F_Interfaces/Base/AMReX_distromap_fi.cpp b/Src/F_Interfaces/Base/AMReX_distromap_fi.cpp
index e50031a588..7fc7adc171 100644
--- a/Src/F_Interfaces/Base/AMReX_distromap_fi.cpp
+++ b/Src/F_Interfaces/Base/AMReX_distromap_fi.cpp
@@ -41,4 +41,9 @@ extern "C" {
     {
         AllPrint() << *dm;
     }
+
+    int amrex_fi_distromap_issame (const DistributionMapping* dma, const DistributionMapping* dmb)
+    {
+        return *dma == *dmb;
+    }
 }
diff --git a/Src/F_Interfaces/Base/AMReX_distromap_mod.F90 b/Src/F_Interfaces/Base/AMReX_distromap_mod.F90
index adbb91b442..9c0884168e 100644
--- a/Src/F_Interfaces/Base/AMReX_distromap_mod.F90
+++ b/Src/F_Interfaces/Base/AMReX_distromap_mod.F90
@@ -8,7 +8,8 @@ module amrex_distromap_module
 
   private
 
-  public :: amrex_distromap_build, amrex_distromap_destroy, amrex_print
+  public :: amrex_distromap_build, amrex_distromap_destroy, amrex_print, &
+            operator(==)
 
   type, public :: amrex_distromap
      logical     :: owner = .false.
@@ -25,6 +26,10 @@ module amrex_distromap_module
 #endif
   end type amrex_distromap
 
+  interface operator(==)
+    module procedure amrex_distromap_issame
+  end interface operator(==)
+
   interface amrex_distromap_build
      module procedure amrex_distromap_build_ba
      module procedure amrex_distromap_build_pmap
@@ -89,6 +94,12 @@ subroutine amrex_fi_print_distromap (dm) bind(c)
        implicit none
        type(c_ptr), value :: dm
      end subroutine amrex_fi_print_distromap
+
+     pure integer(c_int) function amrex_fi_distromap_issame (dma, dmb) bind(c)
+       import
+       implicit none
+       type(c_ptr), value, intent(in) :: dma, dmb
+     end function amrex_fi_distromap_issame
   end interface
 
 contains
@@ -158,4 +169,9 @@ subroutine amrex_distromap_print (dm)
     call amrex_fi_print_distromap(dm%p)
   end subroutine amrex_distromap_print
 
+  pure logical function amrex_distromap_issame (dma, dmb) result(r)
+     type(amrex_distromap), intent(in) :: dma, dmb
+     r = amrex_fi_distromap_issame(dma%p, dmb%p) .ne. 0
+  end function amrex_distromap_issame
+
 end module amrex_distromap_module
diff --git a/Src/LinearSolvers/CMakeLists.txt b/Src/LinearSolvers/CMakeLists.txt
index 6287ef4b42..ddf2de454a 100644
--- a/Src/LinearSolvers/CMakeLists.txt
+++ b/Src/LinearSolvers/CMakeLists.txt
@@ -16,6 +16,8 @@ foreach(D IN LISTS AMReX_SPACEDIM)
        MLMG/AMReX_MLLinOp_K.H
        MLMG/AMReX_MLCellLinOp.H
        MLMG/AMReX_MLNodeLinOp.H
+       MLMG/AMReX_MLNodeLinOp_K.H
+       MLMG/AMReX_MLNodeLinOp_${D}D_K.H
        MLMG/AMReX_MLNodeLinOp.cpp
        MLMG/AMReX_MLCellABecLap.H
        MLMG/AMReX_MLCellABecLap_K.H
@@ -31,30 +33,6 @@ foreach(D IN LISTS AMReX_SPACEDIM)
        MLMG/AMReX_MLPoisson.H
        MLMG/AMReX_MLPoisson_K.H
        MLMG/AMReX_MLPoisson_${D}D_K.H
-       MLMG/AMReX_MLNodeLaplacian.H
-       MLMG/AMReX_MLNodeLaplacian.cpp
-       MLMG/AMReX_MLNodeLaplacian_sync.cpp
-       MLMG/AMReX_MLNodeLaplacian_sten.cpp
-       MLMG/AMReX_MLNodeLaplacian_misc.cpp
-       MLMG/AMReX_MLNodeLap_K.H
-       MLMG/AMReX_MLNodeLap_${D}D_K.H
-       MLMG/AMReX_MLNodeTensorLaplacian.H
-       MLMG/AMReX_MLNodeTensorLaplacian.cpp
-       MLMG/AMReX_MLNodeTensorLap_K.H
-       MLMG/AMReX_MLNodeTensorLap_${D}D_K.H
-       MLMG/AMReX_MLTensorOp.H
-       MLMG/AMReX_MLTensorOp.cpp
-       MLMG/AMReX_MLTensorOp_grad.cpp
-       MLMG/AMReX_MLTensor_K.H
-       MLMG/AMReX_MLTensor_${D}D_K.H
-       MLMG/AMReX_MLEBNodeFDLaplacian.H
-       MLMG/AMReX_MLEBNodeFDLaplacian.cpp
-       MLMG/AMReX_MLEBNodeFDLap_K.H
-       MLMG/AMReX_MLEBNodeFDLap_${D}D_K.H
-       MLMG/AMReX_MLNodeABecLaplacian.H
-       MLMG/AMReX_MLNodeABecLaplacian.cpp
-       MLMG/AMReX_MLNodeABecLap_K.H
-       MLMG/AMReX_MLNodeABecLap_${D}D_K.H
        AMReX_GMRES.H
        AMReX_GMRES_MLMG.H
        )
@@ -68,30 +46,72 @@ foreach(D IN LISTS AMReX_SPACEDIM)
           )
     endif ()
 
-    if (NOT D EQUAL 1)
+    if (AMReX_LINEAR_SOLVERS_EM)
+       if (NOT D EQUAL 1 AND AMReX_LINEAR_SOLVERS_EM)
+          target_sources(amrex_${D}d
+             PRIVATE
+             MLMG/AMReX_MLCurlCurl.H
+             MLMG/AMReX_MLCurlCurl.cpp
+             MLMG/AMReX_MLCurlCurl_K.H
+             )
+       endif ()
+
        target_sources(amrex_${D}d
           PRIVATE
-          MLMG/AMReX_MLCurlCurl.H
-          MLMG/AMReX_MLCurlCurl.cpp
-          MLMG/AMReX_MLCurlCurl_K.H
-          )
+          MLMG/AMReX_MLEBNodeFDLaplacian.H
+          MLMG/AMReX_MLEBNodeFDLaplacian.cpp
+          MLMG/AMReX_MLEBNodeFDLap_K.H
+          MLMG/AMReX_MLEBNodeFDLap_${D}D_K.H
+          MLMG/AMReX_MLNodeTensorLaplacian.H
+          MLMG/AMReX_MLNodeTensorLaplacian.cpp
+          MLMG/AMReX_MLNodeTensorLap_K.H
+          MLMG/AMReX_MLNodeTensorLap_${D}D_K.H
+       )
+    endif ()
+
+    if (AMReX_LINEAR_SOLVERS_INCFLO)
+       target_sources(amrex_${D}d
+          PRIVATE
+          MLMG/AMReX_MLNodeABecLaplacian.H
+          MLMG/AMReX_MLNodeABecLaplacian.cpp
+          MLMG/AMReX_MLNodeABecLap_K.H
+          MLMG/AMReX_MLNodeABecLap_${D}D_K.H
+          MLMG/AMReX_MLNodeLaplacian.H
+          MLMG/AMReX_MLNodeLaplacian.cpp
+          MLMG/AMReX_MLNodeLaplacian_sync.cpp
+          MLMG/AMReX_MLNodeLaplacian_sten.cpp
+          MLMG/AMReX_MLNodeLaplacian_misc.cpp
+          MLMG/AMReX_MLNodeLap_K.H
+          MLMG/AMReX_MLNodeLap_${D}D_K.H
+          MLMG/AMReX_MLTensorOp.H
+          MLMG/AMReX_MLTensorOp.cpp
+          MLMG/AMReX_MLTensorOp_grad.cpp
+          MLMG/AMReX_MLTensor_K.H
+          MLMG/AMReX_MLTensor_${D}D_K.H
+       )
     endif ()
 
     if (AMReX_EB AND NOT D EQUAL 1)
        target_sources(amrex_${D}d
           PRIVATE
-          MLMG/AMReX_MLNodeLaplacian_eb.cpp
           MLMG/AMReX_MLEBABecLap.H
           MLMG/AMReX_MLEBABecLap.cpp
           MLMG/AMReX_MLEBABecLap_F.cpp
           MLMG/AMReX_MLEBABecLap_K.H
           MLMG/AMReX_MLEBABecLap_${D}D_K.H
-          MLMG/AMReX_MLEBTensorOp.H
-          MLMG/AMReX_MLEBTensorOp.cpp
-          MLMG/AMReX_MLEBTensorOp_bc.cpp
-          MLMG/AMReX_MLEBTensor_K.H
-          MLMG/AMReX_MLEBTensor_${D}D_K.H
           )
+
+       if (AMReX_LINEAR_SOLVERS_INCFLO)
+          target_sources(amrex_${D}d
+             PRIVATE
+             MLMG/AMReX_MLNodeLaplacian_eb.cpp
+             MLMG/AMReX_MLEBTensorOp.H
+             MLMG/AMReX_MLEBTensorOp.cpp
+             MLMG/AMReX_MLEBTensorOp_bc.cpp
+             MLMG/AMReX_MLEBTensor_K.H
+             MLMG/AMReX_MLEBTensor_${D}D_K.H
+             )
+       endif ()
     endif ()
 
     if (AMReX_FORTRAN)
@@ -102,7 +122,7 @@ foreach(D IN LISTS AMReX_SPACEDIM)
           )
     endif ()
 
-    if (AMReX_HYPRE)
+    if (AMReX_HYPRE AND AMReX_LINEAR_SOLVERS_INCFLO)
        target_sources(amrex_${D}d
           PRIVATE
           MLMG/AMReX_MLNodeLaplacian_hypre.cpp
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLCellLinOp.H b/Src/LinearSolvers/MLMG/AMReX_MLCellLinOp.H
index e04e16f8bd..b318b318eb 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLCellLinOp.H
+++ b/Src/LinearSolvers/MLMG/AMReX_MLCellLinOp.H
@@ -536,7 +536,11 @@ MLCellLinOpT<MF>::setLevelBC (int amrlev, const MF* a_levelbcdata, const MF* rob
     {
         if (this->needsCoarseDataForBC())
         {
-            AMREX_ALWAYS_ASSERT(!this->hasHiddenDimension());
+            // AMREX_ALWAYS_ASSERT(!this->hasHiddenDimension());
+            if (this->hasHiddenDimension()) {
+                int hidden_dir = this->hiddenDirection();
+                AMREX_ALWAYS_ASSERT(this->m_coarse_data_crse_ratio[hidden_dir] == 1);
+            }
             br_ref_ratio = this->m_coarse_data_crse_ratio.allGT(0) ? this->m_coarse_data_crse_ratio : IntVect(2);
             if (m_crse_sol_br[amrlev] == nullptr && br_ref_ratio.allGT(0))
             {
@@ -1946,8 +1950,6 @@ MLCellLinOpT<MF>::computeVolInv () const
         m_volinv[amrlev].resize(this->NMGLevels(amrlev));
     }
 
-    AMREX_ASSERT(this->m_coarse_fine_bc_type == LinOpBCType::Dirichlet || ! this->hasHiddenDimension());
-
     // We don't need to compute for every level
 
     auto f = [&] (int amrlev, int mglev) {
@@ -2009,11 +2011,11 @@ MLCellLinOpT<MF>::normInf (int amrlev, MF const& mf, bool local) const -> RT
     const int finest_level = this->NAMRLevels() - 1;
     RT norm = RT(0.0);
 #ifdef AMREX_USE_EB
-    if (! mf.isAllRegular()) {
+    const auto *factory = dynamic_cast<EBFArrayBoxFactory const*>(this->Factory(amrlev));
+    if (factory && !factory->isAllRegular()) {
         if constexpr (!std::is_same<MF,MultiFab>()) {
             amrex::Abort("MLCellLinOpT with EB only works with MultiFab");
         } else {
-            const auto *factory = dynamic_cast<EBFArrayBoxFactory const*>(this->Factory(amrlev));
             const MultiFab& vfrac = factory->getVolFrac();
             if (amrlev == finest_level) {
 #ifdef AMREX_USE_GPU
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLEBABecLap_K.H b/Src/LinearSolvers/MLMG/AMReX_MLEBABecLap_K.H
index 517b1875bc..434789c5c5 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLEBABecLap_K.H
+++ b/Src/LinearSolvers/MLMG/AMReX_MLEBABecLap_K.H
@@ -6,13 +6,6 @@
 
 #include <AMReX_EBCellFlag.H>
 
-namespace amrex {
-    AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-    Real get_dx_eb (Real kappa) noexcept {
-        return amrex::max(Real(0.3),(kappa*kappa-Real(0.25))/(Real(2.0)*kappa));
-    }
-}
-
 #if (AMREX_SPACEDIM == 2)
 #include <AMReX_MLEBABecLap_2D_K.H>
 #else
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLEBNodeFDLaplacian.cpp b/Src/LinearSolvers/MLMG/AMReX_MLEBNodeFDLaplacian.cpp
index af4a6a6d74..0a0bdf39a1 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLEBNodeFDLaplacian.cpp
+++ b/Src/LinearSolvers/MLMG/AMReX_MLEBNodeFDLaplacian.cpp
@@ -1,6 +1,6 @@
 #include <AMReX_MLEBNodeFDLaplacian.H>
 #include <AMReX_MLEBNodeFDLap_K.H>
-#include <AMReX_MLNodeLap_K.H>
+#include <AMReX_MLNodeLinOp_K.H>
 #include <AMReX_MLNodeTensorLap_K.H>
 #include <AMReX_MultiFabUtil.H>
 
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLLinOp_K.H b/Src/LinearSolvers/MLMG/AMReX_MLLinOp_K.H
index a6bc651736..74fcc1302f 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLLinOp_K.H
+++ b/Src/LinearSolvers/MLMG/AMReX_MLLinOp_K.H
@@ -1053,6 +1053,15 @@ void mllinop_apply_innu_zhi (int i, int j, int k,
     }
 }
 
+#ifdef AMREX_USE_EB
+
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+Real get_dx_eb (Real kappa) noexcept {
+    return amrex::max(Real(0.3),(kappa*kappa-Real(0.25))/(Real(2.0)*kappa));
+}
+
+#endif
+
 }
 
 #endif
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLNodeLap_1D_K.H b/Src/LinearSolvers/MLMG/AMReX_MLNodeLap_1D_K.H
index 91d0225739..7b6bc1e1fc 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLNodeLap_1D_K.H
+++ b/Src/LinearSolvers/MLMG/AMReX_MLNodeLap_1D_K.H
@@ -4,79 +4,6 @@
 
 namespace amrex {
 
-AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-void mlndlap_set_nodal_mask (int i, int, int, Array4<int> const& nmsk,
-                             Array4<int const> const& cmsk) noexcept
-{
-    using namespace nodelap_detail;
-
-    int s = cmsk(i-1,0,0) + cmsk(i,0,0);
-    if (s == 2*crse_cell) {
-        nmsk(i,0,0) = crse_node;
-    } else if (s == 2*fine_cell) {
-        nmsk(i,0,0) = fine_node;
-    } else {
-        nmsk(i,0,0) = crse_fine_node;
-    }
-}
-
-AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-void mlndlap_set_dirichlet_mask (Box const& bx, Array4<int> const& dmsk,
-                                 Array4<int const> const& omsk, Box const& dom,
-                                 GpuArray<LinOpBCType, AMREX_SPACEDIM> const& bclo,
-                                 GpuArray<LinOpBCType, AMREX_SPACEDIM> const& bchi) noexcept
-{
-    const auto lo = bx.smallEnd(0);
-    const auto hi = bx.bigEnd(0);
-    AMREX_PRAGMA_SIMD
-    for (int i = lo; i <= hi; ++i) {
-        if (!dmsk(i,0,0)) {
-            dmsk(i,0,0) = (omsk(i-1,0,0) == 1 || omsk(i,0,0) == 1);
-        }
-    }
-
-    const auto domlo = dom.smallEnd(0);
-    const auto domhi = dom.bigEnd(0);
-
-    if (bclo[0] == LinOpBCType::Dirichlet && lo == domlo) {
-        dmsk(lo,0,0) = 1;
-    }
-
-    if (bchi[0] == LinOpBCType::Dirichlet && hi == domhi) {
-        dmsk(hi,0,0) = 1;
-    }
-}
-
-AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-void mlndlap_set_dot_mask (Box const& bx, Array4<Real> const& dmsk,
-                           Array4<int const> const& omsk, Box const& dom,
-                           GpuArray<LinOpBCType, AMREX_SPACEDIM> const& bclo,
-                           GpuArray<LinOpBCType, AMREX_SPACEDIM> const& bchi) noexcept
-{
-    const auto lo = bx.smallEnd(0);
-    const auto hi = bx.bigEnd(0);
-
-    AMREX_PRAGMA_SIMD
-    for (int i = lo; i <= hi; ++i) {
-        dmsk(i,0,0) = static_cast<Real>(omsk(i,0,0));
-    }
-
-    const auto domlo = dom.smallEnd(0);
-    const auto domhi = dom.bigEnd(0);
-
-    if ((bclo[0] == LinOpBCType::Neumann || bclo[0] == LinOpBCType::inflow)
-        && lo == domlo)
-    {
-        dmsk(lo,0,0) *= Real(0.5);
-    }
-
-    if ((bchi[0] == LinOpBCType::Neumann || bchi[0] == LinOpBCType::inflow)
-        && hi == domhi)
-    {
-        dmsk(hi,0,0) *= Real(0.5);
-    }
-}
-
 AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
 void mlndlap_zero_fine (int i, int, int, Array4<Real> const& phi,
                         Array4<int const> const& msk, int fine_flag) noexcept
@@ -106,39 +33,6 @@ void mlndlap_semi_avgdown_coeff (int i, int j, int k, Array4<Real> const& crse,
     mlndlap_avgdown_coeff_x(i,j,k,crse,fine);
 }
 
-template <typename T>
-void mlndlap_bc_doit (Box const& vbx, Array4<T> const& a, Box const& domain,
-                      GpuArray<bool,AMREX_SPACEDIM> const& bflo,
-                      GpuArray<bool,AMREX_SPACEDIM> const& bfhi) noexcept
-{
-    Box gdomain = domain;
-    int const idim = 0;
-    if (! bflo[idim]) { gdomain.growLo(idim,1); }
-    if (! bfhi[idim]) { gdomain.growHi(idim,1); }
-
-    if (gdomain.strictly_contains(vbx)) { return; }
-
-    const int offset = domain.cellCentered() ? 0 : 1;
-
-    const auto dlo = domain.smallEnd(0);
-    const auto dhi = domain.bigEnd(0);
-
-    Box const& sbox = amrex::grow(vbx,1);
-    AMREX_HOST_DEVICE_FOR_3D(sbox, i, j, k,
-    {
-        if (! gdomain.contains(IntVect(i))) {
-            if (i == dlo-1 && bflo[0])
-            {
-                a(i,0,0) = a(i+1+offset, j, k);
-            }
-            else if (i == dhi+1 && bfhi[0])
-            {
-                a(i,0,0) = a(i-1-offset, j, k);
-            }
-        }
-    });
-}
-
 AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
 Real mlndlap_adotx_c (int i, int, int, Array4<Real const> const& x,
                       Real sigma, Array4<int const> const& msk,
@@ -335,59 +229,6 @@ void mlndlap_gauss_seidel_with_line_solve_aa(Box const&, Array4<Real> const&,
     amrex::Abort("mlndlap_gauss_seidel_with_line_solve_aa: not implemented in 1D");
 }
 
-
-AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-void mlndlap_restriction (int i, int, int, Array4<Real> const& crse,
-                          Array4<Real const> const& fine, Array4<int const> const& msk) noexcept
-{
-    int ii = i*2;
-    if (msk(ii,0,0)) {
-        crse(i,0,0) = Real(0.0);
-    } else {
-        crse(i,0,0) = Real(1./4.)  *(fine(ii-1,0,0)
-                         + Real(2.)* fine(ii  ,0,0)
-                         +           fine(ii+1,0,0));
-    }
-}
-
-template <int rr>
-AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-void mlndlap_restriction (int i, int, int, Array4<Real> const& crse,
-                          Array4<Real const> const& fine, Array4<int const> const& msk,
-                          Box const& fdom,
-                          GpuArray<LinOpBCType, AMREX_SPACEDIM> const& bclo,
-                          GpuArray<LinOpBCType, AMREX_SPACEDIM> const& bchi) noexcept
-
-{
-    const int ii = i*rr;
-    if (msk(ii,0,0)) {
-        crse(i,0,0) = Real(0.0);
-    } else {
-        const auto ndlo = fdom.smallEnd(0);
-        const auto ndhi = fdom.bigEnd(0);
-        Real tmp = Real(0.0);
-        for (int ioff = -rr+1; ioff <= rr-1; ++ioff) {
-            Real wx = rr - std::abs(ioff);
-            int itmp = ii + ioff;
-            if ((itmp < ndlo && (bclo[0] == LinOpBCType::Neumann ||
-                                 bclo[0] == LinOpBCType::inflow)) ||
-                (itmp > ndhi && (bchi[0] == LinOpBCType::Neumann ||
-                                 bchi[0] == LinOpBCType::inflow))) {
-                itmp = ii - ioff;
-            }
-            tmp += wx*fine(itmp,0,0);
-        }
-        crse(i,0,0) = tmp*(Real(1.0)/Real(rr*rr));
-    }
-}
-
-AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-void mlndlap_semi_restriction (int /*i*/, int /*j*/, int /*k*/, Array4<Real> const&,
-                          Array4<Real const> const&, Array4<int const> const&, int) noexcept
-{
-    amrex::Abort("mlndlap_semi_restriction: not implemented in 1D");
-}
-
 AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
 void mlndlap_interpadd_c (int i, int , int, Array4<Real> const& fine,
                           Array4<Real const> const& crse,
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLNodeLap_2D_K.H b/Src/LinearSolvers/MLMG/AMReX_MLNodeLap_2D_K.H
index 05f02aaa92..db6922c410 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLNodeLap_2D_K.H
+++ b/Src/LinearSolvers/MLMG/AMReX_MLNodeLap_2D_K.H
@@ -4,127 +4,6 @@
 
 namespace amrex {
 
-//
-// masks
-//
-
-AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-void mlndlap_set_nodal_mask (int i, int j, int k, Array4<int> const& nmsk,
-                             Array4<int const> const& cmsk) noexcept
-{
-    using namespace nodelap_detail;
-
-    int s = cmsk(i-1,j-1,k) + cmsk(i  ,j-1,k)
-        +   cmsk(i-1,j  ,k) + cmsk(i  ,j  ,k);
-    if (s == 4*crse_cell) {
-        nmsk(i,j,k) = crse_node;
-    }
-    else if (s == 4*fine_cell) {
-        nmsk(i,j,k) = fine_node;
-    } else {
-        nmsk(i,j,k) = crse_fine_node;
-    }
-}
-
-AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-void mlndlap_set_dirichlet_mask (Box const& bx, Array4<int> const& dmsk,
-                                 Array4<int const> const& omsk, Box const& dom,
-                                 GpuArray<LinOpBCType, AMREX_SPACEDIM> const& bclo,
-                                 GpuArray<LinOpBCType, AMREX_SPACEDIM> const& bchi) noexcept
-{
-    const auto lo = amrex::lbound(bx);
-    const auto hi = amrex::ubound(bx);
-    for (int j = lo.y; j <= hi.y; ++j) {
-    AMREX_PRAGMA_SIMD
-    for (int i = lo.x; i <= hi.x; ++i) {
-        if (!dmsk(i,j,0)) {
-            dmsk(i,j,0) = (omsk(i-1,j-1,0) == 1 || omsk(i,j-1,0) == 1 ||
-                           omsk(i-1,j  ,0) == 1 || omsk(i,j  ,0) == 1);
-        }
-    }}
-
-    const auto domlo = amrex::lbound(dom);
-    const auto domhi = amrex::ubound(dom);
-
-    if (bclo[0] == LinOpBCType::Dirichlet && lo.x == domlo.x) {
-        for (int j = lo.y; j <= hi.y; ++j) {
-            dmsk(lo.x,j,0) = 1;
-        }
-    }
-
-    if (bchi[0] == LinOpBCType::Dirichlet && hi.x == domhi.x) {
-        for (int j = lo.y; j <= hi.y; ++j) {
-            dmsk(hi.x,j,0) = 1;
-        }
-    }
-
-    if (bclo[1] == LinOpBCType::Dirichlet && lo.y == domlo.y) {
-        AMREX_PRAGMA_SIMD
-        for (int i = lo.x; i <= hi.x; ++i) {
-            dmsk(i,lo.y,0) = 1;
-        }
-    }
-
-    if (bchi[1] == LinOpBCType::Dirichlet && hi.y == domhi.y) {
-        AMREX_PRAGMA_SIMD
-        for (int i = lo.x; i <= hi.x; ++i) {
-            dmsk(i,hi.y,0) = 1;
-        }
-    }
-}
-
-AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-void mlndlap_set_dot_mask (Box const& bx, Array4<Real> const& dmsk,
-                           Array4<int const> const& omsk, Box const& dom,
-                           GpuArray<LinOpBCType, AMREX_SPACEDIM> const& bclo,
-                           GpuArray<LinOpBCType, AMREX_SPACEDIM> const& bchi) noexcept
-{
-    const auto lo = amrex::lbound(bx);
-    const auto hi = amrex::ubound(bx);
-    for (int j = lo.y; j <= hi.y; ++j) {
-    AMREX_PRAGMA_SIMD
-    for (int i = lo.x; i <= hi.x; ++i) {
-        dmsk(i,j,0) = static_cast<Real>(omsk(i,j,0));
-    }}
-
-    const auto domlo = amrex::lbound(dom);
-    const auto domhi = amrex::ubound(dom);
-
-    if ((bclo[0] == LinOpBCType::Neumann || bclo[0] == LinOpBCType::inflow)
-        && lo.x == domlo.x)
-    {
-        for (int j = lo.y; j <= hi.y; ++j) {
-            dmsk(lo.x,j,0) *= Real(0.5);
-        }
-    }
-
-    if ((bchi[0] == LinOpBCType::Neumann || bchi[0] == LinOpBCType::inflow)
-        && hi.x == domhi.x)
-    {
-        for (int j = lo.y; j <= hi.y; ++j) {
-            dmsk(hi.x,j,0) *= Real(0.5);
-        }
-    }
-
-    if ((bclo[1] == LinOpBCType::Neumann || bclo[1] == LinOpBCType::inflow)
-        && lo.y == domlo.y)
-    {
-        AMREX_PRAGMA_SIMD
-        for (int i = lo.x; i <= hi.x; ++i) {
-            dmsk(i,lo.y,0) *= Real(0.5);
-        }
-    }
-
-    if ((bchi[1] == LinOpBCType::Neumann || bchi[1] == LinOpBCType::inflow)
-        && hi.y == domhi.y)
-    {
-        AMREX_PRAGMA_SIMD
-        for (int i = lo.x; i <= hi.x; ++i) {
-            dmsk(i,hi.y,0) *= Real(0.5);
-        }
-    }
-}
-
 AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
 void mlndlap_zero_fine (int i, int j, int, Array4<Real> const& phi,
                         Array4<int const> const& msk, int fine_flag) noexcept
@@ -177,116 +56,6 @@ void mlndlap_semi_avgdown_coeff (int i, int j, int k, Array4<Real> const& crse,
     }
 }
 
-//
-// bc
-//
-
-template <typename T>
-void mlndlap_bc_doit (Box const& vbx, Array4<T> const& a, Box const& domain,
-                      GpuArray<bool,AMREX_SPACEDIM> const& bflo,
-                      GpuArray<bool,AMREX_SPACEDIM> const& bfhi) noexcept
-{
-    Box gdomain = domain;
-    for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) {
-        if (! bflo[idim]) { gdomain.growLo(idim,1); }
-        if (! bfhi[idim]) { gdomain.growHi(idim,1); }
-    }
-
-    if (gdomain.strictly_contains(vbx)) { return; }
-
-    const int offset = domain.cellCentered() ? 0 : 1;
-
-    const auto dlo = amrex::lbound(domain);
-    const auto dhi = amrex::ubound(domain);
-
-    Box const& sbox = amrex::grow(vbx,1);
-    AMREX_HOST_DEVICE_FOR_3D(sbox, i, j, k,
-    {
-        if (! gdomain.contains(IntVect(i,j))) {
-            // xlo & ylo
-            if (i == dlo.x-1 && j == dlo.y-1 && (bflo[0] || bflo[1]))
-            {
-                if (bflo[0] && bflo[1])
-                {
-                    a(i,j,k) = a(i+1+offset, j+1+offset, k);
-                }
-                else if (bflo[0])
-                {
-                    a(i,j,k) = a(i+1+offset, j, k);
-                }
-                else if (bflo[1])
-                {
-                    a(i,j,k) = a(i, j+1+offset, k);
-                }
-            }
-            // xhi & ylo
-            else if (i == dhi.x+1 && j == dlo.y-1 && (bfhi[0] || bflo[1]))
-            {
-                if (bfhi[0] && bflo[1])
-                {
-                    a(i,j,k) = a(i-1-offset, j+1+offset, k);
-                }
-                else if (bfhi[0])
-                {
-                    a(i,j,k) = a(i-1-offset, j, k);
-                }
-                else if (bflo[1])
-                {
-                    a(i,j,k) = a(i, j+1+offset, k);
-                }
-            }
-            // xlo & yhi
-            else if (i == dlo.x-1 && j == dhi.y+1 && (bflo[0] || bfhi[1]))
-            {
-                if (bflo[0] && bfhi[1])
-                {
-                    a(i,j,k) = a(i+1+offset, j-1-offset, k);
-                }
-                else if (bflo[0])
-                {
-                    a(i,j,k) = a(i+1+offset, j, k);
-                }
-                else if (bfhi[1])
-                {
-                    a(i,j,k) = a(i, j-1-offset, k);
-                }
-            }
-            // xhi & yhi
-            else if (i == dhi.x+1 && j == dhi.y+1 && (bfhi[0] || bfhi[1]))
-            {
-                if (bfhi[0] && bfhi[1])
-                {
-                    a(i,j,k) = a(i-1-offset, j-1-offset, k);
-                }
-                else if (bfhi[0])
-                {
-                    a(i,j,k) = a(i-1-offset, j, k);
-                }
-                else if (bfhi[1])
-                {
-                    a(i,j,k) = a(i, j-1-offset, k);
-                }
-            }
-            else if (i == dlo.x-1 && bflo[0])
-            {
-                a(i,j,k) = a(i+1+offset, j, k);
-            }
-            else if (i == dhi.x+1 && bfhi[0])
-            {
-                a(i,j,k) = a(i-1-offset, j, k);
-            }
-            else if (j == dlo.y-1 && bflo[1])
-            {
-                a(i,j,k) = a(i, j+1+offset, k);
-            }
-            else if (j == dhi.y+1 && bfhi[1])
-            {
-                a(i,j,k) = a(i, j-1-offset, k);
-            }
-        }
-    });
-}
-
 //
 // operator
 //
@@ -796,91 +565,6 @@ void mlndlap_gauss_seidel_with_line_solve_aa (Box const& bx, Array4<Real> const&
 
 }
 
-//
-// restriction
-//
-
-AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-void mlndlap_restriction (int i, int j, int k, Array4<Real> const& crse,
-                          Array4<Real const> const& fine, Array4<int const> const& msk) noexcept
-{
-    int ii = i*2;
-    int jj = j*2;
-    int kk = 0;
-    if (msk(ii,jj,kk)) {
-        crse(i,j,k) = Real(0.0);
-    } else {
-        crse(i,j,k) = Real(1./16.)*(fine(ii-1,jj-1,kk) + Real(2.)*fine(ii  ,jj-1,kk) +          fine(ii+1,jj-1,kk)
-                         + Real(2.)*fine(ii-1,jj  ,kk) + Real(4.)*fine(ii  ,jj  ,kk) + Real(2.)*fine(ii+1,jj  ,kk)
-                                  + fine(ii-1,jj+1,kk) + Real(2.)*fine(ii  ,jj+1,kk) +          fine(ii+1,jj+1,kk));
-    }
-}
-
-template <int rr>
-AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-void mlndlap_restriction (int i, int j, int k, Array4<Real> const& crse,
-                          Array4<Real const> const& fine, Array4<int const> const& msk,
-                          Box const& fdom,
-                          GpuArray<LinOpBCType, AMREX_SPACEDIM> const& bclo,
-                          GpuArray<LinOpBCType, AMREX_SPACEDIM> const& bchi) noexcept
-{
-    const int ii = i*rr;
-    const int jj = j*rr;
-    if (msk(ii,jj,0)) {
-        crse(i,j,k) = Real(0.0);
-    } else {
-        const auto ndlo = amrex::lbound(fdom);
-        const auto ndhi = amrex::ubound(fdom);
-        Real tmp = Real(0.0);
-        for (int joff = -rr+1; joff <= rr-1; ++joff) {
-            Real wy = rr - std::abs(joff);
-            for (int ioff = -rr+1; ioff <= rr-1; ++ioff) {
-                Real wx = rr - std::abs(ioff);
-                int itmp = ii + ioff;
-                int jtmp = jj + joff;
-                if ((itmp < ndlo.x && (bclo[0] == LinOpBCType::Neumann ||
-                                       bclo[0] == LinOpBCType::inflow)) ||
-                    (itmp > ndhi.x && (bchi[0] == LinOpBCType::Neumann ||
-                                       bchi[0] == LinOpBCType::inflow))) {
-                    itmp = ii - ioff;
-                }
-                if ((jtmp < ndlo.y && (bclo[1] == LinOpBCType::Neumann ||
-                                       bclo[1] == LinOpBCType::inflow)) ||
-                    (jtmp > ndhi.y && (bchi[1] == LinOpBCType::Neumann ||
-                                       bchi[1] == LinOpBCType::inflow))) {
-                    jtmp = jj - joff;
-                }
-                tmp += wx*wy*fine(itmp,jtmp,0);
-            }
-        }
-        crse(i,j,k) = tmp*(Real(1.0)/Real(rr*rr*rr*rr));
-    }
-}
-
-AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-void mlndlap_semi_restriction (int i, int j, int k, Array4<Real> const& crse,
-                          Array4<Real const> const& fine, Array4<int const> const& msk, int idir) noexcept
-{
-    int kk = 0;
-    if (idir == 1) {
-        int ii = i*2;
-        int jj = j;
-        if (msk(ii,jj,kk)) {
-            crse(i,j,k) = Real(0.0);
-        } else {
-            crse(i,j,k) = Real(1./4.)*(fine(ii-1,jj,kk) + Real(2.)*fine(ii,jj,kk) + fine(ii+1,jj,kk));
-        }
-    } else if (idir == 0) {
-        int ii = i;
-        int jj = j*2;
-        if (msk(ii,jj,kk)) {
-            crse(i,j,k) = Real(0.0);
-        } else {
-            crse(i,j,k) = Real(1./4.)*(fine(ii,jj-1,kk) + Real(2.)*fine(ii,jj,kk) + fine(ii,jj+1,kk));
-        }
-    }
-}
-
 //
 // interpolation
 //
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLNodeLap_3D_K.H b/Src/LinearSolvers/MLMG/AMReX_MLNodeLap_3D_K.H
index 5d31de0271..2ddcecfe37 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLNodeLap_3D_K.H
+++ b/Src/LinearSolvers/MLMG/AMReX_MLNodeLap_3D_K.H
@@ -4,177 +4,6 @@
 
 namespace amrex {
 
-//
-// masks
-//
-
-AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-void mlndlap_set_nodal_mask (int i, int j, int k, Array4<int> const& nmsk,
-                             Array4<int const> const& cmsk) noexcept
-{
-    using namespace nodelap_detail;
-
-    int s = cmsk(i-1,j-1,k-1) + cmsk(i  ,j-1,k-1)
-        +   cmsk(i-1,j  ,k-1) + cmsk(i  ,j  ,k-1)
-        +   cmsk(i-1,j-1,k  ) + cmsk(i  ,j-1,k  )
-        +   cmsk(i-1,j  ,k  ) + cmsk(i  ,j  ,k  );
-    if (s == 8*crse_cell) {
-        nmsk(i,j,k) = crse_node;
-    }
-    else if (s == 8*fine_cell) {
-        nmsk(i,j,k) = fine_node;
-    } else {
-        nmsk(i,j,k) = crse_fine_node;
-    }
-}
-
-AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-void mlndlap_set_dirichlet_mask (Box const& bx, Array4<int> const& dmsk,
-                                 Array4<int const> const& omsk, Box const& dom,
-                                 GpuArray<LinOpBCType, AMREX_SPACEDIM> const& bclo,
-                                 GpuArray<LinOpBCType, AMREX_SPACEDIM> const& bchi) noexcept
-{
-    const auto lo = amrex::lbound(bx);
-    const auto hi = amrex::ubound(bx);
-    for (int k = lo.z; k <= hi.z; ++k) {
-    for (int j = lo.y; j <= hi.y; ++j) {
-    AMREX_PRAGMA_SIMD
-    for (int i = lo.x; i <= hi.x; ++i) {
-        if (!dmsk(i,j,k)) {
-            dmsk(i,j,k) = (omsk(i-1,j-1,k-1) == 1 || omsk(i,j-1,k-1) == 1 ||
-                           omsk(i-1,j  ,k-1) == 1 || omsk(i,j  ,k-1) == 1 ||
-                           omsk(i-1,j-1,k  ) == 1 || omsk(i,j-1,k  ) == 1 ||
-                           omsk(i-1,j  ,k  ) == 1 || omsk(i,j  ,k  ) == 1);
-        }
-    }}}
-
-    const auto domlo = amrex::lbound(dom);
-    const auto domhi = amrex::ubound(dom);
-
-    if (bclo[0] == LinOpBCType::Dirichlet && lo.x == domlo.x) {
-        for (int k = lo.z; k <= hi.z; ++k) {
-        for (int j = lo.y; j <= hi.y; ++j) {
-            dmsk(lo.x,j,k) = 1;
-        }}
-    }
-
-    if (bchi[0] == LinOpBCType::Dirichlet && hi.x == domhi.x) {
-        for (int k = lo.z; k <= hi.z; ++k) {
-        for (int j = lo.y; j <= hi.y; ++j) {
-            dmsk(hi.x,j,k) = 1;
-        }}
-    }
-
-    if (bclo[1] == LinOpBCType::Dirichlet && lo.y == domlo.y) {
-        for (int k = lo.z; k <= hi.z; ++k) {
-        AMREX_PRAGMA_SIMD
-        for (int i = lo.x; i <= hi.x; ++i) {
-            dmsk(i,lo.y,k) = 1;
-        }}
-    }
-
-    if (bchi[1] == LinOpBCType::Dirichlet && hi.y == domhi.y) {
-        for (int k = lo.z; k <= hi.z; ++k) {
-        AMREX_PRAGMA_SIMD
-        for (int i = lo.x; i <= hi.x; ++i) {
-            dmsk(i,hi.y,k) = 1;
-        }}
-    }
-
-    if (bclo[2] == LinOpBCType::Dirichlet && lo.z == domlo.z) {
-        for (int j = lo.y; j <= hi.y; ++j) {
-        AMREX_PRAGMA_SIMD
-        for (int i = lo.x; i <= hi.x; ++i) {
-            dmsk(i,j,lo.z) = 1;
-        }}
-    }
-
-    if (bchi[2] == LinOpBCType::Dirichlet && hi.z == domhi.z) {
-        for (int j = lo.y; j <= hi.y; ++j) {
-        AMREX_PRAGMA_SIMD
-        for (int i = lo.x; i <= hi.x; ++i) {
-            dmsk(i,j,hi.z) = 1;
-        }}
-    }
-}
-
-AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-void mlndlap_set_dot_mask (Box const& bx, Array4<Real> const& dmsk,
-                           Array4<int const> const& omsk, Box const& dom,
-                           GpuArray<LinOpBCType, AMREX_SPACEDIM> const& bclo,
-                           GpuArray<LinOpBCType, AMREX_SPACEDIM> const& bchi) noexcept
-{
-    const auto lo = amrex::lbound(bx);
-    const auto hi = amrex::ubound(bx);
-    for (int k = lo.z; k <= hi.z; ++k) {
-    for (int j = lo.y; j <= hi.y; ++j) {
-    AMREX_PRAGMA_SIMD
-    for (int i = lo.x; i <= hi.x; ++i) {
-        dmsk(i,j,k) = static_cast<Real>(omsk(i,j,k));
-    }}}
-
-    const auto domlo = amrex::lbound(dom);
-    const auto domhi = amrex::ubound(dom);
-
-    if ((bclo[0] == LinOpBCType::Neumann || bclo[0] == LinOpBCType::inflow)
-        && lo.x == domlo.x)
-    {
-        for (int k = lo.z; k <= hi.z; ++k) {
-        for (int j = lo.y; j <= hi.y; ++j) {
-            dmsk(lo.x,j,k) *= Real(0.5);
-        }}
-    }
-
-    if ((bchi[0] == LinOpBCType::Neumann || bchi[0] == LinOpBCType::inflow)
-        && hi.x == domhi.x)
-    {
-        for (int k = lo.z; k <= hi.z; ++k) {
-        for (int j = lo.y; j <= hi.y; ++j) {
-            dmsk(hi.x,j,k) *= Real(0.5);
-        }}
-    }
-
-    if ((bclo[1] == LinOpBCType::Neumann || bclo[1] == LinOpBCType::inflow)
-        && lo.y == domlo.y)
-    {
-        for (int k = lo.z; k <= hi.z; ++k) {
-        AMREX_PRAGMA_SIMD
-        for (int i = lo.x; i <= hi.x; ++i) {
-            dmsk(i,lo.y,k) *= Real(0.5);
-        }}
-    }
-
-    if ((bchi[1] == LinOpBCType::Neumann || bchi[1] == LinOpBCType::inflow)
-        && hi.y == domhi.y)
-    {
-        for (int k = lo.z; k <= hi.z; ++k) {
-        AMREX_PRAGMA_SIMD
-        for (int i = lo.x; i <= hi.x; ++i) {
-            dmsk(i,hi.y,k) *= Real(0.5);
-        }}
-    }
-
-    if ((bclo[2] == LinOpBCType::Neumann || bclo[2] == LinOpBCType::inflow)
-        && lo.z == domlo.z)
-    {
-        for (int j = lo.y; j <= hi.y; ++j) {
-        AMREX_PRAGMA_SIMD
-        for (int i = lo.x; i <= hi.x; ++i) {
-            dmsk(i,j,lo.z) *= Real(0.5);
-        }}
-    }
-
-    if ((bchi[2] == LinOpBCType::Neumann || bchi[2] == LinOpBCType::inflow)
-        && hi.z == domhi.z)
-    {
-        for (int j = lo.y; j <= hi.y; ++j) {
-        AMREX_PRAGMA_SIMD
-        for (int i = lo.x; i <= hi.x; ++i) {
-            dmsk(i,j,hi.z) *= Real(0.5);
-        }}
-    }
-}
-
 AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
 void mlndlap_zero_fine (int i, int j, int k, Array4<Real> const& phi,
                         Array4<int const> const& msk, int fine_flag) noexcept
@@ -249,507 +78,6 @@ void mlndlap_semi_avgdown_coeff (int i, int j, int k, Array4<Real> const& crse,
         crse(i,j,k) = cl*cr/(cl+cr);
     }
 }
-//
-// bc
-//
-
-template <typename T>
-inline void mlndlap_bc_doit (Box const& vbx, Array4<T> const& a, Box const& domain,
-                             GpuArray<bool,AMREX_SPACEDIM> const& bflo,
-                             GpuArray<bool,AMREX_SPACEDIM> const& bfhi) noexcept
-{
-    Box gdomain = domain;
-    for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) {
-        if (! bflo[idim]) { gdomain.growLo(idim,1); }
-        if (! bfhi[idim]) { gdomain.growHi(idim,1); }
-    }
-
-    if (gdomain.strictly_contains(vbx)) { return; }
-
-    const int offset = domain.cellCentered() ? 0 : 1;
-
-    const auto dlo = amrex::lbound(domain);
-    const auto dhi = amrex::ubound(domain);
-
-    Box const& sbox = amrex::grow(vbx,1);
-    AMREX_HOST_DEVICE_FOR_3D(sbox, i, j, k,
-    {
-        if (! gdomain.contains(IntVect(i,j,k))) {
-            // xlo & ylo & zlo
-            if (i == dlo.x-1 && j == dlo.y-1 && k == dlo.z-1 && (bflo[0] || bflo[1] || bflo[2]))
-            {
-                if (bflo[0] && bflo[1] && bflo[2])
-                {
-                    a(i,j,k) = a(i+1+offset, j+1+offset, k+1+offset);
-                }
-                else if (bflo[0] && bflo[1])
-                {
-                    a(i,j,k) = a(i+1+offset, j+1+offset, k);
-                }
-                else if (bflo[0] && bflo[2])
-                {
-                    a(i,j,k) = a(i+1+offset, j, k+1+offset);
-                }
-                else if (bflo[1] && bflo[2])
-                {
-                    a(i,j,k) = a(i, j+1+offset, k+1+offset);
-                }
-                else if (bflo[0])
-                {
-                    a(i,j,k) = a(i+1+offset, j, k);
-                }
-                else if (bflo[1])
-                {
-                    a(i,j,k) = a(i, j+1+offset, k);
-                }
-                else if (bflo[2])
-                {
-                    a(i,j,k) = a(i, j, k+1+offset);
-                }
-            }
-            // xhi & ylo & zlo
-            else if (i == dhi.x+1 && j == dlo.y-1 && k == dlo.z-1 && (bfhi[0] || bflo[1] || bflo[2]))
-            {
-                if (bfhi[0] && bflo[1] && bflo[2])
-                {
-                    a(i,j,k) = a(i-1-offset, j+1+offset, k+1+offset);
-                }
-                else if (bfhi[0] && bflo[1])
-                {
-                    a(i,j,k) = a(i-1-offset, j+1+offset, k);
-                }
-                else if (bfhi[0] && bflo[2])
-                {
-                    a(i,j,k) = a(i-1-offset, j, k+1+offset);
-                }
-                else if (bflo[1] && bflo[2])
-                {
-                    a(i,j,k) = a(i, j+1+offset, k+1+offset);
-                }
-                else if (bfhi[0])
-                {
-                    a(i,j,k) = a(i-1-offset, j, k);
-                }
-                else if (bflo[1])
-                {
-                    a(i,j,k) = a(i, j+1+offset, k);
-                }
-                else if (bflo[2])
-                {
-                    a(i,j,k) = a(i, j, k+1+offset);
-                }
-            }
-            // xlo & yhi & zlo
-            else if (i == dlo.x-1 && j == dhi.y+1 && k == dlo.z-1 && (bflo[0] || bfhi[1] || bflo[2]))
-            {
-                if (bflo[0] && bfhi[1] && bflo[2])
-                {
-                    a(i,j,k) = a(i+1+offset, j-1-offset, k+1+offset);
-                }
-                else if (bflo[0] && bfhi[1])
-                {
-                    a(i,j,k) = a(i+1+offset, j-1-offset, k);
-                }
-                else if (bflo[0] && bflo[2])
-                {
-                    a(i,j,k) = a(i+1+offset, j, k+1+offset);
-                }
-                else if (bfhi[1] && bflo[2])
-                {
-                    a(i,j,k) = a(i, j-1-offset, k+1+offset);
-                }
-                else if (bflo[0])
-                {
-                    a(i,j,k) = a(i+1+offset, j, k);
-                }
-                else if (bfhi[1])
-                {
-                    a(i,j,k) = a(i, j-1-offset, k);
-                }
-                else if (bflo[2])
-                {
-                    a(i,j,k) = a(i, j, k+1+offset);
-                }
-            }
-            // xhi & yhi & zlo
-            else if (i == dhi.x+1 && j == dhi.y+1 && k == dlo.z-1 && (bfhi[0] || bfhi[1] || bflo[2]))
-            {
-                if (bfhi[0] && bfhi[1] && bflo[2])
-                {
-                    a(i,j,k) = a(i-1-offset, j-1-offset, k+1+offset);
-                }
-                else if (bfhi[0] && bfhi[1])
-                {
-                    a(i,j,k) = a(i-1-offset, j-1-offset, k);
-                }
-                else if (bfhi[0] && bflo[2])
-                {
-                    a(i,j,k) = a(i-1-offset, j, k+1+offset);
-                }
-                else if (bfhi[1] && bflo[2])
-                {
-                    a(i,j,k) = a(i, j-1-offset, k+1+offset);
-                }
-                else if (bfhi[0])
-                {
-                    a(i,j,k) = a(i-1-offset, j, k);
-                }
-                else if (bfhi[1])
-                {
-                    a(i,j,k) = a(i, j-1-offset, k);
-                }
-                else if (bflo[2])
-                {
-                    a(i,j,k) = a(i, j, k+1+offset);
-                }
-            }
-            // xlo & ylo & zhi
-            else if (i == dlo.x-1 && j == dlo.y-1 && k == dhi.z+1 && (bflo[0] || bflo[1] || bfhi[2]))
-            {
-                if (bflo[0] && bflo[1] && bfhi[2])
-                {
-                    a(i,j,k) = a(i+1+offset, j+1+offset, k-1-offset);
-                }
-                else if (bflo[0] && bflo[1])
-                {
-                    a(i,j,k) = a(i+1+offset, j+1+offset, k);
-                }
-                else if (bflo[0] && bfhi[2])
-                {
-                    a(i,j,k) = a(i+1+offset, j, k-1-offset);
-                }
-                else if (bflo[1] && bfhi[2])
-                {
-                    a(i,j,k) = a(i, j+1+offset, k-1-offset);
-                }
-                else if (bflo[0])
-                {
-                    a(i,j,k) = a(i+1+offset, j, k);
-                }
-                else if (bflo[1])
-                {
-                    a(i,j,k) = a(i, j+1+offset, k);
-                }
-                else if (bfhi[2])
-                {
-                    a(i,j,k) = a(i, j, k-1-offset);
-                }
-            }
-            // xhi & ylo & zhi
-            else if (i == dhi.x+1 && j == dlo.y-1 && k == dhi.z+1 && (bfhi[0] || bflo[1] || bfhi[2]))
-            {
-                if (bfhi[0] && bflo[1] && bfhi[2])
-                {
-                    a(i,j,k) = a(i-1-offset, j+1+offset, k-1-offset);
-                }
-                else if (bfhi[0] && bflo[1])
-                {
-                    a(i,j,k) = a(i-1-offset, j+1+offset, k);
-                }
-                else if (bfhi[0] && bfhi[2])
-                {
-                    a(i,j,k) = a(i-1-offset, j, k-1-offset);
-                }
-                else if (bflo[1] && bfhi[2])
-                {
-                    a(i,j,k) = a(i, j+1+offset, k-1-offset);
-                }
-                else if (bfhi[0])
-                {
-                    a(i,j,k) = a(i-1-offset, j, k);
-                }
-                else if (bflo[1])
-                {
-                    a(i,j,k) = a(i, j+1+offset, k);
-                }
-                else if (bfhi[2])
-                {
-                    a(i,j,k) = a(i, j, k-1-offset);
-                }
-            }
-            // xlo & yhi & zhi
-            else if (i == dlo.x-1 && j == dhi.y+1 && k == dhi.z+1 && (bflo[0] || bfhi[1] || bfhi[2]))
-            {
-                if (bflo[0] && bfhi[1] && bfhi[2])
-                {
-                    a(i,j,k) = a(i+1+offset, j-1-offset, k-1-offset);
-                }
-                else if (bflo[0] && bfhi[1])
-                {
-                    a(i,j,k) = a(i+1+offset, j-1-offset, k);
-                }
-                else if (bflo[0] && bfhi[2])
-                {
-                    a(i,j,k) = a(i+1+offset, j, k-1-offset);
-                }
-                else if (bfhi[1] && bfhi[2])
-                {
-                    a(i,j,k) = a(i, j-1-offset, k-1-offset);
-                }
-                else if (bflo[0])
-                {
-                    a(i,j,k) = a(i+1+offset, j, k);
-                }
-                else if (bfhi[1])
-                {
-                    a(i,j,k) = a(i, j-1-offset, k);
-                }
-                else if (bfhi[2])
-                {
-                    a(i,j,k) = a(i, j, k-1-offset);
-                }
-            }
-            // xhi & yhi & zhi
-            else if (i == dhi.x+1 && j == dhi.y+1 && k == dhi.z+1 && (bfhi[0] || bfhi[1] || bfhi[2]))
-            {
-                if (bfhi[0] && bfhi[1] && bfhi[2])
-                {
-                    a(i,j,k) = a(i-1-offset, j-1-offset, k-1-offset);
-                }
-                else if (bfhi[0] && bfhi[1])
-                {
-                    a(i,j,k) = a(i-1-offset, j-1-offset, k);
-                }
-                else if (bfhi[0] && bfhi[2])
-                {
-                    a(i,j,k) = a(i-1-offset, j, k-1-offset);
-                }
-                else if (bfhi[1] && bfhi[2])
-                {
-                    a(i,j,k) = a(i, j-1-offset, k-1-offset);
-                }
-                else if (bfhi[0])
-                {
-                    a(i,j,k) = a(i-1-offset, j, k);
-                }
-                else if (bfhi[1])
-                {
-                    a(i,j,k) = a(i, j-1-offset, k);
-                }
-                else if (bfhi[2])
-                {
-                    a(i,j,k) = a(i, j, k-1-offset);
-                }
-            }
-            // xlo & ylo
-            else if (i == dlo.x-1 && j == dlo.y-1 && (bflo[0] || bflo[1]))
-            {
-                if (bflo[0] && bflo[1])
-                {
-                    a(i,j,k) = a(i+1+offset, j+1+offset, k);
-                }
-                else if (bflo[0])
-                {
-                    a(i,j,k) = a(i+1+offset, j, k);
-                }
-                else if (bflo[1])
-                {
-                    a(i,j,k) = a(i, j+1+offset, k);
-                }
-            }
-            // xhi & ylo
-            else if (i == dhi.x+1 && j == dlo.y-1 && (bfhi[0] || bflo[1]))
-            {
-                if (bfhi[0] && bflo[1])
-                {
-                    a(i,j,k) = a(i-1-offset, j+1+offset, k);
-                }
-                else if (bfhi[0])
-                {
-                    a(i,j,k) = a(i-1-offset, j, k);
-                }
-                else if (bflo[1])
-                {
-                    a(i,j,k) = a(i, j+1+offset, k);
-                }
-            }
-            // xlo & yhi
-            else if (i == dlo.x-1 && j == dhi.y+1 && (bflo[0] || bfhi[1]))
-            {
-                if (bflo[0] && bfhi[1])
-                {
-                    a(i,j,k) = a(i+1+offset, j-1-offset, k);
-                }
-                else if (bflo[0])
-                {
-                    a(i,j,k) = a(i+1+offset, j, k);
-                }
-                else if (bfhi[1])
-                {
-                    a(i,j,k) = a(i, j-1-offset, k);
-                }
-            }
-            // xhi & yhi
-            else if (i == dhi.x+1 && j == dhi.y+1 && (bfhi[0] || bfhi[1]))
-            {
-                if (bfhi[0] && bfhi[1])
-                {
-                    a(i,j,k) = a(i-1-offset, j-1-offset, k);
-                }
-                else if (bfhi[0])
-                {
-                    a(i,j,k) = a(i-1-offset, j, k);
-                }
-                else if (bfhi[1])
-                {
-                    a(i,j,k) = a(i, j-1-offset, k);
-                }
-            }
-            // xlo & zlo
-            else if (i == dlo.x-1 && k == dlo.z-1 && (bflo[0] || bflo[2]))
-            {
-                if (bflo[0] && bflo[2])
-                {
-                    a(i,j,k) = a(i+1+offset, j, k+1+offset);
-                }
-                else if (bflo[0])
-                {
-                    a(i,j,k) = a(i+1+offset, j, k);
-                }
-                else if (bflo[2])
-                {
-                    a(i,j,k) = a(i, j, k+1+offset);
-                }
-            }
-            // xhi & zlo
-            else if (i == dhi.x+1 && k == dlo.z-1 && (bfhi[0] || bflo[2]))
-            {
-                if (bfhi[0] && bflo[2])
-                {
-                    a(i,j,k) = a(i-1-offset, j, k+1+offset);
-                }
-                else if (bfhi[0])
-                {
-                    a(i,j,k) = a(i-1-offset, j, k);
-                }
-                else if (bflo[2])
-                {
-                    a(i,j,k) = a(i, j, k+1+offset);
-                }
-            }
-            // xlo & zhi
-            else if (i == dlo.x-1 && k == dhi.z+1 && (bflo[0] || bfhi[2]))
-            {
-                if (bflo[0] && bfhi[2])
-                {
-                    a(i,j,k) = a(i+1+offset, j, k-1-offset);
-                }
-                else if (bflo[0])
-                {
-                    a(i,j,k) = a(i+1+offset, j, k);
-                }
-                else if (bfhi[2])
-                {
-                    a(i,j,k) = a(i, j, k-1-offset);
-                }
-            }
-            // xhi & zhi
-            else if (i == dhi.x+1 && k == dhi.z+1 && (bfhi[0] || bfhi[2]))
-            {
-                if (bfhi[0] && bfhi[2])
-                {
-                    a(i,j,k) = a(i-1-offset, j, k-1-offset);
-                }
-                else if (bfhi[0])
-                {
-                    a(i,j,k) = a(i-1-offset, j, k);
-                }
-                else if (bfhi[2])
-                {
-                    a(i,j,k) = a(i, j, k-1-offset);
-                }
-            }
-            // ylo & zlo
-            else if (j == dlo.y-1 && k == dlo.z-1 && (bflo[1] || bflo[2]))
-            {
-                if (bflo[1] && bflo[2])
-                {
-                    a(i,j,k) = a(i, j+1+offset, k+1+offset);
-                }
-                else if (bflo[1])
-                {
-                    a(i,j,k) = a(i, j+1+offset, k);
-                }
-                else if (bflo[2])
-                {
-                    a(i,j,k) = a(i, j, k+1+offset);
-                }
-            }
-            // yhi & zlo
-            else if (j == dhi.y+1 && k == dlo.z-1 && (bfhi[1] || bflo[2]))
-            {
-                if (bfhi[1] && bflo[2])
-                {
-                    a(i,j,k) = a(i, j-1-offset, k+1+offset);
-                }
-                else if (bfhi[1])
-                {
-                    a(i,j,k) = a(i, j-1-offset, k);
-                }
-                else if (bflo[2])
-                {
-                    a(i,j,k) = a(i, j, k+1+offset);
-                }
-            }
-            // ylo & zhi
-            else if (j == dlo.y-1 && k == dhi.z+1 && (bflo[1] || bfhi[2]))
-            {
-                if (bflo[1] && bfhi[2])
-                {
-                    a(i,j,k) = a(i, j+1+offset, k-1-offset);
-                }
-                else if (bflo[1])
-                {
-                    a(i,j,k) = a(i, j+1+offset, k);
-                }
-                else if (bfhi[2])
-                {
-                    a(i,j,k) = a(i, j, k-1-offset);
-                }
-            }
-            // yhi & zhi
-            else if (j == dhi.y+1 && k == dhi.z+1 && (bfhi[1] || bfhi[2]))
-            {
-                if (bfhi[1] && bfhi[2])
-                {
-                    a(i,j,k) = a(i, j-1-offset, k-1-offset);
-                }
-                else if (bfhi[1])
-                {
-                    a(i,j,k) = a(i, j-1-offset, k);
-                }
-                else if (bfhi[2])
-                {
-                    a(i,j,k) = a(i, j, k-1-offset);
-                }
-            }
-            else if (i == dlo.x-1 && bflo[0])
-            {
-                a(i,j,k) = a(i+1+offset, j, k);
-            }
-            else if (i == dhi.x+1 && bfhi[0])
-            {
-                a(i,j,k) = a(i-1-offset, j, k);
-            }
-            else if (j == dlo.y-1 && bflo[1])
-            {
-                a(i,j,k) = a(i, j+1+offset, k);
-            }
-            else if (j == dhi.y+1 && bfhi[1])
-            {
-                a(i,j,k) = a(i, j-1-offset, k);
-            }
-            else if (k == dlo.z-1 && bflo[2])
-            {
-                a(i,j,k) = a(i, j, k+1+offset);
-            }
-            else if (k == dhi.z+1 && bfhi[2])
-            {
-                a(i,j,k) = a(i, j, k-1-offset);
-            }
-        }
-    });
-}
 
 //
 // operator
@@ -1587,138 +915,6 @@ void mlndlap_gauss_seidel_with_line_solve_aa (Box const& bx, Array4<Real> const&
     }
 }
 
-//
-// restriction
-//
-
-AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-void mlndlap_restriction (int i, int j, int k, Array4<Real> const& crse,
-                          Array4<Real const> const& fine, Array4<int const> const& msk) noexcept
-{
-    int ii = i*2;
-    int jj = j*2;
-    int kk = k*2;
-    if (msk(ii,jj,kk)) {
-        crse(i,j,k) = Real(0.0);
-    } else {
-        crse(i,j,k) = Real(1./64.)*(fine(ii-1,jj-1,kk-1)+fine(ii+1,jj-1,kk-1)
-                                   +fine(ii-1,jj+1,kk-1)+fine(ii+1,jj+1,kk-1)
-                                   +fine(ii-1,jj-1,kk+1)+fine(ii+1,jj-1,kk+1)
-                                   +fine(ii-1,jj+1,kk+1)+fine(ii+1,jj+1,kk+1))
-                    + Real(1./32.)*(fine(ii  ,jj-1,kk-1)+fine(ii  ,jj+1,kk-1)
-                                   +fine(ii  ,jj-1,kk+1)+fine(ii  ,jj+1,kk+1)
-                                   +fine(ii-1,jj  ,kk-1)+fine(ii+1,jj  ,kk-1)
-                                   +fine(ii-1,jj  ,kk+1)+fine(ii+1,jj  ,kk+1)
-                                   +fine(ii-1,jj-1,kk  )+fine(ii+1,jj-1,kk  )
-                                   +fine(ii-1,jj+1,kk  )+fine(ii+1,jj+1,kk  ))
-                    + Real(1./16.)*(fine(ii-1,jj,kk)+fine(ii+1,jj,kk)
-                                   +fine(ii,jj-1,kk)+fine(ii,jj+1,kk)
-                                   +fine(ii,jj,kk-1)+fine(ii,jj,kk+1))
-                      + Real(1./8.)*fine(ii,jj,kk);
-    }
-}
-
-template <int rr>
-AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-void mlndlap_restriction (int i, int j, int k, Array4<Real> const& crse,
-                          Array4<Real const> const& fine, Array4<int const> const& msk,
-                          Box const& fdom,
-                          GpuArray<LinOpBCType, AMREX_SPACEDIM> const& bclo,
-                          GpuArray<LinOpBCType, AMREX_SPACEDIM> const& bchi) noexcept
-{
-    const int ii = i*rr;
-    const int jj = j*rr;
-    const int kk = k*rr;
-    if (msk(ii,jj,kk)) {
-        crse(i,j,k) = Real(0.0);
-    } else {
-        const auto ndlo = amrex::lbound(fdom);
-        const auto ndhi = amrex::ubound(fdom);
-        Real tmp = Real(0.0);
-        for (int koff = -rr+1; koff <= rr-1; ++koff) {
-            Real wz = rr - std::abs(koff);
-            for (int joff = -rr+1; joff <= rr-1; ++joff) {
-                Real wy = rr - std::abs(joff);
-                for (int ioff = -rr+1; ioff <= rr-1; ++ioff) {
-                    Real wx = rr - std::abs(ioff);
-                    int itmp = ii + ioff;
-                    int jtmp = jj + joff;
-                    int ktmp = kk + koff;
-                    if ((itmp < ndlo.x && (bclo[0] == LinOpBCType::Neumann ||
-                                           bclo[0] == LinOpBCType::inflow)) ||
-                        (itmp > ndhi.x && (bchi[0] == LinOpBCType::Neumann ||
-                                           bchi[0] == LinOpBCType::inflow))) {
-                        itmp = ii - ioff;
-                    }
-                    if ((jtmp < ndlo.y && (bclo[1] == LinOpBCType::Neumann ||
-                                           bclo[1] == LinOpBCType::inflow)) ||
-                        (jtmp > ndhi.y && (bchi[1] == LinOpBCType::Neumann ||
-                                           bchi[1] == LinOpBCType::inflow))) {
-                        jtmp = jj - joff;
-                    }
-                    if ((ktmp < ndlo.z && (bclo[2] == LinOpBCType::Neumann ||
-                                           bclo[2] == LinOpBCType::inflow)) ||
-                        (ktmp > ndhi.z && (bchi[2] == LinOpBCType::Neumann ||
-                                           bchi[2] == LinOpBCType::inflow))) {
-                        ktmp = kk - koff;
-                    }
-                    tmp += wx*wy*wz*fine(itmp,jtmp,ktmp);
-                }
-            }
-        }
-        crse(i,j,k) = tmp/Real(rr*rr*rr*rr*rr*rr);
-    }
-}
-
-AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-void mlndlap_semi_restriction (int i, int j, int k, Array4<Real> const& crse,
-                               Array4<Real const> const& fine, Array4<int const> const& msk, int idir) noexcept
-{
-    if (idir == 2)
-    {
-        int ii = i*2;
-        int jj = j*2;
-        int kk = k;
-        if (msk(ii,jj,kk)) {
-            crse(i,j,k) = Real(0.0);
-        } else { // use 2-D formula
-            crse(i,j,k) = Real(1./16.)*(fine(ii-1,jj-1,kk) + Real(2.)*fine(ii  ,jj-1,kk) +          fine(ii+1,jj-1,kk)
-                             + Real(2.)*fine(ii-1,jj  ,kk) + Real(4.)*fine(ii  ,jj  ,kk) + Real(2.)*fine(ii+1,jj  ,kk)
-                                      + fine(ii-1,jj+1,kk) + Real(2.)*fine(ii  ,jj+1,kk) +          fine(ii+1,jj+1,kk));
-        }
-    }
-    else if (idir == 1)
-    {
-        int ii = i*2;
-        int jj = j;
-        int kk = k*2;
-        if (msk(ii,jj,kk)) {
-            crse(i,j,k) = Real(0.0);
-        } else { // use 2-D formula
-            crse(i,j,k) = Real(1./16.)*(fine(ii-1,jj,kk-1) + Real(2.)*fine(ii  ,jj,kk-1) +          fine(ii+1,jj,kk-1)
-                             + Real(2.)*fine(ii-1,jj  ,kk) + Real(4.)*fine(ii  ,jj,kk  ) + Real(2.)*fine(ii+1,jj,kk  )
-                                      + fine(ii-1,jj,kk+1) + Real(2.)*fine(ii  ,jj,kk+1) +          fine(ii+1,jj,kk+1));
-        }
-    }
-    else if (idir == 0)
-    {
-        int ii = i;
-        int jj = j*2;
-        int kk = k*2;
-        if (msk(ii,jj,kk)) {
-            crse(i,j,k) = Real(0.0);
-        } else { // use 2-D formula
-            crse(i,j,k) = Real(1./16.)*(fine(ii,jj-1,kk-1) + Real(2.)*fine(ii  ,jj,kk-1) +          fine(ii,jj+1,kk-1)
-                             + Real(2.)*fine(ii,jj-1  ,kk) + Real(4.)*fine(ii  ,jj,kk  ) + Real(2.)*fine(ii,jj+1,kk  )
-                                      + fine(ii,jj-1,kk+1) + Real(2.)*fine(ii  ,jj,kk+1) +          fine(ii,jj+1,kk+1));
-        }
-    }
-    else
-    {
-        amrex::Abort("mlndlap_semi_restriction semi direction wrong semi-direction. ");
-    }
-}
-
 //
 // interpolation
 //
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLNodeLap_K.H b/Src/LinearSolvers/MLMG/AMReX_MLNodeLap_K.H
index 4e76a48689..97f8e07817 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLNodeLap_K.H
+++ b/Src/LinearSolvers/MLMG/AMReX_MLNodeLap_K.H
@@ -7,57 +7,12 @@
 #ifdef AMREX_USE_EB
 #include <AMReX_EBCellFlag.H>
 #endif
+#include <AMReX_MLNodeLinOp_K.H>
 
 namespace amrex {
 
 namespace nodelap_detail {
 
-    struct GetNode {
-        AMREX_GPU_DEVICE Dim3 operator() (Dim3 const& lo, Dim3 const& len, int& offset)
-        {
-            Dim3 node;
-            constexpr int nsten = AMREX_D_TERM(3,*3,*3);
-            int icell = offset / nsten;
-            node.z =  icell /        (len.x*len.y);
-            node.y = (icell - node.z*(len.x*len.y)) /        len.x;
-            node.x = (icell - node.z*(len.x*len.y)) - node.y*len.x;
-            node.x += lo.x;
-            node.y += lo.y;
-            node.z += lo.z;
-            offset -= icell*nsten;
-            return node;
-        }
-    };
-
-    struct GetNode2 {
-        AMREX_GPU_DEVICE Dim3 operator() (int offset, Dim3 const& node)
-        {
-            // In 2D the offsets are
-            //   6 7 8
-            //   4 0 5
-            //   1 2 3
-            constexpr int nstenhalf = AMREX_SPACEDIM == 2 ? 4 : 13;
-            if (offset == 0) {
-                return node;
-            } else {
-                if (offset <= nstenhalf) { --offset; }
-                Dim3 node2;
-                node2.z = offset / 9;
-                node2.y = (offset - node2.z*9) / 3;
-                node2.x = (offset - node2.z*9) - node2.y*3;
-                AMREX_D_TERM(node2.x += node.x-1;,
-                             node2.y += node.y-1;,
-                             node2.z += node.z-1);
-                return node2;
-            }
-        }
-    };
-
-    constexpr int crse_cell = 0; // Do NOT change the values
-    constexpr int fine_cell = 1;
-    constexpr int crse_node = 0;
-    constexpr int crse_fine_node = 1;
-    constexpr int fine_node = 2;
 #if (BL_USE_FLOAT)
     constexpr float eps = 1.e-30_rt;
 #else
@@ -123,40 +78,6 @@ mlndlap_unimpose_neumann_bc (Box const& bx, Array4<Real> const& rhs, Box const&
 
 namespace amrex {
 
-template <typename T>
-void mlndlap_fillbc_cc (Box const& vbx, Array4<T> const& sigma, Box const& domain,
-                        GpuArray<LinOpBCType, AMREX_SPACEDIM> bclo,
-                        GpuArray<LinOpBCType, AMREX_SPACEDIM> bchi) noexcept
-{
-    GpuArray<bool,AMREX_SPACEDIM> bflo{{AMREX_D_DECL(bclo[0] != LinOpBCType::Periodic,
-                                                     bclo[1] != LinOpBCType::Periodic,
-                                                     bclo[2] != LinOpBCType::Periodic)}};
-    GpuArray<bool,AMREX_SPACEDIM> bfhi{{AMREX_D_DECL(bchi[0] != LinOpBCType::Periodic,
-                                                     bchi[1] != LinOpBCType::Periodic,
-                                                     bchi[2] != LinOpBCType::Periodic)}};
-    mlndlap_bc_doit(vbx, sigma, domain, bflo, bfhi);
-}
-
-template <typename T>
-void mlndlap_applybc (Box const& vbx, Array4<T> const& phi, Box const& domain,
-                      GpuArray<LinOpBCType, AMREX_SPACEDIM> bclo,
-                      GpuArray<LinOpBCType, AMREX_SPACEDIM> bchi) noexcept
-{
-    GpuArray<bool,AMREX_SPACEDIM> bflo{{AMREX_D_DECL(bclo[0] == LinOpBCType::Neumann ||
-                                                     bclo[0] == LinOpBCType::inflow,
-                                                     bclo[1] == LinOpBCType::Neumann ||
-                                                     bclo[1] == LinOpBCType::inflow,
-                                                     bclo[2] == LinOpBCType::Neumann ||
-                                                     bclo[2] == LinOpBCType::inflow)}};
-    GpuArray<bool,AMREX_SPACEDIM> bfhi{{AMREX_D_DECL(bchi[0] == LinOpBCType::Neumann ||
-                                                     bchi[0] == LinOpBCType::inflow,
-                                                     bchi[1] == LinOpBCType::Neumann ||
-                                                     bchi[1] == LinOpBCType::inflow,
-                                                     bchi[2] == LinOpBCType::Neumann ||
-                                                     bchi[2] == LinOpBCType::inflow)}};
-    mlndlap_bc_doit(vbx, phi, domain, bflo, bfhi);
-}
-
 AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
 void mlndlap_normalize_sten (int i, int j, int k, Array4<Real> const& x,
                              Array4<Real const> const& sten,
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLNodeLinOp.cpp b/Src/LinearSolvers/MLMG/AMReX_MLNodeLinOp.cpp
index 38f58b70bb..929d05dc5a 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLNodeLinOp.cpp
+++ b/Src/LinearSolvers/MLMG/AMReX_MLNodeLinOp.cpp
@@ -1,6 +1,6 @@
 
 #include <AMReX_MLNodeLinOp.H>
-#include <AMReX_MLNodeLap_K.H>
+#include <AMReX_MLNodeLinOp_K.H>
 #include <AMReX_MLMG_K.H>
 #include <AMReX_MultiFabUtil.H>
 
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLNodeLinOp_1D_K.H b/Src/LinearSolvers/MLMG/AMReX_MLNodeLinOp_1D_K.H
new file mode 100644
index 0000000000..b842dd81b8
--- /dev/null
+++ b/Src/LinearSolvers/MLMG/AMReX_MLNodeLinOp_1D_K.H
@@ -0,0 +1,167 @@
+#ifndef AMREX_ML_NODE_LINOP_1D_K_H_
+#define AMREX_ML_NODE_LINOP_1D_K_H_
+#include <AMReX_Config.H>
+
+namespace amrex {
+
+template <typename T>
+void mlndlap_bc_doit (Box const& vbx, Array4<T> const& a, Box const& domain,
+                      GpuArray<bool,AMREX_SPACEDIM> const& bflo,
+                      GpuArray<bool,AMREX_SPACEDIM> const& bfhi) noexcept
+{
+    Box gdomain = domain;
+    int const idim = 0;
+    if (! bflo[idim]) { gdomain.growLo(idim,1); }
+    if (! bfhi[idim]) { gdomain.growHi(idim,1); }
+
+    if (gdomain.strictly_contains(vbx)) { return; }
+
+    const int offset = domain.cellCentered() ? 0 : 1;
+
+    const auto dlo = domain.smallEnd(0);
+    const auto dhi = domain.bigEnd(0);
+
+    Box const& sbox = amrex::grow(vbx,1);
+    AMREX_HOST_DEVICE_FOR_3D(sbox, i, j, k,
+    {
+        if (! gdomain.contains(IntVect(i))) {
+            if (i == dlo-1 && bflo[0])
+            {
+                a(i,0,0) = a(i+1+offset, j, k);
+            }
+            else if (i == dhi+1 && bfhi[0])
+            {
+                a(i,0,0) = a(i-1-offset, j, k);
+            }
+        }
+    });
+}
+
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void mlndlap_restriction (int i, int, int, Array4<Real> const& crse,
+                          Array4<Real const> const& fine, Array4<int const> const& msk) noexcept
+{
+    int ii = i*2;
+    if (msk(ii,0,0)) {
+        crse(i,0,0) = Real(0.0);
+    } else {
+        crse(i,0,0) = Real(1./4.)  *(fine(ii-1,0,0)
+                         + Real(2.)* fine(ii  ,0,0)
+                         +           fine(ii+1,0,0));
+    }
+}
+
+template <int rr>
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void mlndlap_restriction (int i, int, int, Array4<Real> const& crse,
+                          Array4<Real const> const& fine, Array4<int const> const& msk,
+                          Box const& fdom,
+                          GpuArray<LinOpBCType, AMREX_SPACEDIM> const& bclo,
+                          GpuArray<LinOpBCType, AMREX_SPACEDIM> const& bchi) noexcept
+
+{
+    const int ii = i*rr;
+    if (msk(ii,0,0)) {
+        crse(i,0,0) = Real(0.0);
+    } else {
+        const auto ndlo = fdom.smallEnd(0);
+        const auto ndhi = fdom.bigEnd(0);
+        Real tmp = Real(0.0);
+        for (int ioff = -rr+1; ioff <= rr-1; ++ioff) {
+            Real wx = rr - std::abs(ioff);
+            int itmp = ii + ioff;
+            if ((itmp < ndlo && (bclo[0] == LinOpBCType::Neumann ||
+                                 bclo[0] == LinOpBCType::inflow)) ||
+                (itmp > ndhi && (bchi[0] == LinOpBCType::Neumann ||
+                                 bchi[0] == LinOpBCType::inflow))) {
+                itmp = ii - ioff;
+            }
+            tmp += wx*fine(itmp,0,0);
+        }
+        crse(i,0,0) = tmp*(Real(1.0)/Real(rr*rr));
+    }
+}
+
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void mlndlap_semi_restriction (int /*i*/, int /*j*/, int /*k*/, Array4<Real> const&,
+                          Array4<Real const> const&, Array4<int const> const&, int) noexcept
+{
+    amrex::Abort("mlndlap_semi_restriction: not implemented in 1D");
+}
+
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void mlndlap_set_nodal_mask (int i, int, int, Array4<int> const& nmsk,
+                             Array4<int const> const& cmsk) noexcept
+{
+    using namespace nodelap_detail;
+
+    int s = cmsk(i-1,0,0) + cmsk(i,0,0);
+    if (s == 2*crse_cell) {
+        nmsk(i,0,0) = crse_node;
+    } else if (s == 2*fine_cell) {
+        nmsk(i,0,0) = fine_node;
+    } else {
+        nmsk(i,0,0) = crse_fine_node;
+    }
+}
+
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void mlndlap_set_dirichlet_mask (Box const& bx, Array4<int> const& dmsk,
+                                 Array4<int const> const& omsk, Box const& dom,
+                                 GpuArray<LinOpBCType, AMREX_SPACEDIM> const& bclo,
+                                 GpuArray<LinOpBCType, AMREX_SPACEDIM> const& bchi) noexcept
+{
+    const auto lo = bx.smallEnd(0);
+    const auto hi = bx.bigEnd(0);
+    AMREX_PRAGMA_SIMD
+    for (int i = lo; i <= hi; ++i) {
+        if (!dmsk(i,0,0)) {
+            dmsk(i,0,0) = (omsk(i-1,0,0) == 1 || omsk(i,0,0) == 1);
+        }
+    }
+
+    const auto domlo = dom.smallEnd(0);
+    const auto domhi = dom.bigEnd(0);
+
+    if (bclo[0] == LinOpBCType::Dirichlet && lo == domlo) {
+        dmsk(lo,0,0) = 1;
+    }
+
+    if (bchi[0] == LinOpBCType::Dirichlet && hi == domhi) {
+        dmsk(hi,0,0) = 1;
+    }
+}
+
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void mlndlap_set_dot_mask (Box const& bx, Array4<Real> const& dmsk,
+                           Array4<int const> const& omsk, Box const& dom,
+                           GpuArray<LinOpBCType, AMREX_SPACEDIM> const& bclo,
+                           GpuArray<LinOpBCType, AMREX_SPACEDIM> const& bchi) noexcept
+{
+    const auto lo = bx.smallEnd(0);
+    const auto hi = bx.bigEnd(0);
+
+    AMREX_PRAGMA_SIMD
+    for (int i = lo; i <= hi; ++i) {
+        dmsk(i,0,0) = static_cast<Real>(omsk(i,0,0));
+    }
+
+    const auto domlo = dom.smallEnd(0);
+    const auto domhi = dom.bigEnd(0);
+
+    if ((bclo[0] == LinOpBCType::Neumann || bclo[0] == LinOpBCType::inflow)
+        && lo == domlo)
+    {
+        dmsk(lo,0,0) *= Real(0.5);
+    }
+
+    if ((bchi[0] == LinOpBCType::Neumann || bchi[0] == LinOpBCType::inflow)
+        && hi == domhi)
+    {
+        dmsk(hi,0,0) *= Real(0.5);
+    }
+}
+
+}
+
+#endif
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLNodeLinOp_2D_K.H b/Src/LinearSolvers/MLMG/AMReX_MLNodeLinOp_2D_K.H
new file mode 100644
index 0000000000..3d8746cf05
--- /dev/null
+++ b/Src/LinearSolvers/MLMG/AMReX_MLNodeLinOp_2D_K.H
@@ -0,0 +1,317 @@
+#ifndef AMREX_ML_NODE_LINOP_2D_K_H_
+#define AMREX_ML_NODE_LINOP_2D_K_H_
+#include <AMReX_Config.H>
+
+namespace amrex {
+
+template <typename T>
+void mlndlap_bc_doit (Box const& vbx, Array4<T> const& a, Box const& domain,
+                      GpuArray<bool,AMREX_SPACEDIM> const& bflo,
+                      GpuArray<bool,AMREX_SPACEDIM> const& bfhi) noexcept
+{
+    Box gdomain = domain;
+    for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) {
+        if (! bflo[idim]) { gdomain.growLo(idim,1); }
+        if (! bfhi[idim]) { gdomain.growHi(idim,1); }
+    }
+
+    if (gdomain.strictly_contains(vbx)) { return; }
+
+    const int offset = domain.cellCentered() ? 0 : 1;
+
+    const auto dlo = amrex::lbound(domain);
+    const auto dhi = amrex::ubound(domain);
+
+    Box const& sbox = amrex::grow(vbx,1);
+    AMREX_HOST_DEVICE_FOR_3D(sbox, i, j, k,
+    {
+        if (! gdomain.contains(IntVect(i,j))) {
+            // xlo & ylo
+            if (i == dlo.x-1 && j == dlo.y-1 && (bflo[0] || bflo[1]))
+            {
+                if (bflo[0] && bflo[1])
+                {
+                    a(i,j,k) = a(i+1+offset, j+1+offset, k);
+                }
+                else if (bflo[0])
+                {
+                    a(i,j,k) = a(i+1+offset, j, k);
+                }
+                else if (bflo[1])
+                {
+                    a(i,j,k) = a(i, j+1+offset, k);
+                }
+            }
+            // xhi & ylo
+            else if (i == dhi.x+1 && j == dlo.y-1 && (bfhi[0] || bflo[1]))
+            {
+                if (bfhi[0] && bflo[1])
+                {
+                    a(i,j,k) = a(i-1-offset, j+1+offset, k);
+                }
+                else if (bfhi[0])
+                {
+                    a(i,j,k) = a(i-1-offset, j, k);
+                }
+                else if (bflo[1])
+                {
+                    a(i,j,k) = a(i, j+1+offset, k);
+                }
+            }
+            // xlo & yhi
+            else if (i == dlo.x-1 && j == dhi.y+1 && (bflo[0] || bfhi[1]))
+            {
+                if (bflo[0] && bfhi[1])
+                {
+                    a(i,j,k) = a(i+1+offset, j-1-offset, k);
+                }
+                else if (bflo[0])
+                {
+                    a(i,j,k) = a(i+1+offset, j, k);
+                }
+                else if (bfhi[1])
+                {
+                    a(i,j,k) = a(i, j-1-offset, k);
+                }
+            }
+            // xhi & yhi
+            else if (i == dhi.x+1 && j == dhi.y+1 && (bfhi[0] || bfhi[1]))
+            {
+                if (bfhi[0] && bfhi[1])
+                {
+                    a(i,j,k) = a(i-1-offset, j-1-offset, k);
+                }
+                else if (bfhi[0])
+                {
+                    a(i,j,k) = a(i-1-offset, j, k);
+                }
+                else if (bfhi[1])
+                {
+                    a(i,j,k) = a(i, j-1-offset, k);
+                }
+            }
+            else if (i == dlo.x-1 && bflo[0])
+            {
+                a(i,j,k) = a(i+1+offset, j, k);
+            }
+            else if (i == dhi.x+1 && bfhi[0])
+            {
+                a(i,j,k) = a(i-1-offset, j, k);
+            }
+            else if (j == dlo.y-1 && bflo[1])
+            {
+                a(i,j,k) = a(i, j+1+offset, k);
+            }
+            else if (j == dhi.y+1 && bfhi[1])
+            {
+                a(i,j,k) = a(i, j-1-offset, k);
+            }
+        }
+    });
+}
+
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void mlndlap_restriction (int i, int j, int k, Array4<Real> const& crse,
+                          Array4<Real const> const& fine, Array4<int const> const& msk) noexcept
+{
+    int ii = i*2;
+    int jj = j*2;
+    int kk = 0;
+    if (msk(ii,jj,kk)) {
+        crse(i,j,k) = Real(0.0);
+    } else {
+        crse(i,j,k) = Real(1./16.)*(fine(ii-1,jj-1,kk) + Real(2.)*fine(ii  ,jj-1,kk) +          fine(ii+1,jj-1,kk)
+                         + Real(2.)*fine(ii-1,jj  ,kk) + Real(4.)*fine(ii  ,jj  ,kk) + Real(2.)*fine(ii+1,jj  ,kk)
+                                  + fine(ii-1,jj+1,kk) + Real(2.)*fine(ii  ,jj+1,kk) +          fine(ii+1,jj+1,kk));
+    }
+}
+
+template <int rr>
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void mlndlap_restriction (int i, int j, int k, Array4<Real> const& crse,
+                          Array4<Real const> const& fine, Array4<int const> const& msk,
+                          Box const& fdom,
+                          GpuArray<LinOpBCType, AMREX_SPACEDIM> const& bclo,
+                          GpuArray<LinOpBCType, AMREX_SPACEDIM> const& bchi) noexcept
+{
+    const int ii = i*rr;
+    const int jj = j*rr;
+    if (msk(ii,jj,0)) {
+        crse(i,j,k) = Real(0.0);
+    } else {
+        const auto ndlo = amrex::lbound(fdom);
+        const auto ndhi = amrex::ubound(fdom);
+        Real tmp = Real(0.0);
+        for (int joff = -rr+1; joff <= rr-1; ++joff) {
+            Real wy = rr - std::abs(joff);
+            for (int ioff = -rr+1; ioff <= rr-1; ++ioff) {
+                Real wx = rr - std::abs(ioff);
+                int itmp = ii + ioff;
+                int jtmp = jj + joff;
+                if ((itmp < ndlo.x && (bclo[0] == LinOpBCType::Neumann ||
+                                       bclo[0] == LinOpBCType::inflow)) ||
+                    (itmp > ndhi.x && (bchi[0] == LinOpBCType::Neumann ||
+                                       bchi[0] == LinOpBCType::inflow))) {
+                    itmp = ii - ioff;
+                }
+                if ((jtmp < ndlo.y && (bclo[1] == LinOpBCType::Neumann ||
+                                       bclo[1] == LinOpBCType::inflow)) ||
+                    (jtmp > ndhi.y && (bchi[1] == LinOpBCType::Neumann ||
+                                       bchi[1] == LinOpBCType::inflow))) {
+                    jtmp = jj - joff;
+                }
+                tmp += wx*wy*fine(itmp,jtmp,0);
+            }
+        }
+        crse(i,j,k) = tmp*(Real(1.0)/Real(rr*rr*rr*rr));
+    }
+}
+
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void mlndlap_semi_restriction (int i, int j, int k, Array4<Real> const& crse,
+                          Array4<Real const> const& fine, Array4<int const> const& msk, int idir) noexcept
+{
+    int kk = 0;
+    if (idir == 1) {
+        int ii = i*2;
+        int jj = j;
+        if (msk(ii,jj,kk)) {
+            crse(i,j,k) = Real(0.0);
+        } else {
+            crse(i,j,k) = Real(1./4.)*(fine(ii-1,jj,kk) + Real(2.)*fine(ii,jj,kk) + fine(ii+1,jj,kk));
+        }
+    } else if (idir == 0) {
+        int ii = i;
+        int jj = j*2;
+        if (msk(ii,jj,kk)) {
+            crse(i,j,k) = Real(0.0);
+        } else {
+            crse(i,j,k) = Real(1./4.)*(fine(ii,jj-1,kk) + Real(2.)*fine(ii,jj,kk) + fine(ii,jj+1,kk));
+        }
+    }
+}
+
+//
+// masks
+//
+
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void mlndlap_set_nodal_mask (int i, int j, int k, Array4<int> const& nmsk,
+                             Array4<int const> const& cmsk) noexcept
+{
+    using namespace nodelap_detail;
+
+    int s = cmsk(i-1,j-1,k) + cmsk(i  ,j-1,k)
+        +   cmsk(i-1,j  ,k) + cmsk(i  ,j  ,k);
+    if (s == 4*crse_cell) {
+        nmsk(i,j,k) = crse_node;
+    }
+    else if (s == 4*fine_cell) {
+        nmsk(i,j,k) = fine_node;
+    } else {
+        nmsk(i,j,k) = crse_fine_node;
+    }
+}
+
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void mlndlap_set_dirichlet_mask (Box const& bx, Array4<int> const& dmsk,
+                                 Array4<int const> const& omsk, Box const& dom,
+                                 GpuArray<LinOpBCType, AMREX_SPACEDIM> const& bclo,
+                                 GpuArray<LinOpBCType, AMREX_SPACEDIM> const& bchi) noexcept
+{
+    const auto lo = amrex::lbound(bx);
+    const auto hi = amrex::ubound(bx);
+    for (int j = lo.y; j <= hi.y; ++j) {
+    AMREX_PRAGMA_SIMD
+    for (int i = lo.x; i <= hi.x; ++i) {
+        if (!dmsk(i,j,0)) {
+            dmsk(i,j,0) = (omsk(i-1,j-1,0) == 1 || omsk(i,j-1,0) == 1 ||
+                           omsk(i-1,j  ,0) == 1 || omsk(i,j  ,0) == 1);
+        }
+    }}
+
+    const auto domlo = amrex::lbound(dom);
+    const auto domhi = amrex::ubound(dom);
+
+    if (bclo[0] == LinOpBCType::Dirichlet && lo.x == domlo.x) {
+        for (int j = lo.y; j <= hi.y; ++j) {
+            dmsk(lo.x,j,0) = 1;
+        }
+    }
+
+    if (bchi[0] == LinOpBCType::Dirichlet && hi.x == domhi.x) {
+        for (int j = lo.y; j <= hi.y; ++j) {
+            dmsk(hi.x,j,0) = 1;
+        }
+    }
+
+    if (bclo[1] == LinOpBCType::Dirichlet && lo.y == domlo.y) {
+        AMREX_PRAGMA_SIMD
+        for (int i = lo.x; i <= hi.x; ++i) {
+            dmsk(i,lo.y,0) = 1;
+        }
+    }
+
+    if (bchi[1] == LinOpBCType::Dirichlet && hi.y == domhi.y) {
+        AMREX_PRAGMA_SIMD
+        for (int i = lo.x; i <= hi.x; ++i) {
+            dmsk(i,hi.y,0) = 1;
+        }
+    }
+}
+
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void mlndlap_set_dot_mask (Box const& bx, Array4<Real> const& dmsk,
+                           Array4<int const> const& omsk, Box const& dom,
+                           GpuArray<LinOpBCType, AMREX_SPACEDIM> const& bclo,
+                           GpuArray<LinOpBCType, AMREX_SPACEDIM> const& bchi) noexcept
+{
+    const auto lo = amrex::lbound(bx);
+    const auto hi = amrex::ubound(bx);
+    for (int j = lo.y; j <= hi.y; ++j) {
+    AMREX_PRAGMA_SIMD
+    for (int i = lo.x; i <= hi.x; ++i) {
+        dmsk(i,j,0) = static_cast<Real>(omsk(i,j,0));
+    }}
+
+    const auto domlo = amrex::lbound(dom);
+    const auto domhi = amrex::ubound(dom);
+
+    if ((bclo[0] == LinOpBCType::Neumann || bclo[0] == LinOpBCType::inflow)
+        && lo.x == domlo.x)
+    {
+        for (int j = lo.y; j <= hi.y; ++j) {
+            dmsk(lo.x,j,0) *= Real(0.5);
+        }
+    }
+
+    if ((bchi[0] == LinOpBCType::Neumann || bchi[0] == LinOpBCType::inflow)
+        && hi.x == domhi.x)
+    {
+        for (int j = lo.y; j <= hi.y; ++j) {
+            dmsk(hi.x,j,0) *= Real(0.5);
+        }
+    }
+
+    if ((bclo[1] == LinOpBCType::Neumann || bclo[1] == LinOpBCType::inflow)
+        && lo.y == domlo.y)
+    {
+        AMREX_PRAGMA_SIMD
+        for (int i = lo.x; i <= hi.x; ++i) {
+            dmsk(i,lo.y,0) *= Real(0.5);
+        }
+    }
+
+    if ((bchi[1] == LinOpBCType::Neumann || bchi[1] == LinOpBCType::inflow)
+        && hi.y == domhi.y)
+    {
+        AMREX_PRAGMA_SIMD
+        for (int i = lo.x; i <= hi.x; ++i) {
+            dmsk(i,hi.y,0) *= Real(0.5);
+        }
+    }
+}
+
+}
+
+#endif
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLNodeLinOp_3D_K.H b/Src/LinearSolvers/MLMG/AMReX_MLNodeLinOp_3D_K.H
new file mode 100644
index 0000000000..976a16c7aa
--- /dev/null
+++ b/Src/LinearSolvers/MLMG/AMReX_MLNodeLinOp_3D_K.H
@@ -0,0 +1,810 @@
+#ifndef AMREX_ML_NODE_LINOP_3D_K_H_
+#define AMREX_ML_NODE_LINOP_3D_K_H_
+#include <AMReX_Config.H>
+
+namespace amrex {
+
+template <typename T>
+inline void mlndlap_bc_doit (Box const& vbx, Array4<T> const& a, Box const& domain,
+                             GpuArray<bool,AMREX_SPACEDIM> const& bflo,
+                             GpuArray<bool,AMREX_SPACEDIM> const& bfhi) noexcept
+{
+    Box gdomain = domain;
+    for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) {
+        if (! bflo[idim]) { gdomain.growLo(idim,1); }
+        if (! bfhi[idim]) { gdomain.growHi(idim,1); }
+    }
+
+    if (gdomain.strictly_contains(vbx)) { return; }
+
+    const int offset = domain.cellCentered() ? 0 : 1;
+
+    const auto dlo = amrex::lbound(domain);
+    const auto dhi = amrex::ubound(domain);
+
+    Box const& sbox = amrex::grow(vbx,1);
+    AMREX_HOST_DEVICE_FOR_3D(sbox, i, j, k,
+    {
+        if (! gdomain.contains(IntVect(i,j,k))) {
+            // xlo & ylo & zlo
+            if (i == dlo.x-1 && j == dlo.y-1 && k == dlo.z-1 && (bflo[0] || bflo[1] || bflo[2]))
+            {
+                if (bflo[0] && bflo[1] && bflo[2])
+                {
+                    a(i,j,k) = a(i+1+offset, j+1+offset, k+1+offset);
+                }
+                else if (bflo[0] && bflo[1])
+                {
+                    a(i,j,k) = a(i+1+offset, j+1+offset, k);
+                }
+                else if (bflo[0] && bflo[2])
+                {
+                    a(i,j,k) = a(i+1+offset, j, k+1+offset);
+                }
+                else if (bflo[1] && bflo[2])
+                {
+                    a(i,j,k) = a(i, j+1+offset, k+1+offset);
+                }
+                else if (bflo[0])
+                {
+                    a(i,j,k) = a(i+1+offset, j, k);
+                }
+                else if (bflo[1])
+                {
+                    a(i,j,k) = a(i, j+1+offset, k);
+                }
+                else if (bflo[2])
+                {
+                    a(i,j,k) = a(i, j, k+1+offset);
+                }
+            }
+            // xhi & ylo & zlo
+            else if (i == dhi.x+1 && j == dlo.y-1 && k == dlo.z-1 && (bfhi[0] || bflo[1] || bflo[2]))
+            {
+                if (bfhi[0] && bflo[1] && bflo[2])
+                {
+                    a(i,j,k) = a(i-1-offset, j+1+offset, k+1+offset);
+                }
+                else if (bfhi[0] && bflo[1])
+                {
+                    a(i,j,k) = a(i-1-offset, j+1+offset, k);
+                }
+                else if (bfhi[0] && bflo[2])
+                {
+                    a(i,j,k) = a(i-1-offset, j, k+1+offset);
+                }
+                else if (bflo[1] && bflo[2])
+                {
+                    a(i,j,k) = a(i, j+1+offset, k+1+offset);
+                }
+                else if (bfhi[0])
+                {
+                    a(i,j,k) = a(i-1-offset, j, k);
+                }
+                else if (bflo[1])
+                {
+                    a(i,j,k) = a(i, j+1+offset, k);
+                }
+                else if (bflo[2])
+                {
+                    a(i,j,k) = a(i, j, k+1+offset);
+                }
+            }
+            // xlo & yhi & zlo
+            else if (i == dlo.x-1 && j == dhi.y+1 && k == dlo.z-1 && (bflo[0] || bfhi[1] || bflo[2]))
+            {
+                if (bflo[0] && bfhi[1] && bflo[2])
+                {
+                    a(i,j,k) = a(i+1+offset, j-1-offset, k+1+offset);
+                }
+                else if (bflo[0] && bfhi[1])
+                {
+                    a(i,j,k) = a(i+1+offset, j-1-offset, k);
+                }
+                else if (bflo[0] && bflo[2])
+                {
+                    a(i,j,k) = a(i+1+offset, j, k+1+offset);
+                }
+                else if (bfhi[1] && bflo[2])
+                {
+                    a(i,j,k) = a(i, j-1-offset, k+1+offset);
+                }
+                else if (bflo[0])
+                {
+                    a(i,j,k) = a(i+1+offset, j, k);
+                }
+                else if (bfhi[1])
+                {
+                    a(i,j,k) = a(i, j-1-offset, k);
+                }
+                else if (bflo[2])
+                {
+                    a(i,j,k) = a(i, j, k+1+offset);
+                }
+            }
+            // xhi & yhi & zlo
+            else if (i == dhi.x+1 && j == dhi.y+1 && k == dlo.z-1 && (bfhi[0] || bfhi[1] || bflo[2]))
+            {
+                if (bfhi[0] && bfhi[1] && bflo[2])
+                {
+                    a(i,j,k) = a(i-1-offset, j-1-offset, k+1+offset);
+                }
+                else if (bfhi[0] && bfhi[1])
+                {
+                    a(i,j,k) = a(i-1-offset, j-1-offset, k);
+                }
+                else if (bfhi[0] && bflo[2])
+                {
+                    a(i,j,k) = a(i-1-offset, j, k+1+offset);
+                }
+                else if (bfhi[1] && bflo[2])
+                {
+                    a(i,j,k) = a(i, j-1-offset, k+1+offset);
+                }
+                else if (bfhi[0])
+                {
+                    a(i,j,k) = a(i-1-offset, j, k);
+                }
+                else if (bfhi[1])
+                {
+                    a(i,j,k) = a(i, j-1-offset, k);
+                }
+                else if (bflo[2])
+                {
+                    a(i,j,k) = a(i, j, k+1+offset);
+                }
+            }
+            // xlo & ylo & zhi
+            else if (i == dlo.x-1 && j == dlo.y-1 && k == dhi.z+1 && (bflo[0] || bflo[1] || bfhi[2]))
+            {
+                if (bflo[0] && bflo[1] && bfhi[2])
+                {
+                    a(i,j,k) = a(i+1+offset, j+1+offset, k-1-offset);
+                }
+                else if (bflo[0] && bflo[1])
+                {
+                    a(i,j,k) = a(i+1+offset, j+1+offset, k);
+                }
+                else if (bflo[0] && bfhi[2])
+                {
+                    a(i,j,k) = a(i+1+offset, j, k-1-offset);
+                }
+                else if (bflo[1] && bfhi[2])
+                {
+                    a(i,j,k) = a(i, j+1+offset, k-1-offset);
+                }
+                else if (bflo[0])
+                {
+                    a(i,j,k) = a(i+1+offset, j, k);
+                }
+                else if (bflo[1])
+                {
+                    a(i,j,k) = a(i, j+1+offset, k);
+                }
+                else if (bfhi[2])
+                {
+                    a(i,j,k) = a(i, j, k-1-offset);
+                }
+            }
+            // xhi & ylo & zhi
+            else if (i == dhi.x+1 && j == dlo.y-1 && k == dhi.z+1 && (bfhi[0] || bflo[1] || bfhi[2]))
+            {
+                if (bfhi[0] && bflo[1] && bfhi[2])
+                {
+                    a(i,j,k) = a(i-1-offset, j+1+offset, k-1-offset);
+                }
+                else if (bfhi[0] && bflo[1])
+                {
+                    a(i,j,k) = a(i-1-offset, j+1+offset, k);
+                }
+                else if (bfhi[0] && bfhi[2])
+                {
+                    a(i,j,k) = a(i-1-offset, j, k-1-offset);
+                }
+                else if (bflo[1] && bfhi[2])
+                {
+                    a(i,j,k) = a(i, j+1+offset, k-1-offset);
+                }
+                else if (bfhi[0])
+                {
+                    a(i,j,k) = a(i-1-offset, j, k);
+                }
+                else if (bflo[1])
+                {
+                    a(i,j,k) = a(i, j+1+offset, k);
+                }
+                else if (bfhi[2])
+                {
+                    a(i,j,k) = a(i, j, k-1-offset);
+                }
+            }
+            // xlo & yhi & zhi
+            else if (i == dlo.x-1 && j == dhi.y+1 && k == dhi.z+1 && (bflo[0] || bfhi[1] || bfhi[2]))
+            {
+                if (bflo[0] && bfhi[1] && bfhi[2])
+                {
+                    a(i,j,k) = a(i+1+offset, j-1-offset, k-1-offset);
+                }
+                else if (bflo[0] && bfhi[1])
+                {
+                    a(i,j,k) = a(i+1+offset, j-1-offset, k);
+                }
+                else if (bflo[0] && bfhi[2])
+                {
+                    a(i,j,k) = a(i+1+offset, j, k-1-offset);
+                }
+                else if (bfhi[1] && bfhi[2])
+                {
+                    a(i,j,k) = a(i, j-1-offset, k-1-offset);
+                }
+                else if (bflo[0])
+                {
+                    a(i,j,k) = a(i+1+offset, j, k);
+                }
+                else if (bfhi[1])
+                {
+                    a(i,j,k) = a(i, j-1-offset, k);
+                }
+                else if (bfhi[2])
+                {
+                    a(i,j,k) = a(i, j, k-1-offset);
+                }
+            }
+            // xhi & yhi & zhi
+            else if (i == dhi.x+1 && j == dhi.y+1 && k == dhi.z+1 && (bfhi[0] || bfhi[1] || bfhi[2]))
+            {
+                if (bfhi[0] && bfhi[1] && bfhi[2])
+                {
+                    a(i,j,k) = a(i-1-offset, j-1-offset, k-1-offset);
+                }
+                else if (bfhi[0] && bfhi[1])
+                {
+                    a(i,j,k) = a(i-1-offset, j-1-offset, k);
+                }
+                else if (bfhi[0] && bfhi[2])
+                {
+                    a(i,j,k) = a(i-1-offset, j, k-1-offset);
+                }
+                else if (bfhi[1] && bfhi[2])
+                {
+                    a(i,j,k) = a(i, j-1-offset, k-1-offset);
+                }
+                else if (bfhi[0])
+                {
+                    a(i,j,k) = a(i-1-offset, j, k);
+                }
+                else if (bfhi[1])
+                {
+                    a(i,j,k) = a(i, j-1-offset, k);
+                }
+                else if (bfhi[2])
+                {
+                    a(i,j,k) = a(i, j, k-1-offset);
+                }
+            }
+            // xlo & ylo
+            else if (i == dlo.x-1 && j == dlo.y-1 && (bflo[0] || bflo[1]))
+            {
+                if (bflo[0] && bflo[1])
+                {
+                    a(i,j,k) = a(i+1+offset, j+1+offset, k);
+                }
+                else if (bflo[0])
+                {
+                    a(i,j,k) = a(i+1+offset, j, k);
+                }
+                else if (bflo[1])
+                {
+                    a(i,j,k) = a(i, j+1+offset, k);
+                }
+            }
+            // xhi & ylo
+            else if (i == dhi.x+1 && j == dlo.y-1 && (bfhi[0] || bflo[1]))
+            {
+                if (bfhi[0] && bflo[1])
+                {
+                    a(i,j,k) = a(i-1-offset, j+1+offset, k);
+                }
+                else if (bfhi[0])
+                {
+                    a(i,j,k) = a(i-1-offset, j, k);
+                }
+                else if (bflo[1])
+                {
+                    a(i,j,k) = a(i, j+1+offset, k);
+                }
+            }
+            // xlo & yhi
+            else if (i == dlo.x-1 && j == dhi.y+1 && (bflo[0] || bfhi[1]))
+            {
+                if (bflo[0] && bfhi[1])
+                {
+                    a(i,j,k) = a(i+1+offset, j-1-offset, k);
+                }
+                else if (bflo[0])
+                {
+                    a(i,j,k) = a(i+1+offset, j, k);
+                }
+                else if (bfhi[1])
+                {
+                    a(i,j,k) = a(i, j-1-offset, k);
+                }
+            }
+            // xhi & yhi
+            else if (i == dhi.x+1 && j == dhi.y+1 && (bfhi[0] || bfhi[1]))
+            {
+                if (bfhi[0] && bfhi[1])
+                {
+                    a(i,j,k) = a(i-1-offset, j-1-offset, k);
+                }
+                else if (bfhi[0])
+                {
+                    a(i,j,k) = a(i-1-offset, j, k);
+                }
+                else if (bfhi[1])
+                {
+                    a(i,j,k) = a(i, j-1-offset, k);
+                }
+            }
+            // xlo & zlo
+            else if (i == dlo.x-1 && k == dlo.z-1 && (bflo[0] || bflo[2]))
+            {
+                if (bflo[0] && bflo[2])
+                {
+                    a(i,j,k) = a(i+1+offset, j, k+1+offset);
+                }
+                else if (bflo[0])
+                {
+                    a(i,j,k) = a(i+1+offset, j, k);
+                }
+                else if (bflo[2])
+                {
+                    a(i,j,k) = a(i, j, k+1+offset);
+                }
+            }
+            // xhi & zlo
+            else if (i == dhi.x+1 && k == dlo.z-1 && (bfhi[0] || bflo[2]))
+            {
+                if (bfhi[0] && bflo[2])
+                {
+                    a(i,j,k) = a(i-1-offset, j, k+1+offset);
+                }
+                else if (bfhi[0])
+                {
+                    a(i,j,k) = a(i-1-offset, j, k);
+                }
+                else if (bflo[2])
+                {
+                    a(i,j,k) = a(i, j, k+1+offset);
+                }
+            }
+            // xlo & zhi
+            else if (i == dlo.x-1 && k == dhi.z+1 && (bflo[0] || bfhi[2]))
+            {
+                if (bflo[0] && bfhi[2])
+                {
+                    a(i,j,k) = a(i+1+offset, j, k-1-offset);
+                }
+                else if (bflo[0])
+                {
+                    a(i,j,k) = a(i+1+offset, j, k);
+                }
+                else if (bfhi[2])
+                {
+                    a(i,j,k) = a(i, j, k-1-offset);
+                }
+            }
+            // xhi & zhi
+            else if (i == dhi.x+1 && k == dhi.z+1 && (bfhi[0] || bfhi[2]))
+            {
+                if (bfhi[0] && bfhi[2])
+                {
+                    a(i,j,k) = a(i-1-offset, j, k-1-offset);
+                }
+                else if (bfhi[0])
+                {
+                    a(i,j,k) = a(i-1-offset, j, k);
+                }
+                else if (bfhi[2])
+                {
+                    a(i,j,k) = a(i, j, k-1-offset);
+                }
+            }
+            // ylo & zlo
+            else if (j == dlo.y-1 && k == dlo.z-1 && (bflo[1] || bflo[2]))
+            {
+                if (bflo[1] && bflo[2])
+                {
+                    a(i,j,k) = a(i, j+1+offset, k+1+offset);
+                }
+                else if (bflo[1])
+                {
+                    a(i,j,k) = a(i, j+1+offset, k);
+                }
+                else if (bflo[2])
+                {
+                    a(i,j,k) = a(i, j, k+1+offset);
+                }
+            }
+            // yhi & zlo
+            else if (j == dhi.y+1 && k == dlo.z-1 && (bfhi[1] || bflo[2]))
+            {
+                if (bfhi[1] && bflo[2])
+                {
+                    a(i,j,k) = a(i, j-1-offset, k+1+offset);
+                }
+                else if (bfhi[1])
+                {
+                    a(i,j,k) = a(i, j-1-offset, k);
+                }
+                else if (bflo[2])
+                {
+                    a(i,j,k) = a(i, j, k+1+offset);
+                }
+            }
+            // ylo & zhi
+            else if (j == dlo.y-1 && k == dhi.z+1 && (bflo[1] || bfhi[2]))
+            {
+                if (bflo[1] && bfhi[2])
+                {
+                    a(i,j,k) = a(i, j+1+offset, k-1-offset);
+                }
+                else if (bflo[1])
+                {
+                    a(i,j,k) = a(i, j+1+offset, k);
+                }
+                else if (bfhi[2])
+                {
+                    a(i,j,k) = a(i, j, k-1-offset);
+                }
+            }
+            // yhi & zhi
+            else if (j == dhi.y+1 && k == dhi.z+1 && (bfhi[1] || bfhi[2]))
+            {
+                if (bfhi[1] && bfhi[2])
+                {
+                    a(i,j,k) = a(i, j-1-offset, k-1-offset);
+                }
+                else if (bfhi[1])
+                {
+                    a(i,j,k) = a(i, j-1-offset, k);
+                }
+                else if (bfhi[2])
+                {
+                    a(i,j,k) = a(i, j, k-1-offset);
+                }
+            }
+            else if (i == dlo.x-1 && bflo[0])
+            {
+                a(i,j,k) = a(i+1+offset, j, k);
+            }
+            else if (i == dhi.x+1 && bfhi[0])
+            {
+                a(i,j,k) = a(i-1-offset, j, k);
+            }
+            else if (j == dlo.y-1 && bflo[1])
+            {
+                a(i,j,k) = a(i, j+1+offset, k);
+            }
+            else if (j == dhi.y+1 && bfhi[1])
+            {
+                a(i,j,k) = a(i, j-1-offset, k);
+            }
+            else if (k == dlo.z-1 && bflo[2])
+            {
+                a(i,j,k) = a(i, j, k+1+offset);
+            }
+            else if (k == dhi.z+1 && bfhi[2])
+            {
+                a(i,j,k) = a(i, j, k-1-offset);
+            }
+        }
+    });
+}
+
+//
+// restriction
+//
+
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void mlndlap_restriction (int i, int j, int k, Array4<Real> const& crse,
+                          Array4<Real const> const& fine, Array4<int const> const& msk) noexcept
+{
+    int ii = i*2;
+    int jj = j*2;
+    int kk = k*2;
+    if (msk(ii,jj,kk)) {
+        crse(i,j,k) = Real(0.0);
+    } else {
+        crse(i,j,k) = Real(1./64.)*(fine(ii-1,jj-1,kk-1)+fine(ii+1,jj-1,kk-1)
+                                   +fine(ii-1,jj+1,kk-1)+fine(ii+1,jj+1,kk-1)
+                                   +fine(ii-1,jj-1,kk+1)+fine(ii+1,jj-1,kk+1)
+                                   +fine(ii-1,jj+1,kk+1)+fine(ii+1,jj+1,kk+1))
+                    + Real(1./32.)*(fine(ii  ,jj-1,kk-1)+fine(ii  ,jj+1,kk-1)
+                                   +fine(ii  ,jj-1,kk+1)+fine(ii  ,jj+1,kk+1)
+                                   +fine(ii-1,jj  ,kk-1)+fine(ii+1,jj  ,kk-1)
+                                   +fine(ii-1,jj  ,kk+1)+fine(ii+1,jj  ,kk+1)
+                                   +fine(ii-1,jj-1,kk  )+fine(ii+1,jj-1,kk  )
+                                   +fine(ii-1,jj+1,kk  )+fine(ii+1,jj+1,kk  ))
+                    + Real(1./16.)*(fine(ii-1,jj,kk)+fine(ii+1,jj,kk)
+                                   +fine(ii,jj-1,kk)+fine(ii,jj+1,kk)
+                                   +fine(ii,jj,kk-1)+fine(ii,jj,kk+1))
+                      + Real(1./8.)*fine(ii,jj,kk);
+    }
+}
+
+template <int rr>
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void mlndlap_restriction (int i, int j, int k, Array4<Real> const& crse,
+                          Array4<Real const> const& fine, Array4<int const> const& msk,
+                          Box const& fdom,
+                          GpuArray<LinOpBCType, AMREX_SPACEDIM> const& bclo,
+                          GpuArray<LinOpBCType, AMREX_SPACEDIM> const& bchi) noexcept
+{
+    const int ii = i*rr;
+    const int jj = j*rr;
+    const int kk = k*rr;
+    if (msk(ii,jj,kk)) {
+        crse(i,j,k) = Real(0.0);
+    } else {
+        const auto ndlo = amrex::lbound(fdom);
+        const auto ndhi = amrex::ubound(fdom);
+        Real tmp = Real(0.0);
+        for (int koff = -rr+1; koff <= rr-1; ++koff) {
+            Real wz = rr - std::abs(koff);
+            for (int joff = -rr+1; joff <= rr-1; ++joff) {
+                Real wy = rr - std::abs(joff);
+                for (int ioff = -rr+1; ioff <= rr-1; ++ioff) {
+                    Real wx = rr - std::abs(ioff);
+                    int itmp = ii + ioff;
+                    int jtmp = jj + joff;
+                    int ktmp = kk + koff;
+                    if ((itmp < ndlo.x && (bclo[0] == LinOpBCType::Neumann ||
+                                           bclo[0] == LinOpBCType::inflow)) ||
+                        (itmp > ndhi.x && (bchi[0] == LinOpBCType::Neumann ||
+                                           bchi[0] == LinOpBCType::inflow))) {
+                        itmp = ii - ioff;
+                    }
+                    if ((jtmp < ndlo.y && (bclo[1] == LinOpBCType::Neumann ||
+                                           bclo[1] == LinOpBCType::inflow)) ||
+                        (jtmp > ndhi.y && (bchi[1] == LinOpBCType::Neumann ||
+                                           bchi[1] == LinOpBCType::inflow))) {
+                        jtmp = jj - joff;
+                    }
+                    if ((ktmp < ndlo.z && (bclo[2] == LinOpBCType::Neumann ||
+                                           bclo[2] == LinOpBCType::inflow)) ||
+                        (ktmp > ndhi.z && (bchi[2] == LinOpBCType::Neumann ||
+                                           bchi[2] == LinOpBCType::inflow))) {
+                        ktmp = kk - koff;
+                    }
+                    tmp += wx*wy*wz*fine(itmp,jtmp,ktmp);
+                }
+            }
+        }
+        crse(i,j,k) = tmp/Real(rr*rr*rr*rr*rr*rr);
+    }
+}
+
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void mlndlap_semi_restriction (int i, int j, int k, Array4<Real> const& crse,
+                               Array4<Real const> const& fine, Array4<int const> const& msk, int idir) noexcept
+{
+    if (idir == 2)
+    {
+        int ii = i*2;
+        int jj = j*2;
+        int kk = k;
+        if (msk(ii,jj,kk)) {
+            crse(i,j,k) = Real(0.0);
+        } else { // use 2-D formula
+            crse(i,j,k) = Real(1./16.)*(fine(ii-1,jj-1,kk) + Real(2.)*fine(ii  ,jj-1,kk) +          fine(ii+1,jj-1,kk)
+                             + Real(2.)*fine(ii-1,jj  ,kk) + Real(4.)*fine(ii  ,jj  ,kk) + Real(2.)*fine(ii+1,jj  ,kk)
+                                      + fine(ii-1,jj+1,kk) + Real(2.)*fine(ii  ,jj+1,kk) +          fine(ii+1,jj+1,kk));
+        }
+    }
+    else if (idir == 1)
+    {
+        int ii = i*2;
+        int jj = j;
+        int kk = k*2;
+        if (msk(ii,jj,kk)) {
+            crse(i,j,k) = Real(0.0);
+        } else { // use 2-D formula
+            crse(i,j,k) = Real(1./16.)*(fine(ii-1,jj,kk-1) + Real(2.)*fine(ii  ,jj,kk-1) +          fine(ii+1,jj,kk-1)
+                             + Real(2.)*fine(ii-1,jj  ,kk) + Real(4.)*fine(ii  ,jj,kk  ) + Real(2.)*fine(ii+1,jj,kk  )
+                                      + fine(ii-1,jj,kk+1) + Real(2.)*fine(ii  ,jj,kk+1) +          fine(ii+1,jj,kk+1));
+        }
+    }
+    else if (idir == 0)
+    {
+        int ii = i;
+        int jj = j*2;
+        int kk = k*2;
+        if (msk(ii,jj,kk)) {
+            crse(i,j,k) = Real(0.0);
+        } else { // use 2-D formula
+            crse(i,j,k) = Real(1./16.)*(fine(ii,jj-1,kk-1) + Real(2.)*fine(ii  ,jj,kk-1) +          fine(ii,jj+1,kk-1)
+                             + Real(2.)*fine(ii,jj-1  ,kk) + Real(4.)*fine(ii  ,jj,kk  ) + Real(2.)*fine(ii,jj+1,kk  )
+                                      + fine(ii,jj-1,kk+1) + Real(2.)*fine(ii  ,jj,kk+1) +          fine(ii,jj+1,kk+1));
+        }
+    }
+    else
+    {
+        amrex::Abort("mlndlap_semi_restriction semi direction wrong semi-direction. ");
+    }
+}
+
+//
+// masks
+//
+
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void mlndlap_set_nodal_mask (int i, int j, int k, Array4<int> const& nmsk,
+                             Array4<int const> const& cmsk) noexcept
+{
+    using namespace nodelap_detail;
+
+    int s = cmsk(i-1,j-1,k-1) + cmsk(i  ,j-1,k-1)
+        +   cmsk(i-1,j  ,k-1) + cmsk(i  ,j  ,k-1)
+        +   cmsk(i-1,j-1,k  ) + cmsk(i  ,j-1,k  )
+        +   cmsk(i-1,j  ,k  ) + cmsk(i  ,j  ,k  );
+    if (s == 8*crse_cell) {
+        nmsk(i,j,k) = crse_node;
+    }
+    else if (s == 8*fine_cell) {
+        nmsk(i,j,k) = fine_node;
+    } else {
+        nmsk(i,j,k) = crse_fine_node;
+    }
+}
+
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void mlndlap_set_dirichlet_mask (Box const& bx, Array4<int> const& dmsk,
+                                 Array4<int const> const& omsk, Box const& dom,
+                                 GpuArray<LinOpBCType, AMREX_SPACEDIM> const& bclo,
+                                 GpuArray<LinOpBCType, AMREX_SPACEDIM> const& bchi) noexcept
+{
+    const auto lo = amrex::lbound(bx);
+    const auto hi = amrex::ubound(bx);
+    for (int k = lo.z; k <= hi.z; ++k) {
+    for (int j = lo.y; j <= hi.y; ++j) {
+    AMREX_PRAGMA_SIMD
+    for (int i = lo.x; i <= hi.x; ++i) {
+        if (!dmsk(i,j,k)) {
+            dmsk(i,j,k) = (omsk(i-1,j-1,k-1) == 1 || omsk(i,j-1,k-1) == 1 ||
+                           omsk(i-1,j  ,k-1) == 1 || omsk(i,j  ,k-1) == 1 ||
+                           omsk(i-1,j-1,k  ) == 1 || omsk(i,j-1,k  ) == 1 ||
+                           omsk(i-1,j  ,k  ) == 1 || omsk(i,j  ,k  ) == 1);
+        }
+    }}}
+
+    const auto domlo = amrex::lbound(dom);
+    const auto domhi = amrex::ubound(dom);
+
+    if (bclo[0] == LinOpBCType::Dirichlet && lo.x == domlo.x) {
+        for (int k = lo.z; k <= hi.z; ++k) {
+        for (int j = lo.y; j <= hi.y; ++j) {
+            dmsk(lo.x,j,k) = 1;
+        }}
+    }
+
+    if (bchi[0] == LinOpBCType::Dirichlet && hi.x == domhi.x) {
+        for (int k = lo.z; k <= hi.z; ++k) {
+        for (int j = lo.y; j <= hi.y; ++j) {
+            dmsk(hi.x,j,k) = 1;
+        }}
+    }
+
+    if (bclo[1] == LinOpBCType::Dirichlet && lo.y == domlo.y) {
+        for (int k = lo.z; k <= hi.z; ++k) {
+        AMREX_PRAGMA_SIMD
+        for (int i = lo.x; i <= hi.x; ++i) {
+            dmsk(i,lo.y,k) = 1;
+        }}
+    }
+
+    if (bchi[1] == LinOpBCType::Dirichlet && hi.y == domhi.y) {
+        for (int k = lo.z; k <= hi.z; ++k) {
+        AMREX_PRAGMA_SIMD
+        for (int i = lo.x; i <= hi.x; ++i) {
+            dmsk(i,hi.y,k) = 1;
+        }}
+    }
+
+    if (bclo[2] == LinOpBCType::Dirichlet && lo.z == domlo.z) {
+        for (int j = lo.y; j <= hi.y; ++j) {
+        AMREX_PRAGMA_SIMD
+        for (int i = lo.x; i <= hi.x; ++i) {
+            dmsk(i,j,lo.z) = 1;
+        }}
+    }
+
+    if (bchi[2] == LinOpBCType::Dirichlet && hi.z == domhi.z) {
+        for (int j = lo.y; j <= hi.y; ++j) {
+        AMREX_PRAGMA_SIMD
+        for (int i = lo.x; i <= hi.x; ++i) {
+            dmsk(i,j,hi.z) = 1;
+        }}
+    }
+}
+
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void mlndlap_set_dot_mask (Box const& bx, Array4<Real> const& dmsk,
+                           Array4<int const> const& omsk, Box const& dom,
+                           GpuArray<LinOpBCType, AMREX_SPACEDIM> const& bclo,
+                           GpuArray<LinOpBCType, AMREX_SPACEDIM> const& bchi) noexcept
+{
+    const auto lo = amrex::lbound(bx);
+    const auto hi = amrex::ubound(bx);
+    for (int k = lo.z; k <= hi.z; ++k) {
+    for (int j = lo.y; j <= hi.y; ++j) {
+    AMREX_PRAGMA_SIMD
+    for (int i = lo.x; i <= hi.x; ++i) {
+        dmsk(i,j,k) = static_cast<Real>(omsk(i,j,k));
+    }}}
+
+    const auto domlo = amrex::lbound(dom);
+    const auto domhi = amrex::ubound(dom);
+
+    if ((bclo[0] == LinOpBCType::Neumann || bclo[0] == LinOpBCType::inflow)
+        && lo.x == domlo.x)
+    {
+        for (int k = lo.z; k <= hi.z; ++k) {
+        for (int j = lo.y; j <= hi.y; ++j) {
+            dmsk(lo.x,j,k) *= Real(0.5);
+        }}
+    }
+
+    if ((bchi[0] == LinOpBCType::Neumann || bchi[0] == LinOpBCType::inflow)
+        && hi.x == domhi.x)
+    {
+        for (int k = lo.z; k <= hi.z; ++k) {
+        for (int j = lo.y; j <= hi.y; ++j) {
+            dmsk(hi.x,j,k) *= Real(0.5);
+        }}
+    }
+
+    if ((bclo[1] == LinOpBCType::Neumann || bclo[1] == LinOpBCType::inflow)
+        && lo.y == domlo.y)
+    {
+        for (int k = lo.z; k <= hi.z; ++k) {
+        AMREX_PRAGMA_SIMD
+        for (int i = lo.x; i <= hi.x; ++i) {
+            dmsk(i,lo.y,k) *= Real(0.5);
+        }}
+    }
+
+    if ((bchi[1] == LinOpBCType::Neumann || bchi[1] == LinOpBCType::inflow)
+        && hi.y == domhi.y)
+    {
+        for (int k = lo.z; k <= hi.z; ++k) {
+        AMREX_PRAGMA_SIMD
+        for (int i = lo.x; i <= hi.x; ++i) {
+            dmsk(i,hi.y,k) *= Real(0.5);
+        }}
+    }
+
+    if ((bclo[2] == LinOpBCType::Neumann || bclo[2] == LinOpBCType::inflow)
+        && lo.z == domlo.z)
+    {
+        for (int j = lo.y; j <= hi.y; ++j) {
+        AMREX_PRAGMA_SIMD
+        for (int i = lo.x; i <= hi.x; ++i) {
+            dmsk(i,j,lo.z) *= Real(0.5);
+        }}
+    }
+
+    if ((bchi[2] == LinOpBCType::Neumann || bchi[2] == LinOpBCType::inflow)
+        && hi.z == domhi.z)
+    {
+        for (int j = lo.y; j <= hi.y; ++j) {
+        AMREX_PRAGMA_SIMD
+        for (int i = lo.x; i <= hi.x; ++i) {
+            dmsk(i,j,hi.z) *= Real(0.5);
+        }}
+    }
+}
+
+}
+
+#endif
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLNodeLinOp_K.H b/Src/LinearSolvers/MLMG/AMReX_MLNodeLinOp_K.H
new file mode 100644
index 0000000000..2c98d32a00
--- /dev/null
+++ b/Src/LinearSolvers/MLMG/AMReX_MLNodeLinOp_K.H
@@ -0,0 +1,107 @@
+#ifndef AMREX_ML_NODE_LINOP_K_H_
+#define AMREX_ML_NODE_LINOP_K_H_
+#include <AMReX_Config.H>
+
+#include <AMReX_FArrayBox.H>
+
+namespace amrex::nodelap_detail {
+
+#ifdef AMREX_USE_HYPRE
+
+    struct GetNode {
+        AMREX_GPU_DEVICE Dim3 operator() (Dim3 const& lo, Dim3 const& len, int& offset)
+        {
+            Dim3 node;
+            constexpr int nsten = AMREX_D_TERM(3,*3,*3);
+            int icell = offset / nsten;
+            node.z =  icell /        (len.x*len.y);
+            node.y = (icell - node.z*(len.x*len.y)) /        len.x;
+            node.x = (icell - node.z*(len.x*len.y)) - node.y*len.x;
+            node.x += lo.x;
+            node.y += lo.y;
+            node.z += lo.z;
+            offset -= icell*nsten;
+            return node;
+        }
+    };
+
+    struct GetNode2 {
+        AMREX_GPU_DEVICE Dim3 operator() (int offset, Dim3 const& node)
+        {
+            // In 2D the offsets are
+            //   6 7 8
+            //   4 0 5
+            //   1 2 3
+            constexpr int nstenhalf = AMREX_SPACEDIM == 2 ? 4 : 13;
+            if (offset == 0) {
+                return node;
+            } else {
+                if (offset <= nstenhalf) { --offset; }
+                Dim3 node2;
+                node2.z = offset / 9;
+                node2.y = (offset - node2.z*9) / 3;
+                node2.x = (offset - node2.z*9) - node2.y*3;
+                AMREX_D_TERM(node2.x += node.x-1;,
+                             node2.y += node.y-1;,
+                             node2.z += node.z-1);
+                return node2;
+            }
+        }
+    };
+
+#endif /* AMREX_USE_HYPRE */
+
+    constexpr int crse_cell = 0; // Do NOT change the values
+    constexpr int fine_cell = 1;
+    constexpr int crse_node = 0;
+    constexpr int crse_fine_node = 1;
+    constexpr int fine_node = 2;
+}
+
+namespace amrex {
+
+template <typename T>
+void mlndlap_fillbc_cc (Box const& vbx, Array4<T> const& sigma, Box const& domain,
+                        GpuArray<LinOpBCType, AMREX_SPACEDIM> bclo,
+                        GpuArray<LinOpBCType, AMREX_SPACEDIM> bchi) noexcept
+{
+    GpuArray<bool,AMREX_SPACEDIM> bflo{{AMREX_D_DECL(bclo[0] != LinOpBCType::Periodic,
+                                                     bclo[1] != LinOpBCType::Periodic,
+                                                     bclo[2] != LinOpBCType::Periodic)}};
+    GpuArray<bool,AMREX_SPACEDIM> bfhi{{AMREX_D_DECL(bchi[0] != LinOpBCType::Periodic,
+                                                     bchi[1] != LinOpBCType::Periodic,
+                                                     bchi[2] != LinOpBCType::Periodic)}};
+    mlndlap_bc_doit(vbx, sigma, domain, bflo, bfhi);
+}
+
+template <typename T>
+void mlndlap_applybc (Box const& vbx, Array4<T> const& phi, Box const& domain,
+                      GpuArray<LinOpBCType, AMREX_SPACEDIM> bclo,
+                      GpuArray<LinOpBCType, AMREX_SPACEDIM> bchi) noexcept
+{
+    GpuArray<bool,AMREX_SPACEDIM> bflo{{AMREX_D_DECL(bclo[0] == LinOpBCType::Neumann ||
+                                                     bclo[0] == LinOpBCType::inflow,
+                                                     bclo[1] == LinOpBCType::Neumann ||
+                                                     bclo[1] == LinOpBCType::inflow,
+                                                     bclo[2] == LinOpBCType::Neumann ||
+                                                     bclo[2] == LinOpBCType::inflow)}};
+    GpuArray<bool,AMREX_SPACEDIM> bfhi{{AMREX_D_DECL(bchi[0] == LinOpBCType::Neumann ||
+                                                     bchi[0] == LinOpBCType::inflow,
+                                                     bchi[1] == LinOpBCType::Neumann ||
+                                                     bchi[1] == LinOpBCType::inflow,
+                                                     bchi[2] == LinOpBCType::Neumann ||
+                                                     bchi[2] == LinOpBCType::inflow)}};
+    mlndlap_bc_doit(vbx, phi, domain, bflo, bfhi);
+}
+
+}
+
+#if (AMREX_SPACEDIM == 1)
+#include <AMReX_MLNodeLinOp_1D_K.H>
+#elif (AMREX_SPACEDIM == 2)
+#include <AMReX_MLNodeLinOp_2D_K.H>
+#else
+#include <AMReX_MLNodeLinOp_3D_K.H>
+#endif
+
+#endif
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLNodeTensorLaplacian.cpp b/Src/LinearSolvers/MLMG/AMReX_MLNodeTensorLaplacian.cpp
index 59718d7624..e8135aeba1 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLNodeTensorLaplacian.cpp
+++ b/Src/LinearSolvers/MLMG/AMReX_MLNodeTensorLaplacian.cpp
@@ -1,5 +1,5 @@
 #include <AMReX_MLNodeTensorLaplacian.H>
-#include <AMReX_MLNodeLap_K.H>
+#include <AMReX_MLNodeLinOp_K.H>
 #include <AMReX_MLNodeTensorLap_K.H>
 #include <AMReX_MultiFabUtil.H>
 
diff --git a/Src/LinearSolvers/MLMG/Make.package b/Src/LinearSolvers/MLMG/Make.package
index 3609164c91..8e8d9b3ac1 100644
--- a/Src/LinearSolvers/MLMG/Make.package
+++ b/Src/LinearSolvers/MLMG/Make.package
@@ -1,6 +1,9 @@
 ifndef AMREX_MLMG_MAKE
        AMREX_MLMG_MAKE := 1
 
+USE_LINEAR_SOLVERS_INCFLO ?= TRUE
+USE_LINEAR_SOLVERS_EM ?= TRUE
+
 CEXE_sources += AMReX_MLMG.cpp
 
 CEXE_headers   += AMReX_MLMG.H
@@ -16,7 +19,7 @@ CEXE_headers   += AMReX_MLLinOp_K.H
 
 CEXE_headers   += AMReX_MLCellLinOp.H
 
-CEXE_headers   += AMReX_MLNodeLinOp.H
+CEXE_headers   += AMReX_MLNodeLinOp.H AMReX_MLNodeLinOp_K.H AMReX_MLNodeLinOp_$(DIM)D_K.H
 CEXE_sources   += AMReX_MLNodeLinOp.cpp
 
 CEXE_headers   += AMReX_MLCellABecLap.H
@@ -39,60 +42,69 @@ ifeq ($(DIM),3)
 CEXE_headers   += AMReX_MLPoisson_2D_K.H
 endif
 
-CEXE_headers   += AMReX_MLNodeLaplacian.H
-CEXE_sources   += AMReX_MLNodeLaplacian.cpp
-CEXE_sources   += AMReX_MLNodeLaplacian_sync.cpp
-CEXE_sources   += AMReX_MLNodeLaplacian_sten.cpp
-CEXE_sources   += AMReX_MLNodeLaplacian_misc.cpp
-CEXE_headers   += AMReX_MLNodeLap_K.H AMReX_MLNodeLap_$(DIM)D_K.H
-ifeq ($(USE_EB),TRUE)
-  CEXE_sources   += AMReX_MLNodeLaplacian_eb.cpp
+ifneq ($(BL_NO_FORT),TRUE)
+  CEXE_headers += AMReX_MLLinOp_F.H
+  F90EXE_sources += AMReX_MLLinOp_nd.F90
 endif
-ifeq ($(USE_HYPRE),TRUE)
-  CEXE_sources   += AMReX_MLNodeLaplacian_hypre.cpp
+
+ifeq ($(USE_EB),TRUE)
+  CEXE_headers += AMReX_MLEBABecLap.H
+  CEXE_sources += AMReX_MLEBABecLap.cpp
+  CEXE_sources += AMReX_MLEBABecLap_F.cpp
+  CEXE_headers += AMReX_MLEBABecLap_K.H
+  CEXE_headers += AMReX_MLEBABecLap_$(DIM)D_K.H
 endif
 
-CEXE_headers   += AMReX_MLNodeABecLaplacian.H
-CEXE_sources   += AMReX_MLNodeABecLaplacian.cpp
-CEXE_headers   += AMReX_MLNodeABecLap_K.H AMReX_MLNodeABecLap_$(DIM)D_K.H
+ifneq ($(USE_LINEAR_SOLVERS_INCFLO),FALSE)
 
-CEXE_headers   += AMReX_MLNodeTensorLaplacian.H
-CEXE_sources   += AMReX_MLNodeTensorLaplacian.cpp
-CEXE_headers   += AMReX_MLNodeTensorLap_K.H AMReX_MLNodeTensorLap_$(DIM)D_K.H
+  CEXE_headers += AMReX_MLNodeABecLaplacian.H
+  CEXE_sources += AMReX_MLNodeABecLaplacian.cpp
+  CEXE_headers += AMReX_MLNodeABecLap_K.H AMReX_MLNodeABecLap_$(DIM)D_K.H
 
-CEXE_headers   += AMReX_MLTensorOp.H
-CEXE_sources   += AMReX_MLTensorOp.cpp AMReX_MLTensorOp_grad.cpp
-CEXE_headers   += AMReX_MLTensor_K.H AMReX_MLTensor_$(DIM)D_K.H
+  CEXE_headers += AMReX_MLNodeLaplacian.H
+  CEXE_sources += AMReX_MLNodeLaplacian.cpp
+  CEXE_sources += AMReX_MLNodeLaplacian_sync.cpp
+  CEXE_sources += AMReX_MLNodeLaplacian_sten.cpp
+  CEXE_sources += AMReX_MLNodeLaplacian_misc.cpp
+  CEXE_headers += AMReX_MLNodeLap_K.H AMReX_MLNodeLap_$(DIM)D_K.H
+ifeq ($(USE_EB),TRUE)
+  CEXE_sources += AMReX_MLNodeLaplacian_eb.cpp
+endif
+ifeq ($(USE_HYPRE),TRUE)
+  CEXE_sources += AMReX_MLNodeLaplacian_hypre.cpp
+endif
 
-CEXE_headers   += AMReX_MLEBNodeFDLaplacian.H
-CEXE_sources   += AMReX_MLEBNodeFDLaplacian.cpp
-CEXE_headers   += AMReX_MLEBNodeFDLap_K.H
-CEXE_headers   += AMReX_MLEBNodeFDLap_$(DIM)D_K.H
+  CEXE_headers += AMReX_MLTensorOp.H
+  CEXE_sources += AMReX_MLTensorOp.cpp AMReX_MLTensorOp_grad.cpp
+  CEXE_headers += AMReX_MLTensor_K.H AMReX_MLTensor_$(DIM)D_K.H
 
 ifeq ($(USE_EB),TRUE)
-CEXE_headers   += AMReX_MLEBABecLap.H
-CEXE_sources   += AMReX_MLEBABecLap.cpp
-CEXE_sources   += AMReX_MLEBABecLap_F.cpp
-CEXE_headers   += AMReX_MLEBABecLap_K.H
-CEXE_headers   += AMReX_MLEBABecLap_$(DIM)D_K.H
-
-CEXE_headers   += AMReX_MLEBTensorOp.H
-CEXE_sources   += AMReX_MLEBTensorOp.cpp
-CEXE_sources   += AMReX_MLEBTensorOp_bc.cpp
-CEXE_headers   += AMReX_MLEBTensor_K.H AMReX_MLEBTensor_$(DIM)D_K.H
+  CEXE_headers += AMReX_MLEBTensorOp.H
+  CEXE_sources += AMReX_MLEBTensorOp.cpp
+  CEXE_sources += AMReX_MLEBTensorOp_bc.cpp
+  CEXE_headers += AMReX_MLEBTensor_K.H AMReX_MLEBTensor_$(DIM)D_K.H
 endif
 
-ifneq ($(BL_NO_FORT),TRUE)
-  CEXE_headers += AMReX_MLLinOp_F.H
-  F90EXE_sources += AMReX_MLLinOp_nd.F90
-endif
+endif # ifneq ($(USE_LINEAR_SOLVERS_INCFLO),FALSE)
 
+ifneq ($(USE_LINEAR_SOLVERS_EM),FALSE)
 ifneq ($(DIM),1)
   CEXE_headers += AMReX_MLCurlCurl.H
   CEXE_sources += AMReX_MLCurlCurl.cpp
   CEXE_headers += AMReX_MLCurlCurl_K.H
 endif
 
+  CEXE_headers += AMReX_MLEBNodeFDLaplacian.H
+  CEXE_sources += AMReX_MLEBNodeFDLaplacian.cpp
+  CEXE_headers += AMReX_MLEBNodeFDLap_K.H
+  CEXE_headers += AMReX_MLEBNodeFDLap_$(DIM)D_K.H
+
+  CEXE_headers += AMReX_MLNodeTensorLaplacian.H
+  CEXE_sources += AMReX_MLNodeTensorLaplacian.cpp
+  CEXE_headers += AMReX_MLNodeTensorLap_K.H AMReX_MLNodeTensorLap_$(DIM)D_K.H
+
+endif # ifneq ($(USE_LINEAR_SOLVERS_EM),FALSE)
+
 VPATH_LOCATIONS += $(AMREX_HOME)/Src/LinearSolvers/MLMG
 INCLUDE_LOCATIONS += $(AMREX_HOME)/Src/LinearSolvers/MLMG
 
diff --git a/Src/Particle/AMReX_NeighborParticles.H b/Src/Particle/AMReX_NeighborParticles.H
index 5f814805a2..1b7d01d7a9 100644
--- a/Src/Particle/AMReX_NeighborParticles.H
+++ b/Src/Particle/AMReX_NeighborParticles.H
@@ -467,7 +467,7 @@ protected:
     ParticleCopyPlan neighbor_copy_plan;
 
     amrex::PODVector<char, PolymorphicArenaAllocator<char> > snd_buffer;
-    amrex::Gpu::DeviceVector<char> rcv_buffer;
+    amrex::PODVector<char, PolymorphicArenaAllocator<char> > rcv_buffer;
 
     Gpu::PinnedVector<char> pinned_snd_buffer;
     Gpu::PinnedVector<char> pinned_rcv_buffer;
diff --git a/Src/Particle/AMReX_NeighborParticlesGPUImpl.H b/Src/Particle/AMReX_NeighborParticlesGPUImpl.H
index 4cc533d901..b4e50bef7b 100644
--- a/Src/Particle/AMReX_NeighborParticlesGPUImpl.H
+++ b/Src/Particle/AMReX_NeighborParticlesGPUImpl.H
@@ -255,6 +255,12 @@ updateNeighborsGPU (bool boundary_neighbors_only)
     }
 
     clearNeighbors();
+
+    if (ParallelDescriptor::UseGpuAwareMpi()) {
+        snd_buffer.setArena(The_Comms_Arena());
+        rcv_buffer.setArena(The_Comms_Arena());
+    }
+
     packBuffer(*this, neighbor_copy_op, neighbor_copy_plan, snd_buffer);
     if (ParallelDescriptor::UseGpuAwareMpi())
     {
diff --git a/Src/Particle/AMReX_ParticleContainer.H b/Src/Particle/AMReX_ParticleContainer.H
index 03a2254a10..4b347d283b 100644
--- a/Src/Particle/AMReX_ParticleContainer.H
+++ b/Src/Particle/AMReX_ParticleContainer.H
@@ -60,6 +60,7 @@
 #include <memory>
 #include <numeric>
 #include <random>
+#include <string>
 #include <tuple>
 #include <type_traits>
 #include <utility>
@@ -1144,7 +1145,8 @@ public:
      */
     ParticleTileType& DefineAndReturnParticleTile (int lev, int grid, int tile)
     {
-        m_particles[lev][std::make_pair(grid, tile)].define(NumRuntimeRealComps(), NumRuntimeIntComps());
+        m_particles[lev][std::make_pair(grid, tile)].define(NumRuntimeRealComps(), NumRuntimeIntComps(), &m_soa_rdata_names, &m_soa_idata_names);
+
         return ParticlesAt(lev, grid, tile);
     }
 
@@ -1247,10 +1249,10 @@ public:
 
     Long superParticleSize() const { return superparticle_size; }
 
-    template <typename T,
-              std::enable_if_t<std::is_same_v<T,bool>,int> = 0>
-    void AddRealComp (T communicate=true)
+    void AddRealComp (std::string const & name, int communicate=1)
     {
+        m_soa_rdata_names.push_back(name);
+
         m_runtime_comps_defined = true;
         m_num_runtime_real++;
         h_redistribute_real_comp.push_back(communicate);
@@ -1270,10 +1272,15 @@ public:
         }
     }
 
-    template <typename T,
-              std::enable_if_t<std::is_same_v<T,bool>,int> = 0>
-    void AddIntComp (T communicate=true)
+    void AddRealComp (int communicate=1)
     {
+        AddRealComp(getDefaultCompNameReal<ParticleType>(NArrayReal+m_num_runtime_real), communicate);
+    }
+
+    void AddIntComp (std::string const & name, int communicate=1)
+    {
+        m_soa_idata_names.push_back(name);
+
         m_runtime_comps_defined = true;
         m_num_runtime_int++;
         h_redistribute_int_comp.push_back(communicate);
@@ -1293,6 +1300,11 @@ public:
         }
     }
 
+    void AddIntComp (int communicate=1)
+    {
+        AddIntComp(getDefaultCompNameInt<ParticleType>(NArrayInt+m_num_runtime_int), communicate);
+    }
+
     int NumRuntimeRealComps () const { return m_num_runtime_real; }
     int NumRuntimeIntComps  () const { return m_num_runtime_int;  }
 
@@ -1403,6 +1415,15 @@ public:
 #include "AMReX_ParticlesHDF5.H"
 #endif
 
+    /** Overwrite the default names for the compile-time SoA components */
+    void SetSoACompileTimeNames (std::vector<std::string> const & rdata_name, std::vector<std::string> const & idata_name);
+
+    /** Get the names for the real SoA components **/
+    std::vector<std::string> GetRealSoANames () const {return m_soa_rdata_names;}
+
+    /** Get the names for the int SoA components **/
+    std::vector<std::string> GetIntSoANames () const {return m_soa_idata_names;}
+
 protected:
 
     template <class RTYPE>
@@ -1435,6 +1456,10 @@ private:
     size_t particle_size, superparticle_size;
     int num_real_comm_comps, num_int_comm_comps;
     Vector<ParticleLevel> m_particles;
+
+    // names of both compile-time and runtime Real and Int SoA data
+    std::vector<std::string> m_soa_rdata_names;
+    std::vector<std::string> m_soa_idata_names;
 };
 
 template <int T_NStructReal, int T_NStructInt, int T_NArrayReal, int T_NArrayInt, template<class> class Allocator, class CellAssignor>
diff --git a/Src/Particle/AMReX_ParticleContainerI.H b/Src/Particle/AMReX_ParticleContainerI.H
index 74e65b792f..d42d2d5b4b 100644
--- a/Src/Particle/AMReX_ParticleContainerI.H
+++ b/Src/Particle/AMReX_ParticleContainerI.H
@@ -1,6 +1,10 @@
-#include <type_traits>
 #include <AMReX_MakeParticle.H>
 
+#include <string>
+#include <type_traits>
+#include <vector>
+
+
 template <typename ParticleType, int NArrayReal, int NArrayInt,
           template<class> class Allocator, class CellAssignor>
 void
@@ -60,10 +64,40 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
         pp.query("do_unlink", doUnlink);
         pp.queryAdd("do_mem_efficient_sort", memEfficientSort);
 
+        // add default names for SoA Real and Int compile-time arguments
+        for (int i=0; i<NArrayReal; ++i)
+        {
+            m_soa_rdata_names.push_back(getDefaultCompNameReal<ParticleType>(i));
+        }
+        for (int i=0; i<NArrayInt; ++i)
+        {
+            m_soa_idata_names.push_back(getDefaultCompNameInt<ParticleType>(i));
+        }
+
         initialized = true;
     }
 }
 
+template <typename ParticleType, int NArrayReal, int NArrayInt,
+        template<class> class Allocator, class CellAssignor>
+void
+ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssignor> :: SetSoACompileTimeNames (
+    std::vector<std::string> const & rdata_name, std::vector<std::string> const & idata_name
+)
+{
+    AMREX_ALWAYS_ASSERT_WITH_MESSAGE(rdata_name.size() == NArrayReal, "rdata_name must be equal to NArrayReal");
+    AMREX_ALWAYS_ASSERT_WITH_MESSAGE(idata_name.size() == NArrayInt, "idata_name must be equal to NArrayInt");
+
+    for (int i=0; i<NArrayReal; ++i)
+    {
+        m_soa_rdata_names.at(i) = rdata_name.at(i);
+    }
+    for (int i=0; i<NArrayInt; ++i)
+    {
+        m_soa_idata_names.at(i) = idata_name.at(i);
+    }
+}
+
 template <typename ParticleType, int NArrayReal, int NArrayInt,
           template<class> class Allocator, class CellAssignor>
 template <typename P, typename Assignor>
@@ -1161,7 +1195,7 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
         }
     } else {
         ParticleTileType ptile_tmp;
-        ptile_tmp.define(m_num_runtime_real, m_num_runtime_int);
+        ptile_tmp.define(m_num_runtime_real, m_num_runtime_int, &m_soa_rdata_names, &m_soa_idata_names);
         ptile_tmp.resize(np_total);
         // copy re-ordered particles
         gatherParticles(ptile_tmp, ptile, np, permutations);
@@ -1354,8 +1388,14 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
     plan.build(*this, op, h_redistribute_int_comp,
                h_redistribute_real_comp, local);
 
+    // by default, this uses The_Arena();
     amrex::PODVector<char, PolymorphicArenaAllocator<char> > snd_buffer;
-    Gpu::DeviceVector<char> rcv_buffer;
+    amrex::PODVector<char, PolymorphicArenaAllocator<char> > rcv_buffer;
+
+    if (ParallelDescriptor::UseGpuAwareMpi()) {
+        snd_buffer.setArena(The_Comms_Arena());
+        rcv_buffer.setArena(The_Comms_Arena());
+    }
 
     packBuffer(*this, op, plan, snd_buffer);
 
@@ -1498,7 +1538,7 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
             tmp_local[lev][index].resize(num_threads);
             soa_local[lev][index].resize(num_threads);
             for (int t = 0; t < num_threads; ++t) {
-                soa_local[lev][index][t].define(m_num_runtime_real, m_num_runtime_int);
+                soa_local[lev][index][t].define(m_num_runtime_real, m_num_runtime_int, &m_soa_rdata_names, &m_soa_idata_names);
             }
         }
     }
diff --git a/Src/Particle/AMReX_ParticleIO.H b/Src/Particle/AMReX_ParticleIO.H
index 01ab0ded86..601b7417d1 100644
--- a/Src/Particle/AMReX_ParticleIO.H
+++ b/Src/Particle/AMReX_ParticleIO.H
@@ -51,20 +51,18 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
 {
     Vector<int> write_real_comp;
     Vector<std::string> tmp_real_comp_names;
-    int nrc = ParticleType::is_soa_particle ? NStructReal + NumRealComps() - AMREX_SPACEDIM : NStructReal + NumRealComps();
 
-    for (int i = 0; i < nrc; ++i )
+    int first_rcomp = ParticleType::is_soa_particle ? AMREX_SPACEDIM : 0;
+    for (int i = first_rcomp; i < NStructReal + NumRealComps(); ++i )
     {
         write_real_comp.push_back(1);
         if (real_comp_names.empty())
         {
-            std::stringstream ss;
-            ss << "real_comp" << i;
-            tmp_real_comp_names.push_back(ss.str());
+            tmp_real_comp_names.push_back(getDefaultCompNameReal<ParticleType>(i));
         }
         else
         {
-            tmp_real_comp_names.push_back(real_comp_names[i]);
+            tmp_real_comp_names.push_back(real_comp_names[i-first_rcomp]);
         }
     }
 
@@ -75,9 +73,7 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
         write_int_comp.push_back(1);
         if (int_comp_names.empty())
         {
-            std::stringstream ss;
-            ss << "int_comp" << i;
-            tmp_int_comp_names.push_back(ss.str());
+            tmp_int_comp_names.push_back(getDefaultCompNameInt<ParticleType>(i));
         }
         else
         {
@@ -98,14 +94,12 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
 {
     Vector<int> write_real_comp;
     Vector<std::string> real_comp_names;
-    int nrc = ParticleType::is_soa_particle ? NStructReal + NumRealComps() - AMREX_SPACEDIM : NStructReal + NumRealComps();
 
-    for (int i = 0; i < nrc; ++i )
+    int first_rcomp = ParticleType::is_soa_particle ? AMREX_SPACEDIM : 0;
+    for (int i = first_rcomp; i < NStructReal + NumRealComps(); ++i )
     {
         write_real_comp.push_back(1);
-        std::stringstream ss;
-        ss << "real_comp" << i;
-        real_comp_names.push_back(ss.str());
+        real_comp_names.push_back(getDefaultCompNameReal<ParticleType>(i));
     }
 
     Vector<int> write_int_comp;
@@ -113,9 +107,7 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
     for (int i = 0; i < NStructInt + NumIntComps(); ++i )
     {
         write_int_comp.push_back(1);
-        std::stringstream ss;
-        ss << "int_comp" << i;
-        int_comp_names.push_back(ss.str());
+        int_comp_names.push_back(getDefaultCompNameInt<ParticleType>(i));
     }
 
     WriteBinaryParticleData(dir, name, write_real_comp, write_int_comp,
@@ -182,9 +174,7 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
     Vector<std::string> int_comp_names;
     for (int i = 0; i < NStructInt + NumIntComps(); ++i )
     {
-        std::stringstream ss;
-        ss << "int_comp" << i;
-        int_comp_names.push_back(ss.str());
+        int_comp_names.push_back(getDefaultCompNameInt<ParticleType>(i));
     }
 
     WriteBinaryParticleData(dir, name,
@@ -211,20 +201,16 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
     AMREX_ASSERT(write_int_comp.size()  == NStructInt  + NArrayInt );
 
     Vector<std::string> real_comp_names;
-    int nrc = ParticleType::is_soa_particle ? NStructReal + NumRealComps() - AMREX_SPACEDIM : NStructReal + NumRealComps();
-    for (int i = 0; i < nrc; ++i )
+    int first_rcomp = ParticleType::is_soa_particle ? AMREX_SPACEDIM : 0;
+    for (int i = first_rcomp; i < NStructReal + NumRealComps(); ++i )
     {
-        std::stringstream ss;
-        ss << "real_comp" << i;
-        real_comp_names.push_back(ss.str());
+        real_comp_names.push_back(getDefaultCompNameReal<ParticleType>(i));
     }
 
     Vector<std::string> int_comp_names;
     for (int i = 0; i < NStructInt + NumIntComps(); ++i )
     {
-        std::stringstream ss;
-        ss << "int_comp" << i;
-        int_comp_names.push_back(ss.str());
+        int_comp_names.push_back(getDefaultCompNameInt<ParticleType>(i));
     }
 
     WriteBinaryParticleData(dir, name, write_real_comp, write_int_comp,
@@ -259,14 +245,12 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
 {
     Vector<int> write_real_comp;
     Vector<std::string> real_comp_names;
-    int nrc = ParticleType::is_soa_particle ? NStructReal + NumRealComps() - AMREX_SPACEDIM : NStructReal + NumRealComps();
 
-    for (int i = 0; i < nrc; ++i )
+    int first_rcomp = ParticleType::is_soa_particle ? AMREX_SPACEDIM : 0;
+    for (int i = first_rcomp; i < NStructReal + NumRealComps(); ++i )
     {
         write_real_comp.push_back(1);
-        std::stringstream ss;
-        ss << "real_comp" << i;
-        real_comp_names.push_back(ss.str());
+        real_comp_names.push_back(getDefaultCompNameReal<ParticleType>(i));
     }
 
     Vector<int> write_int_comp;
@@ -274,9 +258,7 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
     for (int i = 0; i < NStructInt + NumIntComps(); ++i )
     {
         write_int_comp.push_back(1);
-        std::stringstream ss;
-        ss << "int_comp" << i;
-        int_comp_names.push_back(ss.str());
+        int_comp_names.push_back(getDefaultCompNameInt<ParticleType>(i));
     }
 
     WriteBinaryParticleData(dir, name, write_real_comp, write_int_comp,
@@ -345,9 +327,7 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
     Vector<std::string> int_comp_names;
     for (int i = 0; i < NStructInt + NumIntComps(); ++i )
     {
-        std::stringstream ss;
-        ss << "int_comp" << i;
-        int_comp_names.push_back(ss.str());
+        int_comp_names.push_back(getDefaultCompNameInt<ParticleType>(i));
     }
 
     WriteBinaryParticleData(dir, name,
@@ -374,20 +354,16 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
     AMREX_ASSERT(write_int_comp.size()  == NStructInt  + NumIntComps() );
 
     Vector<std::string> real_comp_names;
-    int nrc = ParticleType::is_soa_particle ? NStructReal + NumRealComps() - AMREX_SPACEDIM : NStructReal + NumRealComps();
-    for (int i = 0; i < nrc; ++i )
+    int first_rcomp = ParticleType::is_soa_particle ? AMREX_SPACEDIM : 0;
+    for (int i = first_rcomp; i < NStructReal + NumRealComps(); ++i )
     {
-        std::stringstream ss;
-        ss << "real_comp" << i;
-        real_comp_names.push_back(ss.str());
+        real_comp_names.push_back(getDefaultCompNameReal<ParticleType>(i));
     }
 
     Vector<std::string> int_comp_names;
     for (int i = 0; i < NStructInt + NumIntComps(); ++i )
     {
-        std::stringstream ss;
-        ss << "int_comp" << i;
-        int_comp_names.push_back(ss.str());
+        int_comp_names.push_back(getDefaultCompNameInt<ParticleType>(i));
     }
 
     WriteBinaryParticleData(dir, name, write_real_comp, write_int_comp,
diff --git a/Src/Particle/AMReX_ParticleTile.H b/Src/Particle/AMReX_ParticleTile.H
index a645330e04..7546ff8a21 100644
--- a/Src/Particle/AMReX_ParticleTile.H
+++ b/Src/Particle/AMReX_ParticleTile.H
@@ -11,7 +11,10 @@
 #include <AMReX_RealVect.H>
 
 #include <array>
+#include <string>
 #include <type_traits>
+#include <vector>
+
 
 namespace amrex {
 
@@ -730,10 +733,15 @@ struct ParticleTile
     ParticleTile& operator= (ParticleTile &&) noexcept = default;
 #endif
 
-    void define (int a_num_runtime_real, int a_num_runtime_int)
+    void define (
+        int a_num_runtime_real,
+        int a_num_runtime_int,
+        std::vector<std::string>* soa_rdata_names=nullptr,
+        std::vector<std::string>* soa_idata_names=nullptr
+    )
     {
         m_defined = true;
-        GetStructOfArrays().define(a_num_runtime_real, a_num_runtime_int);
+        GetStructOfArrays().define(a_num_runtime_real, a_num_runtime_int, soa_rdata_names, soa_idata_names);
         m_runtime_r_ptrs.resize(a_num_runtime_real);
         m_runtime_i_ptrs.resize(a_num_runtime_int);
         m_runtime_r_cptrs.resize(a_num_runtime_real);
diff --git a/Src/Particle/AMReX_ParticleUtil.H b/Src/Particle/AMReX_ParticleUtil.H
index b09f1d3583..f6f2506ec7 100644
--- a/Src/Particle/AMReX_ParticleUtil.H
+++ b/Src/Particle/AMReX_ParticleUtil.H
@@ -883,6 +883,26 @@ void PermutationForDeposition (Gpu::DeviceVector<index_type>& perm, index_type n
         });
 }
 
+template <typename P>
+std::string getDefaultCompNameReal (const int i) {
+    int first_r_name = 0;
+    if constexpr (P::is_soa_particle) {
+        if (i < AMREX_SPACEDIM) {
+            constexpr int x_in_ascii = 120;
+            std::string const name{char(x_in_ascii+i)};
+            return name;
+        }
+        first_r_name = AMREX_SPACEDIM;
+    }
+    std::string const name{("real_comp" + std::to_string(i-first_r_name))};
+    return name;
+}
+
+template <typename P>
+std::string getDefaultCompNameInt (const int i) {
+    std::string const name{("int_comp" + std::to_string(i))};
+    return name;
+}
 
 #ifdef AMREX_USE_HDF5_ASYNC
 void async_vol_es_wait_particle();
diff --git a/Src/Particle/AMReX_StructOfArrays.H b/Src/Particle/AMReX_StructOfArrays.H
index 4de35e085c..46d18a1715 100644
--- a/Src/Particle/AMReX_StructOfArrays.H
+++ b/Src/Particle/AMReX_StructOfArrays.H
@@ -6,7 +6,11 @@
 #include <AMReX_Vector.H>
 #include <AMReX_GpuContainers.H>
 
+#include <algorithm>
 #include <array>
+#include <string>
+#include <vector>
+
 
 namespace amrex {
 
@@ -19,11 +23,18 @@ struct StructOfArrays {
     using RealVector = amrex::PODVector<ParticleReal, Allocator<ParticleReal> >;
     using IntVector = amrex::PODVector<int, Allocator<int> >;
 
-    void define (int a_num_runtime_real, int a_num_runtime_int)
+    void define (
+        int a_num_runtime_real,
+        int a_num_runtime_int,
+        std::vector<std::string>* soa_rdata_names=nullptr,
+        std::vector<std::string>* soa_idata_names=nullptr
+    )
     {
         m_defined = true;
         m_runtime_rdata.resize(a_num_runtime_real);
         m_runtime_idata.resize(a_num_runtime_int );
+        m_rdata_names = soa_rdata_names;
+        m_idata_names = soa_idata_names;
     }
 
     [[nodiscard]] int NumRealComps () const noexcept { return NReal + m_runtime_rdata.size(); }
@@ -41,6 +52,28 @@ struct StructOfArrays {
     /** Get access to the particle Int Arrays (only compile-time components) */
     [[nodiscard]] const std::array< IntVector,  NInt>& GetIntData  () const { return m_idata; }
 
+    /** Get the names for the real SoA components **/
+    [[nodiscard]] std::vector<std::string> GetRealNames () const
+    {
+        if (m_rdata_names) {
+            return *m_rdata_names;
+        }
+        else {
+            return std::vector<std::string>();
+        }
+    }
+
+    /** Get the names for the int SoA components **/
+    [[nodiscard]] std::vector<std::string> GetIntNames () const
+    {
+        if (m_idata_names) {
+            return *m_idata_names;
+        }
+        else {
+            return std::vector<std::string>();
+        }
+    }
+
     /** Get access to a particle Real component Array (compile-time and runtime component)
      *
      * @param index component with 0...NReal-1 compile-time and NReal... runtime arguments
@@ -79,6 +112,32 @@ struct StructOfArrays {
         }
     }
 
+    /** Get access to a particle Real component Array (compile-time and runtime component)
+     *
+     * @param name named component component with 0...NReal-1 compile-time and NReal... runtime arguments
+     */
+    [[nodiscard]] RealVector& GetRealData (std::string const & name) {
+        AMREX_ALWAYS_ASSERT_WITH_MESSAGE(m_rdata_names != nullptr, "SoA Real names were not defined.");
+        auto const pos = std::find(m_rdata_names->begin(), m_rdata_names->end(), name);
+        AMREX_ALWAYS_ASSERT_WITH_MESSAGE(pos != m_rdata_names->end(), "Soa Real name='" + name + "' was not found components");
+
+        int const index = std::distance(m_rdata_names->begin(), pos);
+        return GetRealData(index);
+    }
+
+    /** Get access to a particle Real component Array (compile-time and runtime component)
+     *
+     * @param name named component component with 0...NReal-1 compile-time and NReal... runtime arguments
+     */
+    [[nodiscard]] const RealVector& GetRealData (std::string const & name) const {
+        AMREX_ALWAYS_ASSERT_WITH_MESSAGE(m_rdata_names != nullptr, "SoA Real names were not defined.");
+        auto const pos = std::find(m_rdata_names->begin(), m_rdata_names->end(), name);
+        AMREX_ALWAYS_ASSERT_WITH_MESSAGE(pos != m_rdata_names->end(), "Soa Real name='" + name + "' was not found components");
+
+        int const index = std::distance(m_rdata_names->begin(), pos);
+        return GetRealData(index);
+    }
+
     /** Get access to a particle Int component Array (compile-time and runtime component)
      *
      * @param index component with 0...NInt-1 compile-time and NInt... runtime arguments
@@ -118,6 +177,34 @@ struct StructOfArrays {
         }
     }
 
+    /** Get access to a particle Int component Array (compile-time and runtime component)
+     *
+     * @param index component with 0...NInt-1 compile-time and NInt... runtime arguments
+     * @return
+     */
+    [[nodiscard]] IntVector& GetIntData (std::string const & name) {
+        AMREX_ALWAYS_ASSERT_WITH_MESSAGE(m_idata_names != nullptr, "SoA Int names were not defined.");
+        auto const pos = std::find(m_idata_names->begin(), m_idata_names->end(), name);
+        AMREX_ALWAYS_ASSERT_WITH_MESSAGE(pos != m_idata_names->end(), "Soa Int name='" + name + "' was not found components");
+
+        int const index = std::distance(m_idata_names->begin(), pos);
+        return GetIntData(index);
+    }
+
+    /** Get access to a particle Int component Array (compile-time and runtime component)
+     *
+     * @param index component with 0...NInt-1 compile-time and NInt... runtime arguments
+     * @return
+     */
+    [[nodiscard]] const IntVector& GetIntData (std::string const & name) const {
+        AMREX_ALWAYS_ASSERT_WITH_MESSAGE(m_idata_names != nullptr, "SoA Int names were not defined.");
+        auto const pos = std::find(m_idata_names->begin(), m_idata_names->end(), name);
+        AMREX_ALWAYS_ASSERT_WITH_MESSAGE(pos != m_idata_names->end(), "Soa Int name='" + name + "' was not found components");
+
+        int const index = std::distance(m_idata_names->begin(), pos);
+        return GetIntData(index);
+    }
+
     /**
     * \brief Returns the total number of particles (real and neighbor)
     *
@@ -226,13 +313,20 @@ struct StructOfArrays {
     int m_num_neighbor_particles{0};
 
 private:
+    // compile-time data
     IdCPU m_idcpu;
     std::array<RealVector, NReal> m_rdata;
     std::array< IntVector,  NInt> m_idata;
 
+    // runtime data
     std::vector<RealVector> m_runtime_rdata;
     std::vector<IntVector > m_runtime_idata;
 
+    // names of both compile-time and runtime Real and Int data
+    std::vector<std::string>* m_rdata_names = nullptr;
+    std::vector<std::string>* m_idata_names = nullptr;
+
+    //! whether the runtime components are sized correctly
     bool m_defined{false};
 };
 
diff --git a/Tests/CMakeLists.txt b/Tests/CMakeLists.txt
index a5ec9e1cf4..9d6afba857 100644
--- a/Tests/CMakeLists.txt
+++ b/Tests/CMakeLists.txt
@@ -117,13 +117,17 @@ if (AMReX_TEST_TYPE STREQUAL "Small")
       add_subdirectory("LinearSolvers/ABecLaplacian_C")
    endif()
 
+   if (AMReX_FFT)
+      add_subdirectory("FFT/Poisson")
+   endif()
+
 else()
    #
    # List of subdirectories to search for CMakeLists.
    #
    set( AMREX_TESTS_SUBDIRS Amr AsyncOut CLZ CTOParFor DeviceGlobal Enum
                             MultiBlock MultiPeriod ParmParse Parser Parser2 Reinit
-                            RoundoffDomain)
+                            RoundoffDomain SmallMatrix)
 
    if (AMReX_PARTICLES)
      list(APPEND AMREX_TESTS_SUBDIRS Particles)
@@ -137,6 +141,10 @@ else()
       list(APPEND AMREX_TESTS_SUBDIRS LinearSolvers)
    endif ()
 
+   if (AMReX_FFT)
+      list(APPEND AMREX_TESTS_SUBDIRS FFT)
+   endif ()
+
    if (AMReX_HDF5)
       list(APPEND AMREX_TESTS_SUBDIRS HDF5Benchmark)
    endif ()
diff --git a/Tests/FFT/Poisson/CMakeLists.txt b/Tests/FFT/Poisson/CMakeLists.txt
new file mode 100644
index 0000000000..21a9d3b268
--- /dev/null
+++ b/Tests/FFT/Poisson/CMakeLists.txt
@@ -0,0 +1,10 @@
+foreach(D IN LISTS AMReX_SPACEDIM)
+    set(_sources  main.cpp)
+
+    set(_input_files)
+
+    setup_test(${D} _sources _input_files)
+
+    unset(_sources)
+    unset(_input_files)
+endforeach()
diff --git a/Tests/FFT/Poisson/GNUmakefile b/Tests/FFT/Poisson/GNUmakefile
new file mode 100644
index 0000000000..93376f4485
--- /dev/null
+++ b/Tests/FFT/Poisson/GNUmakefile
@@ -0,0 +1,26 @@
+AMREX_HOME := ../../..
+
+DEBUG	= FALSE
+
+DIM	= 3
+
+COMP    = gcc
+
+USE_MPI   = TRUE
+USE_OMP   = FALSE
+USE_CUDA  = FALSE
+USE_HIP   = FALSE
+USE_SYCL  = FALSE
+
+USE_FFT = TRUE
+
+BL_NO_FORT = TRUE
+
+TINY_PROFILE = FALSE
+
+include $(AMREX_HOME)/Tools/GNUMake/Make.defs
+
+include ./Make.package
+include $(AMREX_HOME)/Src/Base/Make.package
+
+include $(AMREX_HOME)/Tools/GNUMake/Make.rules
diff --git a/Tests/FFT/Poisson/Make.package b/Tests/FFT/Poisson/Make.package
new file mode 100644
index 0000000000..6b4b865e8f
--- /dev/null
+++ b/Tests/FFT/Poisson/Make.package
@@ -0,0 +1 @@
+CEXE_sources += main.cpp
diff --git a/Tests/FFT/Poisson/main.cpp b/Tests/FFT/Poisson/main.cpp
new file mode 100644
index 0000000000..1286d80dad
--- /dev/null
+++ b/Tests/FFT/Poisson/main.cpp
@@ -0,0 +1,148 @@
+#include <AMReX_FFT_Poisson.H> // Put this at the top for testing
+
+#include <AMReX.H>
+#include <AMReX_MultiFab.H>
+#include <AMReX_ParmParse.H>
+#include <AMReX_PlotFileUtil.H>
+
+using namespace amrex;
+
+int main (int argc, char* argv[])
+{
+    amrex::Initialize(argc, argv);
+    {
+        BL_PROFILE("main");
+
+        AMREX_D_TERM(int n_cell_x = 64;,
+                     int n_cell_y = 32;,
+                     int n_cell_z = 128);
+
+        AMREX_D_TERM(int max_grid_size_x = 32;,
+                     int max_grid_size_y = 32;,
+                     int max_grid_size_z = 32);
+
+        AMREX_D_TERM(Real prob_lo_x = 0.;,
+                     Real prob_lo_y = 0.;,
+                     Real prob_lo_z = 0.);
+        AMREX_D_TERM(Real prob_hi_x = 1.;,
+                     Real prob_hi_y = 1.;,
+                     Real prob_hi_z = 1.);
+
+        {
+            ParmParse pp;
+            AMREX_D_TERM(pp.query("n_cell_x", n_cell_x);,
+                         pp.query("n_cell_y", n_cell_y);,
+                         pp.query("n_cell_z", n_cell_z));
+            AMREX_D_TERM(pp.query("max_grid_size_x", max_grid_size_x);,
+                         pp.query("max_grid_size_y", max_grid_size_y);,
+                         pp.query("max_grid_size_z", max_grid_size_z));
+        }
+
+        Box domain(IntVect(0),IntVect(AMREX_D_DECL(n_cell_x-1,n_cell_y-1,n_cell_z-1)));
+        BoxArray ba(domain);
+        ba.maxSize(IntVect(AMREX_D_DECL(max_grid_size_x,
+                                        max_grid_size_y,
+                                        max_grid_size_z)));
+        DistributionMapping dm(ba);
+
+        Geometry geom;
+        {
+            geom.define(domain,
+                        RealBox(AMREX_D_DECL(prob_lo_x,prob_lo_y,prob_lo_z),
+                                AMREX_D_DECL(prob_hi_x,prob_hi_y,prob_hi_z)),
+                        CoordSys::cartesian, {AMREX_D_DECL(1,1,1)});
+        }
+        auto const& dx = geom.CellSizeArray();
+
+        MultiFab rhs(ba,dm,1,0);
+        MultiFab soln(ba,dm,1,0);
+        auto const& rhsma = rhs.arrays();
+        ParallelFor(rhs, [=] AMREX_GPU_DEVICE (int b, int i, int j, int k)
+        {
+            AMREX_D_TERM(Real x = (i+0.5_rt) * dx[0] - 0.5_rt;,
+                         Real y = (j+0.5_rt) * dx[1] - 0.5_rt;,
+                         Real z = (k+0.5_rt) * dx[2] - 0.5_rt);
+            rhsma[b](i,j,k) = std::exp(-10._rt*
+                (AMREX_D_TERM(x*x*1.05_rt, + y*y*0.90_rt, + z*z)));
+        });
+
+        // Shift rhs so that its sum is zero.
+        auto rhosum = rhs.sum(0);
+        rhs.plus(-rhosum/geom.Domain().d_numPts(), 0, 1);
+
+#if (AMREX_SPACEDIM == 3)
+        Array<int,2> solvers{0,1};
+#else
+        Array<int,2> solvers{0};
+#endif
+
+        for (int solver_type : solvers) {
+            double tsetup, tsolve;
+            if (solver_type == 0) {
+                auto t0 = amrex::second();
+                FFT::Poisson<MultiFab> fft_poisson(geom);
+                auto t1 = amrex::second();
+                tsetup = t1-t0;
+
+                for (int n = 0; n < 2; ++n) {
+                    auto ta = amrex::second();
+                    fft_poisson.solve(soln, rhs);
+                    auto tb = amrex::second();
+                    tsolve = tb-ta;
+                }
+            } else {
+                auto t0 = amrex::second();
+                FFT::PoissonHybrid<MultiFab> fft_poisson(geom);
+                auto t1 = amrex::second();
+                tsetup = t1-t0;
+
+                for (int n = 0; n < 2; ++n) {
+                    auto ta = amrex::second();
+                    fft_poisson.solve(soln, rhs);
+                    auto tb = amrex::second();
+                    tsolve = tb-ta;
+                }
+            }
+
+            amrex::Print() << "  AMReX FFT setup time: " << tsetup
+                           << ", solve time " << tsolve << "\n";
+
+            MultiFab phi(soln.boxArray(), soln.DistributionMap(), 1, 1);
+            MultiFab res(soln.boxArray(), soln.DistributionMap(), 1, 0);
+            MultiFab::Copy(phi, soln, 0, 0, 1, 0);
+            phi.FillBoundary(geom.periodicity());
+            auto const& res_ma = res.arrays();
+            auto const& phi_ma = phi.const_arrays();
+            auto const& rhs_ma = rhs.const_arrays();
+            ParallelFor(res, [=] AMREX_GPU_DEVICE (int b, int i, int j, int k)
+            {
+                auto const& phia = phi_ma[b];
+                auto lap = (phia(i-1,j,k)-2._rt*phia(i,j,k)+phia(i+1,j,k)) / (dx[0]*dx[0]);
+#if (AMREX_SPACEDIM >= 2)
+                lap += (phia(i,j-1,k)-2._rt*phia(i,j,k)+phia(i,j+1,k)) / (dx[1]*dx[1]);
+#endif
+#if (AMREX_SPACEDIM == 3)
+                if ((solver_type == 1) && (k == 0)) { // Neumann
+                    lap += (-phia(i,j,k)+phia(i,j,k+1)) / (dx[2]*dx[2]);
+                } else if ((solver_type == 1) && ((k+1) == n_cell_z)) { // Neumann
+                    lap += (phia(i,j,k-1)-phia(i,j,k)) / (dx[2]*dx[2]);
+                } else {
+                    lap += (phia(i,j,k-1)-2._rt*phia(i,j,k)+phia(i,j,k+1)) / (dx[2]*dx[2]);
+                }
+#endif
+                res_ma[b](i,j,k) = rhs_ma[b](i,j,k) - lap;
+            });
+            auto bnorm = rhs.norminf();
+            auto rnorm = res.norminf();
+            amrex::Print() << "  rhs inf norm " << bnorm << "\n"
+                           << "  res inf norm " << rnorm << "\n";
+#ifdef AMREX_USE_FLOAT
+            auto eps = 2.e-3f;
+#else
+            auto eps = 1.e-11;
+#endif
+            AMREX_ALWAYS_ASSERT(rnorm < eps*bnorm);
+        }
+    }
+    amrex::Finalize();
+}
diff --git a/Tests/FFT/R2C/CMakeLists.txt b/Tests/FFT/R2C/CMakeLists.txt
new file mode 100644
index 0000000000..21a9d3b268
--- /dev/null
+++ b/Tests/FFT/R2C/CMakeLists.txt
@@ -0,0 +1,10 @@
+foreach(D IN LISTS AMReX_SPACEDIM)
+    set(_sources  main.cpp)
+
+    set(_input_files)
+
+    setup_test(${D} _sources _input_files)
+
+    unset(_sources)
+    unset(_input_files)
+endforeach()
diff --git a/Tests/FFT/R2C/GNUmakefile b/Tests/FFT/R2C/GNUmakefile
new file mode 100644
index 0000000000..93376f4485
--- /dev/null
+++ b/Tests/FFT/R2C/GNUmakefile
@@ -0,0 +1,26 @@
+AMREX_HOME := ../../..
+
+DEBUG	= FALSE
+
+DIM	= 3
+
+COMP    = gcc
+
+USE_MPI   = TRUE
+USE_OMP   = FALSE
+USE_CUDA  = FALSE
+USE_HIP   = FALSE
+USE_SYCL  = FALSE
+
+USE_FFT = TRUE
+
+BL_NO_FORT = TRUE
+
+TINY_PROFILE = FALSE
+
+include $(AMREX_HOME)/Tools/GNUMake/Make.defs
+
+include ./Make.package
+include $(AMREX_HOME)/Src/Base/Make.package
+
+include $(AMREX_HOME)/Tools/GNUMake/Make.rules
diff --git a/Tests/FFT/R2C/Make.package b/Tests/FFT/R2C/Make.package
new file mode 100644
index 0000000000..6b4b865e8f
--- /dev/null
+++ b/Tests/FFT/R2C/Make.package
@@ -0,0 +1 @@
+CEXE_sources += main.cpp
diff --git a/Tests/FFT/R2C/main.cpp b/Tests/FFT/R2C/main.cpp
new file mode 100644
index 0000000000..7103038575
--- /dev/null
+++ b/Tests/FFT/R2C/main.cpp
@@ -0,0 +1,126 @@
+#include <AMReX_FFT.H> // Put this at the top for testing
+
+#include <AMReX.H>
+#include <AMReX_MultiFab.H>
+#include <AMReX_ParmParse.H>
+#include <AMReX_PlotFileUtil.H>
+
+using namespace amrex;
+
+int main (int argc, char* argv[])
+{
+    amrex::Initialize(argc, argv);
+    {
+        BL_PROFILE("main");
+
+        AMREX_D_TERM(int n_cell_x = 128;,
+                     int n_cell_y = 32;,
+                     int n_cell_z = 64);
+
+        AMREX_D_TERM(int max_grid_size_x = 32;,
+                     int max_grid_size_y = 32;,
+                     int max_grid_size_z = 32);
+
+        AMREX_D_TERM(Real prob_lo_x = 0.;,
+                     Real prob_lo_y = 0.;,
+                     Real prob_lo_z = 0.);
+        AMREX_D_TERM(Real prob_hi_x = 1.;,
+                     Real prob_hi_y = 1.;,
+                     Real prob_hi_z = 1.);
+
+        {
+            ParmParse pp;
+            AMREX_D_TERM(pp.query("n_cell_x", n_cell_x);,
+                         pp.query("n_cell_y", n_cell_y);,
+                         pp.query("n_cell_z", n_cell_z));
+            AMREX_D_TERM(pp.query("max_grid_size_x", max_grid_size_x);,
+                         pp.query("max_grid_size_y", max_grid_size_y);,
+                         pp.query("max_grid_size_z", max_grid_size_z));
+        }
+
+        Box domain(IntVect(0),IntVect(AMREX_D_DECL(n_cell_x-1,n_cell_y-1,n_cell_z-1)));
+        BoxArray ba(domain);
+        ba.maxSize(IntVect(AMREX_D_DECL(max_grid_size_x,
+                                        max_grid_size_y,
+                                        max_grid_size_z)));
+        DistributionMapping dm(ba);
+
+        Geometry geom;
+        {
+            geom.define(domain,
+                        RealBox(AMREX_D_DECL(prob_lo_x,prob_lo_y,prob_lo_z),
+                                AMREX_D_DECL(prob_hi_x,prob_hi_y,prob_hi_z)),
+                        CoordSys::cartesian, {AMREX_D_DECL(1,1,1)});
+        }
+        auto const& dx = geom.CellSizeArray();
+
+        MultiFab mf(ba,dm,1,0);
+        auto const& ma = mf.arrays();
+        ParallelFor(mf, [=] AMREX_GPU_DEVICE (int b, int i, int j, int k)
+        {
+            AMREX_D_TERM(Real x = (i+0.5_rt) * dx[0] - 0.5_rt;,
+                         Real y = (j+0.5_rt) * dx[1] - 0.5_rt;,
+                         Real z = (k+0.5_rt) * dx[2] - 0.5_rt);
+            ma[b](i,j,k) = std::exp(-10._rt*
+                (AMREX_D_TERM(x*x*1.05_rt, + y*y*0.90_rt, + z*z)));
+        });
+
+        MultiFab mf2(ba,dm,1,0);
+
+        auto scaling = Real(1) / Real(geom.Domain().d_numPts());
+
+        {
+            cMultiFab cmf(ba,dm,1,0);
+
+            // forward
+            {
+                FFT::R2C<Real,FFT::Direction::forward> r2c(geom.Domain());
+                r2c.forward(mf,cmf);
+            }
+
+            // backward
+            {
+                FFT::R2C<Real,FFT::Direction::backward> r2c(geom.Domain());
+                r2c.backward(cmf,mf2);
+            }
+
+            auto const& ma2 = mf2.arrays();
+            ParallelFor(mf2, [=] AMREX_GPU_DEVICE (int b, int i, int j, int k)
+            {
+                ma2[b](i,j,k) = ma[b](i,j,k) - ma2[b](i,j,k)*scaling;
+            });
+
+            auto error = mf2.norminf();
+            amrex::Print() << "  Expected to be close to zero: " << error << "\n";
+#ifdef AMREX_USE_FLOAT
+            auto eps = 1.e-6f;
+#else
+            auto eps = 1.e-13;
+#endif
+            AMREX_ALWAYS_ASSERT(error < eps);
+        }
+
+        mf2.setVal(std::numeric_limits<Real>::max());
+
+        { // forward and backward
+            FFT::R2C<Real,FFT::Direction::both> r2c(geom.Domain());
+            r2c.forwardThenBackward(mf, mf2,
+                                    [=] AMREX_GPU_DEVICE (int, int, int, auto& sp)
+            {
+                sp *= scaling;
+            });
+
+            MultiFab::Subtract(mf2, mf, 0, 0, 1, 0);
+
+            auto error = mf2.norminf();
+            amrex::Print() << "  Expected to be close to zero: " << error << "\n";
+#ifdef AMREX_USE_FLOAT
+            auto eps = 1.e-6f;
+#else
+            auto eps = 1.e-13;
+#endif
+            AMREX_ALWAYS_ASSERT(error < eps);
+        }
+    }
+    amrex::Finalize();
+}
diff --git a/Tests/LinearSolvers/CurlCurl/CMakeLists.txt b/Tests/LinearSolvers/CurlCurl/CMakeLists.txt
index 9dacdeb2fe..d7b1a912ed 100644
--- a/Tests/LinearSolvers/CurlCurl/CMakeLists.txt
+++ b/Tests/LinearSolvers/CurlCurl/CMakeLists.txt
@@ -1,5 +1,5 @@
 foreach(D IN LISTS AMReX_SPACEDIM)
-    if (D EQUAL 1)
+    if (D EQUAL 1 OR NOT AMReX_LINEAR_SOLVERS_EM)
        return()
     endif ()
 
diff --git a/Tests/LinearSolvers/NodalPoisson/CMakeLists.txt b/Tests/LinearSolvers/NodalPoisson/CMakeLists.txt
index d15b7d8e64..f42bd1fecc 100644
--- a/Tests/LinearSolvers/NodalPoisson/CMakeLists.txt
+++ b/Tests/LinearSolvers/NodalPoisson/CMakeLists.txt
@@ -1,5 +1,5 @@
 foreach(D IN LISTS AMReX_SPACEDIM)
-    if(D EQUAL 1)
+    if(D EQUAL 1 OR NOT AMReX_LINEAR_SOLVERS_INCFLO)
        continue()
     endif()
 
diff --git a/Tests/LinearSolvers/Nodal_Projection_EB/CMakeLists.txt b/Tests/LinearSolvers/Nodal_Projection_EB/CMakeLists.txt
index 3a8b331e45..d244b7573a 100644
--- a/Tests/LinearSolvers/Nodal_Projection_EB/CMakeLists.txt
+++ b/Tests/LinearSolvers/Nodal_Projection_EB/CMakeLists.txt
@@ -1,4 +1,4 @@
-if ( (NOT AMReX_EB) OR NOT (3 IN_LIST AMReX_SPACEDIM))
+if ( (NOT AMReX_EB) OR (NOT AMReX_LINEAR_SOLVERS_INCFLO) OR NOT (3 IN_LIST AMReX_SPACEDIM))
    return()
 endif ()
 
diff --git a/Tests/LinearSolvers/NodeTensorLap/CMakeLists.txt b/Tests/LinearSolvers/NodeTensorLap/CMakeLists.txt
index 956ea25072..4d40669a0c 100644
--- a/Tests/LinearSolvers/NodeTensorLap/CMakeLists.txt
+++ b/Tests/LinearSolvers/NodeTensorLap/CMakeLists.txt
@@ -1,6 +1,6 @@
 if (AMReX_GPU_BACKEND STREQUAL NONE)
     foreach(D IN LISTS AMReX_SPACEDIM)
-        if(D EQUAL 1)
+        if(D EQUAL 1 OR NOT AMReX_LINEAR_SOLVERS_EM)
             continue()
         endif()
 
diff --git a/Tests/Particles/NamedSoAComponents/CMakeLists.txt b/Tests/Particles/NamedSoAComponents/CMakeLists.txt
new file mode 100644
index 0000000000..e14ddd6897
--- /dev/null
+++ b/Tests/Particles/NamedSoAComponents/CMakeLists.txt
@@ -0,0 +1,10 @@
+foreach(D IN LISTS AMReX_SPACEDIM)
+    set(_sources     main.cpp)
+    #set(_input_files)
+    #set(_input_files inputs)
+
+    setup_test(${D} _sources _input_files NTHREADS 2)
+
+    unset(_sources)
+    unset(_input_files)
+endforeach()
diff --git a/Tests/Particles/NamedSoAComponents/GNUmakefile b/Tests/Particles/NamedSoAComponents/GNUmakefile
new file mode 100644
index 0000000000..9f49d3ec02
--- /dev/null
+++ b/Tests/Particles/NamedSoAComponents/GNUmakefile
@@ -0,0 +1,22 @@
+AMREX_HOME = ../../../
+
+DEBUG	= FALSE
+
+DIM	= 3
+
+COMP    = gcc
+
+USE_MPI   = TRUE
+USE_OMP   = FALSE
+USE_CUDA  = FALSE
+
+#TINY_PROFILE = TRUE
+USE_PARTICLES = TRUE
+
+include $(AMREX_HOME)/Tools/GNUMake/Make.defs
+
+include ./Make.package
+include $(AMREX_HOME)/Src/Base/Make.package
+include $(AMREX_HOME)/Src/Particle/Make.package
+
+include $(AMREX_HOME)/Tools/GNUMake/Make.rules
diff --git a/Tests/Particles/NamedSoAComponents/Make.package b/Tests/Particles/NamedSoAComponents/Make.package
new file mode 100644
index 0000000000..6b4b865e8f
--- /dev/null
+++ b/Tests/Particles/NamedSoAComponents/Make.package
@@ -0,0 +1 @@
+CEXE_sources += main.cpp
diff --git a/Tests/Particles/NamedSoAComponents/main.cpp b/Tests/Particles/NamedSoAComponents/main.cpp
new file mode 100644
index 0000000000..39b6a70ff5
--- /dev/null
+++ b/Tests/Particles/NamedSoAComponents/main.cpp
@@ -0,0 +1,139 @@
+#include <AMReX.H>
+#include <AMReX_Particle.H>
+#include <AMReX_ParticleContainer.H>
+#include <AMReX_ParticleTile.H>
+#include <AMReX_ParIter.H>
+#include <AMReX_REAL.H>
+#include <AMReX_Vector.H>
+#include <AMReX_GpuContainers.H>
+
+#include <array>
+
+using namespace amrex;
+
+void addParticles ()
+{
+    using PC = ParticleContainerPureSoA<AMREX_SPACEDIM+1, 2>;
+    int is_per[AMREX_SPACEDIM];
+    for (int & d : is_per) {
+        d = 1;
+    }
+
+    RealBox real_box;
+    for (int n = 0; n < AMREX_SPACEDIM; n++)
+    {
+        real_box.setLo(n, 0.0);
+        real_box.setHi(n, 100.0);
+    }
+
+    IntVect domain_lo(AMREX_D_DECL(0, 0, 0));
+    IntVect domain_hi(AMREX_D_DECL(127, 127, 127));
+    const Box base_domain(domain_lo, domain_hi);
+
+    Geometry geom(base_domain, &real_box, CoordSys::cartesian, is_per);
+    BoxArray ba(base_domain);
+    ba.maxSize(64);
+
+    DistributionMapping dm(ba);
+
+    PC pc(geom, dm, ba);
+
+    amrex::Print() << "Original Real SoA component names are: ";
+    for (auto& n : pc.GetRealSoANames()) {
+        amrex::Print() << n << ", ";
+    }
+    amrex::Print() << "\n";
+
+    amrex::Print() << "Original Int SoA component names are: ";
+    for (auto& n : pc.GetIntSoANames()) {
+        amrex::Print() << n << ", ";
+    }
+    amrex::Print() << "\n";
+
+    amrex::Print() << "Adding runtime comps. \n";
+    pc.AddRealComp("real_comp1");
+    pc.AddRealComp(); // without name - should be real_comp2
+    pc.AddIntComp(); // without name - should be int_comp0
+
+    amrex::Print() << "New Real SoA component names are: ";
+    for (auto& n : pc.GetRealSoANames()) {
+        amrex::Print() << n << ", ";
+    }
+    amrex::Print() << "\n";
+
+    amrex::Print() << "New Int SoA component names are: ";
+    for (auto& n : pc.GetIntSoANames()) {
+        amrex::Print() << n << ", ";
+    }
+    amrex::Print() << "\n";
+
+    amrex::Print() << "Reset compile-time SoA names \n";
+    pc.SetSoACompileTimeNames({AMREX_D_DECL("x", "y", "z"), "w"}, {"i1", "i2"});
+
+    amrex::Print() << "New Real SoA component names are: ";
+    for (auto& n : pc.GetRealSoANames()) {
+        amrex::Print() << n << ", ";
+    }
+    amrex::Print() << "\n";
+
+    amrex::Print() << "New Int SoA component names are: ";
+    for (auto& n : pc.GetIntSoANames()) {
+        amrex::Print() << n << ", ";
+    }
+    amrex::Print() << "\n";
+
+    int const NArrayReal = PC::NArrayReal;
+    int const NArrayInt = PC::NArrayInt;
+    using ParticleType = typename PC::ParticleType;
+
+    const int add_num_particles = 5;
+    auto& ptile1 = pc.DefineAndReturnParticleTile(0, 0, 0);
+    ptile1.resize(add_num_particles);
+
+    for (int i = 0; i < add_num_particles; ++i)
+    {
+        for (int d = 0; d < AMREX_SPACEDIM; d++) {
+            ptile1.pos(i, d) = 12.0;
+        }
+        ptile1.getParticleTileData().rdata(AMREX_SPACEDIM)[i] = 1.2;  // w
+
+        ptile1.push_back_int(0, int(ParticleType::NextID()));
+        ptile1.push_back_int(1, amrex::ParallelDescriptor::MyProc());
+    }
+
+    int lev=0;
+    using MyParIter = ParIter_impl<ParticleType, NArrayReal, NArrayInt>;
+    for (MyParIter pti(pc, lev); pti.isValid(); ++pti) {
+        auto& soa = pti.GetStructOfArrays();
+        AMREX_D_TERM(
+                     auto *xp = soa.GetRealData("x").data();,
+                     auto *yp = soa.GetRealData("y").data();,
+                     auto *zp = soa.GetRealData("z").data();
+                     );
+        auto *wp = soa.GetRealData("w").data();
+
+        const int np = pti.numParticles();
+        ParallelFor( np, [=] AMREX_GPU_DEVICE (long ip)
+        {
+            AMREX_D_TERM(
+                         AMREX_ALWAYS_ASSERT_WITH_MESSAGE(xp[ip] == 12_prt,
+                                                          "pos attribute expected to be 12");,
+                         AMREX_ALWAYS_ASSERT_WITH_MESSAGE(yp[ip] == 12_prt,
+                                                          "pos attribute expected to be 12");,
+                         AMREX_ALWAYS_ASSERT_WITH_MESSAGE(zp[ip] == 12_prt,
+                                                          "pos attribute expected to be 12");
+                         );
+            AMREX_ALWAYS_ASSERT_WITH_MESSAGE(wp[ip] == 1.2_prt,
+                                             "pos attribute expected to be 1.2");
+        });
+    }
+}
+
+int main (int argc, char* argv[])
+ {
+    amrex::Initialize(argc,argv);
+    {
+        addParticles();
+    }
+    amrex::Finalize();
+ }
diff --git a/Tests/SmallMatrix/CMakeLists.txt b/Tests/SmallMatrix/CMakeLists.txt
new file mode 100644
index 0000000000..224c4563c8
--- /dev/null
+++ b/Tests/SmallMatrix/CMakeLists.txt
@@ -0,0 +1,9 @@
+foreach(D IN LISTS AMReX_SPACEDIM)
+    set(_sources     main.cpp)
+    set(_input_files)
+
+    setup_test(${D} _sources _input_files)
+
+    unset(_sources)
+    unset(_input_files)
+endforeach()
diff --git a/Tests/SmallMatrix/GNUmakefile b/Tests/SmallMatrix/GNUmakefile
new file mode 100644
index 0000000000..d0d895ff52
--- /dev/null
+++ b/Tests/SmallMatrix/GNUmakefile
@@ -0,0 +1,24 @@
+AMREX_HOME := ../..
+
+DEBUG	= FALSE
+
+DIM	= 3
+
+COMP    = gcc
+
+USE_MPI   = FALSE
+USE_OMP   = FALSE
+USE_CUDA  = FALSE
+USE_HIP   = FALSE
+USE_SYCL  = FALSE
+
+BL_NO_FORT = TRUE
+
+TINY_PROFILE = FALSE
+
+include $(AMREX_HOME)/Tools/GNUMake/Make.defs
+
+include ./Make.package
+include $(AMREX_HOME)/Src/Base/Make.package
+
+include $(AMREX_HOME)/Tools/GNUMake/Make.rules
diff --git a/Tests/SmallMatrix/Make.package b/Tests/SmallMatrix/Make.package
new file mode 100644
index 0000000000..6b4b865e8f
--- /dev/null
+++ b/Tests/SmallMatrix/Make.package
@@ -0,0 +1 @@
+CEXE_sources += main.cpp
diff --git a/Tests/SmallMatrix/main.cpp b/Tests/SmallMatrix/main.cpp
new file mode 100644
index 0000000000..505ae50bd6
--- /dev/null
+++ b/Tests/SmallMatrix/main.cpp
@@ -0,0 +1,306 @@
+#include <AMReX.H>
+#include <AMReX_Math.H>
+#include <AMReX_Print.H>
+#include <AMReX_REAL.H>
+#include <AMReX_SmallMatrix.H>
+
+using namespace amrex;
+
+int main (int argc, char* argv[])
+{
+    static_assert(Order::C == Order::RowMajor &&
+                  Order::F == Order::ColumnMajor);
+
+    amrex::Initialize(argc, argv);
+    // 0-based indexing
+    {
+        SmallMatrix<Real,3,4> m34{};
+        for (int j = 0; j < 4; ++j) {
+            for (int i = 0; i < 3; ++i) {
+                AMREX_ALWAYS_ASSERT(m34(i,j) == 0.0_rt);
+            }
+        }
+    }
+    {
+        SmallVector<Real,3> cv{};
+        SmallRowVector<Real,3> rv{};
+        SmallVector<int,3> cv2{1,2,3};
+        SmallRowVector<int,3> rv2{0,10,20};
+        SmallVector<int,5> cv3{0,1,2};
+        for (int j = 0; j < 3; ++j) {
+            AMREX_ALWAYS_ASSERT(cv(j) == 0.0_rt &&
+                                rv(j) == 0.0_rt &&
+                                cv2(j) == j+1 &&
+                                rv2(j) == j*10 &&
+                                cv3(j) == j);
+        }
+        AMREX_ALWAYS_ASSERT(cv3(3) == 0 && cv3(4) == 0);
+    }
+    {
+        SmallMatrix<int,3,4> m34{{0,3,6,9},
+                                 {1,4,7,10},
+                                 {2,5,8,11}};
+        int v = 0;
+        for (int j = 0; j < 4; ++j) {
+            for (int i = 0; i < 3; ++i) {
+                AMREX_ALWAYS_ASSERT(m34(i,j) == v++);
+            }
+        }
+        std::cout << m34;
+    }
+    {
+        SmallMatrix<int,3,4,Order::C> m34{{0,1,2,3},
+                                          {4,5,6,7},
+                                          {8,9,10,11}};
+        int v = 0;
+        for (int i = 0; i < 3; ++i) {
+            for (int j = 0; j < 4; ++j) {
+                AMREX_ALWAYS_ASSERT(m34(i,j) == v++);
+            }
+        }
+    }
+    {
+        auto v3 = SmallVector<double,3>::Zero();
+        v3[0] = 1.;
+        v3(1) = 2.;
+        v3[2] = 3.;
+        auto m33 = SmallMatrix<double,3,3>::Identity();
+        auto r = m33*v3;
+        AMREX_ALWAYS_ASSERT(almostEqual(r[0],v3[0]) &&
+                            almostEqual(r[1],v3[1]) &&
+                            almostEqual(r[2],v3[2]));
+    }
+    {
+        SmallMatrix<int,4,3,Order::C> A{{1, 0, 1},
+                                        {2, 1, 1},
+                                        {0, 1, 1},
+                                        {1, 1, 2}};
+        SmallMatrix<int,3,3,Order::C> B{{1, 2, 1},
+                                        {2, 3, 1},
+                                        {4, 2, 2}};
+        SmallMatrix<int,3,1,Order::C> C{10, 8, 6};
+        auto ABC = A*B*C;
+        AMREX_ALWAYS_ASSERT(ABC(0,0) == 100 &&
+                            ABC(1,0) == 182 &&
+                            ABC(2,0) == 118 &&
+                            ABC(3,0) == 218);
+    }
+    {
+        SmallMatrix<int,3,4,Order::F> A{{1, 2, 0, 1},
+                                        {0, 1, 1, 1},
+                                        {1, 1, 1, 2}};
+        SmallMatrix<int,3,3,Order::F> B{{1, 2, 4},
+                                        {2, 3, 2},
+                                        {1, 1, 2}};
+        SmallMatrix<int,1,3,Order::F> C{10, 8, 6};
+        auto ABC = A.transpose()*B.transposeInPlace()*C.transpose();
+        AMREX_ALWAYS_ASSERT(ABC(0,0) == 100 &&
+                            ABC(1,0) == 182 &&
+                            ABC(2,0) == 118 &&
+                            ABC(3,0) == 218);
+    }
+    {
+        SmallMatrix<int, 3, 4> m;
+        m.setVal(2);
+        using M = decltype(m);
+        AMREX_ALWAYS_ASSERT(m.product() == Math::powi<M::row_size*M::column_size>(2));
+        AMREX_ALWAYS_ASSERT(m.sum() == 2*m.row_size*m.column_size);
+    }
+    {
+        SmallMatrix<double, 5, 5> m{{1.0, 3.4, 4.5, 5.6, 6.7},
+                                    {1.3, 2.0, 4.5, 5.6, 6.7},
+                                    {1.3, 1.0, 3.0, 5.6, 6.7},
+                                    {1.3, 1.4, 4.5, 4.0, 6.7},
+                                    {1.3, 1.0, 4.5, 5.6, 5.0}};
+        AMREX_ALWAYS_ASSERT(m.trace() == double(1+2+3+4+5));
+    }
+    {
+        SmallMatrix<int,2,3> a{{+1, +2, +3},
+                               {+7, +8, +9}};
+        SmallMatrix<int,2,3> b{{-1, -2, -3},
+                               {-7, -8, -9}};
+        auto c = a*2 + 2*b;
+        for (auto const& x : c) {
+            AMREX_ALWAYS_ASSERT(x == 0);
+        }
+    }
+    {
+        SmallMatrix<int,2,3> a{{+1, +2, +3},
+                               {+7, +8, +9}};
+        SmallMatrix<int,2,3> b{{-1, -2, -3},
+                               {-7, -8, -9}};
+        auto c = -a - b;
+        for (auto const& x : c) {
+            AMREX_ALWAYS_ASSERT(x == 0);
+        }
+    }
+    {
+        SmallMatrix<int,2,3> a{{+1, +2, +3},
+                               {+7, +8, +9}};
+        SmallMatrix<int,2,3> b;
+        b.setVal(-1);
+        AMREX_ALWAYS_ASSERT(a.dot(b) == -30);
+    }
+    {
+        SmallVector<int, 3> v{10,20,30};
+        auto const& [x,y,z] = v;
+        AMREX_ALWAYS_ASSERT(x == 10 && y == 20 && z == 30);
+
+        auto& [a,b,c] = v;
+        a = 100; b = 200; c = 300;
+        AMREX_ALWAYS_ASSERT(v[0] == 100 && v[1] == 200 && v[2] == 300);
+
+        auto const [i,j,k] = v;
+        AMREX_ALWAYS_ASSERT(i == 100 && j == 200 && k == 300);
+
+        auto [d,e,f] = v;
+        AMREX_ALWAYS_ASSERT(d == 100 && e == 200 && f == 300);
+    }
+
+    // 1-based indexing
+    {
+        SmallMatrix<Real,3,4,Order::F,1> m34{};
+        for (int j = 1; j <= 4; ++j) {
+            for (int i = 1; i <= 3; ++i) {
+                AMREX_ALWAYS_ASSERT(m34(i,j) == 0.0_rt);
+            }
+        }
+    }
+    {
+        SmallVector<Real,3,1> cv{};
+        SmallRowVector<Real,3,1> rv{};
+        SmallVector<int,3,1> cv2{1,2,3};
+        SmallRowVector<int,3,1> rv2{0,10,20};
+        SmallVector<int,5,1> cv3{0,1,2};
+        for (int j = 0; j < 3; ++j) {
+            AMREX_ALWAYS_ASSERT(cv(j+1) == 0.0_rt &&
+                                rv(j+1) == 0.0_rt &&
+                                cv2(j+1) == j+1 &&
+                                rv2(j+1) == j*10 &&
+                                cv3(j+1) == j);
+        }
+        AMREX_ALWAYS_ASSERT(cv3(4) == 0 && cv3(5) == 0);
+    }
+    {
+        SmallMatrix<int,3,4,Order::F,1> m34{{0,3,6,9},
+                                            {1,4,7,10},
+                                            {2,5,8,11}};
+        int v = 0;
+        for (int j = 1; j <= 4; ++j) {
+            for (int i = 1; i <= 3; ++i) {
+                AMREX_ALWAYS_ASSERT(m34(i,j) == v++);
+            }
+        }
+        std::cout << m34;
+    }
+    {
+        SmallMatrix<int,3,4,Order::C,1> m34{{0,1,2,3},
+                                            {4,5,6,7},
+                                            {8,9,10,11}};
+        int v = 0;
+        for (int i = 1; i <= 3; ++i) {
+            for (int j = 1; j <= 4; ++j) {
+                AMREX_ALWAYS_ASSERT(m34(i,j) == v++);
+            }
+        }
+    }
+    {
+        auto v3 = SmallVector<double,3,1>::Zero();
+        v3[1] = 1.;
+        v3(2) = 2.;
+        v3[3] = 3.;
+        auto m33 = SmallMatrix<double,3,3,Order::F,1>::Identity();
+        auto r = m33*v3;
+        AMREX_ALWAYS_ASSERT(almostEqual(r[1],v3[1]) &&
+                            almostEqual(r[2],v3[2]) &&
+                            almostEqual(r[3],v3[3]));
+    }
+    {
+        SmallMatrix<int,4,3,Order::C,1> A{{1, 0, 1},
+                                          {2, 1, 1},
+                                          {0, 1, 1},
+                                          {1, 1, 2}};
+        SmallMatrix<int,3,3,Order::C,1> B{{1, 2, 1},
+                                          {2, 3, 1},
+                                          {4, 2, 2}};
+        SmallMatrix<int,3,1,Order::C,1> C{10, 8, 6};
+        auto ABC = A*B*C;
+        AMREX_ALWAYS_ASSERT(ABC(1,1) == 100 &&
+                            ABC(2,1) == 182 &&
+                            ABC(3,1) == 118 &&
+                            ABC(4,1) == 218);
+    }
+    {
+        SmallMatrix<int,3,4,Order::F,1> A{{1, 2, 0, 1},
+                                          {0, 1, 1, 1},
+                                          {1, 1, 1, 2}};
+        SmallMatrix<int,3,3,Order::F,1> B{{1, 2, 4},
+                                          {2, 3, 2},
+                                          {1, 1, 2}};
+        SmallMatrix<int,1,3,Order::F,1> C{10, 8, 6};
+        auto ABC = A.transpose()*B.transposeInPlace()*C.transpose();
+        AMREX_ALWAYS_ASSERT(ABC(1,1) == 100 &&
+                            ABC(2,1) == 182 &&
+                            ABC(3,1) == 118 &&
+                            ABC(4,1) == 218);
+    }
+    {
+        SmallMatrix<int, 3, 4, Order::F, 1> m;
+        m.setVal(2);
+        using M = decltype(m);
+        AMREX_ALWAYS_ASSERT(m.product() == Math::powi<M::row_size*M::column_size>(2));
+        AMREX_ALWAYS_ASSERT(m.sum() == 2*m.row_size*m.column_size);
+    }
+    {
+        SmallMatrix<double, 5, 5, Order::F, 1> m{{1.0, 3.4, 4.5, 5.6, 6.7},
+                                                 {1.3, 2.0, 4.5, 5.6, 6.7},
+                                                 {1.3, 1.0, 3.0, 5.6, 6.7},
+                                                 {1.3, 1.4, 4.5, 4.0, 6.7},
+                                                 {1.3, 1.0, 4.5, 5.6, 5.0}};
+        AMREX_ALWAYS_ASSERT(m.trace() == double(1+2+3+4+5));
+    }
+    {
+        SmallMatrix<int,2,3,Order::F,1> a{{+1, +2, +3},
+                                          {+7, +8, +9}};
+        SmallMatrix<int,2,3,Order::F,1> b{{-1, -2, -3},
+                                          {-7, -8, -9}};
+        auto c = a*2 + 2*b;
+        for (auto const& x : c) {
+            AMREX_ALWAYS_ASSERT(x == 0);
+        }
+    }
+    {
+        SmallMatrix<int,2,3,Order::F,1> a{{+1, +2, +3},
+                                          {+7, +8, +9}};
+        SmallMatrix<int,2,3,Order::F,1> b{{-1, -2, -3},
+                                          {-7, -8, -9}};
+        auto c = -a - b;
+        for (auto const& x : c) {
+            AMREX_ALWAYS_ASSERT(x == 0);
+        }
+    }
+    {
+        SmallMatrix<int,2,3,Order::F,1> a{{+1, +2, +3},
+                                          {+7, +8, +9}};
+        SmallMatrix<int,2,3,Order::F,1> b;
+        b.setVal(-1);
+        AMREX_ALWAYS_ASSERT(a.dot(b) == -30);
+    }
+    {
+        SmallVector<int, 3, 1> v{10,20,30};
+        auto const& [x,y,z] = v;
+        AMREX_ALWAYS_ASSERT(x == 10 && y == 20 && z == 30);
+
+        auto& [a,b,c] = v;
+        a = 100; b = 200; c = 300;
+        AMREX_ALWAYS_ASSERT(v[1] == 100 && v[2] == 200 && v[3] == 300);
+
+        auto const [i,j,k] = v;
+        AMREX_ALWAYS_ASSERT(i == 100 && j == 200 && k == 300);
+
+        auto [d,e,f] = v;
+        AMREX_ALWAYS_ASSERT(d == 100 && e == 200 && f == 300);
+    }
+
+    amrex::Finalize();
+}
diff --git a/Tools/CMake/AMReXConfig.cmake.in b/Tools/CMake/AMReXConfig.cmake.in
index f5045b715c..96fb12cbf7 100644
--- a/Tools/CMake/AMReXConfig.cmake.in
+++ b/Tools/CMake/AMReXConfig.cmake.in
@@ -74,6 +74,9 @@ set(AMReX_AMRLEVEL_FOUND            @AMReX_AMRLEVEL@)
 set(AMReX_EB_FOUND                  @AMReX_EB@)
 set(AMReX_FINTERFACES_FOUND         @AMReX_FORTRAN_INTERFACES@)
 set(AMReX_LSOLVERS_FOUND            @AMReX_LINEAR_SOLVERS@)
+set(AMReX_LSOLVERS_INCFLO_FOUND     @AMReX_LINEAR_SOLVERS_INCFLO@)
+set(AMReX_LSOLVERS_EM_FOUND         @AMReX_LINEAR_SOLVERS_EM@)
+set(AMReX_FFT_FOUND                 @AMReX_FFT@)
 set(AMReX_AMRDATA_FOUND             @AMReX_AMRDATA@)
 set(AMReX_PARTICLES_FOUND           @AMReX_PARTICLES@)
 set(AMReX_P@AMReX_PARTICLES_PRECISION@_FOUND ON)
@@ -129,6 +132,9 @@ set(AMReX_AMRLEVEL                  @AMReX_AMRLEVEL@)
 set(AMReX_EB                        @AMReX_EB@)
 set(AMReX_FINTERFACES               @AMReX_FORTRAN_INTERFACES@)
 set(AMReX_LSOLVERS                  @AMReX_LINEAR_SOLVERS@)
+set(AMReX_LSOLVERS_INCFLO           @AMReX_LINEAR_SOLVERS_INCFLO@)
+set(AMReX_LSOLVERS_EM               @AMReX_LINEAR_SOLVERS_EM@)
+set(AMReX_FFT                       @AMReX_FFT@)
 set(AMReX_AMRDATA                   @AMReX_AMRDATA@)
 set(AMReX_PARTICLES                 @AMReX_PARTICLES@)
 set(AMReX_PARTICLES_PRECISION       @AMReX_PARTICLES_PRECISION@)
@@ -212,6 +218,12 @@ if (@AMReX_CONDUIT@)
    find_dependency(Conduit REQUIRED)
 endif ()
 
+if (@AMReX_FFT@)
+    if (@AMReX_GPU_BACKEND@ STREQUAL NONE)
+        find_dependency(AMReXFFTW REQUIRED)
+    endif()
+endif()
+
 if (@AMReX_HDF5@)
     find_dependency(HDF5 REQUIRED)
 endif ()
diff --git a/Tools/CMake/AMReXOptions.cmake b/Tools/CMake/AMReXOptions.cmake
index 3e5d4c8bdb..a7863f125e 100644
--- a/Tools/CMake/AMReXOptions.cmake
+++ b/Tools/CMake/AMReXOptions.cmake
@@ -284,6 +284,19 @@ print_option(AMReX_FORTRAN_INTERFACES)
 option( AMReX_LINEAR_SOLVERS  "Build AMReX Linear solvers" ON )
 print_option( AMReX_LINEAR_SOLVERS )
 
+cmake_dependent_option( AMReX_LINEAR_SOLVERS_INCFLO
+    "Build AMReX Linear solvers useful for incompressible flow codes" ON
+    "AMReX_LINEAR_SOLVERS" OFF)
+print_option( AMReX_LINEAR_SOLVERS_INCFLO )
+
+cmake_dependent_option( AMReX_LINEAR_SOLVERS_EM
+    "Build AMReX Linear solvers useful for electromagnetic codes" ON
+    "AMReX_LINEAR_SOLVERS" OFF)
+print_option( AMReX_LINEAR_SOLVERS_EM )
+
+option( AMReX_FFT  "Build AMReX FFT" OFF )
+print_option( AMReX_FFT )
+
 option( AMReX_AMRDATA "Build data services" OFF )
 print_option( AMReX_AMRDATA )
 
diff --git a/Tools/CMake/AMReXThirdPartyLibraries.cmake b/Tools/CMake/AMReXThirdPartyLibraries.cmake
index abe62a2ebc..b8ad503e83 100644
--- a/Tools/CMake/AMReXThirdPartyLibraries.cmake
+++ b/Tools/CMake/AMReXThirdPartyLibraries.cmake
@@ -1,3 +1,27 @@
+#
+# FFT
+#
+if (AMReX_FFT)
+    if (AMReX_CUDA)
+        find_package(CUDAToolkit REQUIRED)
+        foreach(D IN LISTS AMReX_SPACEDIM)
+            target_link_libraries(amrex_${D}d PUBLIC CUDA::cufft)
+        endforeach()
+    elseif (AMReX_HIP)
+        find_package(rocfft REQUIRED)
+        foreach(D IN LISTS AMReX_SPACEDIM)
+            target_link_libraries(amrex_${D}d PUBLIC roc::rocfft)
+        endforeach()
+    elseif (AMReX_SYCL)
+        # nothing to do
+    else()
+        find_package(AMReXFFTW REQUIRED)
+        foreach(D IN LISTS AMReX_SPACEDIM)
+            target_link_libraries(amrex_${D}d PUBLIC AMReX::FFTW)
+        endforeach()
+    endif()
+endif()
+
 #
 # HDF5 -- here it would be best to create an imported target
 #
diff --git a/Tools/CMake/AMReX_Config_ND.H.in b/Tools/CMake/AMReX_Config_ND.H.in
index 3296a403ff..07e3b7fd63 100644
--- a/Tools/CMake/AMReX_Config_ND.H.in
+++ b/Tools/CMake/AMReX_Config_ND.H.in
@@ -39,6 +39,7 @@
 #cmakedefine BL_FORT_USE_LOWERCASE
 #cmakedefine BL_FORT_USE_UPPERCASE
 #cmakedefine BL_NO_FORT
+#cmakedefine AMREX_USE_FFT
 #cmakedefine AMREX_USE_SENSEI_INSITU
 #cmakedefine AMREX_NO_SENSEI_AMR_INST
 #cmakedefine AMREX_USE_CONDUIT
diff --git a/Tools/CMake/FindAMReXFFTW.cmake b/Tools/CMake/FindAMReXFFTW.cmake
new file mode 100644
index 0000000000..678743a08b
--- /dev/null
+++ b/Tools/CMake/FindAMReXFFTW.cmake
@@ -0,0 +1,51 @@
+#[=======================================================================[:
+FindAMReXFFTW
+-------
+
+Finds the FFTW library.
+
+Imported Targets
+^^^^^^^^^^^^^^^^
+
+This module provides the following imported target, if found:
+
+``FFTW``
+  The FFTW library
+
+Result Variables
+^^^^^^^^^^^^^^^^
+
+This will define the following variables:
+
+``AMReXFFTW_FOUND``
+  True if the hypre library has been found.
+``FFTW_INCLUDES``
+  Include directories needed to use FFTW.
+``FFTW_LIBRARIES``
+  Libraries needed to link to FFTW.
+
+This will also create an imported target, AMReX::FFTW.
+#]=======================================================================]
+
+if (NOT FFTW_INCLUDES)
+    find_path(FFTW_INCLUDES NAMES "fftw3.h" HINTS ${FFTW_ROOT}/include)
+endif()
+
+if (NOT FFTW_LIBRARIES)
+    find_library(FFTW_LIBRARY NAMES "fftw3" HINTS ${FFTW_ROOT}/lib)
+    find_library(FFTWF_LIBRARY NAMES "fftw3f" HINTS ${FFTW_ROOT}/lib)
+    set(FFTW_LIBRARIES ${FFTW_LIBRARY} ${FFTWF_LIBRARY})
+endif()
+
+include(FindPackageHandleStandardArgs)
+
+find_package_handle_standard_args(AMReXFFTW
+    REQUIRED_VARS FFTW_LIBRARIES FFTW_INCLUDES)
+
+mark_as_advanced(FFTW_LIBRARIES FFTW_INCLUDES)
+
+# Create imported target
+add_library(AMReX::FFTW INTERFACE IMPORTED GLOBAL)
+target_link_libraries(AMReX::FFTW INTERFACE ${FFTW_LIBRARIES})
+set_target_properties(AMReX::FFTW PROPERTIES
+	INTERFACE_INCLUDE_DIRECTORIES "${FFTW_INCLUDES}")
diff --git a/Tools/F_scripts/dep.py b/Tools/F_scripts/dep.py
index 151d1a9f9a..e5eb74cb40 100755
--- a/Tools/F_scripts/dep.py
+++ b/Tools/F_scripts/dep.py
@@ -28,7 +28,7 @@
 import preprocess
 
 # modules to ignore in the dependencies
-IGNORES = ["iso_c_binding", "iso_fortran_env", "omp_lib", "mpi", "cudafor", "openacc", "hdf"]
+IGNORES = ["iso_c_binding", "iso_fortran_env", "omp_lib", "mpi", "cudafor", "openacc", "hdf", "hdf5"]
 
 # regular expression for "{}module{}name", where {} can be any number
 # of spaces.  We use 4 groups here, denoted by (), so the name of the
diff --git a/Tools/GNUMake/Make.defs b/Tools/GNUMake/Make.defs
index f33911ed3a..66de8ec6a5 100644
--- a/Tools/GNUMake/Make.defs
+++ b/Tools/GNUMake/Make.defs
@@ -112,6 +112,12 @@ else
   DEBUG := FALSE
 endif
 
+ifdef USE_FFT
+  USE_FFT := $(strip $(USE_FFT))
+else
+  USE_FFT := FALSE
+endif
+
 ifdef PROFILE
   PROFILE := $(strip $(PROFILE))
 else
@@ -604,6 +610,28 @@ else
     DebugSuffix :=
 endif
 
+ifeq ($(USE_FFT),TRUE)
+  include $(AMREX_HOME)/Src/FFT/Make.package
+  ifeq ($(USE_CUDA),TRUE)
+    LIBRARIES += -lcufft
+  else ifeq ($(USE_HIP),TRUE)
+    # Use rocFFT.  ROC_PATH is defined in hip.mak
+    SYSTEM_INCLUDE_LOCATIONS += $(ROC_PATH)/rocfft/include
+    LIBRARY_LOCATIONS += $(ROC_PATH)/rocfft/lib
+    LIBRARIES += -Wl,--rpath=$(ROC_PATH)/rocfft/lib -lrocfft
+  else ifeq ($(USE_SYCL),TRUE)
+    # nothing
+  else
+    FFTW_HOME ?= NOT_SET
+    ifneq ($(FFTW_HOME),NOT_SET)
+      SYSTEM_INCLUDE_LOCATIONS += $(FFTW_HOME)/include
+      LIBRARY_LOCATIONS += $(FFTW_HOME)/lib
+      LIBRARIES += -Wl,--rpath=$(FFTW_HOME)/lib
+    endif
+    LIBRARIES += -lfftw3f -lfftw3
+  endif
+endif
+
 ifeq ($(USE_PROFPARSER),TRUE)
   PROFILE := TRUE
   TRACE_PROFILE := TRUE
@@ -760,10 +788,6 @@ ifeq ($(USE_HIP),TRUE)
 
     GPUSuffix := .HIP
 
-    ifeq ($(HIP_INDIRECT_FUNCTION),TRUE)
-      DEFINES += -DAMREX_HIP_INDIRECT_FUNCTION
-    endif
-
     ifeq ($(USE_MPI),TRUE)
       # Make sure that the C/C++ MPI
       # wrappers are calling hipcc to compile the code.
@@ -918,6 +942,10 @@ ifeq ($(USE_PARTICLES),TRUE)
   DEFINES += -DAMREX_PARTICLES
 endif
 
+ifeq ($(USE_FFT),TRUE)
+    DEFINES += -DAMREX_USE_FFT
+endif
+
 ifeq ($(USE_EB),TRUE)
     DEFINES += -DAMREX_USE_EB
 endif
diff --git a/Tools/GNUMake/comps/hip.mak b/Tools/GNUMake/comps/hip.mak
index 87bb3e93f5..26dff7f94f 100644
--- a/Tools/GNUMake/comps/hip.mak
+++ b/Tools/GNUMake/comps/hip.mak
@@ -119,8 +119,8 @@ ifeq ($(HIP_COMPILER),clang)
   endif
 
   # Generic HIP info
-  ROC_PATH=$(realpath $(dir $(HIP_PATH)))
-  SYSTEM_INCLUDE_LOCATIONS += $(ROC_PATH)/include $(HIP_PATH)/include
+  ROC_PATH=$(realpath $(HIP_PATH))
+  SYSTEM_INCLUDE_LOCATIONS += $(ROC_PATH)/include
 
   # rocRand
   SYSTEM_INCLUDE_LOCATIONS += $(ROC_PATH)/include/hiprand $(ROC_PATH)/include/rocrand
diff --git a/Tools/libamrex/configure.py b/Tools/libamrex/configure.py
index 1545f86dfb..873b575fe4 100755
--- a/Tools/libamrex/configure.py
+++ b/Tools/libamrex/configure.py
@@ -57,10 +57,22 @@ def configure(argv):
                         help="Enable AMReX Fortran API [default=yes]",
                         choices=["yes","no"],
                         default="yes")
+    parser.add_argument("--enable-fft",
+                        help="Enable AMReX FFT [default=no]",
+                        choices=["yes","no"],
+                        default="no")
     parser.add_argument("--enable-linear-solver",
                         help="Enable AMReX linear solvers [default=yes]",
                         choices=["yes","no"],
                         default="yes")
+    parser.add_argument("--enable-linear-solver-incflo",
+                        help="Enable AMReX linear solvers for incompressible flow codes [default=yes]",
+                        choices=["yes","no"],
+                        default="yes")
+    parser.add_argument("--enable-linear-solver-em",
+                        help="Enable AMReX linear solvers for electromagnetic codes [default=yes]",
+                        choices=["yes","no"],
+                        default="yes")
     parser.add_argument("--enable-hypre",
                         help="Enable Hypre as an option for bottom solver of AMReX linear solvers [default=no]",
                         choices=["yes","no"],
@@ -143,7 +155,10 @@ def configure(argv):
     f.write("DEBUG = {}\n".format("TRUE" if args.debug == "yes" else "FALSE"))
     f.write("USE_PARTICLES = {}\n".format("FALSE" if args.enable_particle == "no" else "TRUE"))
     f.write("USE_FORTRAN_INTERFACE = {}\n".format("FALSE" if args.enable_fortran_api == "no" else "TRUE"))
+    f.write("USE_FFT = {}\n".format("TRUE" if args.enable_fft == "yes" else "FALSE"))
     f.write("USE_LINEAR_SOLVERS = {}\n".format("FALSE" if args.enable_linear_solver == "no" else "TRUE"))
+    f.write("USE_LINEAR_SOLVERS_INCFLO = {}\n".format("FALSE" if args.enable_linear_solver_incflo == "no" else "TRUE"))
+    f.write("USE_LINEAR_SOLVERS_EM = {}\n".format("FALSE" if args.enable_linear_solver_em == "no" else "TRUE"))
     f.write("USE_HYPRE = {}\n".format("TRUE" if args.enable_hypre == "yes" else "FALSE"))
     f.write("USE_PETSC = {}\n".format("TRUE" if args.enable_petsc == "yes" else "FALSE"))
     f.write("USE_EB = {}\n".format("TRUE" if args.enable_eb == "yes" else "FALSE"))