diff --git a/.codespellrc b/.codespellrc
index 7fa3560731..715dc816bd 100644
--- a/.codespellrc
+++ b/.codespellrc
@@ -1,3 +1,3 @@
 [codespell]
-skip = .git,*.ipynb,*.bib,*.ps,*.patch,*~,CHANGES,*/Extern/SWFFT,*/Extern/hpgmg,./tmp_install_dir,./installdir,*/build,*/tmp_build_dir
+skip = .git,*.ipynb,*.bib,*.ps,*.patch,*~,CHANGES,./tmp_install_dir,./installdir,*/build,*/tmp_build_dir
 ignore-words = .codespell-ignore-words
diff --git a/.github/workflows/apps.yml b/.github/workflows/apps.yml
index 19a3c4699a..042b0d7c9d 100644
--- a/.github/workflows/apps.yml
+++ b/.github/workflows/apps.yml
@@ -7,9 +7,14 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
+  check_changes:
+    uses: ./.github/workflows/check_changes.yml
+
   castro:
     name: Castro
     runs-on: ubuntu-latest
+    needs: check_changes
+    if: needs.check_changes.outputs.has_non_docs_changes == 'true'
     steps:
     - uses: actions/checkout@v4
     - name: Get Latest Release Tag
@@ -62,6 +67,8 @@ jobs:
   warpx:
     name: WarpX
     runs-on: ubuntu-latest
+    needs: check_changes
+    if: needs.check_changes.outputs.has_non_docs_changes == 'true'
     steps:
     - uses: actions/checkout@v4
     - name: Download WarpX
@@ -105,6 +112,8 @@ jobs:
   pyamrex:
     name: pyamrex
     runs-on: ubuntu-latest
+    needs: check_changes
+    if: needs.check_changes.outputs.has_non_docs_changes == 'true'
     steps:
     - uses: actions/checkout@v4
     - name: Checkout pyamrex
diff --git a/.github/workflows/bittree.yml b/.github/workflows/bittree.yml
index 299fb025d8..f07999a54a 100644
--- a/.github/workflows/bittree.yml
+++ b/.github/workflows/bittree.yml
@@ -7,9 +7,14 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
+  check_changes:
+    uses: ./.github/workflows/check_changes.yml
+
   bittree-2d:
     name: Bittree 2D
     runs-on: ubuntu-latest
+    needs: check_changes
+    if: needs.check_changes.outputs.has_non_docs_changes == 'true'
     steps:
     - uses: actions/checkout@v4
     - name: Dependencies
@@ -62,6 +67,8 @@ jobs:
   bittree-3d:
     name: Bittree 3D
     runs-on: ubuntu-latest
+    needs: check_changes
+    if: needs.check_changes.outputs.has_non_docs_changes == 'true'
     steps:
     - uses: actions/checkout@v4
     - name: Dependencies
diff --git a/.github/workflows/catalyst.yml b/.github/workflows/catalyst.yml
index 108b76dff1..e33d81f45c 100644
--- a/.github/workflows/catalyst.yml
+++ b/.github/workflows/catalyst.yml
@@ -7,10 +7,14 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
+  check_changes:
+    uses: ./.github/workflows/check_changes.yml
+
   catalyst:
     name: Catalyst
     runs-on: ubuntu-22.04
-    if: github.event.pull_request.draft == false
+    needs: check_changes
+    if: github.event.pull_request.draft == false && needs.check_changes.outputs.has_non_docs_changes == 'true'
     env:
       CXX: g++
       CC: gcc
diff --git a/.github/workflows/check_changes.yml b/.github/workflows/check_changes.yml
new file mode 100644
index 0000000000..35c0582cc8
--- /dev/null
+++ b/.github/workflows/check_changes.yml
@@ -0,0 +1,29 @@
+name: Check Changes
+
+on:
+  workflow_call:
+    outputs:
+      has_docs_changes:
+        value: ${{ jobs.check.outputs.has_docs_changes }}
+      has_non_docs_changes:
+        value: ${{ jobs.check.outputs.has_non_docs_changes }}
+
+jobs:
+  check:
+    runs-on: ubuntu-latest
+    outputs:
+      has_non_docs_changes: ${{ steps.set-output.outputs.has_non_docs_changes }}
+    steps:
+      - uses: actions/checkout@v4
+      - uses: dorny/paths-filter@v3
+        id: changes
+        with:
+          filters: |
+            docs:
+              - 'Docs/**'
+            others:
+              - '!Docs/**'
+      - id: set-output
+        run: |
+          echo "has_docs_changes=${{ steps.changes.outputs.docs }}" >> $GITHUB_OUTPUT
+          echo "has_non_docs_changes=${{ steps.changes.outputs.others }}" >> $GITHUB_OUTPUT
diff --git a/.github/workflows/clang.yml b/.github/workflows/clang.yml
index 1ed5240164..bbbc08723e 100644
--- a/.github/workflows/clang.yml
+++ b/.github/workflows/clang.yml
@@ -1,5 +1,3 @@
-# -Wno-c++17-extensions: Clang complains about nodiscard if the standard is not set to c++17.
-
 name: LinuxClang
 
 on: [push, pull_request]
@@ -9,11 +7,16 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
+  check_changes:
+    uses: ./.github/workflows/check_changes.yml
+
   # Build and install libamrex as AMReX CMake project
   # Note: this is an intentional "minimal" build that does not enable (many) options
   library_clang:
     name: Clang@7.0 C++17 SP NOMPI Debug [lib]
     runs-on: ubuntu-20.04
+    needs: check_changes
+    if: needs.check_changes.outputs.has_non_docs_changes == 'true'
     steps:
     - uses: actions/checkout@v4
     - name: Dependencies
@@ -75,6 +78,8 @@ jobs:
   tests_clang:
     name: Clang@14.0 C++17 SP Particles DP Mesh Debug [tests]
     runs-on: ubuntu-22.04
+    needs: check_changes
+    if: needs.check_changes.outputs.has_non_docs_changes == 'true'
     steps:
     - uses: actions/checkout@v4
     - name: Dependencies
@@ -131,6 +136,8 @@ jobs:
   tests_cxx20:
     name: Clang C++20 [tests]
     runs-on: ubuntu-22.04
+    needs: check_changes
+    if: needs.check_changes.outputs.has_non_docs_changes == 'true'
     steps:
     - uses: actions/checkout@v4
     - name: Dependencies
@@ -180,6 +187,8 @@ jobs:
   configure-2d:
     name: Clang NOMPI Release [configure 2D]
     runs-on: ubuntu-22.04
+    needs: check_changes
+    if: needs.check_changes.outputs.has_non_docs_changes == 'true'
     steps:
     - uses: actions/checkout@v4
     - name: Dependencies
diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
index c9c34bf34f..99ebc63114 100644
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@@ -13,8 +13,12 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
+  check_changes:
+    uses: ./.github/workflows/check_changes.yml
+
   analyze:
-    if: ${{ github.repository == 'AMReX-Codes/amrex' || github.event_name != 'schedule' }}
+    needs: check_changes
+    if: ${{ github.repository == 'AMReX-Codes/amrex' || github.event_name != 'schedule' }} && needs.check_changes.outputs.has_non_docs_changes == 'true'
     name: Analyze
     runs-on: ubuntu-latest
     permissions:
diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml
index 927e99ded4..4b23db20e9 100644
--- a/.github/workflows/cuda.yml
+++ b/.github/workflows/cuda.yml
@@ -7,10 +7,15 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
+  check_changes:
+    uses: ./.github/workflows/check_changes.yml
+
   # Build libamrex and all tests with CUDA 11.2
   tests-cuda11:
     name: CUDA@11.2 GNU@9.3.0 C++17 Release [tests]
     runs-on: ubuntu-20.04
+    needs: check_changes
+    if: needs.check_changes.outputs.has_non_docs_changes == 'true'
     steps:
     - uses: actions/checkout@v4
     - name: Dependencies
@@ -62,6 +67,8 @@ jobs:
   tests-cuda11-clang:
     name: Clang@15 CUDA@11.7 C++17 Release [tests]
     runs-on: ubuntu-22.04
+    needs: check_changes
+    if: needs.check_changes.outputs.has_non_docs_changes == 'true'
     env:
       CC: clang-15
       CXX: clang++-15
@@ -115,6 +122,8 @@ jobs:
   tests-nvhpc-nvcc:
     name: NVHPC NVCC/NVC++ C++17 Release [tests]
     runs-on: ubuntu-20.04
+    needs: check_changes
+    if: needs.check_changes.outputs.has_non_docs_changes == 'true'
     steps:
     - uses: actions/checkout@v4
     - name: Dependencies
@@ -178,6 +187,8 @@ jobs:
   configure-3d-cuda:
     name: CUDA@11.2 GNU@9.3.0 [configure 3D]
     runs-on: ubuntu-20.04
+    needs: check_changes
+    if: needs.check_changes.outputs.has_non_docs_changes == 'true'
     steps:
     - uses: actions/checkout@v4
     - name: Dependencies
diff --git a/.github/workflows/dependencies/dependencies_gcc.sh b/.github/workflows/dependencies/dependencies_gcc.sh
index 93d9aa27ec..47a7d07245 100755
--- a/.github/workflows/dependencies/dependencies_gcc.sh
+++ b/.github/workflows/dependencies/dependencies_gcc.sh
@@ -12,7 +12,6 @@ set -eu -o pipefail
 #   failed files the given number of times.
 echo 'Acquire::Retries "3";' | sudo tee /etc/apt/apt.conf.d/80-retries
 
-sudo add-apt-repository ppa:ubuntu-toolchain-r/test
 sudo apt-get update
 
 sudo apt-get install -y --no-install-recommends \
diff --git a/.github/workflows/dependencies/dependencies_hip.sh b/.github/workflows/dependencies/dependencies_hip.sh
index df4f274ef3..6b69c5433a 100755
--- a/.github/workflows/dependencies/dependencies_hip.sh
+++ b/.github/workflows/dependencies/dependencies_hip.sh
@@ -40,6 +40,7 @@ echo 'export PATH=/opt/rocm/llvm/bin:/opt/rocm/bin:/opt/rocm/profiler/bin:/opt/r
 
 # we should not need to export HIP_PATH=/opt/rocm/hip with those installs
 
+sudo apt-get clean
 sudo apt-get update
 
 # Ref.: https://rocmdocs.amd.com/en/latest/Installation_Guide/Installation-Guide.html#installing-development-packages-for-cross-compilation
diff --git a/.github/workflows/gcc.yml b/.github/workflows/gcc.yml
index 88fe47c988..3b9bddac8f 100644
--- a/.github/workflows/gcc.yml
+++ b/.github/workflows/gcc.yml
@@ -10,11 +10,16 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
+  check_changes:
+    uses: ./.github/workflows/check_changes.yml
+
   # Build and install libamrex as AMReX CMake project
   # Note: this is an intentional "minimal" build that does not enable (many) options
   library:
     name: GNU@8.4 C++17 Release [lib]
     runs-on: ubuntu-20.04
+    needs: check_changes
+    if: needs.check_changes.outputs.has_non_docs_changes == 'true'
     steps:
     - uses: actions/checkout@v4
     - name: Dependencies
@@ -72,6 +77,8 @@ jobs:
   tests_build_3D:
     name: GNU@13 C++17 3D Debug Fortran [tests]
     runs-on: ubuntu-24.04
+    needs: check_changes
+    if: needs.check_changes.outputs.has_non_docs_changes == 'true'
     steps:
     - uses: actions/checkout@v4
     - name: Dependencies
@@ -121,6 +128,8 @@ jobs:
   tests_build_2D:
     name: GNU@9.3 C++17 2D Debug Fortran [tests]
     runs-on: ubuntu-20.04
+    needs: check_changes
+    if: needs.check_changes.outputs.has_non_docs_changes == 'true'
     steps:
     - uses: actions/checkout@v4
     - name: Dependencies
@@ -170,6 +179,8 @@ jobs:
   tests_build_1D:
     name: GNU@9.3 C++17 1D Debug Fortran [tests]
     runs-on: ubuntu-20.04
+    needs: check_changes
+    if: needs.check_changes.outputs.has_non_docs_changes == 'true'
     steps:
     - uses: actions/checkout@v4
     - name: Dependencies
@@ -221,6 +232,8 @@ jobs:
   tests_cxx20:
     name: GNU@10.1 C++20 OMP [tests]
     runs-on: ubuntu-20.04
+    needs: check_changes
+    if: needs.check_changes.outputs.has_non_docs_changes == 'true'
     steps:
     - uses: actions/checkout@v4
     - name: Dependencies
@@ -281,6 +294,8 @@ jobs:
   tests-nonmpi:
     name: GNU@8.4 C++17 NOMPI [tests]
     runs-on: ubuntu-20.04
+    needs: check_changes
+    if: needs.check_changes.outputs.has_non_docs_changes == 'true'
     steps:
     - uses: actions/checkout@v4
     - name: Dependencies
@@ -340,6 +355,8 @@ jobs:
   tests-nofortran:
     name: GNU@12 C++17 w/o Fortran [tests]
     runs-on: ubuntu-22.04
+    needs: check_changes
+    if: needs.check_changes.outputs.has_non_docs_changes == 'true'
     steps:
     - uses: actions/checkout@v4
     - name: Dependencies
@@ -401,6 +418,8 @@ jobs:
   configure-1d:
     name: GNU@9.3 Release [configure 1D]
     runs-on: ubuntu-20.04
+    needs: check_changes
+    if: needs.check_changes.outputs.has_non_docs_changes == 'true'
     steps:
     - uses: actions/checkout@v4
     - name: Dependencies
@@ -441,6 +460,8 @@ jobs:
   configure-3d:
     name: GNU@11.2 Release [configure 3D]
     runs-on: ubuntu-22.04
+    needs: check_changes
+    if: needs.check_changes.outputs.has_non_docs_changes == 'true'
     steps:
     - uses: actions/checkout@v4
     - name: Dependencies
@@ -481,6 +502,8 @@ jobs:
   configure-3d-single-tprof:
     name: GNU@9.3 Release [configure 3D]
     runs-on: ubuntu-20.04
+    needs: check_changes
+    if: needs.check_changes.outputs.has_non_docs_changes == 'true'
     steps:
     - uses: actions/checkout@v4
     - name: Dependencies
@@ -522,6 +545,8 @@ jobs:
   configure-3d-omp-debug:
     name: GNU@9.3 OMP Debug [configure 3D]
     runs-on: ubuntu-20.04
+    needs: check_changes
+    if: needs.check_changes.outputs.has_non_docs_changes == 'true'
     steps:
     - uses: actions/checkout@v4
     - name: Dependencies
@@ -562,6 +587,8 @@ jobs:
   plotfile-tools:
     name: GNU Plotfile Tools [tools]
     runs-on: ubuntu-20.04
+    needs: check_changes
+    if: needs.check_changes.outputs.has_non_docs_changes == 'true'
     steps:
     - uses: actions/checkout@v4
     - name: Dependencies
@@ -601,6 +628,8 @@ jobs:
   tests_run:
     name: GNU@13 C++17 [tests]
     runs-on: ubuntu-24.04
+    needs: check_changes
+    if: needs.check_changes.outputs.has_non_docs_changes == 'true'
     steps:
     - uses: actions/checkout@v4
     - name: Dependencies
@@ -651,6 +680,8 @@ jobs:
   test_hdf5:
     name: GNU@9.3 HDF5 I/O Test [tests]
     runs-on: ubuntu-20.04
+    needs: check_changes
+    if: needs.check_changes.outputs.has_non_docs_changes == 'true'
     env:
       CXX: h5pcc
       CC: h5cc
diff --git a/.github/workflows/hip.yml b/.github/workflows/hip.yml
index 22154d6b01..7f88fe5557 100644
--- a/.github/workflows/hip.yml
+++ b/.github/workflows/hip.yml
@@ -7,9 +7,14 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
+  check_changes:
+    uses: ./.github/workflows/check_changes.yml
+
   tests-hip:
     name: HIP ROCm Flang C++17 [tests]
     runs-on: ubuntu-20.04
+    needs: check_changes
+    if: needs.check_changes.outputs.has_non_docs_changes == 'true'
     steps:
     - uses: actions/checkout@v4
     - name: Dependencies
@@ -69,6 +74,8 @@ jobs:
   tests-hip-wrapper:
     name: HIP ROCm GFortran@9.3 C++17 [tests-hipcc]
     runs-on: ubuntu-20.04
+    needs: check_changes
+    if: needs.check_changes.outputs.has_non_docs_changes == 'true'
     steps:
     - uses: actions/checkout@v4
     - name: Dependencies
@@ -127,6 +134,8 @@ jobs:
   configure-2d-single-hip:
     name: HIP EB [configure 2D]
     runs-on: ubuntu-20.04
+    needs: check_changes
+    if: needs.check_changes.outputs.has_non_docs_changes == 'true'
     steps:
     - uses: actions/checkout@v4
     - name: Dependencies
@@ -160,6 +169,8 @@ jobs:
   hip-3d-eb-gmake:
     name: HIP EB 3D GMake
     runs-on: ubuntu-20.04
+    needs: check_changes
+    if: needs.check_changes.outputs.has_non_docs_changes == 'true'
     steps:
     - uses: actions/checkout@v4
     - name: Dependencies
diff --git a/.github/workflows/hypre.yml b/.github/workflows/hypre.yml
index c6f2ee20ed..79fb36bdf1 100644
--- a/.github/workflows/hypre.yml
+++ b/.github/workflows/hypre.yml
@@ -7,9 +7,14 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
+  check_changes:
+    uses: ./.github/workflows/check_changes.yml
+
   compile-hypre-cuda-eb-2d:
     name: CUDA EB 2D Hypre@2.26.0
     runs-on: ubuntu-20.04
+    needs: check_changes
+    if: needs.check_changes.outputs.has_non_docs_changes == 'true'
     env:
       AMREX_HYPRE_HOME: ${HOME}/.cache/hypre-2.26.0-cuda
     steps:
@@ -61,6 +66,8 @@ jobs:
   test-hypre-cpu-3d:
     name: GCC 3D Hypre@2.21.0
     runs-on: ubuntu-latest
+    needs: check_changes
+    if: needs.check_changes.outputs.has_non_docs_changes == 'true'
     steps:
     - uses: actions/checkout@v4
     - name: Dependencies
@@ -110,6 +117,8 @@ jobs:
   test-hypre-solver-cpu-eb-2d:
     name: GCC EB 2D Hypre@2.28.0
     runs-on: ubuntu-latest
+    needs: check_changes
+    if: needs.check_changes.outputs.has_non_docs_changes == 'true'
     steps:
     - uses: actions/checkout@v4
     - name: Dependencies
diff --git a/.github/workflows/intel.yml b/.github/workflows/intel.yml
index 15c7bbda58..c0ecf42e3b 100644
--- a/.github/workflows/intel.yml
+++ b/.github/workflows/intel.yml
@@ -7,9 +7,14 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
+  check_changes:
+    uses: ./.github/workflows/check_changes.yml
+
   tests-oneapi-sycl:
     name: oneAPI SYCL [tests]
     runs-on: ubuntu-latest
+    needs: check_changes
+    if: needs.check_changes.outputs.has_non_docs_changes == 'true'
     steps:
     - uses: actions/checkout@v4
     - name: Dependencies
@@ -24,11 +29,8 @@ jobs:
         restore-keys: |
              ccache-${{ github.workflow }}-${{ github.job }}-git-
     - name: Build & Install
-      # /tmp/icpx-2d34de0e47/global_vars-header-4390fb.h:25:36: error: zero size arrays are an extension [-Werror,-Wzero-length-array]
-      #    25 | const char* const kernel_names[] = {
-      #       |                                    ^
-      # 1 error generated.
-      env: {CXXFLAGS: "-fno-operator-names -Werror -Wall -Wextra -Wpedantic -Wnull-dereference -Wfloat-conversion -Wshadow -Woverloaded-virtual -Wextra-semi -Wunreachable-code -Wnon-virtual-dtor -Wno-zero-length-array"}
+      # Warnings in 2025.0: unused-variable, shadow
+      env: {CXXFLAGS: "-fno-operator-names -Werror -Wall -Wextra -Wpedantic -Wnull-dereference -Wfloat-conversion -Wshadow -Woverloaded-virtual -Wextra-semi -Wunreachable-code -Wnon-virtual-dtor -Wno-unused-variable -Wno-shadow"}
       run: |
         export CCACHE_COMPRESS=1
         export CCACHE_COMPRESSLEVEL=10
@@ -59,6 +61,8 @@ jobs:
   tests-oneapi-sycl-eb:
     name: oneAPI SYCL [tests w/ EB]
     runs-on: ubuntu-latest
+    needs: check_changes
+    if: needs.check_changes.outputs.has_non_docs_changes == 'true'
     steps:
     - uses: actions/checkout@v4
     - name: Dependencies
@@ -73,11 +77,8 @@ jobs:
         restore-keys: |
              ccache-${{ github.workflow }}-${{ github.job }}-git-
     - name: Build & Install
-      # /tmp/icpx-2d34de0e47/global_vars-header-4390fb.h:25:36: error: zero size arrays are an extension [-Werror,-Wzero-length-array]
-      #    25 | const char* const kernel_names[] = {
-      #       |                                    ^
-      # 1 error generated.
-      env: {CXXFLAGS: "-fno-operator-names -Werror -Wall -Wextra -Wpedantic -Wnull-dereference -Wfloat-conversion -Wshadow -Woverloaded-virtual -Wextra-semi -Wunreachable-code -Wnon-virtual-dtor -Wno-zero-length-array"}
+      # Warnings in 2025.0: unused-variable, shadow
+      env: {CXXFLAGS: "-fno-operator-names -Werror -Wall -Wextra -Wpedantic -Wnull-dereference -Wfloat-conversion -Wshadow -Woverloaded-virtual -Wextra-semi -Wunreachable-code -Wnon-virtual-dtor -Wno-unused-variable -Wno-shadow"}
       run: |
         export CCACHE_COMPRESS=1
         export CCACHE_COMPRESSLEVEL=10
@@ -107,6 +108,8 @@ jobs:
   tests-oneapi-sycl-eb-nvidia:
     name: oneAPI SYCL for Nvidia GPUs [tests w/ EB]
     runs-on: ubuntu-latest
+    needs: check_changes
+    if: false && needs.check_changes.outputs.has_non_docs_changes == 'true'
     steps:
     - uses: actions/checkout@v4
     - name: Dependencies
@@ -124,7 +127,8 @@ jobs:
         restore-keys: |
              ccache-${{ github.workflow }}-${{ github.job }}-git-
     - name: Build & Install
-      env: {CXXFLAGS: "-fsycl -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend --offload-arch=sm_80 -fno-operator-names -Werror -Wall -Wextra -Wpedantic -Wnull-dereference -Wfloat-conversion -Wshadow -Woverloaded-virtual -Wextra-semi -Wunreachable-code -Wnon-virtual-dtor"}
+      # Warnings in 2025.0: unused-variable, shadow
+      env: {CXXFLAGS: "-fsycl -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend --offload-arch=sm_80 -fno-operator-names -Werror -Wall -Wextra -Wpedantic -Wnull-dereference -Wfloat-conversion -Wshadow -Woverloaded-virtual -Wextra-semi -Wunreachable-code -Wnon-virtual-dtor -Wno-unused-variable -Wno-shadow"}
       run: |
         export CCACHE_COMPRESS=1
         export CCACHE_COMPRESSLEVEL=10
@@ -156,6 +160,8 @@ jobs:
   no-tests-oneapi-sycl-amd:
     name: oneAPI SYCL for AMD GPUs
     runs-on: ubuntu-20.04
+    needs: check_changes
+    if: false && needs.check_changes.outputs.has_non_docs_changes == 'true'
     steps:
     - uses: actions/checkout@v4
     - name: Dependencies
@@ -173,7 +179,8 @@ jobs:
         restore-keys: |
              ccache-${{ github.workflow }}-${{ github.job }}-git-
     - name: Build & Install
-      env: {CXXFLAGS: "-fsycl -fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=gfx90a -fno-operator-names -Werror -Wall -Wextra -Wpedantic -Wnull-dereference -Wfloat-conversion -Wshadow -Woverloaded-virtual -Wextra-semi -Wunreachable-code -Wnon-virtual-dtor"}
+      # Warnings in 2025.0: unused-variable, shadow
+      env: {CXXFLAGS: "-fsycl -fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=gfx90a -fno-operator-names -Werror -Wall -Wextra -Wpedantic -Wnull-dereference -Wfloat-conversion -Wshadow -Woverloaded-virtual -Wextra-semi -Wunreachable-code -Wnon-virtual-dtor -Wno-unused-variable -Wno-shadow"}
       run: |
         export CCACHE_COMPRESS=1
         export CCACHE_COMPRESSLEVEL=10
@@ -203,6 +210,8 @@ jobs:
   tests-icc:
     name: ICC [tests]
     runs-on: ubuntu-latest
+    needs: check_changes
+    if: needs.check_changes.outputs.has_non_docs_changes == 'true'
     steps:
     - uses: actions/checkout@v4
     - name: Dependencies
diff --git a/.github/workflows/macos.yml b/.github/workflows/macos.yml
index 76aba4cef6..06034a471a 100644
--- a/.github/workflows/macos.yml
+++ b/.github/workflows/macos.yml
@@ -7,10 +7,15 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
+  check_changes:
+    uses: ./.github/workflows/check_changes.yml
+
   # Build libamrex and all tests
   tests-macos-universal-nompi:
     name: AppleClang Universal w/o MPI [tests-universal]
     runs-on: macos-latest
+    needs: check_changes
+    if: needs.check_changes.outputs.has_non_docs_changes == 'true'
     steps:
     - uses: actions/checkout@v4
     - name: Dependencies
@@ -55,6 +60,8 @@ jobs:
   tests-macos:
     name: AppleClang@11.0 GFortran@9.3 [tests]
     runs-on: macos-latest
+    needs: check_changes
+    if: needs.check_changes.outputs.has_non_docs_changes == 'true'
     steps:
     - uses: actions/checkout@v4
     - name: Dependencies
diff --git a/.github/workflows/petsc.yml b/.github/workflows/petsc.yml
index efde21b89a..89784a9831 100644
--- a/.github/workflows/petsc.yml
+++ b/.github/workflows/petsc.yml
@@ -7,9 +7,14 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
+  check_changes:
+    uses: ./.github/workflows/check_changes.yml
+
   test-petsc-cpu-2d:
     name: GCC 2D EB PETSc@3.18.1
     runs-on: ubuntu-latest
+    needs: check_changes
+    if: needs.check_changes.outputs.has_non_docs_changes == 'true'
     steps:
     - uses: actions/checkout@v4
     - name: Dependencies
diff --git a/.github/workflows/sensei.yml b/.github/workflows/sensei.yml
index 52f8e418a6..d1fcbaea67 100644
--- a/.github/workflows/sensei.yml
+++ b/.github/workflows/sensei.yml
@@ -9,9 +9,14 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
+  check_changes:
+    uses: ./.github/workflows/check_changes.yml
+
   test_sensei:
     name: SENSEI Adaptor [test]
     runs-on: ubuntu-20.04
+    needs: check_changes
+    if: needs.check_changes.outputs.has_non_docs_changes == 'true'
     env:
       CXX: clang++
       CC: clang
diff --git a/.github/workflows/smoke.yml b/.github/workflows/smoke.yml
index d56e52c63e..dab1cbe7e7 100644
--- a/.github/workflows/smoke.yml
+++ b/.github/workflows/smoke.yml
@@ -7,9 +7,14 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
+  check_changes:
+    uses: ./.github/workflows/check_changes.yml
+
   multid:
     name: GNU Multi-D
     runs-on: ubuntu-latest
+    needs: check_changes
+    if: needs.check_changes.outputs.has_non_docs_changes == 'true'
     steps:
     - uses: actions/checkout@v4
     - name: Dependencies
diff --git a/.github/workflows/sundials.yml b/.github/workflows/sundials.yml
index e17234ba28..645ac13ae4 100644
--- a/.github/workflows/sundials.yml
+++ b/.github/workflows/sundials.yml
@@ -7,9 +7,14 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
+  check_changes:
+    uses: ./.github/workflows/check_changes.yml
+
   sundials-cpu:
     name: GCC SUNDIALS@6.5.0
     runs-on: ubuntu-22.04
+    needs: check_changes
+    if: needs.check_changes.outputs.has_non_docs_changes == 'true'
     env:
        CCACHE_COMPRESS: 1
        CCACHE_COMPRESSLEVEL: 10
@@ -70,6 +75,8 @@ jobs:
   sundials-cuda:
     name: CUDA SUNDIALS@7.0.0
     runs-on: ubuntu-22.04
+    needs: check_changes
+    if: needs.check_changes.outputs.has_non_docs_changes == 'true'
     env:
        CCACHE_COMPRESS: 1
        CCACHE_COMPRESSLEVEL: 10
diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml
index 4dd9a3f391..0c894e4686 100644
--- a/.github/workflows/windows.yml
+++ b/.github/workflows/windows.yml
@@ -7,10 +7,15 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
+  check_changes:
+    uses: ./.github/workflows/check_changes.yml
+
   # Build libamrex and all tests
   tests_msvc:
     name: MSVC C++17 w/o Fortran w/o MPI
     runs-on: windows-latest
+    needs: check_changes
+    if: needs.check_changes.outputs.has_non_docs_changes == 'true'
     steps:
     - uses: actions/checkout@v4
     #- name: Set Up Cache
@@ -57,6 +62,8 @@ jobs:
   test_msvc_static:
     name: MSVC C++17 w/o Fortran w/o MPI static
     runs-on: windows-latest
+    needs: check_changes
+    if: needs.check_changes.outputs.has_non_docs_changes == 'true'
     steps:
     - uses: actions/checkout@v4
     #- name: Set Up Cache
@@ -105,6 +112,8 @@ jobs:
   tests_clang:
     name: MSVC Clang C++17 w/o Fortran w/o MPI
     runs-on: windows-latest
+    needs: check_changes
+    if: needs.check_changes.outputs.has_non_docs_changes == 'true'
     steps:
     - uses: actions/checkout@v4
     - uses: seanmiddleditch/gha-setup-ninja@master
diff --git a/CHANGES b/CHANGES
index 2f5d0e5373..e91f1bfd73 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,3 +1,39 @@
+# 24.11
+
+  -- Add option to do stable redistribute with GPUs (#4200)
+
+  -- Remove HIP_INDIRECT_FUNCTION (#4199)
+
+  -- Use The_Comms_Arena in particle communication. (#4175)
+
+  -- Refactor grid-stride loop (#4190)
+
+  -- STL optimization: Bounding volume hierarchy (#4140)
+
+  -- Bounded sampling (#4195)
+
+  -- amrex::FFT (#4193)
+
+  -- SmallMatrix: Structured binding support (#4189)
+
+  -- SmallMatrix: Support 1-based indexing (#4188)
+
+  -- SoA: Public Getter for Names (#4187)
+
+  -- Named SoA Support (#4163)
+
+  -- Fix queryktharr() return value/behaviour. (#4186)
+
+  -- Add more build options for linear solvers (#4181)
+
+  -- New function for domain decomposition (#4183)
+
+  -- SmallMatrix: Matrix class with compile time size (#4176)
+
+  -- modify/remove the assertions about no hidden dimension (#4180)
+
+  -- Add comparison operator for boxarray and distromap. Add hdf5 to dep.py (#4173)
+
 # 24.10
 
   -- TinyProfiler: Remove unnecessary precision() call (#4174)
diff --git a/Docs/Doxygen/doxygen.conf b/Docs/Doxygen/doxygen.conf
index 4594d5ca65..20371a2036 100644
--- a/Docs/Doxygen/doxygen.conf
+++ b/Docs/Doxygen/doxygen.conf
@@ -2046,7 +2046,6 @@ INCLUDE_PATH           = ../../Src/Base \
                          ../../Src/Boundary \
                          ../../Src/Extern \
                          ../../Src/Extern/HYPRE \
-                         ../../Src/Extern/hpgmg \
                          ../../Src/Extern/HDF5 \
                          ../../Src/Extern/PETSc \
                          ../../Src/Extern/amrdata \
@@ -2054,7 +2053,6 @@ INCLUDE_PATH           = ../../Src/Base \
                          ../../Src/Extern/SUNDIALS \
                          ../../Src/Extern/ProfParser \
                          ../../Src/Extern/SENSEI \
-                         ../../Src/Extern/SWFFT \
                          ../../Src/AmrCore \
                          ../../Src/LinearSolvers \
                          ../../Src/LinearSolvers/MLMG \
diff --git a/Docs/sphinx_documentation/source/External_Frameworks_Chapter.rst b/Docs/sphinx_documentation/source/External_Frameworks_Chapter.rst
index 3d15c08303..c3f74cd7bc 100644
--- a/Docs/sphinx_documentation/source/External_Frameworks_Chapter.rst
+++ b/Docs/sphinx_documentation/source/External_Frameworks_Chapter.rst
@@ -7,4 +7,3 @@ External Frameworks
    :maxdepth: 1
 
    SUNDIALS_top
-   SWFFT
diff --git a/Docs/sphinx_documentation/source/FFT.rst b/Docs/sphinx_documentation/source/FFT.rst
index 3fc24fcab8..2a5957e40b 100644
--- a/Docs/sphinx_documentation/source/FFT.rst
+++ b/Docs/sphinx_documentation/source/FFT.rst
@@ -7,10 +7,10 @@ FFT::R2C Class
 ==============
 
 Class template `FFT::R2C` supports discrete Fourier transforms between real
-and complex data. The name R2C indicates that the forward transform converts
-real data to complex data, while the backward transform converts complex
-data to real data. It should be noted that both directions of transformation
-are supported, not just from real to complex.
+and complex data across MPI processes. The name R2C indicates that the
+forward transform converts real data to complex data, while the backward
+transform converts complex data to real data. It should be noted that both
+directions of transformation are supported, not just from real to complex.
 
 The implementation utilizes cuFFT, rocFFT, oneMKL and FFTW, for CUDA, HIP,
 SYCL and CPU builds, respectively. Because the parallel communication is
@@ -18,8 +18,7 @@ handled by AMReX, it does not need the parallel version of
 FFTW. Furthermore, there is no constraint on the domain decomposition such
 as one Box per process. This class performs parallel FFT on AMReX's parallel
 data containers (e.g., :cpp:`MultiFab` and
-:cpp:`FabArray<BaseFab<ComplexData<Real>>>`. For local FFT, the users can
-use FFTW, cuFFT, rocFFT, or oneMKL directly.
+:cpp:`FabArray<BaseFab<ComplexData<Real>>>`.
 
 Other than using column-majored order, AMReX follows the convention of
 FFTW. Applying the forward transform followed by the backward transform
@@ -28,7 +27,7 @@ complex data also follows the FFTW convention, where the complex Hermitian
 output array has `(nx/2+1,ny,nz)` elements. Here `nx`, `ny` and `nz` are the
 sizes of the real array and the division is rounded down.
 
-Below are examples of using :cpp:`FFT:R2C`.
+Below are examples of using :cpp:`FFT::R2C`.
 
 .. highlight:: c++
 
@@ -47,7 +46,9 @@ Below are examples of using :cpp:`FFT:R2C`.
             sp *= scaling;
         });
 
-    cMultiFab cmf(...);
+    // Use R2C provided spectral data layout.
+    auto const& [cba, cdm] = r2c.getSpectralDataLayout();
+    cMultiFab cmf(cba, cdm, 1, 0);
     FFT::R2C<Real,FFT::Direction::forward> r2c_forward(geom.Domain());
     r2c_forward(mfin, cmf);
 
@@ -56,16 +57,109 @@ Below are examples of using :cpp:`FFT:R2C`.
 
 Note that using :cpp:`forwardThenBackward` is expected to be more efficient
 than separate calls to :cpp:`forward` and :cpp:`backward` because some
-parallel communication can be avoided. It should also be noted that a lot of
+parallel communication can be avoided. For the spectral data, the example
+above builds :cpp:`cMultiFab` using :cpp:`FFT::R2C` provided layout. You can
+also use your own :cpp:`BoxArray` and :cpp:`DistributionMapping`, but it
+might result in extra communication. It should also be noted that a lot of
 preparation works are done in the construction of an :cpp:`FFT::R2C`
-object. Therefore, one should cache it for reuse if possible.
+object. Therefore, one should cache it for reuse if possible. Although
+:cpp:`FFT::R2C` does not have a default constructor, one could always use
+:cpp:`std::unique_ptr<FFT::R2C<Real>>` to store an object in one's class.
+
+
+.. _sec:FFT:localr2c:
+
+FFT::LocalR2C Class
+===================
+
+Class template `FFT::LocalR2C` supports local discrete Fourier transforms
+between real and complex data. The name R2C indicates that the forward
+transform converts real data to complex data, while the backward transform
+converts complex data to real data. It should be noted that both directions
+of transformation are supported, not just from real to complex.
+
+Below is an example of using :cpp:`FFT::LocalR2C`.
+
+.. highlight:: c++
+
+::
+
+    MultiFab mf(...);
+    BaseFab<GpuComplex<T>> cfab;
+    for (MFIter mfi(mf); mfi.isValid(); ++mfi) {
+        FFT::LocalR2C fft(mfi.fabbox().length());
+        cfab.resize(IntVect(0), fft.spectralSize()-1);
+        fft.forward(mf[mfi].dataPtr(), cfab.dataPtr());
+    }
 
 
 Poisson Solver
 ==============
 
-AMReX provides FFT based Poisson solvers. :cpp:`FFT::Poisson` supports all
-periodic boundaries using purely FFT. :cpp:`FFT::PoissonHybrid` is a 3D only
-solver that supports periodic boundaries in the first two dimensions and
-Neumann boundary in the last dimension. Similar to :cpp:`FFT::R2C`, the
-Poisson solvers should be cached for reuse.
+AMReX provides FFT based Poisson solvers. Here, Poisson's equation is
+
+.. math::
+
+  \nabla^2 \phi = \rho.
+
+:cpp:`FFT::Poisson` supports periodic (:cpp:`FFT::Boundary::periodic`),
+homogeneous Neumann (:cpp:`FFT::Boundary::even`), and homogeneous Dirichlet
+(:cpp:`FFT::Boundary::odd`) boundaries using FFT. Below is an example of
+using the solver.
+
+.. highlight:: c++
+
+::
+
+    Geometry geom(...);
+    MultiFab soln(...);
+    MultiFab rhs(...);
+
+    Array<std::pair<FFT::Boundary,FFT::Boundary>,AMREX_SPACEDIM>
+            fft_bc{...};
+
+    bool has_dirichlet = false;
+    for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) {
+        has_dirichlet = has_dirichlet ||
+            fft_bc[idim].first == FFT::Boundary::odd ||
+            fft_bc[idim].second == FFT::Boundary::odd;
+    }
+    if (! has_dirichlet) {
+        // Shift rhs so that its sum is zero.
+        auto rhosum = rhs.sum(0);
+        rhs.plus(-rhosum/geom.Domain().d_numPts(), 0, 1);
+    }
+
+    FFT::Poisson fft_poisson(geom, fft_bc);
+    fft_poisson.solve(soln, rhs);
+
+:cpp:`FFT::PoissonOpenBC` is a 3D only solver that supports open
+boundaries. Its implementation utilizes :cpp:`FFT::OpenBCSolver`, which can
+be used for implementing convolution based solvers with a user provided
+Green's function. If users want to extend the open BC solver to 2D or other
+types of Green's function, they could use :cpp:`FFT::PoissonOpenBC` as an
+example. Below is an example of solving Poisson's equation with open
+boundaries.
+
+.. highlight:: c++
+
+::
+
+    Geometry geom(...);
+    MultiFab soln(...); // soln can be either nodal or cell-centered.
+    MultiFab rhs(...);  // rhs must have the same index type as soln.
+
+    int ng = ...; // ng can be non-zero, if we want to compute potential
+                  // outside the domain.
+    FFT::PoissonOpenBC openbc_solver(geom, soln.ixType(), IntVect(ng));
+    openbc_solver.solve(soln, rhs);
+
+:cpp:`FFT::PoissonHybrid` is a 3D only solver that supports periodic
+boundaries in the first two dimensions and Neumann boundary in the last
+dimension. The last dimension is solved with a tridiagonal solver that can
+support non-uniform cell size in the z-direction. For most applications,
+:cpp:`FFT::Poisson` should be used.
+
+Similar to :cpp:`FFT::R2C`, the Poisson solvers should be cached for reuse,
+and one might need to use :cpp:`std::unique_ptr<FFT::Poisson<MultiFab>>`
+because there is no default constructor.
diff --git a/Docs/sphinx_documentation/source/LinearSolvers.rst b/Docs/sphinx_documentation/source/LinearSolvers.rst
index ee3ebe0efc..27d6e5ca3a 100644
--- a/Docs/sphinx_documentation/source/LinearSolvers.rst
+++ b/Docs/sphinx_documentation/source/LinearSolvers.rst
@@ -699,7 +699,7 @@ the following cross-terms are evaluated separately using the ``MLTensorOp`` and
 
     (\eta u_y)_x + ( (\kappa - \frac{2}{3} \eta) (u_x + w_z) )_y  + (\eta w_y)_z
 
-    (\eta u_z)_x + (\eta v_z)_y - ( (\kappa - \frac{2}{3} \eta) (u_x + v_y) )_z
+    (\eta u_z)_x + (\eta v_z)_y + ( (\kappa - \frac{2}{3} \eta) (u_x + v_y) )_z
 
 The code below is an example of how to set up the solver to compute the
 viscous term `divtau` explicitly:
diff --git a/Docs/sphinx_documentation/source/SWFFT.rst b/Docs/sphinx_documentation/source/SWFFT.rst
deleted file mode 100644
index 9e6192ff04..0000000000
--- a/Docs/sphinx_documentation/source/SWFFT.rst
+++ /dev/null
@@ -1,107 +0,0 @@
-.. role:: cpp(code)
-   :language: c++
-
-.. role:: fortran(code)
-   :language: fortran
-
-.. role:: underline
-    :class: underline
-
-.. _swfftdoc:
-
-SWFFT
-=======
-
-``hacc/SWFFT``, developed by Adrian Pope et al. at Argonne National Lab, provides the functionality to perform forward and reverse Fast Fourier Transforms (FFT) within a fully parallelized framework built in C++ and F90. In the words of HACC's developers, SWFFT is a "distributed-memory, pencil-decomposed, parallel 3D FFT." [1]_ The SWFFT source code is also contained in the following directory within AMReX: ``amrex/Src/Extern/SWFFT``. [2]_
-
-Pencil Redistribution
---------------------------------
-
-As input, SWFFT takes three-dimensional arrays of data distributed across block-structured grids, and redistributes the data into "pencil" grids in :math:`z, x,` and then :math:`y`, belonging to different MPI processes. After each pencil conversion, a 1D FFT is performed on the data along the pencil direction using calls to the FFTW [3]_ library. The ``README`` files in the tutorial directories specify the relationship between the number of grids and the number of MPI processes that should be used. The ``hacc/SWFFT`` ``README`` document by Adrian Pope et al. explains restrictions on grid dimensions in relation to the number of MPI processes [1]_  [2]_:
-
-      [...] A rule of thumb is that [SWFFT] generally works when the number of vertices along
-      one side of the global 3D grid ("ng") can be factored into small primes, and
-      when the number of MPI ranks can also be factored into small primes.
-      I believe that all of the unique prime factors of the number of MPI ranks
-      must be present in the set of prime factors of the grid, eg. if you have
-      20 MPI ranks then ng must be a multiple of 5 and 2. The ``CheckDecomposition``
-      utility is provided to check (on one rank) whether a proposed grid size and
-      number of MPI ranks will work, which can be done before submitting a large
-      test with ``TestDfft/TestFDfft``.
-
-The relationship between the number of processes versus global grid dimensions is determined by how the total number of grids can be factored from a three dimensional grid structure (block structured grids) into a two dimensional structure (pencil arrays), as shown in the figures below.
-
-The following figures illustrate how data is distributed from block structured grids to pencil arrays within SWFFT, where the colors of each box indicate which MPI rank it belongs to:
-
-.. |a| image:: ./SWFFT/figs/grid_4x4x4.png
-       :width: 100%
-
-.. |b| image:: ./SWFFT/figs/grid_8x8x1.png
-       :width: 100%
-
-.. |c| image:: ./SWFFT/figs/grid_2x2x2.png
-       :width: 100%
-
-.. |d| image:: ./SWFFT/figs/grid_4x2x1.png
-       :width: 100%
-
-.. |e| image:: ./SWFFT/figs/grid_1x4x2.png
-       :width: 100%
-
-.. |f| image:: ./SWFFT/figs/grid_4x1x2.png
-       :width: 100%
-
-.. table:: SWFFT Redistribution from :math:`4 \times 4 \times 4` Box Array into Pencils
-   :align: center
-
-   +---------------------------------------------------------+------------------------------------------------------+
-   |                        |a|                              |                        |b|                           |
-   +---------------------------------------------------------+------------------------------------------------------+
-   | | (a) Block structured grids: :math:`N_x=4,N_y=4,N_z=4` | | (b) Z-pencils: :math:`N_x=8,N_y=8,N_z=1`           |
-   +---------------------------------------------------------+------------------------------------------------------+
-
-
-.. table:: SWFFT Redistribution from :math:`2 \times 2 \times 2` Box Array into Pencils
-   :align: center
-
-   +---------------------------------------------------------+------------------------------------------------------+
-   |                        |c|                              |                        |d|                           |
-   +---------------------------------------------------------+------------------------------------------------------+
-   | | (a) Block structured grids: :math:`N_x=2,N_y=2,N_z=2` | | (b) Z-pencils: :math:`N_x=4,N_y=2,N_z=1`           |
-   +---------------------------------------------------------+------------------------------------------------------+
-   |                        |e|                              |                        |f|                           |
-   +---------------------------------------------------------+------------------------------------------------------+
-   | | (c) X-pencils: :math:`N_x=1,N_y=4,N_z=2`              | | (d) Y-pencils: :math:`N_x=4,N_y=1,N_z=2`           |
-   +---------------------------------------------------------+------------------------------------------------------+
-
-Using the same number of AMReX grids as processes has been verified to work in the `SWFFT Poisson`_ and `SWFFT Simple`_ tutorials. This can be illustrated by the following equation for the total number of grids, :math:`N_{b}`, in a regularly structured domain:
-
-.. math:: N_{b} = m_{bi} m_{bj} = n_{bi} n_{bj} n_{bk},
-
-where :math:`n_{bi}, n_{bj},` and  :math:`n_{bk}` are the number of grids, or boxes, in the :math:`x, y,` and :math:`z` dimensions of the block-structured grid. Analogously, for pencil distributions, :math:`m_{bi}` and :math:`m_{bj}` are the number of grids along the remaining dimensions if pencils are taken in the :math:`k` direction. There are many possible ways of redistributing the data, for example :math:`m_{bi} = n_{bi}n_{bk}` & :math:`m_{bj} = n_{bj}` is one possible simple configuration. However, it is evident from the figures above that the SWFFT redistribution algorithm has a more sophisticated method for finding the prime factors of the grid.
-
-Tutorials
---------------------------------
-
-AMReX contains two SWFFT tutorials, `SWFFT Poisson`_ and `SWFFT Simple`_:
-
-- `SWFFT Poisson`_ solves a Poisson equation with periodic boundary conditions.  In it, both a forward FFT
-  and reverse FFT are called to solve the equation, however, no reordering of the DFT data in k-space is performed.
-
-- `SWFFT Simple`_ is useful if the objective is to simply take a forward FFT of data,
-  and the DFT's ordering in k-space matters to the user. This tutorial initializes a 3D or 2D :cpp:`MultiFab`,
-  takes a forward FFT, and then redistributes the data in k-space back to the "correct," 0 to :math:`2\pi`, ordering.
-  The results are written to a plot file.
-
-.. _`SWFFT Poisson`: https://amrex-codes.github.io/amrex/tutorials_html/SWFFT_Tutorial.html#swfft-poisson
-
-.. _`SWFFT Simple`: https://amrex-codes.github.io/amrex/tutorials_html/SWFFT_Tutorial.html#swfft-simple
-
-.. [1]
-   https://git.cels.anl.gov/hacc/SWFFT
-
-.. [2]
-   SWFFT source code directory in AMReX: amrex/Src/Extern/SWFFT
-
-.. [3]
-   http://www.fftw.org/
diff --git a/Docs/sphinx_documentation/source/SWFFT/figs/grid_1x4x2.png b/Docs/sphinx_documentation/source/SWFFT/figs/grid_1x4x2.png
deleted file mode 100644
index 4b6c6fad83..0000000000
Binary files a/Docs/sphinx_documentation/source/SWFFT/figs/grid_1x4x2.png and /dev/null differ
diff --git a/Docs/sphinx_documentation/source/SWFFT/figs/grid_2x2x2.png b/Docs/sphinx_documentation/source/SWFFT/figs/grid_2x2x2.png
deleted file mode 100644
index dd94593eb9..0000000000
Binary files a/Docs/sphinx_documentation/source/SWFFT/figs/grid_2x2x2.png and /dev/null differ
diff --git a/Docs/sphinx_documentation/source/SWFFT/figs/grid_4x1x2.png b/Docs/sphinx_documentation/source/SWFFT/figs/grid_4x1x2.png
deleted file mode 100644
index b511415a28..0000000000
Binary files a/Docs/sphinx_documentation/source/SWFFT/figs/grid_4x1x2.png and /dev/null differ
diff --git a/Docs/sphinx_documentation/source/SWFFT/figs/grid_4x2x1.png b/Docs/sphinx_documentation/source/SWFFT/figs/grid_4x2x1.png
deleted file mode 100644
index 2dd94b0242..0000000000
Binary files a/Docs/sphinx_documentation/source/SWFFT/figs/grid_4x2x1.png and /dev/null differ
diff --git a/Docs/sphinx_documentation/source/SWFFT/figs/grid_4x4x4.png b/Docs/sphinx_documentation/source/SWFFT/figs/grid_4x4x4.png
deleted file mode 100644
index ca557d3c48..0000000000
Binary files a/Docs/sphinx_documentation/source/SWFFT/figs/grid_4x4x4.png and /dev/null differ
diff --git a/Docs/sphinx_documentation/source/SWFFT/figs/grid_8x8x1.png b/Docs/sphinx_documentation/source/SWFFT/figs/grid_8x8x1.png
deleted file mode 100644
index 9cf287aa13..0000000000
Binary files a/Docs/sphinx_documentation/source/SWFFT/figs/grid_8x8x1.png and /dev/null differ
diff --git a/Src/AmrCore/AMReX_TagBox.cpp b/Src/AmrCore/AMReX_TagBox.cpp
index 1efc1e5e87..8f69b54a6a 100644
--- a/Src/AmrCore/AMReX_TagBox.cpp
+++ b/Src/AmrCore/AMReX_TagBox.cpp
@@ -472,7 +472,7 @@ TagBoxArray::local_collate_gpu (Gpu::PinnedVector<IntVect>& v) const
         {
             int bid = blockIdx.x;
             int tid = threadIdx.x;
-            int icell = blockDim.x*blockIdx.x+threadIdx.x;
+            int icell = block_size*blockIdx.x+threadIdx.x;
 
             int t = 0;
             if (icell < ncells && tags[icell] != TagBox::CLEAR) {
@@ -558,7 +558,7 @@ TagBoxArray::local_collate_gpu (Gpu::PinnedVector<IntVect>& v) const
             {
                 int bid = blockIdx.x;
                 int tid = threadIdx.x;
-                int icell = blockDim.x*blockIdx.x+threadIdx.x;
+                int icell = block_size*blockIdx.x+threadIdx.x;
 
                 Gpu::SharedMemory<unsigned int> gsm;
                 unsigned int * shared_counter = gsm.dataPtr();
diff --git a/Src/Base/AMReX.H b/Src/Base/AMReX.H
index 4094d4f2ff..a1124ce69d 100644
--- a/Src/Base/AMReX.H
+++ b/Src/Base/AMReX.H
@@ -102,8 +102,8 @@ namespace amrex
     * classes to clean up any "global" state that they maintain when we're
     * exiting from AMReX.
     */
-    void ExecOnFinalize (PTR_TO_VOID_FUNC);
-    void ExecOnInitialize (PTR_TO_VOID_FUNC);
+    void ExecOnFinalize (std::function<void()>);
+    void ExecOnInitialize (std::function<void()>);
 
     //! This shuts up the compiler about unused variables
     template <class... Ts>
diff --git a/Src/Base/AMReX.cpp b/Src/Base/AMReX.cpp
index b4aa8f4490..9d9edeaeba 100644
--- a/Src/Base/AMReX.cpp
+++ b/Src/Base/AMReX.cpp
@@ -14,6 +14,10 @@
 #include <AMReX_Geometry.H>
 #include <AMReX_Gpu.H>
 
+#ifdef AMREX_USE_FFT
+#include <AMReX_FFT.H>
+#endif
+
 #ifdef AMREX_USE_HYPRE
 #include <_hypre_utilities.h>
 #ifdef AMREX_USE_CUDA
@@ -302,20 +306,20 @@ amrex::Assert_host (const char* EX, const char* file, int line, const char* msg)
 
 namespace
 {
-    std::stack<amrex::PTR_TO_VOID_FUNC> The_Finalize_Function_Stack;
-    std::stack<amrex::PTR_TO_VOID_FUNC> The_Initialize_Function_Stack;
+    std::stack<std::function<void()>> The_Finalize_Function_Stack;
+    std::stack<std::function<void()>> The_Initialize_Function_Stack;
 }
 
 void
-amrex::ExecOnFinalize (PTR_TO_VOID_FUNC fp)
+amrex::ExecOnFinalize (std::function<void()> f)
 {
-    The_Finalize_Function_Stack.push(fp);
+    The_Finalize_Function_Stack.push(std::move(f));
 }
 
 void
-amrex::ExecOnInitialize (PTR_TO_VOID_FUNC fp)
+amrex::ExecOnInitialize (std::function<void()> f)
 {
-    The_Initialize_Function_Stack.push(fp);
+    The_Initialize_Function_Stack.push(std::move(f));
 }
 
 amrex::AMReX*
@@ -391,7 +395,7 @@ amrex::Initialize (int& argc, char**& argv, bool build_parm_parse,
         //
         // Call the registered function.
         //
-        (*The_Initialize_Function_Stack.top())();
+        The_Initialize_Function_Stack.top()();
         //
         // And then remove it from the stack.
         //
@@ -655,6 +659,10 @@ amrex::Initialize (int& argc, char**& argv, bool build_parm_parse,
     AsyncOut::Initialize();
     VectorGrowthStrategy::Initialize();
 
+#ifdef AMREX_USE_FFT
+    FFT::Initialize();
+#endif
+
 #ifdef AMREX_USE_EB
     EB2::Initialize();
 #endif
@@ -756,7 +764,7 @@ amrex::Finalize (amrex::AMReX* pamrex)
         //
         // Call the registered function.
         //
-        (*The_Finalize_Function_Stack.top())();
+        The_Finalize_Function_Stack.top()();
         //
         // And then remove it from the stack.
         //
diff --git a/Src/Base/AMReX_BoxArray.H b/Src/Base/AMReX_BoxArray.H
index e85946872c..066e3b073f 100644
--- a/Src/Base/AMReX_BoxArray.H
+++ b/Src/Base/AMReX_BoxArray.H
@@ -69,7 +69,8 @@ namespace amrex
      */
     [[nodiscard]] BoxArray decompose (Box const& domain, int nboxes,
                                       Array<bool,AMREX_SPACEDIM> const& decomp
-                                      = {AMREX_D_DECL(true,true,true)});
+                                      = {AMREX_D_DECL(true,true,true)},
+                                      bool no_overlap = false);
 
 struct BARef
 {
diff --git a/Src/Base/AMReX_BoxArray.cpp b/Src/Base/AMReX_BoxArray.cpp
index 576d4cb870..fbb137f5e8 100644
--- a/Src/Base/AMReX_BoxArray.cpp
+++ b/Src/Base/AMReX_BoxArray.cpp
@@ -1891,7 +1891,7 @@ bool match (const BoxArray& x, const BoxArray& y)
 }
 
 BoxArray decompose (Box const& domain, int nboxes,
-                    Array<bool,AMREX_SPACEDIM> const& decomp)
+                    Array<bool,AMREX_SPACEDIM> const& decomp, bool no_overlap)
 {
     auto ndecomp = std::count(decomp.begin(), decomp.end(), true);
 
@@ -2048,9 +2048,24 @@ BoxArray decompose (Box const& domain, int nboxes,
                 ilo += domlo[0];
                 ihi += domlo[0];
                 Box b{IntVect(AMREX_D_DECL(ilo,jlo,klo)),
-                      IntVect(AMREX_D_DECL(ihi,jhi,khi))};
+                      IntVect(AMREX_D_DECL(ihi,jhi,khi)), ixtyp};
                 if (b.ok()) {
-                    bl.push_back(b.convert(ixtyp));
+                    if (no_overlap) {
+                        for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) {
+                            if (ixtyp.nodeCentered(idim) &&
+                                b.bigEnd(idim) == ccdomain.bigEnd(idim))
+                            {
+                                b.growHi(idim, 1);
+                            }
+                        }
+                    } else {
+                        for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) {
+                            if (ixtyp.nodeCentered(idim)) {
+                                b.growHi(idim, 1);
+                            }
+                        }
+                    }
+                    bl.push_back(b);
                 }
     AMREX_D_TERM(},},})
 
diff --git a/Src/Base/AMReX_GpuLaunchFunctsG.H b/Src/Base/AMReX_GpuLaunchFunctsG.H
index 56a95dbc5b..84067ed181 100644
--- a/Src/Base/AMReX_GpuLaunchFunctsG.H
+++ b/Src/Base/AMReX_GpuLaunchFunctsG.H
@@ -806,7 +806,7 @@ ParallelFor (Gpu::KernelInfo const&, T n, L const& f) noexcept
             if (tid < nleft) {
                 detail::call_f_scalar_handler(f, tid+start_idx,
                     Gpu::Handler(amrex::min((std::uint64_t(nleft-tid)+(std::uint64_t)threadIdx.x),
-                    (std::uint64_t)blockDim.x)));
+                    (std::uint64_t)MT)));
             }
         });
     }
@@ -829,7 +829,7 @@ ParallelFor (Gpu::KernelInfo const&, BoxND<dim> const& box, L const& f) noexcept
                 auto iv = indexer.intVect(icell);
                 detail::call_f_intvect_handler(f, iv,
                     Gpu::Handler(amrex::min((indexer.numPts()-icell+(std::uint64_t)threadIdx.x),
-                    (std::uint64_t)blockDim.x)));
+                    (std::uint64_t)MT)));
             }
         });
     }
@@ -852,7 +852,7 @@ ParallelFor (Gpu::KernelInfo const&, BoxND<dim> const& box, T ncomp, L const& f)
                 auto iv = indexer.intVect(icell);
                 detail::call_f_intvect_ncomp_handler(f, iv, ncomp,
                     Gpu::Handler(amrex::min((indexer.numPts()-icell+(std::uint64_t)threadIdx.x),
-                    (std::uint64_t)blockDim.x)));
+                    (std::uint64_t)MT)));
             }
         });
     }
@@ -870,9 +870,9 @@ ParallelForRNG (T n, L const& f) noexcept
                         amrex::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch()),
                         ec.numThreads, 0, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE () noexcept {
-        Long tid = Long(blockDim.x)*blockIdx.x+threadIdx.x;
+        Long tid = Long(AMREX_GPU_MAX_THREADS)*blockIdx.x+threadIdx.x;
         RandomEngine engine{&(rand_state[tid])};
-        for (Long i = tid, stride = Long(blockDim.x)*gridDim.x; i < Long(n); i += stride) {
+        for (Long i = tid, stride = Long(AMREX_GPU_MAX_THREADS)*gridDim.x; i < Long(n); i += stride) {
             f(T(i),engine);
         }
     });
@@ -892,9 +892,9 @@ ParallelForRNG (BoxND<dim> const& box, L const& f) noexcept
                         amrex::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch()),
                         ec.numThreads, 0, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE () noexcept {
-        auto const tid = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x;
+        auto const tid = std::uint64_t(AMREX_GPU_MAX_THREADS)*blockIdx.x+threadIdx.x;
         RandomEngine engine{&(rand_state[tid])};
-        for (std::uint64_t icell = tid, stride = std::uint64_t(blockDim.x)*gridDim.x; icell < indexer.numPts(); icell += stride) {
+        for (std::uint64_t icell = tid, stride = std::uint64_t(AMREX_GPU_MAX_THREADS)*gridDim.x; icell < indexer.numPts(); icell += stride) {
             auto iv = indexer.intVect(icell);
             detail::call_f_intvect_engine(f, iv, engine);
         }
@@ -915,9 +915,9 @@ ParallelForRNG (BoxND<dim> const& box, T ncomp, L const& f) noexcept
                         amrex::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch()),
                         ec.numThreads, 0, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE () noexcept {
-        auto const tid = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x;
+        auto const tid = std::uint64_t(AMREX_GPU_MAX_THREADS)*blockIdx.x+threadIdx.x;
         RandomEngine engine{&(rand_state[tid])};
-        for (std::uint64_t icell = tid, stride = std::uint64_t(blockDim.x)*gridDim.x; icell < indexer.numPts(); icell += stride) {
+        for (std::uint64_t icell = tid, stride = std::uint64_t(AMREX_GPU_MAX_THREADS)*gridDim.x; icell < indexer.numPts(); icell += stride) {
             auto iv = indexer.intVect(icell);
             detail::call_f_intvect_ncomp_engine(f, iv, ncomp, engine);
         }
@@ -938,7 +938,7 @@ ParallelFor (Gpu::KernelInfo const&,
     AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE () noexcept {
         auto const ncells = std::max(indexer1.numPts(), indexer2.numPts());
-        for (std::uint64_t icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x, stride = std::uint64_t(blockDim.x)*gridDim.x;
+        for (std::uint64_t icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x, stride = std::uint64_t(MT)*gridDim.x;
              icell < ncells; icell += stride) {
             if (icell < indexer1.numPts()) {
                 auto iv = indexer1.intVect(icell);
@@ -967,7 +967,7 @@ ParallelFor (Gpu::KernelInfo const&,
     AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE () noexcept {
         auto const ncells = std::max({indexer1.numPts(), indexer2.numPts(), indexer3.numPts()});
-        for (std::uint64_t icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x, stride = std::uint64_t(blockDim.x)*gridDim.x;
+        for (std::uint64_t icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x, stride = std::uint64_t(MT)*gridDim.x;
              icell < ncells; icell += stride) {
             if (icell < indexer1.numPts()) {
                 auto iv = indexer1.intVect(icell);
@@ -1001,7 +1001,7 @@ ParallelFor (Gpu::KernelInfo const&,
     AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE () noexcept {
         auto const ncells = std::max(indexer1.numPts(), indexer2.numPts());
-        for (std::uint64_t icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x, stride = std::uint64_t(blockDim.x)*gridDim.x;
+        for (std::uint64_t icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x, stride = std::uint64_t(MT)*gridDim.x;
              icell < ncells; icell += stride) {
             if (icell < indexer1.numPts()) {
                 auto iv = indexer1.intVect(icell);
@@ -1034,7 +1034,7 @@ ParallelFor (Gpu::KernelInfo const&,
     AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE () noexcept {
         auto const ncells = std::max({indexer1.numPts(), indexer2.numPts(), indexer3.numPts()});
-        for (std::uint64_t icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x, stride = std::uint64_t(blockDim.x)*gridDim.x;
+        for (std::uint64_t icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x, stride = std::uint64_t(MT)*gridDim.x;
              icell < ncells; icell += stride) {
             if (icell < indexer1.numPts()) {
                 auto iv = indexer1.intVect(icell);
diff --git a/Src/Base/AMReX_MultiFabUtil.cpp b/Src/Base/AMReX_MultiFabUtil.cpp
index 721919f509..98e14650c3 100644
--- a/Src/Base/AMReX_MultiFabUtil.cpp
+++ b/Src/Base/AMReX_MultiFabUtil.cpp
@@ -855,10 +855,10 @@ namespace amrex
                 {
 #ifdef AMREX_USE_SYCL
                     int i1d = h.blockIdx() / n2dblocks;
-                    int i2d = h.threadIdx() + h.blockDim()*(h.blockIdx()-i1d*n2dblocks);
+                    int i2d = h.threadIdx() + AMREX_GPU_MAX_THREADS*(h.blockIdx()-i1d*n2dblocks);
 #else
                     int i1d = blockIdx.x / n2dblocks;
-                    int i2d = threadIdx.x + blockDim.x*(blockIdx.x-i1d*n2dblocks);
+                    int i2d = threadIdx.x + AMREX_GPU_MAX_THREADS*(blockIdx.x-i1d*n2dblocks);
 #endif
                     int i2dy = i2d / n2dx;
                     int i2dx = i2d - i2dy*n2dx;
diff --git a/Src/Base/AMReX_REAL.H b/Src/Base/AMReX_REAL.H
index 54815fa3da..246762b298 100644
--- a/Src/Base/AMReX_REAL.H
+++ b/Src/Base/AMReX_REAL.H
@@ -100,25 +100,25 @@ inline namespace literals {
       ```
     */
     constexpr Real
-    operator"" _rt( long double x )
+    operator""_rt( long double x )
     {
         return Real( x );
     }
 
     constexpr Real
-    operator"" _rt( unsigned long long int x )
+    operator""_rt( unsigned long long int x )
     {
         return Real( x );
     }
 
     constexpr ParticleReal
-    operator"" _prt( long double x )
+    operator""_prt( long double x )
     {
         return ParticleReal( x );
     }
 
     constexpr ParticleReal
-    operator"" _prt( unsigned long long int x )
+    operator""_prt( unsigned long long int x )
     {
         return ParticleReal( x );
     }
diff --git a/Src/Base/AMReX_Reduce.H b/Src/Base/AMReX_Reduce.H
index 1060b91f31..f6ed403a2b 100644
--- a/Src/Base/AMReX_Reduce.H
+++ b/Src/Base/AMReX_Reduce.H
@@ -516,7 +516,6 @@ public:
         {
             Dim1 blockIdx {gh.blockIdx()};
             Dim1 threadIdx{gh.threadIdx()};
-            Dim1 blockDim {gh.blockDim()};
             Dim1 gridDim  {gh.gridDim()};
 #else
         amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, 0, stream,
@@ -529,7 +528,7 @@ public:
             if (threadIdx.x == 0 && static_cast<int>(blockIdx.x) >= nblocks) {
                 dst = r;
             }
-            for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
+            for (int icell = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;
                  icell < ncells; icell += stride) {
                 int k =  icell /   lenxy;
                 int j = (icell - k*lenxy) /   lenx;
@@ -575,7 +574,6 @@ public:
         {
             Dim1 blockIdx {gh.blockIdx()};
             Dim1 threadIdx{gh.threadIdx()};
-            Dim1 blockDim {gh.blockDim()};
             Dim1 gridDim  {gh.gridDim()};
 #else
         amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, 0, stream,
@@ -588,7 +586,7 @@ public:
             if (threadIdx.x == 0 && static_cast<int>(blockIdx.x) >= nblocks) {
                 dst = r;
             }
-            for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
+            for (int icell = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;
                  icell < ncells; icell += stride) {
                 int k =  icell /   lenxy;
                 int j = (icell - k*lenxy) /   lenx;
@@ -632,7 +630,6 @@ public:
         {
             Dim1 blockIdx {gh.blockIdx()};
             Dim1 threadIdx{gh.threadIdx()};
-            Dim1 blockDim {gh.blockDim()};
             Dim1 gridDim  {gh.gridDim()};
 #else
         amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, 0, stream,
@@ -645,7 +642,7 @@ public:
             if (threadIdx.x == 0 && static_cast<int>(blockIdx.x) >= nblocks) {
                 dst = r;
             }
-            for (N i = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
+            for (N i = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;
                  i < n; i += stride) {
                 auto pr = f(i);
                 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r,pr);
@@ -728,7 +725,7 @@ public:
                 ReduceTuple dst = r;
                 for (int istream = 0, nstreams = nblocks.size(); istream < nstreams; ++istream) {
                     auto dp_stream = dp+istream*maxblocks;
-                    for (int i = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
+                    for (int i = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;
                          i < nblocks[istream]; i += stride) {
                         Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, dp_stream[i]);
                     }
@@ -871,7 +868,7 @@ bool AnyOf (N n, T const* v, P const& pred)
         if (!(*has_any))
         {
             int r = false;
-            for (N i = gh.blockDim()*gh.blockIdx()+gh.threadIdx(), stride = gh.blockDim()*gh.gridDim();
+            for (N i = AMREX_GPU_MAX_THREADS*gh.blockIdx()+gh.threadIdx(), stride = AMREX_GPU_MAX_THREADS*gh.gridDim();
                  i < n && !r; i += stride)
             {
                 r = pred(v[i]) ? 1 : 0;
@@ -892,7 +889,7 @@ bool AnyOf (N n, T const* v, P const& pred)
         if (!has_any)
         {
             int r = false;
-            for (N i = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
+            for (N i = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;
                  i < n && !r; i += stride)
             {
                 r = pred(v[i]) ? 1 : 0;
@@ -932,7 +929,7 @@ bool AnyOf (Box const& box, P const& pred)
         if (!(*has_any))
         {
             int r = false;
-            for (int icell = gh.blockDim()*gh.blockIdx()+gh.threadIdx(), stride = gh.blockDim()*gh.gridDim();
+            for (int icell = AMREX_GPU_MAX_THREADS*gh.blockIdx()+gh.threadIdx(), stride = AMREX_GPU_MAX_THREADS*gh.gridDim();
                  icell < ncells && !r; icell += stride) {
                 int k =  icell /   lenxy;
                 int j = (icell - k*lenxy) /   lenx;
@@ -958,7 +955,7 @@ bool AnyOf (Box const& box, P const& pred)
         if (!has_any)
         {
             int r = false;
-            for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
+            for (int icell = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;
                  icell < ncells && !r; icell += stride) {
                 int k =  icell /   lenxy;
                 int j = (icell - k*lenxy) /   lenx;
diff --git a/Src/Base/AMReX_Scan.H b/Src/Base/AMReX_Scan.H
index e819b9e84b..a26afb8782 100644
--- a/Src/Base/AMReX_Scan.H
+++ b/Src/Base/AMReX_Scan.H
@@ -676,7 +676,7 @@ T PrefixSum (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum a_ret_sum = ret
     {
         auto& scan_tile_state = const_cast<ScanTileState&>(tile_state);
         auto& scan_bid = const_cast<OrderedBlockId&>(ordered_block_id);
-        const unsigned int gid = blockIdx.x*blockDim.x + threadIdx.x;
+        const unsigned int gid = blockIdx.x*nthreads + threadIdx.x;
         if (gid == 0) { scan_bid.reset(); }
         scan_tile_state.initialize_prefix(gid, nblocks);
     });
@@ -755,7 +755,7 @@ T PrefixSum (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum a_ret_sum = ret
                                            rocprim::plus<T>());
             }
             if (totalsum_p) {
-                if (iend == n && threadIdx.x == blockDim.x-1) { // last thread of last block
+                if (iend == n && threadIdx.x == nthreads-1) { // last thread of last block
                     T tsum = data[nelms_per_thread-1];
                     AMREX_IF_CONSTEXPR(is_exclusive) { tsum += last; }
                     *totalsum_p = tsum;
@@ -768,7 +768,7 @@ T PrefixSum (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum a_ret_sum = ret
         BlockExchange().blocked_to_striped(data, data, temp_storage.exchange);
 
         for (int i = 0; i < nelms_per_thread; ++i) {
-            N offset = ibegin + i*blockDim.x + threadIdx.x;
+            N offset = ibegin + i*nthreads + threadIdx.x;
             if (offset < iend) { fout(offset, data[i]); }
         }
     });
@@ -888,7 +888,7 @@ T PrefixSum (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum a_ret_sum = ret
                 BlockScan(temp_storage.scan_storeage.scan).InclusiveSum(data, data, prefix_op);
             }
             if (totalsum_p) {
-                if (iend == n && threadIdx.x == blockDim.x-1) { // last thread of last block
+                if (iend == n && threadIdx.x == nthreads-1) { // last thread of last block
                     T tsum = data[nelms_per_thread-1];
                     AMREX_IF_CONSTEXPR(is_exclusive) { tsum += last; }
                     *totalsum_p = tsum;
@@ -901,7 +901,7 @@ T PrefixSum (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum a_ret_sum = ret
         BlockExchange(temp_storage.exchange).BlockedToStriped(data);
 
         for (int i = 0; i < nelms_per_thread; ++i) {
-            N offset = ibegin + i*blockDim.x + threadIdx.x;
+            N offset = ibegin + i*nthreads + threadIdx.x;
             if (offset < iend) { fout(offset, data[i]); }
         }
     });
@@ -962,7 +962,7 @@ T PrefixSum (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum a_ret_sum = ret
     {
         int lane = threadIdx.x % Gpu::Device::warp_size;
         int warp = threadIdx.x / Gpu::Device::warp_size;
-        int nwarps = blockDim.x / Gpu::Device::warp_size;
+        int nwarps = nthreads / Gpu::Device::warp_size;
 
         amrex::Gpu::SharedMemory<T> gsm;
         T* shared = gsm.dataPtr();
@@ -999,7 +999,7 @@ T PrefixSum (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum a_ret_sum = ret
         T sum_prev_chunk = 0; // inclusive sum from previous chunks.
         T tmp_out[nchunks]; // block-wide inclusive sum for chunks
         for (int ichunk = 0; ichunk < nchunks; ++ichunk) {
-            N offset = ibegin + ichunk*blockDim.x;
+            N offset = ibegin + ichunk*nthreads;
             if (offset >= iend) { break; }
 
             offset += threadIdx.x;
@@ -1074,7 +1074,7 @@ T PrefixSum (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum a_ret_sum = ret
 
         if (virtual_block_id == 0) {
             for (int ichunk = 0; ichunk < nchunks; ++ichunk) {
-                N offset = ibegin + ichunk*blockDim.x + threadIdx.x;
+                N offset = ibegin + ichunk*nthreads + threadIdx.x;
                 if (offset >= iend) { break; }
                 fout(offset, tmp_out[ichunk]);
                 if (offset == n-1) {
@@ -1136,7 +1136,7 @@ T PrefixSum (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum a_ret_sum = ret
             T exclusive_prefix = shared[0];
 
             for (int ichunk = 0; ichunk < nchunks; ++ichunk) {
-                N offset = ibegin + ichunk*blockDim.x + threadIdx.x;
+                N offset = ibegin + ichunk*nthreads + threadIdx.x;
                 if (offset >= iend) { break; }
                 T t = tmp_out[ichunk] + exclusive_prefix;
                 fout(offset, t);
diff --git a/Src/EB/AMReX_EB2_Level.cpp b/Src/EB/AMReX_EB2_Level.cpp
index f51bc80ad3..b0fec6e389 100644
--- a/Src/EB/AMReX_EB2_Level.cpp
+++ b/Src/EB/AMReX_EB2_Level.cpp
@@ -880,7 +880,7 @@ void
 Level::fillLevelSet (MultiFab& levelset, const Geometry& geom) const
 {
     levelset.setVal(-1.0);
-    levelset.ParallelCopy(m_levelset,0,0,1,0,0);
+    levelset.ParallelCopy(m_levelset,0,0,1,IntVect(0),levelset.nGrowVect(),geom.periodicity());
 
     const std::vector<IntVect>& pshifts = geom.periodicity().shiftIntVect();
 
diff --git a/Src/EB/AMReX_EB_STL_utils.cpp b/Src/EB/AMReX_EB_STL_utils.cpp
index 3a3070f188..e4aea5a1eb 100644
--- a/Src/EB/AMReX_EB_STL_utils.cpp
+++ b/Src/EB/AMReX_EB_STL_utils.cpp
@@ -631,6 +631,11 @@ STLtools::fill (MultiFab& mf, IntVect const& nghost, Geometry const& geom,
     const auto plo = geom.ProbLoArray();
     const auto dx  = geom.CellSizeArray();
 
+    auto ixt = mf.ixType();
+    RealVect offset(AMREX_D_DECL(ixt.cellCentered(0) ? 0.5_rt : 0.0_rt,
+                                 ixt.cellCentered(1) ? 0.5_rt : 0.0_rt,
+                                 ixt.cellCentered(2) ? 0.5_rt : 0.0_rt));
+
     const Triangle* tri_pts = m_tri_pts_d.data();
     XDim3 ptmin = m_ptmin;
     XDim3 ptmax = m_ptmax;
@@ -650,12 +655,12 @@ STLtools::fill (MultiFab& mf, IntVect const& nghost, Geometry const& geom,
            [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k, auto control) noexcept
     {
         Real coords[3];
-        coords[0]=plo[0]+static_cast<Real>(i)*dx[0];
-        coords[1]=plo[1]+static_cast<Real>(j)*dx[1];
+        coords[0]=plo[0]+(static_cast<Real>(i)+offset[0])*dx[0];
+        coords[1]=plo[1]+(static_cast<Real>(j)+offset[1])*dx[1];
 #if (AMREX_SPACEDIM == 2)
         coords[2]=Real(0.);
 #else
-        coords[2]=plo[2]+static_cast<Real>(k)*dx[2];
+        coords[2]=plo[2]+(static_cast<Real>(k)+offset[2])*dx[2];
 #endif
         int num_intersects=0;
         if (coords[0] >= ptmin.x && coords[0] <= ptmax.x &&
diff --git a/Src/Extern/HDF5/Make.package b/Src/Extern/HDF5/Make.package
index 76e2233904..05c003bac1 100644
--- a/Src/Extern/HDF5/Make.package
+++ b/Src/Extern/HDF5/Make.package
@@ -1,10 +1,11 @@
-#
-# HDF5 Blueprint Support
-#
 
 CEXE_sources += AMReX_PlotFileUtilHDF5.cpp
 
-CEXE_headers += AMReX_PlotFileUtilHDF5.H AMReX_ParticleHDF5.H AMReX_WriteBinaryParticleDataHDF5.H AMReX_ParticlesHDF5.H
+CEXE_headers += AMReX_ParticleHDF5.H
+CEXE_headers += AMReX_ParticlesHDF5.H
+CEXE_headers += AMReX_PlotFileUtilHDF5.H
+CEXE_headers += AMReX_ParticleUtilHDF5.H
+CEXE_headers += AMReX_WriteBinaryParticleDataHDF5.H
 
 VPATH_LOCATIONS += $(AMREX_HOME)/Src/Extern/HDF5
 INCLUDE_LOCATIONS += $(AMREX_HOME)/Src/Extern/HDF5
diff --git a/Src/Extern/SWFFT/AlignedAllocator.h b/Src/Extern/SWFFT/AlignedAllocator.h
deleted file mode 100644
index 2d16a84375..0000000000
--- a/Src/Extern/SWFFT/AlignedAllocator.h
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- *                 Copyright (C) 2017, UChicago Argonne, LLC
- *                            All Rights Reserved
- *
- *           Hardware/Hybrid Cosmology Code (HACC), Version 1.0
- *
- * Salman Habib, Adrian Pope, Hal Finkel, Nicholas Frontiere, Katrin Heitmann,
- *      Vitali Morozov, Jeffrey Emberson, Thomas Uram, Esteban Rangel
- *                        (Argonne National Laboratory)
- *
- *  David Daniel, Patricia Fasel, Chung-Hsing Hsu, Zarija Lukic, James Ahrens
- *                      (Los Alamos National Laboratory)
- *
- *                               George Zagaris
- *                                 (Kitware)
- *
- *                            OPEN SOURCE LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *   1. Redistributions of source code must retain the above copyright notice,
- *      this list of conditions and the following disclaimer. Software changes,
- *      modifications, or derivative works, should be noted with comments and
- *      the author and organization's name.
- *
- *   2. Redistributions in binary form must reproduce the above copyright
- *      notice, this list of conditions and the following disclaimer in the
- *      documentation and/or other materials provided with the distribution.
- *
- *   3. Neither the names of UChicago Argonne, LLC or the Department of Energy
- *      nor the names of its contributors may be used to endorse or promote
- *      products derived from this software without specific prior written
- *      permission.
- *
- *   4. The software and the end-user documentation included with the
- *      redistribution, if any, must include the following acknowledgment:
- *
- *     "This product includes software produced by UChicago Argonne, LLC under
- *      Contract No. DE-AC02-06CH11357 with the Department of Energy."
- *
- * *****************************************************************************
- *                                DISCLAIMER
- * THE SOFTWARE IS SUPPLIED "AS IS" WITHOUT WARRANTY OF ANY KIND. NEITHER THE
- * UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT OF ENERGY, NOR
- * UCHICAGO ARGONNE, LLC, NOR ANY OF THEIR EMPLOYEES, MAKES ANY WARRANTY,
- * EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR RESPONSIBILITY FOR THE
- * ACCURARY, COMPLETENESS, OR USEFULNESS OF ANY INFORMATION, DATA, APPARATUS,
- * PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS THAT ITS USE WOULD NOT INFRINGE
- * PRIVATELY OWNED RIGHTS.
- *
- * *****************************************************************************
- */
-
-#ifndef HACC_ALIGNEDALLOCATOR_H
-#define HACC_ALIGNEDALLOCATOR_H
-
-#include <stddef.h>
-#include <stdlib.h>
-
-#include <new>
-
-namespace hacc {
-template <typename T, size_t N>
-class AlignedAllocator
-{
-public:
-  typedef T value_type;
-  typedef T *pointer;
-  typedef T &reference;
-  typedef const T *const_pointer;
-  typedef const T &const_reference;
-  typedef size_t size_type;
-  typedef ptrdiff_t difference_type;
-
-  template <typename U>
-  struct rebind {
-      typedef AlignedAllocator<U, N> other;
-  };
-
-public:
-  AlignedAllocator() throw() {};
-  AlignedAllocator(const AlignedAllocator&) throw() {};
-
-  template <typename U, size_t M>
-  AlignedAllocator(const AlignedAllocator<U, M>&) throw() {};
-
-public:
-  ~AlignedAllocator() throw () {};
-
-public:
-  pointer address(reference x) const { return &x; }
-  const_pointer address (const_reference x) const { return &x; }
-
-  size_type max_size() const throw() { return size_t(-1) / sizeof(T); }
-
-  void construct(pointer p, const_reference val) { ::new ((void*)p) T(val); }
-  void destroy(pointer p) { ((T*)p)->~T(); }
-
-public:
-  pointer allocate(size_type n,
-                   const void * /*hint*/ = 0)
-  {
-    pointer p;
-    if (posix_memalign((void **) &p, N, n*sizeof(T)) != 0) {
-      throw std::bad_alloc();
-    }
-
-    return p;
-  }
-
-  void deallocate(pointer p, size_type n)
-  {
-    free((void *) p);
-  }
-};
-} // namespace hacc
-
-#endif // HACC_ALIGNEDALLOCATOR_H
-
diff --git a/Src/Extern/SWFFT/COPYING b/Src/Extern/SWFFT/COPYING
deleted file mode 100644
index 08b68c8770..0000000000
--- a/Src/Extern/SWFFT/COPYING
+++ /dev/null
@@ -1,50 +0,0 @@
-                   Copyright (C) 2017, UChicago Argonne, LLC
-                              All Rights Reserved
-
-             Hardware/Hybrid Cosmology Code (HACC), Version 1.0
-
-  Salman Habib, Adrian Pope, Hal Finkel, Nicholas Frontiere, Katrin Heitmann,
-       Vitali Morozov, Jeffrey Emberson, Thomas Uram, Esteban Rangel
-                         (Argonne National Laboratory)
-
-   David Daniel, Patricia Fasel, Chung-Hsing Hsu, Zarija Lukic, James Ahrens
-                       (Los Alamos National Laboratory)
-
-                                George Zagaris
-                                  (Kitware)
-
-                             OPEN SOURCE LICENSE
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-  1. Redistributions of source code must retain the above copyright notice,
-     this list of conditions and the following disclaimer. Software changes,
-     modifications, or derivative works, should be noted with comments and the
-     author and organization's name.
-
-  2. Redistributions in binary form must reproduce the above copyright notice,
-     this list of conditions and the following disclaimer in the documentation
-     and/or other materials provided with the distribution.
-
-  3. Neither the names of UChicago Argonne, LLC or the Department of Energy nor
-     the names of its contributors may be used to endorse or promote products
-     derived from this software without specific prior written permission.
-
-  4. The software and the end-user documentation included with the
-     redistribution, if any, must include the following acknowledgment:
-
-    "This product includes software produced by UChicago Argonne, LLC under
-     Contract No. DE-AC02-06CH11357 with the Department of Energy."
-
-********************************************************************************
-                                 DISCLAIMER
-THE SOFTWARE IS SUPPLIED "AS IS" WITHOUT WARRANTY OF ANY KIND. NEITHER THE
-UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT OF ENERGY, NOR 
-UCHICAGO ARGONNE, LLC, NOR ANY OF THEIR EMPLOYEES, MAKES ANY WARRANTY, EXPRESS
-OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR RESPONSIBILITY FOR THE ACCURARY,
-COMPLETENESS, OR USEFULNESS OF ANY INFORMATION, DATA, APPARATUS, PRODUCT, OR
-PROCESS DISCLOSED, OR REPRESENTS THAT ITS USE WOULD NOT INFRINGE PRIVATELY
-OWNED RIGHTS.
-
-********************************************************************************
diff --git a/Src/Extern/SWFFT/CheckDecomposition.c b/Src/Extern/SWFFT/CheckDecomposition.c
deleted file mode 100644
index 2b38d4916c..0000000000
--- a/Src/Extern/SWFFT/CheckDecomposition.c
+++ /dev/null
@@ -1,646 +0,0 @@
-/*
- *                 Copyright (C) 2017, UChicago Argonne, LLC
- *                            All Rights Reserved
- *
- *           Hardware/Hybrid Cosmology Code (HACC), Version 1.0
- *
- * Salman Habib, Adrian Pope, Hal Finkel, Nicholas Frontiere, Katrin Heitmann,
- *      Vitali Morozov, Jeffrey Emberson, Thomas Uram, Esteban Rangel
- *                        (Argonne National Laboratory)
- *
- *  David Daniel, Patricia Fasel, Chung-Hsing Hsu, Zarija Lukic, James Ahrens
- *                      (Los Alamos National Laboratory)
- *
- *                               George Zagaris
- *                                 (Kitware)
- *
- *                            OPEN SOURCE LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *   1. Redistributions of source code must retain the above copyright notice,
- *      this list of conditions and the following disclaimer. Software changes,
- *      modifications, or derivative works, should be noted with comments and
- *      the author and organization's name.
- *
- *   2. Redistributions in binary form must reproduce the above copyright
- *      notice, this list of conditions and the following disclaimer in the
- *      documentation and/or other materials provided with the distribution.
- *
- *   3. Neither the names of UChicago Argonne, LLC or the Department of Energy
- *      nor the names of its contributors may be used to endorse or promote
- *      products derived from this software without specific prior written
- *      permission.
- *
- *   4. The software and the end-user documentation included with the
- *      redistribution, if any, must include the following acknowledgment:
- *
- *     "This product includes software produced by UChicago Argonne, LLC under
- *      Contract No. DE-AC02-06CH11357 with the Department of Energy."
- *
- * *****************************************************************************
- *                                DISCLAIMER
- * THE SOFTWARE IS SUPPLIED "AS IS" WITHOUT WARRANTY OF ANY KIND. NEITHER THE
- * UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT OF ENERGY, NOR
- * UCHICAGO ARGONNE, LLC, NOR ANY OF THEIR EMPLOYEES, MAKES ANY WARRANTY,
- * EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR RESPONSIBILITY FOR THE
- * ACCURARY, COMPLETENESS, OR USEFULNESS OF ANY INFORMATION, DATA, APPARATUS,
- * PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS THAT ITS USE WOULD NOT INFRINGE
- * PRIVATELY OWNED RIGHTS.
- *
- * *****************************************************************************
- */
-
-#include <stdio.h>
-#include <string.h>
-#include <stdlib.h>
-#include <assert.h>
-#include <stdbool.h>
-
-#include <mpi.h>
-
-#include "distribution.h"
-
-static inline const char *separator(int i, int n)
-{
-  return i == (n - 1) ? "." : ", ";
-}
-
-int main(int argc, char *argv[]) {
-
-  MPI_Init(&argc, &argv);
-
-  int ndim = 3, period[3], self = 0, nproc, n[3], debug = 1;
-  int explicit3d = 0, np3d = 0;
-
-  distribution_t dist;
-  distribution_t *d = &dist;
-  d->debug = 1;
-
-  if(argc < 5) {
-    fprintf(stderr,"\n");
-    fprintf(stderr,"USAGE: %s <ngx> <ngy> <ngz> <Nproc> [nx ny nz]\n",argv[0]);
-    fprintf(stderr,"\n");
-    fprintf(stderr,"Required: ng? = number of global grid vertices in each dimension\n");
-    fprintf(stderr,"Required: Nproc = total number of MPI ranks\n");
-    fprintf(stderr,"Optional: n? = number of MPI ranks in each dimension for 3D Cartesian communicator if setting explicitly; Nproc = nx*ny*nz enforced\n");
-    fprintf(stderr,"\n");
-    exit(-1);
-  }
-
-  for(int i=0; i<ndim; i++) {
-    n[i] = atoi(argv[i+1]);;
-    d->n[i] = n[i];
-    //d->padding[i] = 0;
-  }
-
-  nproc = atoi(argv[4]);
-
-  if(argc >= 8) {
-    explicit3d = 1;
-    np3d = 1;
-    for(int i=0; i<ndim; i++) {
-      d->process_topology_3.nproc[i] = atoi(argv[i+5]);
-      np3d *= d->process_topology_3.nproc[i];
-    }
-    if(np3d != nproc) {
-      fprintf(stderr,"ERROR: %d * %d * %d = %d != %d\n",
-              d->process_topology_3.nproc[0],
-              d->process_topology_3.nproc[1],
-              d->process_topology_3.nproc[2],
-              np3d, nproc);
-      exit(-1);
-    }
-  }
-
-
-
-  // set up process grid with 1d decomposition (SLABs)
-  d->process_topology_1.nproc[0] = 0;
-  d->process_topology_1.nproc[1] = 1; // don't distribute outer dimensions
-  d->process_topology_1.nproc[2] = 1; // don't distribute outer dimensions
-  period[0] = period[1] = period[2] = 1;
-  //process_topology_1.nproc is filled with number of processors in each dim
-  MPI_Dims_create(nproc, ndim, d->process_topology_1.nproc);
-
-  if(self == 0) {
-    printf("distribution 1D: [%d:%d:%d]\n",
-           d->process_topology_1.nproc[0],
-           d->process_topology_1.nproc[1],
-           d->process_topology_1.nproc[2]);
-    fflush(stdout);
-  }
-
-  //calculates the local dimensions (number of points in each dimension)
-  d->process_topology_1.n[0] = n[0] / d->process_topology_1.nproc[0];
-  d->process_topology_1.n[1] = n[1] / d->process_topology_1.nproc[1];
-  d->process_topology_1.n[2] = n[2] / d->process_topology_1.nproc[2];
-
-
-
-  // set up process grid with 3d decomposition (CUBE)
-  if(!explicit3d) {
-    d->process_topology_3.nproc[0] = 0;
-    d->process_topology_3.nproc[1] = 0;
-    d->process_topology_3.nproc[2] = 0;
-    period[0] = period[1] = period[2] = 1;
-    MPI_Dims_create(nproc, ndim, d->process_topology_3.nproc);
-  }
-
-  if(self == 0) {
-    printf("distribution 3D: [%d:%d:%d]\n",
-           d->process_topology_3.nproc[0],
-           d->process_topology_3.nproc[1],
-           d->process_topology_3.nproc[2]);
-    fflush(stdout);
-  }
-
-  assert(n[0]%d->process_topology_3.nproc[0] == 0);
-  assert(n[0]%d->process_topology_3.nproc[1] == 0);
-  assert(n[0]%d->process_topology_3.nproc[2] == 0);
-
-  //set local dimensions
-  d->process_topology_3.n[0] = n[0] / d->process_topology_3.nproc[0];
-  d->process_topology_3.n[1] = n[1] / d->process_topology_3.nproc[1];
-  d->process_topology_3.n[2] = n[2] / d->process_topology_3.nproc[2];
-
-
-
-  // set up process grid with 2d decomposition (z_PENCILs )
-  d->process_topology_2_z.nproc[0] = 0;
-  d->process_topology_2_z.nproc[1] = 0;
-  d->process_topology_2_z.nproc[2] = 1; // don't distribute outer dimension
-  period[0] = period[1] = period[2] = 1;
-  MPI_Dims_create(nproc, ndim, d->process_topology_2_z.nproc);
-  d->process_topology_2_z.n[0] = n[0] / d->process_topology_2_z.nproc[0];
-  d->process_topology_2_z.n[1] = n[1] / d->process_topology_2_z.nproc[1];
-  d->process_topology_2_z.n[2] = n[2] / d->process_topology_2_z.nproc[2];
-  //variable used to ensure that pencils created fit inside the cuboids,
-  //if not the code will assert out.
-  bool check_z_dims=false;
-  if(d->process_topology_2_z.n[0] != 0
-     && d->process_topology_2_z.n[1] != 0
-     && d->process_topology_2_z.n[2] != 0)
-  {// protects from dividing by zero.
-    check_z_dims = ((d->process_topology_3.n[0]) % (d->process_topology_2_z.n[0]) == 0)
-      && ((d->process_topology_3.n[1]) % (d->process_topology_2_z.n[1]) == 0)
-      && (n[0] % (d->process_topology_2_z.nproc[0]) == 0)
-      && (n[0] % (d->process_topology_2_z.nproc[1]) == 0);
-
-    if(self==0 && debug && !check_z_dims)
-      fprintf(stderr,"Need to fix Z PENCILS z_procs(%d,%d,%d) 3d.ns(%d,%d,%d) 2d_z.ns(%d,%d,%d)\n",
-              d->process_topology_2_z.nproc[0],
-              d->process_topology_2_z.nproc[1],
-              d->process_topology_2_z.nproc[2],
-              d->process_topology_3.n[0],
-              d->process_topology_3.n[1],
-              d->process_topology_3.n[2],
-              d->process_topology_2_z.n[0],
-              d->process_topology_2_z.n[1],
-              d->process_topology_2_z.n[2]);
-
-    //try swapping pencil dimensions if current setup pencil dimensions dont
-    //fit inside the cubes.
-    if(!(check_z_dims)
-       && ((d->process_topology_3.n[0]) % (d->process_topology_2_z.n[1]) == 0)
-       && ((d->process_topology_3.n[1]) % (d->process_topology_2_z.n[0]) == 0))
-    {
-
-      if(self==0 && debug)
-        fprintf(stderr,"Swapping Z pencils in initialization  (%d,%d,%d)\n",
-                d->process_topology_2_z.nproc[0],
-                d->process_topology_2_z.nproc[1],
-                d->process_topology_2_z.nproc[2]);
-      int temp=d->process_topology_2_z.nproc[0];
-      d->process_topology_2_z.nproc[0] = d->process_topology_2_z.nproc[1];
-      d->process_topology_2_z.nproc[1] = temp;
-      d->process_topology_2_z.nproc[2] = d->process_topology_2_z.nproc[2];
-
-      d->process_topology_2_z.n[0] = n[0] / d->process_topology_2_z.nproc[0];
-      d->process_topology_2_z.n[1] = n[1] / d->process_topology_2_z.nproc[1];
-      d->process_topology_2_z.n[2] = n[2] / d->process_topology_2_z.nproc[2];
-      check_z_dims = ((d->process_topology_3.n[0]) % (d->process_topology_2_z.n[0]) == 0)
-        && ((d->process_topology_3.n[1]) % (d->process_topology_2_z.n[1]) == 0)
-        && (n[0] % (d->process_topology_2_z.nproc[0]) == 0)
-        && (n[0] % (d->process_topology_2_z.nproc[1]) == 0);
-    }
-  } else {
-    check_z_dims=false;
-  }
-//  if that did not work, make a pencil that does if inside the 3d cuboids by
-//  taking the cuboids dimensions (np1,np2,np3) and making pencils
-//  (np1,np2*np3,1), or (np1*np3,np2,1) on the most evenly distributed
-//  dimensions
-  if(!check_z_dims){
-    if(self==0 && debug)
-      fprintf(stderr,"MAKING Z PENCILS FIT zprocs(%d,%d,%d) z.ns(%d,%d,%d)\n",
-              d->process_topology_2_z.nproc[0],
-              d->process_topology_2_z.nproc[1],
-              d->process_topology_2_z.nproc[2],
-              d->process_topology_2_z.n[0],
-              d->process_topology_2_z.n[1],
-              d->process_topology_2_z.n[2]);
-
-    d->process_topology_2_z.nproc[2]=1;
-    if(d->process_topology_3.n[0]>d->process_topology_3.n[1])
-    {
-      d->process_topology_2_z.nproc[1]=d->process_topology_3.nproc[1]*d->process_topology_3.nproc[2];
-      d->process_topology_2_z.nproc[0]=d->process_topology_3.nproc[0];
-      if((n[0] % (d->process_topology_2_z.nproc[0]) != 0)
-         || (n[0] % (d->process_topology_2_z.nproc[1]) != 0))
-      {
-        d->process_topology_2_z.nproc[0]=d->process_topology_3.nproc[0]*d->process_topology_3.nproc[2];
-        d->process_topology_2_z.nproc[1]=d->process_topology_3.nproc[1];
-      }
-    } else {
-      d->process_topology_2_z.nproc[0]=d->process_topology_3.nproc[0]*d->process_topology_3.nproc[2];
-      d->process_topology_2_z.nproc[1]=d->process_topology_3.nproc[1];
-      if((n[0] % (d->process_topology_2_z.nproc[0]) != 0)
-         || (n[0] % (d->process_topology_2_z.nproc[1]) != 0))
-      {
-        d->process_topology_2_z.nproc[1]=d->process_topology_3.nproc[1]*d->process_topology_3.nproc[2];
-        d->process_topology_2_z.nproc[0]=d->process_topology_3.nproc[0];
-      }
-    }
-    d->process_topology_2_z.n[0] = n[0] / d->process_topology_2_z.nproc[0];
-    d->process_topology_2_z.n[1] = n[1] / d->process_topology_2_z.nproc[1];
-    d->process_topology_2_z.n[2] = n[2] / d->process_topology_2_z.nproc[2];
-    if(self==0 && debug)
-      fprintf(stderr,"MAKING Z PENCILS FIT AFTER zprocs(%d,%d,%d) z.ns(%d,%d,%d)\n",
-              d->process_topology_2_z.nproc[0],
-              d->process_topology_2_z.nproc[1],
-              d->process_topology_2_z.nproc[2],
-              d->process_topology_2_z.n[0],
-              d->process_topology_2_z.n[1],
-              d->process_topology_2_z.n[2]);
-    if(d->process_topology_2_z.n[0] != 0
-       && d->process_topology_2_z.n[1] != 0
-       && d->process_topology_2_z.n[2] != 0)
-    {// protects from dividing by zero.
-      check_z_dims=((d->process_topology_3.n[0]) % (d->process_topology_2_z.n[0]) == 0)
-        && ((d->process_topology_3.n[1]) % (d->process_topology_2_z.n[1]) == 0)
-        && (n[0] % (d->process_topology_2_z.nproc[0]) == 0)
-        && (n[0] % (d->process_topology_2_z.nproc[1]) == 0);
-    } else {
-      check_z_dims=false;
-    }
-  }
-
-  if (d->debug && 0 == self) {
-    fprintf(stderr, "  2d_z: ");
-    for (int i = 0; i < ndim; ++i) {
-      fprintf(stderr, "%d%s",
-              d->process_topology_2_z.nproc[i],
-              separator(i, ndim));
-    }
-    fprintf(stderr, "\n");
-  }
-  if(!check_z_dims && debug && (self==0)){
-    FILE * outfile;
-    outfile= fopen("error.data","a");
-    fprintf(outfile,"Z DIMS FAILS:(%d,%d,%d) (%d,%d,%d) \n",
-            d->process_topology_2_z.nproc[0],
-            d->process_topology_2_z.nproc[1],
-            d->process_topology_2_z.nproc[2],
-            d->process_topology_3.nproc[0],
-            d->process_topology_3.nproc[1],
-            d->process_topology_3.nproc[2]);
-  }
-  //assert(check_z_dims);
-  if(!check_z_dims)
-    fprintf(stderr,"assert(check_z_dims) would have failed.\n");
-
-//  if this happens, it is because the dimensions were chosen incorrectly.
-//  Either to many processors for the number of points in one dimension (could
-//  not do at least 1 point per processor), or the methods above could
-//  not make a distribution of pencils that fit in the cubiods, which would
-//  happen if the user gave numbers that wouldent work (we require the number
-//  of processors in each dimension of the cuboid must be modulo the number of
-//  points in that dimension, otherwise, this error will happen).
-
-  if(self == 0) {
-    printf("distribution 2z: [%d:%d:%d]\n",
-           d->process_topology_2_z.nproc[0],
-           d->process_topology_2_z.nproc[1],
-           d->process_topology_2_z.nproc[2]);
-    fflush(stdout);
-  }
-
-
-
-  // set up process grid with 2d decomposition (x_PENCILs)
-  d->process_topology_2_x.nproc[0] = 1; // don't distribute outer dimension
-  d->process_topology_2_x.nproc[1] = 0;
-  d->process_topology_2_x.nproc[2] = 0;
-  period[0] = period[1] = period[2] = 1;
-  MPI_Dims_create(nproc, ndim, d->process_topology_2_x.nproc);
-  d->process_topology_2_x.n[0] = n[0] / d->process_topology_2_x.nproc[0];
-  d->process_topology_2_x.n[1] = n[1] / d->process_topology_2_x.nproc[1];
-  d->process_topology_2_x.n[2] = n[2] / d->process_topology_2_x.nproc[2];
-  //variable used to ensure that pencils created fit inside the cuboids,
-  //if not the code will assert out.
-  bool check_x_dims = false;
-  if(d->process_topology_2_x.n[0] != 0
-     && d->process_topology_2_x.n[1] != 0
-     && d->process_topology_2_x.n[2] != 0)
-  {// protects from dividing by zero.
-    check_x_dims = ((d->process_topology_3.n[2]) % (d->process_topology_2_x.n[2]) == 0)
-      && ((d->process_topology_3.n[1]) % (d->process_topology_2_x.n[1]) == 0)
-      && (n[0] % (d->process_topology_2_x.nproc[2]) == 0)
-      && (n[0] % (d->process_topology_2_x.nproc[0]) == 0);
-    if(self==0 && debug && !check_x_dims)
-      fprintf(stderr,"Need to fix X PENCILS x_procs(%d,%d,%d) 3d.ns(%d,%d,%d) 2d_x.ns(%d,%d,%d)...\n",
-              d->process_topology_2_x.nproc[0],
-              d->process_topology_2_x.nproc[1],
-              d->process_topology_2_x.nproc[2],
-              d->process_topology_3.n[0],
-              d->process_topology_3.n[1],
-              d->process_topology_3.n[2],
-              d->process_topology_2_x.n[0],
-              d->process_topology_2_x.n[1],
-              d->process_topology_2_x.n[2]);
-
-    //try swapping pencil dimensions if current setup does not have pencils
-    //that fit inside cubes.
-    if(!(check_x_dims)
-       && ((d->process_topology_3.n[2]) % (d->process_topology_2_x.n[1]) == 0)
-       && ((d->process_topology_3.n[1]) % (d->process_topology_2_x.n[2]) == 0))
-    {
-      if(self==0 && debug)
-        fprintf(stderr,"Swapping X pencils in initialization .... \n");
-      d->process_topology_2_x.nproc[0] = d->process_topology_2_x.nproc[0];
-      int temp = d->process_topology_2_x.nproc[1];
-      d->process_topology_2_x.nproc[1] = d->process_topology_2_x.nproc[2];
-      d->process_topology_2_x.nproc[2] = temp;
-
-      d->process_topology_2_x.n[0] = n[0] / d->process_topology_2_x.nproc[0];
-      d->process_topology_2_x.n[1] = n[1] / d->process_topology_2_x.nproc[1];
-      d->process_topology_2_x.n[2] = n[2] / d->process_topology_2_x.nproc[2];
-      check_x_dims = ((d->process_topology_3.n[2]) % (d->process_topology_2_x.n[2]) == 0)
-        && ((d->process_topology_3.n[1]) % (d->process_topology_2_x.n[1]) == 0)
-        && (n[0] % (d->process_topology_2_x.nproc[2]) == 0)
-        && (n[0] % (d->process_topology_2_x.nproc[0]) == 0);
-    }
-  } else{
-    check_x_dims=false;
-  }
-//    if that did not work, make a pencil that does by taking the cuboid
-//    (np1,np2,np3) and making pencils of the form (1,np2*np1,np3) or
-//    (1,np2*np1,np3) depending on the most even distribution it can.
-  if(!check_x_dims){
-    if(self==0 && debug)
-      fprintf(stderr,"MAKING X PENCILS FIT xprocs(%d,%d,%d) x.ns(%d,%d,%d)...\n",
-              d->process_topology_2_x.nproc[0],
-              d->process_topology_2_x.nproc[1],
-              d->process_topology_2_x.nproc[2],
-              d->process_topology_2_x.n[0],
-              d->process_topology_2_x.n[1],
-              d->process_topology_2_x.n[2]);
-
-    d->process_topology_2_x.nproc[0] = 1;
-    if(d->process_topology_3.nproc[2] > d->process_topology_3.nproc[1])
-    {
-      d->process_topology_2_x.nproc[1] = d->process_topology_3.nproc[1]*d->process_topology_3.nproc[0];
-      d->process_topology_2_x.nproc[2] = d->process_topology_3.nproc[2];
-      if((n[0] % (d->process_topology_2_x.nproc[2]) != 0)
-         || (n[0] % (d->process_topology_2_x.nproc[0]) != 0))
-      {
-        d->process_topology_2_x.nproc[2]=d->process_topology_3.nproc[2]*d->process_topology_3.nproc[0];
-        d->process_topology_2_x.nproc[1]=d->process_topology_3.nproc[1];
-      }
-
-    } else {
-      d->process_topology_2_x.nproc[2] = d->process_topology_3.nproc[2]*d->process_topology_3.nproc[0];
-      d->process_topology_2_x.nproc[1] = d->process_topology_3.nproc[1];
-      if((n[0] % (d->process_topology_2_x.nproc[2]) != 0)
-         || (n[0] % (d->process_topology_2_x.nproc[0]) != 0))
-      {
-        d->process_topology_2_x.nproc[1]=d->process_topology_3.nproc[1]*d->process_topology_3.nproc[0];
-        d->process_topology_2_x.nproc[2]=d->process_topology_3.nproc[2];
-      }
-    }
-    d->process_topology_2_x.n[0] = n[0] / d->process_topology_2_x.nproc[0];
-    d->process_topology_2_x.n[1] = n[1] / d->process_topology_2_x.nproc[1];
-    d->process_topology_2_x.n[2] = n[2] / d->process_topology_2_x.nproc[2];
-    if(self==0 && debug)
-      fprintf(stderr,"MAKING X PENCILS FIT AFTER xprocs(%d,%d,%d) x.ns(%d,%d,%d)...\n",
-              d->process_topology_2_x.nproc[0],
-              d->process_topology_2_x.nproc[1],
-              d->process_topology_2_x.nproc[2],
-              d->process_topology_2_x.n[0],
-              d->process_topology_2_x.n[1],
-              d->process_topology_2_x.n[2]);
-    if(d->process_topology_2_x.n[0] != 0
-       && d->process_topology_2_x.n[1] != 0
-       && d->process_topology_2_x.n[2] != 0)
-    {// protects from dividing by zero.
-      check_x_dims = ((d->process_topology_3.n[2]) % (d->process_topology_2_x.n[2]) == 0)
-        && ((d->process_topology_3.n[1]) % (d->process_topology_2_x.n[1]) == 0)
-        && (n[0] % (d->process_topology_2_x.nproc[2]) == 0)
-        && (n[0] % (d->process_topology_2_x.nproc[0]) == 0);
-    } else {
-      check_x_dims=false;
-    }
-  }
-
-  if (d->debug && 0 == self) {
-    fprintf(stderr, "  2d_x: ");
-    for (int i = 0; i < ndim; ++i) {
-      fprintf(stderr, "%d%s",
-              d->process_topology_2_x.nproc[i],
-              separator(i, ndim));
-    }
-    fprintf(stderr, "\n");
-  }
-  if(!check_x_dims && debug && (self==0)){
-    FILE * outfile;
-    outfile= fopen("error.data","a");
-    fprintf(outfile,"X DIMS FAILS:(%d,%d,%d) (%d,%d,%d) \n",
-            d->process_topology_2_x.nproc[0],
-            d->process_topology_2_x.nproc[1],
-            d->process_topology_2_x.nproc[2],
-            d->process_topology_3.nproc[0],
-            d->process_topology_3.nproc[1],
-            d->process_topology_3.nproc[2]);
-  }
-  //assert(check_x_dims);
-  if(!check_x_dims)
-    fprintf(stderr,"assert(check_x_dims) would have failed.\n");
-//  if this happens, it is because the dimensions were chosen incorrectly.
-//  Either to many processors for the number of points in one dimension (could
-//  not do at least 1 point per processor), or the methods above could not make
-//  a distribution of pencils that fit in the cubiods, which would happen if the
-//  user gave numbers that wouldent work (we require the number of processors in
-//  each dimension of the cuboid must be modulo the number of points in that
-//  dimension, otherwise, this error will happen).
-
-  if(self == 0) {
-    printf("distribution 2x: [%d:%d:%d]\n",
-           d->process_topology_2_x.nproc[0],
-           d->process_topology_2_x.nproc[1],
-           d->process_topology_2_x.nproc[2]);
-    fflush(stdout);
-  }
-
-
-
-  // set up process grid with 2d decomposition (y_PENCILs)
-  d->process_topology_2_y.nproc[0] = 0;
-  d->process_topology_2_y.nproc[1] = 1; // don't distribute outer dimension
-  d->process_topology_2_y.nproc[2] = 0;
-  period[0] = period[1] = period[2] = 1;
-  MPI_Dims_create(nproc, ndim, d->process_topology_2_y.nproc);
-  d->process_topology_2_y.n[0] = n[0] / d->process_topology_2_y.nproc[0];
-  d->process_topology_2_y.n[1] = n[1] / d->process_topology_2_y.nproc[1];
-  d->process_topology_2_y.n[2] = n[2] / d->process_topology_2_y.nproc[2];
-  //variable used to ensure that pencils created fit inside the cuboids,
-  //if not the code will assert out.
-  bool check_y_dims=false;
-  if(d->process_topology_2_y.n[0] != 0
-     && d->process_topology_2_y.n[1] != 0
-     && d->process_topology_2_y.n[2] != 0)
-  {// protects from dividing by zero.
-    check_y_dims = (((d->process_topology_3.n[2]) % (d->process_topology_2_y.n[2]) == 0)
-                    && ((d->process_topology_3.n[0]) % (d->process_topology_2_y.n[0]) == 0)
-                    && (n[0] % (d->process_topology_2_y.nproc[2]) == 0)
-                    && (n[0] % (d->process_topology_2_y.nproc[0]) == 0));
-    if(self==0 && debug && !check_y_dims)
-      fprintf(stderr,"Need to fix Y PENCILS y_procs(%d,%d,%d) 3d.ns(%d,%d,%d) 2d_y.ns(%d,%d,%d)...\n",
-              d->process_topology_2_y.nproc[0],
-              d->process_topology_2_y.nproc[1],
-              d->process_topology_2_y.nproc[2],
-              d->process_topology_3.n[0],
-              d->process_topology_3.n[1],
-              d->process_topology_3.n[2],
-              d->process_topology_2_y.n[0],
-              d->process_topology_2_y.n[1],
-              d->process_topology_2_y.n[2]);
-    //try swapping pencil dimensions if the current dimension of the pencils
-    //does not fit inside the cubes.
-    if(!(check_y_dims)
-       && ((d->process_topology_3.n[2]) % (d->process_topology_2_y.n[0]) == 0)
-       && ((d->process_topology_3.n[0]) % (d->process_topology_2_y.n[2]) == 0))
-    {
-      if(self==0 && debug)
-        fprintf(stderr,"Swapping Y pencils in initialization .... \n");
-
-      int temp = d->process_topology_2_y.nproc[0];
-      d->process_topology_2_y.nproc[0] = d->process_topology_2_y.nproc[2];
-      d->process_topology_2_y.nproc[2] = temp;
-      d->process_topology_2_y.nproc[1] = d->process_topology_2_y.nproc[1];
-
-      d->process_topology_2_y.n[0] = n[0] / d->process_topology_2_y.nproc[0];
-      d->process_topology_2_y.n[1] = n[1] / d->process_topology_2_y.nproc[1];
-      d->process_topology_2_y.n[2] = n[2] / d->process_topology_2_y.nproc[2];
-      check_y_dims = (((d->process_topology_3.n[2]) % (d->process_topology_2_y.n[2]) == 0)
-                      && ((d->process_topology_3.n[0]) % (d->process_topology_2_y.n[0]) == 0)
-                      && (n[0] % (d->process_topology_2_y.nproc[2]) == 0)
-                      && (n[0] % (d->process_topology_2_y.nproc[0]) == 0));
-    }
-  } else {
-    check_y_dims = false;
-  }
-//  if that did not work, make a pencil that does by taking the cuboid
-//  (np1,np2,np3) and making pencils of the form (np1,1,np3*np2) or
-//  (np1*np2,1,np3) depending on the most even distribution it can.
-  if(!check_y_dims){
-    if(self==0 && debug)
-      fprintf(stderr,"MAKING Y PENCILS FIT yprocs(%d,%d,%d) y.ns(%d,%d,%d)...\n",
-              d->process_topology_2_y.nproc[0],
-              d->process_topology_2_y.nproc[1],
-              d->process_topology_2_y.nproc[2],
-              d->process_topology_2_y.n[0],
-              d->process_topology_2_y.n[1],
-              d->process_topology_2_y.n[2]);
-
-    d->process_topology_2_y.nproc[1]=1;
-    if(d->process_topology_3.nproc[2] > d->process_topology_3.nproc[0])
-    {
-      d->process_topology_2_y.nproc[0] = d->process_topology_3.nproc[0]*d->process_topology_3.nproc[1];
-      d->process_topology_2_y.nproc[2] = d->process_topology_3.nproc[2];
-      if((n[0] % (d->process_topology_2_y.nproc[2]) != 0)
-         || (n[0] % (d->process_topology_2_y.nproc[0]) != 0))
-      {
-        d->process_topology_2_y.nproc[2] = d->process_topology_3.nproc[2]*d->process_topology_3.nproc[1];
-        d->process_topology_2_y.nproc[0] = d->process_topology_3.nproc[0];
-      }
-    } else {
-      d->process_topology_2_y.nproc[2] = d->process_topology_3.nproc[2]*d->process_topology_3.nproc[1];
-      d->process_topology_2_y.nproc[0] = d->process_topology_3.nproc[0];
-      if((n[0] % (d->process_topology_2_y.nproc[2]) != 0)
-         || (n[0] % (d->process_topology_2_y.nproc[0]) != 0))
-      {
-        d->process_topology_2_y.nproc[0] = d->process_topology_3.nproc[0]*d->process_topology_3.nproc[1];
-        d->process_topology_2_y.nproc[2] = d->process_topology_3.nproc[2];
-      }
-    }
-
-    d->process_topology_2_y.n[0] = n[0] / d->process_topology_2_y.nproc[0];
-    d->process_topology_2_y.n[1] = n[1] / d->process_topology_2_y.nproc[1];
-    d->process_topology_2_y.n[2] = n[2] / d->process_topology_2_y.nproc[2];
-    if(self==0 && debug)
-      fprintf(stderr,"MAKING Y PENCILS FIT AFTER yprocs(%d,%d,%d) y.ns(%d,%d,%d)...\n",
-              d->process_topology_2_y.nproc[0],
-              d->process_topology_2_y.nproc[1],
-              d->process_topology_2_y.nproc[2],
-              d->process_topology_2_y.n[0],
-              d->process_topology_2_y.n[1],
-              d->process_topology_2_y.n[2]);
-    if(d->process_topology_2_y.n[0] != 0 && d->process_topology_2_y.n[1] != 0
-       && d->process_topology_2_y.n[2] != 0)
-    {// protects from dividing by zero.
-      check_y_dims = (((d->process_topology_3.n[2]) % (d->process_topology_2_y.n[2]) == 0)
-                      && ((d->process_topology_3.n[0]) % (d->process_topology_2_y.n[0]) == 0)
-                      && (n[0] % (d->process_topology_2_y.nproc[2]) == 0)
-                      && (n[0] % (d->process_topology_2_y.nproc[0]) == 0));
-    } else {
-      check_y_dims=false;
-    }
-  }
-
-  if (d->debug && 0 == self) {
-    fprintf(stderr, "  2d_y: ");
-    for (int i = 0; i < ndim; ++i) {
-      fprintf(stderr, "%d%s",
-              d->process_topology_2_y.nproc[i],
-              separator(i, ndim));
-    }
-    fprintf(stderr, "\n");
-  }
-  if(!check_y_dims && debug && (self==0)){
-    FILE * outfile;
-    outfile = fopen("error.data","a");
-    fprintf(outfile,"Y DIMS FAILS:(%d,%d,%d) (%d,%d,%d) \n",
-            d->process_topology_2_y.nproc[0],
-            d->process_topology_2_y.nproc[1],
-            d->process_topology_2_y.nproc[2],
-            d->process_topology_3.nproc[0],
-            d->process_topology_3.nproc[1],
-            d->process_topology_3.nproc[2]);
-  }
-  //assert(check_y_dims);
-  if(!check_y_dims)
-    fprintf(stderr,"assert(check_y_dims) would have failed.\n");
-
-//  if this happens, it is because the dimensions were chosen incorrectly.
-//  Either to many processors for the number of points in one dimension (could
-//  not do at least 1 point per processor), or the methods above could
-//  not make a distribution of pencils that fit in the cubiods, which would
-//  happen if the user gave numbers that wouldent work (we require the number of
-//  processors in each dimension of the cuboid must be modulo the number of
-//  points in that dimension, otherwise, this error will happen).
-
-  if(self == 0) {
-    printf("distribution 2y: [%d:%d:%d]\n",
-           d->process_topology_2_y.nproc[0],
-           d->process_topology_2_y.nproc[1],
-           d->process_topology_2_y.nproc[2]);
-    fflush(stdout);
-  }
-
-
-
-  MPI_Finalize();
-  return 0;
-}
diff --git a/Src/Extern/SWFFT/Dfft.H b/Src/Extern/SWFFT/Dfft.H
deleted file mode 100644
index 9b85957681..0000000000
--- a/Src/Extern/SWFFT/Dfft.H
+++ /dev/null
@@ -1,631 +0,0 @@
-/*
- *                 Copyright (C) 2017, UChicago Argonne, LLC
- *                            All Rights Reserved
- *
- *           Hardware/Hybrid Cosmology Code (HACC), Version 1.0
- *
- * Salman Habib, Adrian Pope, Hal Finkel, Nicholas Frontiere, Katrin Heitmann,
- *      Vitali Morozov, Jeffrey Emberson, Thomas Uram, Esteban Rangel
- *                        (Argonne National Laboratory)
- *
- *  David Daniel, Patricia Fasel, Chung-Hsing Hsu, Zarija Lukic, James Ahrens
- *                      (Los Alamos National Laboratory)
- *
- *                               George Zagaris
- *                                 (Kitware)
- *
- *                            OPEN SOURCE LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *   1. Redistributions of source code must retain the above copyright notice,
- *      this list of conditions and the following disclaimer. Software changes,
- *      modifications, or derivative works, should be noted with comments and
- *      the author and organization's name.
- *
- *   2. Redistributions in binary form must reproduce the above copyright
- *      notice, this list of conditions and the following disclaimer in the
- *      documentation and/or other materials provided with the distribution.
- *
- *   3. Neither the names of UChicago Argonne, LLC or the Department of Energy
- *      nor the names of its contributors may be used to endorse or promote
- *      products derived from this software without specific prior written
- *      permission.
- *
- *   4. The software and the end-user documentation included with the
- *      redistribution, if any, must include the following acknowledgment:
- *
- *     "This product includes software produced by UChicago Argonne, LLC under
- *      Contract No. DE-AC02-06CH11357 with the Department of Energy."
- *
- * *****************************************************************************
- *                                DISCLAIMER
- * THE SOFTWARE IS SUPPLIED "AS IS" WITHOUT WARRANTY OF ANY KIND. NEITHER THE
- * UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT OF ENERGY, NOR
- * UCHICAGO ARGONNE, LLC, NOR ANY OF THEIR EMPLOYEES, MAKES ANY WARRANTY,
- * EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR RESPONSIBILITY FOR THE
- * ACCURARY, COMPLETENESS, OR USEFULNESS OF ANY INFORMATION, DATA, APPARATUS,
- * PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS THAT ITS USE WOULD NOT INFRINGE
- * PRIVATELY OWNED RIGHTS.
- *
- * *****************************************************************************
- */
-
-// I think this should work for global {ngx, ngy, ngz}
-
-#ifndef HACC_DFFT_HPP
-#define HACC_DFFT_HPP
-#include <AMReX_Config.H>
-
-#ifdef ESSL_FFTW
-#include <fftw3_essl.h>
-#else
-#include <fftw3.h>
-#endif
-
-#include "complex-type.h"
-#include "Distribution.H"
-#include "Error.h"
-
-// DFFT_TIMING
-// 0 = no info
-// 1 = summary info
-// 2 = fine grain info
-#ifndef DFFT_TIMING
-#define DFFT_TIMING 0
-#endif
-
-#if DFFT_TIMING
-#include "TimingStats.h"
-#endif
-
-#define FFTW_ADDR(X) reinterpret_cast<fftw_complex*>(&(X)[0])
-
-namespace hacc {
-
-class Dfft {
-
-public:
-
-  //
-  int global_ng(int i) const { return d.global_ng(i); }
-  int const (& global_ng() const)[3] { return d.global_ng(); }
-  size_t global_size() const { return d.global_size(); }
-  MPI_Comm parent_comm() const { return d.parent_comm(); }
-
-
-
-  //
-  size_t local_size() const { return d.local_size(); }
-
-
-
-  // rank location in r-space
-  int self_rspace(int i) const { return d.self_3d(i);}
-  int const (& self_rspace() const)[3] { return d.self_3d();}
-
-  // number of ranks in r-space
-  int nproc_rspace(int i) const { return d.nproc_3d(i);}
-  int const (& nproc_rspace() const)[3] { return d.nproc_3d();}
-
-  // local grid dimensions in r-space
-  int local_ng_rspace(int i) const { return d.local_ng_3d(i);}
-  int const (& local_ng_rspace() const)[3] { return d.local_ng_3d();}
-
-  // 3D cartesian communicator in r-space
-  MPI_Comm cartcomm_rspace() const { return d.cart_3d(); }
-
-
-
-  // rank location in k-space
-  int self_kspace(int i) const { return d.self_2d_z(i);}
-  int const (& self_kspace() const)[3] { return d.self_2d_z();}
-
-  // number of ranks in r-space
-  int nproc_kspace(int i) const { return d.nproc_2d_z(i);}
-  int const (& nproc_kspace() const)[3] { return d.nproc_2d_z();}
-
-  // local grid dimensions in k-space
-  int local_ng_kspace(int i) const { return d.local_ng_2d_z(i);}
-  int const (& local_ng_kspace() const)[3] { return d.local_ng_2d_z();}
-
-  // 3D cartesian communicator in k-space
-  MPI_Comm cartcomm_kspace() const { return d.cart_2d_z(); }
-
-
-
-  void forward(complex_t const *in) {
-
-    if (PlansMade != true) Error() << "Dfft buffers not set";
-
-#if DFFT_TIMING
-    double start, stop;
-    double tdist=0.0, tdfft=0.0;
-    start = MPI_Wtime();
-#endif
-
-    distribution_3_to_2(&in[0], &m_fs[0], &d.m_d, 0);   // in --> fs
-
-#if DFFT_TIMING
-    stop = MPI_Wtime();
-    tdist += stop-start;
-#if DFFT_TIMING > 1
-    printTimingStats(d.parent_comm(), "DFFT  fd32", stop-start);
-#endif
-    MPI_Barrier(d.parent_comm());
-    start = MPI_Wtime();
-#endif
-
-    fftw_execute(m_plan_f_x);                           // fs --> fo
-
-#if DFFT_TIMING
-    stop = MPI_Wtime();
-    tdfft += stop-start;
-#if DFFT_TIMING > 1
-    printTimingStats(d.parent_comm(), "DFFT  f1dx", stop-start);
-#endif
-    MPI_Barrier(d.parent_comm());
-    start = MPI_Wtime();
-#endif
-
-    distribution_2_to_3(&m_fo[0], &m_fs[0], &d.m_d, 0); // fo --> fs
-
-#if DFFT_TIMING
-    stop = MPI_Wtime();
-    tdist += stop-start;
-#if DFFT_TIMING > 1
-    printTimingStats(d.parent_comm(), "DFFT  fd23", stop-start);
-#endif
-    MPI_Barrier(d.parent_comm());
-    start = MPI_Wtime();
-#endif
-
-    distribution_3_to_2(&m_fs[0], &m_fo[0], &d.m_d, 1); // fs --> fo
-
-#if DFFT_TIMING
-    stop = MPI_Wtime();
-    tdist += stop-start;
-#if DFFT_TIMING > 1
-    printTimingStats(d.parent_comm(), "DFFT  fd32", stop-start);
-#endif
-    MPI_Barrier(d.parent_comm());
-    start = MPI_Wtime();
-#endif
-
-    fftw_execute(m_plan_f_y);                           // fo --> fs
-
-#if DFFT_TIMING
-    stop = MPI_Wtime();
-    tdfft += stop-start;
-#if DFFT_TIMING > 1
-    printTimingStats(d.parent_comm(), "DFFT  f1dy", stop-start);
-#endif
-    MPI_Barrier(d.parent_comm());
-    start = MPI_Wtime();
-#endif
-
-    distribution_2_to_3(&m_fs[0], &m_fo[0], &d.m_d, 1); // fs --> fo
-
-#if DFFT_TIMING
-    stop = MPI_Wtime();
-    tdist += stop-start;
-#if DFFT_TIMING > 1
-    printTimingStats(d.parent_comm(), "DFFT  fd23", stop-start);
-#endif
-    MPI_Barrier(d.parent_comm());
-    start = MPI_Wtime();
-#endif
-
-    distribution_3_to_2(&m_fo[0], &m_fs[0], &d.m_d, 2); // fo --> fs
-
-#if DFFT_TIMING
-    stop = MPI_Wtime();
-    tdist += stop-start;
-#if DFFT_TIMING > 1
-    printTimingStats(d.parent_comm(), "DFFT  fd32", stop-start);
-#endif
-    MPI_Barrier(d.parent_comm());
-    start = MPI_Wtime();
-#endif
-
-    fftw_execute(m_plan_f_z);                           // fs --> fo
-
-#if DFFT_TIMING
-    stop = MPI_Wtime();
-    tdfft += stop-start;
-#if DFFT_TIMING > 1
-    printTimingStats(d.parent_comm(), "DFFT  f1dz", stop-start);
-#endif
-    printTimingStats(d.parent_comm(), "DFFT fdist", tdist);
-    printTimingStats(d.parent_comm(), "DFFT fdfft", tdfft);
-#endif
-
-  }
-
-
-
-  void forward(float const *in, size_t ghost0, size_t ghost1) {
-
-    if (PlansMade != true) Error() << "Dfft buffers not set";
-
-#if DFFT_TIMING
-    double start = MPI_Wtime();
-#endif
-
-    // copy from overloaded in to m_fo
-    const int *local_dim_r = local_ng_rspace();
-    int indexc = 0;
-    int indexf = ghost0*(ghost0 + local_dim_r[2] + ghost1)*(ghost0 + local_dim_r[1] + ghost1);
-    for(int local_r0=0; local_r0 < local_dim_r[0]; ++local_r0) {
-      indexf += ghost0*(ghost0 + local_dim_r[2] + ghost1);
-      for(int local_r1=0; local_r1 < local_dim_r[1]; ++local_r1) {
-        indexf += ghost0;
-        for(int local_r2=0; local_r2 < local_dim_r[2]; ++local_r2) {
-          m_fo[indexc] = in[indexf];
-          indexf++;
-          indexc++;
-        } // 2
-        indexf += ghost1;
-      } // 1
-      indexf += ghost1*(ghost0 + local_dim_r[2] + ghost1);
-    } // 0
-
-#if DFFT_TIMING
-    double stop = MPI_Wtime();
-    printTimingStats(d.parent_comm(), "DFFT fcopy", stop-start);
-#endif
-
-    // forward FFT from m_fo into m_fo
-    forward(&m_fo[0]);
-  }
-
-
-
-  void forward(float const *in, size_t ghost) {
-    return forward(in, ghost, ghost+1);
-  }
-
-
-
-  void backward(complex_t *out) {
-
-    if (PlansMade != true) Error() << "Dfft buffers not set";
-
-#if DFFT_TIMING
-    double start, stop;
-    double tdist=0.0, tdfft=0.0;
-    start = MPI_Wtime();
-#endif
-
-    fftw_execute(m_plan_b_z);                           // bi --> bs
-
-#if DFFT_TIMING
-    stop = MPI_Wtime();
-    tdfft += stop-start;
-#if DFFT_TIMING > 1
-    printTimingStats(d.parent_comm(), "DFFT  b1dz", stop-start);
-#endif
-    MPI_Barrier(d.parent_comm());
-    start = MPI_Wtime();
-#endif
-
-    distribution_2_to_3(&m_bs[0], &m_bi[0], &d.m_d, 2); // bs --> bi
-
-#if DFFT_TIMING
-    stop = MPI_Wtime();
-    tdist += stop-start;
-#if DFFT_TIMING > 1
-    printTimingStats(d.parent_comm(), "DFFT  bd23", stop-start);
-#endif
-    MPI_Barrier(d.parent_comm());
-    start = MPI_Wtime();
-#endif
-
-    distribution_3_to_2(&m_bi[0], &m_bs[0], &d.m_d, 1); // bi --> bs
-
-#if DFFT_TIMING
-    stop = MPI_Wtime();
-    tdist += stop-start;
-#if DFFT_TIMING > 1
-    printTimingStats(d.parent_comm(), "DFFT  bd32", stop-start);
-#endif
-    MPI_Barrier(d.parent_comm());
-    start = MPI_Wtime();
-#endif
-
-    fftw_execute(m_plan_b_y);                           // bs --> bi
-
-#if DFFT_TIMING
-    stop = MPI_Wtime();
-    tdfft += stop-start;
-#if DFFT_TIMING > 1
-    printTimingStats(d.parent_comm(), "DFFT  b1dy", stop-start);
-#endif
-    MPI_Barrier(d.parent_comm());
-    start = MPI_Wtime();
-#endif
-
-    distribution_2_to_3(&m_bi[0], &m_bs[0], &d.m_d, 1); // bi --> bs
-
-#if DFFT_TIMING
-    stop = MPI_Wtime();
-    tdist += stop-start;
-#if DFFT_TIMING > 1
-    printTimingStats(d.parent_comm(), "DFFT  bd23", stop-start);
-#endif
-    MPI_Barrier(d.parent_comm());
-    start = MPI_Wtime();
-#endif
-
-    distribution_3_to_2(&m_bs[0], &m_bi[0], &d.m_d, 0); // bs --> bi
-
-#if DFFT_TIMING
-    stop = MPI_Wtime();
-    tdist += stop-start;
-#if DFFT_TIMING > 1
-    printTimingStats(d.parent_comm(), "DFFT  bd32", stop-start);
-#endif
-    MPI_Barrier(d.parent_comm());
-    start = MPI_Wtime();
-#endif
-
-    fftw_execute(m_plan_b_x);                           // bi --> bs
-
-#if DFFT_TIMING
-    stop = MPI_Wtime();
-    tdfft += stop-start;
-#if DFFT_TIMING > 1
-    printTimingStats(d.parent_comm(), "DFFT  b1dx", stop-start);
-#endif
-    MPI_Barrier(d.parent_comm());
-    start = MPI_Wtime();
-#endif
-
-    distribution_2_to_3(&m_bs[0], &out[0], &d.m_d, 0);  // bs --> out
-
-#if DFFT_TIMING
-    stop = MPI_Wtime();
-    tdist += stop-start;
-#if DFFT_TIMING > 1
-    printTimingStats(d.parent_comm(), "DFFT  bd23", stop-start);
-#endif
-    printTimingStats(d.parent_comm(), "DFFT bdist", tdist);
-    printTimingStats(d.parent_comm(), "DFFT bdfft", tdfft);
-#endif
-
-  }
-
-
-
-  void backward(float *out, size_t ghost0, size_t ghost1) {
-
-    // backward FFT from m_bi into m_bi
-    backward(&m_bi[0]);
-
-#if DFFT_TIMING
-    double start = MPI_Wtime();
-#endif
-
-    // copy from m_bi to overloaded out
-    const int *local_dim_r = local_ng_rspace();
-    int indexc = 0;
-    int indexf = ghost0*(ghost0 + local_dim_r[2] + ghost1)*(ghost0 + local_dim_r[1] + ghost1);
-    for(int local_r0=0; local_r0 < local_dim_r[0]; ++local_r0) {
-      indexf += ghost0*(ghost0 + local_dim_r[2] + ghost1);
-      for(int local_r1=0; local_r1 < local_dim_r[1]; ++local_r1) {
-        indexf += ghost0;
-        for(int local_r2=0; local_r2 < local_dim_r[2]; ++local_r2) {
-          out[indexf] = std::real(m_bi[indexc]);
-          indexf++;
-          indexc++;
-        } // 2
-        indexf += ghost1;
-      } // 1
-      indexf += ghost1*(ghost0 + local_dim_r[2] + ghost1);
-    } // 0
-
-#if DFFT_TIMING
-    double stop = MPI_Wtime();
-    printTimingStats(d.parent_comm(), "DFFT bcopy", stop-start);
-#endif
-
-  }
-
-
-
-  void backward(float *out, size_t ghost) {
-    return backward(out, ghost, ghost+1);
-  }
-
-
-
-  Dfft(Distribution &dist)
-    : d(dist), PlansMade(false)
-  {}
-
-
-
-  // arrays given at class initialization are hard-wired to the fft plans
-  Dfft(Distribution &dist,
-       complex_t *forward_output,
-       complex_t *forward_scratch,
-       complex_t *backward_input,
-       complex_t *backward_scratch,
-       unsigned int flags = FFTW_MEASURE)
-    : d(dist), PlansMade(false)
-  {
-    makePlans(forward_output,
-              forward_scratch,
-              backward_input,
-              backward_scratch,
-              flags);
-  }
-
-
-
-  // array rules:
-  // forward/backward method calls over-write these arrays
-  // forward method has separate input array argument
-  // backward method has separate output array argument
-  // forward_scratch != forward_output && backward_scratch != backward_input
-  // no required relationship between forward and backward arrays
-
-  // actually, can these transforms be done in-place?
-  // for now leave as-is, but test later
-
-  // FFTW_MEASURE = 0
-  void makePlans(complex_t *forward_output,
-                 complex_t *forward_scratch,
-                 complex_t *backward_input,
-                 complex_t *backward_scratch,
-                 unsigned int flags = FFTW_MEASURE)
-  {
-    if(forward_output == forward_scratch)
-      Error() << "Dfft::setBuffers() forward_output == forward_scratch";
-    if(backward_input == backward_scratch)
-      Error() << "Dfft::setBuffers() backward_input == backward_scratch";
-
-    m_fo = forward_output;
-    m_fs = forward_scratch;
-    m_bi = backward_input;
-    m_bs = backward_scratch;
-
-#if DFFT_TIMING
-    double start = MPI_Wtime();
-#endif
-
-    // fs --> fo
-    m_plan_f_x = fftw_plan_many_dft(1, // rank
-                                    &(d.m_d.process_topology_2_x.n[0]), // const int *n,
-                                    d.m_d.process_topology_2_x.n[1] * d.m_d.process_topology_2_x.n[2], // howmany
-                                    FFTW_ADDR(m_fs),
-                                    NULL, // const int *inembed,
-                                    1, // int istride,
-                                    d.m_d.process_topology_2_x.n[0], // int idist,
-                                    FFTW_ADDR(m_fo),
-                                    NULL, // const int *onembed,
-                                    1, // int ostride,
-                                    d.m_d.process_topology_2_x.n[0], // int odist,
-                                    FFTW_FORWARD, // int sign,
-                                    flags); // unsigned flags
-
-    // fo --> fs
-    m_plan_f_y = fftw_plan_many_dft(1, // rank
-                                    &(d.m_d.process_topology_2_y.n[1]), // const int *n,
-                                    d.m_d.process_topology_2_y.n[0] * d.m_d.process_topology_2_y.n[2], // howmany
-                                    FFTW_ADDR(m_fo),
-                                    NULL, // const int *inembed,
-                                    1, // int istride,
-                                    d.m_d.process_topology_2_y.n[1], // int idist,
-                                    FFTW_ADDR(m_fs),
-                                    NULL, // const int *onembed,
-                                    1, // int ostride,
-                                    d.m_d.process_topology_2_y.n[1], // int odist,
-                                    FFTW_FORWARD, // int sign,
-                                    flags); // unsigned flags
-
-    // fs --> fo
-    m_plan_f_z = fftw_plan_many_dft(1, // rank
-                                    &(d.m_d.process_topology_2_z.n[2]), // const int *n,
-                                    d.m_d.process_topology_2_z.n[1] * d.m_d.process_topology_2_z.n[0], // howmany
-                                    FFTW_ADDR(m_fs),
-                                    NULL, // const int *inembed,
-                                    1, // int istride,
-                                    d.m_d.process_topology_2_z.n[2], // int idist,
-                                    FFTW_ADDR(m_fo),
-                                    NULL, // const int *onembed,
-                                    1, // int ostride,
-                                    d.m_d.process_topology_2_z.n[2], // int odist,
-                                    FFTW_FORWARD, // int sign,
-                                    flags); // unsigned flags
-
-    // bi --> bs
-    m_plan_b_z = fftw_plan_many_dft(1, // rank
-                                    &(d.m_d.process_topology_2_z.n[2]), // const int *n,
-                                    d.m_d.process_topology_2_z.n[1] * d.m_d.process_topology_2_z.n[0], // howmany
-                                    FFTW_ADDR(m_bi),
-                                    NULL, // const int *inembed,
-                                    1, // int istride,
-                                    d.m_d.process_topology_2_z.n[2], // int idist,
-                                    FFTW_ADDR(m_bs),
-                                    NULL, // const int *onembed,
-                                    1, // int ostride,
-                                    d.m_d.process_topology_2_z.n[2], // int odist,
-                                    FFTW_BACKWARD, // int sign,
-                                    flags); // unsigned flags
-
-    // bs --> bi
-    m_plan_b_y = fftw_plan_many_dft(1, // rank
-                                    &(d.m_d.process_topology_2_y.n[1]), // const int *n,
-                                    d.m_d.process_topology_2_y.n[0] * d.m_d.process_topology_2_y.n[2], // howmany
-                                    FFTW_ADDR(m_bs),
-                                    NULL, // const int *inembed,
-                                    1, // int istride,
-                                    d.m_d.process_topology_2_y.n[1], // int idist,
-                                    FFTW_ADDR(m_bi),
-                                    NULL, // const int *onembed,
-                                    1, // int ostride,
-                                    d.m_d.process_topology_2_y.n[1], // int odist,
-                                    FFTW_BACKWARD, // int sign,
-                                    flags); // unsigned flags
-
-    // bi --> bs
-    m_plan_b_x = fftw_plan_many_dft(1, // rank
-                                    &(d.m_d.process_topology_2_x.n[0]), // const int *n,
-                                    d.m_d.process_topology_2_x.n[1] * d.m_d.process_topology_2_x.n[2], // howmany
-                                    FFTW_ADDR(m_bi),
-                                    NULL, // const int *inembed,
-                                    1, // int istride,
-                                    d.m_d.process_topology_2_x.n[0], // int idist,
-                                    FFTW_ADDR(m_bs),
-                                    NULL, // const int *onembed,
-                                    1, // int ostride,
-                                    d.m_d.process_topology_2_x.n[0], // int odist,
-                                    FFTW_BACKWARD, // int sign,
-                                    flags); // unsigned flags
-
-#if DFFT_TIMING
-    double stop = MPI_Wtime();
-    printTimingStats(d.parent_comm(), "DFFT  init", stop-start);
-#endif
-
-    PlansMade = true;
-  }
-
-
-
-  ~Dfft() {
-    if(PlansMade == true) {
-      fftw_destroy_plan(m_plan_f_x);
-      fftw_destroy_plan(m_plan_f_y);
-      fftw_destroy_plan(m_plan_f_z);
-      fftw_destroy_plan(m_plan_b_z);
-      fftw_destroy_plan(m_plan_b_y);
-      fftw_destroy_plan(m_plan_b_x);
-    }
-  }
-
-
-
-  Distribution & get_d() {return d;}
-
-protected:
-
-  Distribution &d;
-  bool PlansMade;
-  complex_t *m_fo;
-  complex_t *m_fs;
-  complex_t *m_bi;
-  complex_t *m_bs;
-  fftw_plan m_plan_f_x;
-  fftw_plan m_plan_f_y;
-  fftw_plan m_plan_f_z;
-  fftw_plan m_plan_b_z;
-  fftw_plan m_plan_b_y;
-  fftw_plan m_plan_b_x;
-};
-
-} // namespace hacc
-
-#endif // HACC_DFFT_HPP
diff --git a/Src/Extern/SWFFT/DfftC.cpp b/Src/Extern/SWFFT/DfftC.cpp
deleted file mode 100644
index 6576003e84..0000000000
--- a/Src/Extern/SWFFT/DfftC.cpp
+++ /dev/null
@@ -1,138 +0,0 @@
-/*
- *                 Copyright (C) 2017, UChicago Argonne, LLC
- *                            All Rights Reserved
- *
- *           Hardware/Hybrid Cosmology Code (HACC), Version 1.0
- *
- * Salman Habib, Adrian Pope, Hal Finkel, Nicholas Frontiere, Katrin Heitmann,
- *      Vitali Morozov, Jeffrey Emberson, Thomas Uram, Esteban Rangel
- *                        (Argonne National Laboratory)
- *
- *  David Daniel, Patricia Fasel, Chung-Hsing Hsu, Zarija Lukic, James Ahrens
- *                      (Los Alamos National Laboratory)
- *
- *                               George Zagaris
- *                                 (Kitware)
- *
- *                            OPEN SOURCE LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *   1. Redistributions of source code must retain the above copyright notice,
- *      this list of conditions and the following disclaimer. Software changes,
- *      modifications, or derivative works, should be noted with comments and
- *      the author and organization's name.
- *
- *   2. Redistributions in binary form must reproduce the above copyright
- *      notice, this list of conditions and the following disclaimer in the
- *      documentation and/or other materials provided with the distribution.
- *
- *   3. Neither the names of UChicago Argonne, LLC or the Department of Energy
- *      nor the names of its contributors may be used to endorse or promote
- *      products derived from this software without specific prior written
- *      permission.
- *
- *   4. The software and the end-user documentation included with the
- *      redistribution, if any, must include the following acknowledgment:
- *
- *     "This product includes software produced by UChicago Argonne, LLC under
- *      Contract No. DE-AC02-06CH11357 with the Department of Energy."
- *
- * *****************************************************************************
- *                                DISCLAIMER
- * THE SOFTWARE IS SUPPLIED "AS IS" WITHOUT WARRANTY OF ANY KIND. NEITHER THE
- * UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT OF ENERGY, NOR
- * UCHICAGO ARGONNE, LLC, NOR ANY OF THEIR EMPLOYEES, MAKES ANY WARRANTY,
- * EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR RESPONSIBILITY FOR THE
- * ACCURARY, COMPLETENESS, OR USEFULNESS OF ANY INFORMATION, DATA, APPARATUS,
- * PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS THAT ITS USE WOULD NOT INFRINGE
- * PRIVATELY OWNED RIGHTS.
- *
- * *****************************************************************************
- */
-
-///
-// Give C linkage to C++ Dfft class so that Fortran can access its functions.
-///
-
-#include "complex-type.h"
-#include "Distribution.hpp"
-#include "Dfft.hpp"
-
-extern "C" {
-
-  hacc::Dfft* Dfft__new(hacc::Distribution &dist) {
-    return new hacc::Dfft(dist);
-  }
-
-  void Dfft__makePlans(hacc::Dfft* This, complex_t *forward_output, complex_t *forward_scratch,
-                      complex_t *backward_input, complex_t *backward_scratch, unsigned int flags) {
-    This->makePlans(forward_output, forward_scratch, backward_input, backward_scratch, flags);
-  }
-
-  void Dfft__forward(hacc::Dfft* This, complex_t const *in) {
-    This->forward(in);
-  }
-
-  void Dfft__backward(hacc::Dfft* This, complex_t *out) {
-    This->backward(out);
-  }
-
-  size_t Dfft__global_size(hacc::Dfft* This) {
-    return This->global_size();
-  }
-
-  size_t Dfft__local_size(hacc::Dfft* This) {
-    return This->local_size();
-  }
-
-  void Dfft__global_ng(hacc::Dfft* This, int n[3]) {
-    for(size_t i = 0; i < 3; ++i) n[i] = This->global_ng(i);
-  }
-
-  void Dfft__self_rspace(hacc::Dfft* This, int n[3]) {
-    for(size_t i = 0; i < 3; ++i) n[i] = This->self_rspace(i);
-  }
-
-  void Dfft__nproc_rspace(hacc::Dfft* This, int n[3]) {
-    for(size_t i = 0; i < 3; ++i) n[i] = This->nproc_rspace(i);
-  }
-
-  void Dfft__local_ng_rspace(hacc::Dfft* This, int n[3]) {
-    for(size_t i = 0; i < 3; ++i) n[i] = This->local_ng_rspace(i);
-  }
-
-  void Dfft__self_kspace(hacc::Dfft* This, int n[3]) {
-    for(size_t i = 0; i < 3; ++i) n[i] = This->self_kspace(i);
-  }
-
-  void Dfft__nproc_kspace(hacc::Dfft* This, int n[3]) {
-    for(size_t i = 0; i < 3; ++i) n[i] = This->nproc_kspace(i);
-  }
-
-  void Dfft__local_ng_kspace(hacc::Dfft* This, int n[3]) {
-    for(size_t i = 0; i < 3; ++i) n[i] = This->local_ng_kspace(i);
-  }
-
-  MPI_Fint Dfft__parent_comm(hacc::Dfft* This) {
-    MPI_Comm comm = This->parent_comm();
-    return MPI_Comm_c2f(comm);
-  }
-
-  MPI_Fint Dfft__cartcomm_rspace(hacc::Dfft* This) {
-    MPI_Comm comm = This->cartcomm_rspace();
-    return MPI_Comm_c2f(comm);
-  }
-
-  MPI_Fint Dfft__cartcomm_kspace(hacc::Dfft* This) {
-    MPI_Comm comm = This->cartcomm_kspace();
-    return MPI_Comm_c2f(comm);
-  }
-
-  void Dfft__delete(hacc::Dfft* This) {
-    delete This;
-  }
-
-}
-
diff --git a/Src/Extern/SWFFT/Distribution.H b/Src/Extern/SWFFT/Distribution.H
deleted file mode 100644
index 7048f11b2d..0000000000
--- a/Src/Extern/SWFFT/Distribution.H
+++ /dev/null
@@ -1,286 +0,0 @@
-/*
- *                 Copyright (C) 2017, UChicago Argonne, LLC
- *                            All Rights Reserved
- *
- *           Hardware/Hybrid Cosmology Code (HACC), Version 1.0
- *
- * Salman Habib, Adrian Pope, Hal Finkel, Nicholas Frontiere, Katrin Heitmann,
- *      Vitali Morozov, Jeffrey Emberson, Thomas Uram, Esteban Rangel
- *                        (Argonne National Laboratory)
- *
- *  David Daniel, Patricia Fasel, Chung-Hsing Hsu, Zarija Lukic, James Ahrens
- *                      (Los Alamos National Laboratory)
- *
- *                               George Zagaris
- *                                 (Kitware)
- *
- *                            OPEN SOURCE LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *   1. Redistributions of source code must retain the above copyright notice,
- *      this list of conditions and the following disclaimer. Software changes,
- *      modifications, or derivative works, should be noted with comments and
- *      the author and organization's name.
- *
- *   2. Redistributions in binary form must reproduce the above copyright
- *      notice, this list of conditions and the following disclaimer in the
- *      documentation and/or other materials provided with the distribution.
- *
- *   3. Neither the names of UChicago Argonne, LLC or the Department of Energy
- *      nor the names of its contributors may be used to endorse or promote
- *      products derived from this software without specific prior written
- *      permission.
- *
- *   4. The software and the end-user documentation included with the
- *      redistribution, if any, must include the following acknowledgment:
- *
- *     "This product includes software produced by UChicago Argonne, LLC under
- *      Contract No. DE-AC02-06CH11357 with the Department of Energy."
- *
- * *****************************************************************************
- *                                DISCLAIMER
- * THE SOFTWARE IS SUPPLIED "AS IS" WITHOUT WARRANTY OF ANY KIND. NEITHER THE
- * UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT OF ENERGY, NOR
- * UCHICAGO ARGONNE, LLC, NOR ANY OF THEIR EMPLOYEES, MAKES ANY WARRANTY,
- * EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR RESPONSIBILITY FOR THE
- * ACCURARY, COMPLETENESS, OR USEFULNESS OF ANY INFORMATION, DATA, APPARATUS,
- * PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS THAT ITS USE WOULD NOT INFRINGE
- * PRIVATELY OWNED RIGHTS.
- *
- * *****************************************************************************
- */
-
-// I think this should work for global {ngx, ngy, ngz}
-
-#ifndef HACC_DISTRIBUTION_HPP
-#define HACC_DISTRIBUTION_HPP
-
-#include <vector>
-
-///
-// Distribution / partition / decomposition of data
-//
-// A C++ wrapper around distribution.h
-///
-
-#include "complex-type.h"
-#include "distribution_c.h"
-
-namespace hacc {
-
-class Distribution {
-
-public:
-
-
-  //
-  // constructors
-  //
-
-  // standard setup
-
-  Distribution(MPI_Comm comm,
-               int const n[],
-               bool debug = false)
-    : m_comm(comm), m_rmap(NULL), m_debug(debug)
-  {
-    int Ndims[3] = { 0, 0, 0 };
-    initialize(comm, n, Ndims);
-  }
-
-  Distribution(MPI_Comm comm,
-               int ng,
-               bool debug = false)
-    : m_comm(comm), m_rmap(NULL), m_debug(debug)
-  {
-    int n[3] = { ng, ng, ng };
-    int Ndims[3] = { 0, 0, 0 };
-    initialize(comm, n, Ndims);
-  }
-
-  // custom setup with 3d decomposition and grid to rank map
-
-  Distribution(MPI_Comm comm,
-           int const n[],
-           int const Ndims[],
-           int *rmap,
-           bool debug = false)
-    : m_comm(comm), m_rmap(rmap), m_debug(debug)
-  {
-    initialize(comm, n, Ndims);
-  }
-
-  Distribution(MPI_Comm comm,
-           int ng,
-           int const Ndims[],
-           int *rmap,
-           bool debug = false)
-    : m_comm(comm), m_rmap(rmap), m_debug(debug)
-  {
-    int n[3] = { ng, ng, ng };
-    initialize(comm, n, Ndims);
-  }
-
-  //
-  // destructor
-  //
-
-  virtual ~Distribution()
-  {
-    distribution_fini(&m_d);
-  }
-
-
-  // initialization
-
-  void initialize(MPI_Comm comm, int const n[], int const Ndims[]) {
-    int flag;
-    MPI_Initialized(&flag);
-    if (flag == 0) {
-      MPI_Init(0, 0);
-    }
-
-    distribution_init(comm, n, Ndims, &m_d, m_rmap, m_debug);
-  }
-
-
-  // redistribution
-
-  void redistribute_1_to_3(const complex_t *a, complex_t *b) {
-    distribution_1_to_3(a, b, &m_d);
-  }
-
-  void redistribute_3_to_1(const complex_t *a, complex_t *b) {
-    distribution_3_to_1(a, b, &m_d);
-  }
-
-  void redistribute_2_to_3(const complex_t *a, complex_t *b, int axis) {
-    distribution_2_to_3(a, b, &m_d, axis);
-  }
-
-  void redistribute_3_to_2(const complex_t *a, complex_t *b, int axis) {
-    distribution_3_to_2(a, b, &m_d, axis);
-  }
-
-
-  // grid sizes
-
-  size_t local_size() const {
-    size_t size = 1;
-    for (int i = 0; i < 3; ++i) {
-      size *= (m_d.n[i] / m_d.process_topology_3.nproc[i]);
-    }
-    return size;
-  }
-
-  size_t global_size() const {
-    size_t size = 1;
-    for (int i = 0; i < 3; ++i) {
-      size *= m_d.n[i];
-    }
-    return size;
-  }
-
-  int global_ng(int i) const { return m_d.n[i];}
-  int local_ng_1d(int i) const { return m_d.process_topology_1.n[i];}
-  int local_ng_2d_x(int i) const { return m_d.process_topology_2_x.n[i];}
-  int local_ng_2d_y(int i) const { return m_d.process_topology_2_y.n[i];}
-  int local_ng_2d_z(int i) const { return m_d.process_topology_2_z.n[i];}
-  int local_ng_3d(int i) const { return m_d.process_topology_3.n[i];}
-
-  int const (& global_ng() const)[3] { return m_d.n;}
-  int const (& local_ng_1d() const)[3] { return m_d.process_topology_1.n;}
-  int const (& local_ng_2d_x() const)[3] { return m_d.process_topology_2_x.n;}
-  int const (& local_ng_2d_y() const)[3] { return m_d.process_topology_2_y.n;}
-  int const (& local_ng_2d_z() const)[3] { return m_d.process_topology_2_z.n;}
-  int const (& local_ng_3d() const)[3] { return m_d.process_topology_3.n;}
-
-
-  // numbers of ranks
-
-  int nproc() const { return m_d.process_topology_1.nproc[0];}
-
-  int nproc_1d(int i) const { return m_d.process_topology_1.nproc[i];}
-  int nproc_2d_x(int i) const { return m_d.process_topology_2_x.nproc[i];}
-  int nproc_2d_y(int i) const { return m_d.process_topology_2_y.nproc[i];}
-  int nproc_2d_z(int i) const { return m_d.process_topology_2_z.nproc[i];}
-  int nproc_3d(int i) const { return m_d.process_topology_3.nproc[i];}
-
-  int const (& nproc_1d() const)[3] { return m_d.process_topology_1.nproc;}
-  int const (& nproc_2d_x() const)[3] { return m_d.process_topology_2_x.nproc;}
-  int const (& nproc_2d_y() const)[3] { return m_d.process_topology_2_y.nproc;}
-  int const (& nproc_2d_z() const)[3] { return m_d.process_topology_2_z.nproc;}
-  int const (& nproc_3d() const)[3] { return m_d.process_topology_3.nproc;}
-
-
-  // rank location
-
-  int self() const { return m_d.process_topology_1.self[0];}
-
-  int self_1d(int i) const { return m_d.process_topology_1.self[i];}
-  int self_2d_x(int i) const { return m_d.process_topology_2_x.self[i];}
-  int self_2d_y(int i) const { return m_d.process_topology_2_y.self[i];}
-  int self_2d_z(int i) const { return m_d.process_topology_2_z.self[i];}
-  int self_3d(int i) const { return m_d.process_topology_3.self[i];}
-
-  int const (& self_1d() const)[3] { return m_d.process_topology_1.self;}
-  int const (& self_2d_x() const)[3] { return m_d.process_topology_2_x.self;}
-  int const (& self_2d_y() const)[3] { return m_d.process_topology_2_y.self;}
-  int const (& self_2d_z() const)[3] { return m_d.process_topology_2_z.self;}
-  int const (& self_3d() const)[3] { return m_d.process_topology_3.self;}
-
-
-  // communicators
-
-  MPI_Comm cart_1d() const { return m_d.process_topology_1.cart;}
-  MPI_Comm cart_2d_x() const { return m_d.process_topology_2_x.cart;}
-  MPI_Comm cart_2d_y() const { return m_d.process_topology_2_y.cart;}
-  MPI_Comm cart_2d_z() const { return m_d.process_topology_2_z.cart;}
-  MPI_Comm cart_3d() const { return m_d.process_topology_3.cart;}
-
-  MPI_Comm parent_comm() const { return m_comm;}
-
-
-  //
-
-  int rank_2d_x(int c[]) {
-    int r;
-    Rank_x_pencils(&r, c, &m_d);
-    return r;
-  }
-
-  int rank_2d_y(int c[]) {
-    int r;
-    Rank_y_pencils(&r, c, &m_d);
-    return r;
-  }
-
-  int rank_2d_z(int c[]) {
-    int r;
-    Rank_z_pencils(&r, c, &m_d);
-    return r;
-  }
-
-
-  //
-
-  void coords_2d_x(int r, int c[]) { Coord_x_pencils(r, c, &m_d);}
-  void coords_2d_y(int r, int c[]) { Coord_y_pencils(r, c, &m_d);}
-  void coords_2d_z(int r, int c[]) { Coord_z_pencils(r, c, &m_d);}
-
-
-public:
-  // This is public for now until we refactor the Solver* classes
-  // to use the C++ interface.
-  distribution_t m_d;
-
-protected:
-  MPI_Comm m_comm;
-  int* m_rmap;
-  bool m_debug;
-};
-
-} // namespace hacc
-#endif // HACC_DISTRIBUTION_HPP
diff --git a/Src/Extern/SWFFT/DistributionC.cpp b/Src/Extern/SWFFT/DistributionC.cpp
deleted file mode 100644
index e768f08f0e..0000000000
--- a/Src/Extern/SWFFT/DistributionC.cpp
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- *                 Copyright (C) 2017, UChicago Argonne, LLC
- *                            All Rights Reserved
- *
- *           Hardware/Hybrid Cosmology Code (HACC), Version 1.0
- *
- * Salman Habib, Adrian Pope, Hal Finkel, Nicholas Frontiere, Katrin Heitmann,
- *      Vitali Morozov, Jeffrey Emberson, Thomas Uram, Esteban Rangel
- *                        (Argonne National Laboratory)
- *
- *  David Daniel, Patricia Fasel, Chung-Hsing Hsu, Zarija Lukic, James Ahrens
- *                      (Los Alamos National Laboratory)
- *
- *                               George Zagaris
- *                                 (Kitware)
- *
- *                            OPEN SOURCE LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *   1. Redistributions of source code must retain the above copyright notice,
- *      this list of conditions and the following disclaimer. Software changes,
- *      modifications, or derivative works, should be noted with comments and
- *      the author and organization's name.
- *
- *   2. Redistributions in binary form must reproduce the above copyright
- *      notice, this list of conditions and the following disclaimer in the
- *      documentation and/or other materials provided with the distribution.
- *
- *   3. Neither the names of UChicago Argonne, LLC or the Department of Energy
- *      nor the names of its contributors may be used to endorse or promote
- *      products derived from this software without specific prior written
- *      permission.
- *
- *   4. The software and the end-user documentation included with the
- *      redistribution, if any, must include the following acknowledgment:
- *
- *     "This product includes software produced by UChicago Argonne, LLC under
- *      Contract No. DE-AC02-06CH11357 with the Department of Energy."
- *
- * *****************************************************************************
- *                                DISCLAIMER
- * THE SOFTWARE IS SUPPLIED "AS IS" WITHOUT WARRANTY OF ANY KIND. NEITHER THE
- * UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT OF ENERGY, NOR
- * UCHICAGO ARGONNE, LLC, NOR ANY OF THEIR EMPLOYEES, MAKES ANY WARRANTY,
- * EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR RESPONSIBILITY FOR THE
- * ACCURARY, COMPLETENESS, OR USEFULNESS OF ANY INFORMATION, DATA, APPARATUS,
- * PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS THAT ITS USE WOULD NOT INFRINGE
- * PRIVATELY OWNED RIGHTS.
- *
- * *****************************************************************************
- */
-
-///
-// Distribution / partition / decomposition of data
-//
-// Give C linkage to C++ Distribution class so that Fortran can access its functions.
-///
-
-#include "Distribution.hpp"
-
-extern "C" {
-
-  hacc::Distribution* Distribution__new(MPI_Fint *fcomm, int const n[], bool debug) {
-    MPI_Comm comm = MPI_Comm_f2c(*fcomm);
-    return new hacc::Distribution(comm, n, debug);
-  }
-
-  MPI_Fint Distribution__Cart_3D(hacc::Distribution* This) {
-    MPI_Comm comm = This->cart_3d();
-    return MPI_Comm_c2f(comm);
-  }
-
-  void Distribution__delete(hacc::Distribution* This) {
-    delete This;
-  }
-
-}
-
diff --git a/Src/Extern/SWFFT/Error.h b/Src/Extern/SWFFT/Error.h
deleted file mode 100644
index e0607f7e24..0000000000
--- a/Src/Extern/SWFFT/Error.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- *                 Copyright (C) 2017, UChicago Argonne, LLC
- *                            All Rights Reserved
- *
- *           Hardware/Hybrid Cosmology Code (HACC), Version 1.0
- *
- * Salman Habib, Adrian Pope, Hal Finkel, Nicholas Frontiere, Katrin Heitmann,
- *      Vitali Morozov, Jeffrey Emberson, Thomas Uram, Esteban Rangel
- *                        (Argonne National Laboratory)
- *
- *  David Daniel, Patricia Fasel, Chung-Hsing Hsu, Zarija Lukic, James Ahrens
- *                      (Los Alamos National Laboratory)
- *
- *                               George Zagaris
- *                                 (Kitware)
- *
- *                            OPEN SOURCE LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *   1. Redistributions of source code must retain the above copyright notice,
- *      this list of conditions and the following disclaimer. Software changes,
- *      modifications, or derivative works, should be noted with comments and
- *      the author and organization's name.
- *
- *   2. Redistributions in binary form must reproduce the above copyright
- *      notice, this list of conditions and the following disclaimer in the
- *      documentation and/or other materials provided with the distribution.
- *
- *   3. Neither the names of UChicago Argonne, LLC or the Department of Energy
- *      nor the names of its contributors may be used to endorse or promote
- *      products derived from this software without specific prior written
- *      permission.
- *
- *   4. The software and the end-user documentation included with the
- *      redistribution, if any, must include the following acknowledgment:
- *
- *     "This product includes software produced by UChicago Argonne, LLC under
- *      Contract No. DE-AC02-06CH11357 with the Department of Energy."
- *
- * *****************************************************************************
- *                                DISCLAIMER
- * THE SOFTWARE IS SUPPLIED "AS IS" WITHOUT WARRANTY OF ANY KIND. NEITHER THE
- * UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT OF ENERGY, NOR
- * UCHICAGO ARGONNE, LLC, NOR ANY OF THEIR EMPLOYEES, MAKES ANY WARRANTY,
- * EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR RESPONSIBILITY FOR THE
- * ACCURARY, COMPLETENESS, OR USEFULNESS OF ANY INFORMATION, DATA, APPARATUS,
- * PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS THAT ITS USE WOULD NOT INFRINGE
- * PRIVATELY OWNED RIGHTS.
- *
- * *****************************************************************************
- */
-
-#ifndef HACC_ERROR_H
-#define HACC_ERROR_H
-
-#include <iostream>
-#include <sstream>
-#include <stdexcept>
-#include <string>
-
-#include <errno.h>
-#include <string.h>
-
-namespace hacc {
-// Use this class to generate a fatal error, like this:
-// Error() << "Something bad happened: " << 5 << " is not " << 6;
-// which will format a string and throw a std::runtime_error.
-class Error {
-public:
-  Error(bool UseErrno = false) : SysError(0) {
-    if (UseErrno)
-      SysError = strerror(errno);
-  }
-
-  ~Error() {
-    if (SysError)
-      SS << ": " << SysError;
-
-    throw std::runtime_error(SS.str());
-  }
-
-  template <typename T>
-  Error &operator << (const T &Value) {
-    SS << Value;
-    return *this;
-  }
-
-protected:
-  std::stringstream SS;
-  const char *SysError;
-};
-} // namespace hacc
-
-#endif // HACC_ERROR_H
-
diff --git a/Src/Extern/SWFFT/Make.package b/Src/Extern/SWFFT/Make.package
deleted file mode 100644
index e9e16d44e7..0000000000
--- a/Src/Extern/SWFFT/Make.package
+++ /dev/null
@@ -1,11 +0,0 @@
-cEXE_headers +=  AlignedAllocator.h
-cEXE_headers +=  complex-type.h
-cEXE_headers +=  distribution_c.h
-cEXE_headers +=  TimingStats.h
-cEXE_headers +=  Error.h
-CEXE_headers += Distribution.H
-CEXE_headers += Dfft.H
-cEXE_sources += distribution.c
-
-cEXE_headers += verbosity.h
-CEXE_sources += verbosity.cpp
diff --git a/Src/Extern/SWFFT/README b/Src/Extern/SWFFT/README
deleted file mode 100644
index fe34f57d58..0000000000
--- a/Src/Extern/SWFFT/README
+++ /dev/null
@@ -1,125 +0,0 @@
-SWFFT (HACC)
-Adrian Pope (et al)
-apope@anl.gov
-2017-10-25
-
-========
-IMPORTANT
-========
-
-This code is available under BSD license from https://xgitlab.cels.anl.gov/hacc/SWFFT
-
-========
-Overview
-========
-
-This directory contains the source code to be called by the code in 
-amrex/Tutorials/SWFFT in order to run SWFFT, a 3D distributed 
-memory discrete fast Fourier transform.
-There is also a utility that checks grid sizes and 
-MPI rank layouts (CheckDecomposition).
-
-This code assumes that global grid will originally be distributed between
-MPI ranks using a 3D Cartesian communicator. That data needs to be
-re-distributed to three 2D pencil distributions in turn in order to compute
-the DFFTs along each dimension.
-
-Functionally, a Distribution object is instantiated based on a parent
-MPI_Comm, and that Distribution instance will create and track the Cartesian
-communicators for the initial 3D distribution and the three 2D pencil
-distributions. A Dfft object is then instantiated based on the Distribution
-object in order to coordinate the operations to actually execute the
-3D distributed memory DFFT. The Dfft instance also has convenience methods
-to access the communicators and geometric information for the MPI distribution
-in "real space" (initial 3D distribution) and "k space" (2D pencils in z).
-
-This code does not work for arbitrary grid sizes and number of MPI ranks.
-The specific constraints are difficult to enumerate in a compact way, but a
-rule of thumb is that it generally works when the number of vertices along
-one side of the global 3D grid ("ng")can be factored into small primes, and
-when the number of MPI ranks can also be factored into small primes.
-I believe that all of the unique prime factors of the number of MPI ranks
-must be present in the set of prime factors of the grid, eg. if you have
-20 MPI ranks then ng must be a multiple of 5 and 2. The "CheckDecomposition"
-utility is provided to check (on one rank) whether a proposed grid size and
-number of MPI ranks will work, which can be done before submitting a large
-test with TestDfft/TestFDfft.
-
-========
-Building
-========
-
--------------------
-System Requirements
--------------------
-
-MPI (known to work for 2.2 and newer, may work with some older versions)
-FFTW3 (double precision, OpenMP optional, does not use FFTW3's MPI interface)
-
-============================
-CheckDecomposition (Utility)
-============================
-
------
-Usage
------
-
-Though CheckDecomposition is built with MPI it is intended to run serially
-with the proposed number of MPI ranks as a command line argument, eg.
-
-$ ./CheckDecomposition <ngx> <ngy> <ngz> <Nproc> [nx ny nz]
-
-where <ngx>, <ngy>, and <ngz> are the number of vertices along each side
-of the global grid and <Nproc> is the number of MPI ranks. The user may
-optionally additionally supply the number of MPI ranks in each dimension
-for the 3D communicator as [nx ny nz], and though this option is not currently
-available in this version of TestDfft and the underlying Distribution code,
-it would be fairly easy to re-activate that functionality.
-
---------------
-Example Output
---------------
-
-Check whether a 10240^3 grid will work on 32768 MPI ranks:
-
-$ ./CheckDecomposition 10240 10240 10240 32768
-distribution 1D: [32768:1:1]
-distribution 3D: [32:32:32]
-  2d_z: 256, 128, 1.
-distribution 2z: [256:128:1]
-  2d_x: 1, 256, 128.
-distribution 2x: [1:256:128]
-  2d_y: 256, 1, 128.
-distribution 2y: [256:1:128]
-
-================
-Additional Notes
-================
-
--------------
-Integer Types
--------------
-
-The signature of many MPI functions requires 32-bit integers, so we use those
-where required. The underlying distribution.h/.c code also uses 32-bit
-integers to keep track of the number of grid vertices along the sides of
-the global grid, which likely does not present a practical limit on the
-size of 3D grids in the near future. I believe we always use 64-bit integers
-for iteration through the grids values themselves, so the total number of
-grid vertices locally and globally should not be limited by 32-bit integer
-size. This distribution code has been tested up to 16384^3 global grid and
-on >~10^6 MPI-ranks.
-
--------------------------------
-Fortran Multidimensional Arrays
--------------------------------
-
-The linear storage in memory of multidimensional arrays differs between
-that of C (row-major) and Fortran (column-major). The Fortran interface
-provided here implicitly assumes that the one-dimensional memory storage
-of the arrays to be transformed conforms with the C convention. The 
-returned transformed data is also arranged in row-major format. Hence, care
-must be taken to ensure that data is arranged in this way when interfacing
-with the Fortran wrappers. In general, this involves a transpose of data when 
-using a multidimensional Fortran array to store the 3D data in memory. 
-
diff --git a/Src/Extern/SWFFT/TimingStats.h b/Src/Extern/SWFFT/TimingStats.h
deleted file mode 100644
index f9d716d5a0..0000000000
--- a/Src/Extern/SWFFT/TimingStats.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- *                 Copyright (C) 2017, UChicago Argonne, LLC
- *                            All Rights Reserved
- *
- *           Hardware/Hybrid Cosmology Code (HACC), Version 1.0
- *
- * Salman Habib, Adrian Pope, Hal Finkel, Nicholas Frontiere, Katrin Heitmann,
- *      Vitali Morozov, Jeffrey Emberson, Thomas Uram, Esteban Rangel
- *                        (Argonne National Laboratory)
- *
- *  David Daniel, Patricia Fasel, Chung-Hsing Hsu, Zarija Lukic, James Ahrens
- *                      (Los Alamos National Laboratory)
- *
- *                               George Zagaris
- *                                 (Kitware)
- *
- *                            OPEN SOURCE LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *   1. Redistributions of source code must retain the above copyright notice,
- *      this list of conditions and the following disclaimer. Software changes,
- *      modifications, or derivative works, should be noted with comments and
- *      the author and organization's name.
- *
- *   2. Redistributions in binary form must reproduce the above copyright
- *      notice, this list of conditions and the following disclaimer in the
- *      documentation and/or other materials provided with the distribution.
- *
- *   3. Neither the names of UChicago Argonne, LLC or the Department of Energy
- *      nor the names of its contributors may be used to endorse or promote
- *      products derived from this software without specific prior written
- *      permission.
- *
- *   4. The software and the end-user documentation included with the
- *      redistribution, if any, must include the following acknowledgment:
- *
- *     "This product includes software produced by UChicago Argonne, LLC under
- *      Contract No. DE-AC02-06CH11357 with the Department of Energy."
- *
- * *****************************************************************************
- *                                DISCLAIMER
- * THE SOFTWARE IS SUPPLIED "AS IS" WITHOUT WARRANTY OF ANY KIND. NEITHER THE
- * UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT OF ENERGY, NOR
- * UCHICAGO ARGONNE, LLC, NOR ANY OF THEIR EMPLOYEES, MAKES ANY WARRANTY,
- * EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR RESPONSIBILITY FOR THE
- * ACCURARY, COMPLETENESS, OR USEFULNESS OF ANY INFORMATION, DATA, APPARATUS,
- * PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS THAT ITS USE WOULD NOT INFRINGE
- * PRIVATELY OWNED RIGHTS.
- *
- * *****************************************************************************
- */
-
-#ifndef HACC_TIMINGSTATS_H
-#define HACC_TIMINGSTATS_H
-
-#include <math.h>
-
-#include <mpi.h>
-
-#include "verbosity.h"
-
-// lightweight timing statistics from MPI_Wtime() calls
-// C header only, no static variables
-// prints maximum, average/mean, minimum, and stddev
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-inline
-void printTimingStats(MPI_Comm comm,        // comm for MPI_Allreduce()
-                      const char *preamble, // text at beginning of line
-                      double dt)            // delta t in seconds
-{
-  int myrank, nranks;
-  double max, min, sum, avg, var, stdev;
-
-  MPI_Comm_rank(comm, &myrank);
-  MPI_Comm_size(comm, &nranks);
-
-  MPI_Allreduce(&dt, &max, 1, MPI_DOUBLE, MPI_MAX, comm);
-  MPI_Allreduce(&dt, &min, 1, MPI_DOUBLE, MPI_MIN, comm);
-  MPI_Allreduce(&dt, &sum, 1, MPI_DOUBLE, MPI_SUM, comm);
-  avg = sum/nranks;
-
-  dt -= avg;
-  dt *= dt;
-  MPI_Allreduce(&dt, &var, 1, MPI_DOUBLE, MPI_SUM, comm);
-  var *= 1.0/nranks;
-  stdev = sqrt(var);
-
-  if(myrank==0 && verbosity() > 0) {
-    printf("%s  max %.3es  avg %.3es  min %.3es  dev %.3es\n",
-           preamble, max, avg, min, stdev);
-  }
-
-  MPI_Barrier(comm);
-
-  return;
-}
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif // HACC_TIMINGSTATS_H
diff --git a/Src/Extern/SWFFT/complex-type.h b/Src/Extern/SWFFT/complex-type.h
deleted file mode 100644
index 069f2393d9..0000000000
--- a/Src/Extern/SWFFT/complex-type.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- *                 Copyright (C) 2017, UChicago Argonne, LLC
- *                            All Rights Reserved
- *
- *           Hardware/Hybrid Cosmology Code (HACC), Version 1.0
- *
- * Salman Habib, Adrian Pope, Hal Finkel, Nicholas Frontiere, Katrin Heitmann,
- *      Vitali Morozov, Jeffrey Emberson, Thomas Uram, Esteban Rangel
- *                        (Argonne National Laboratory)
- *
- *  David Daniel, Patricia Fasel, Chung-Hsing Hsu, Zarija Lukic, James Ahrens
- *                      (Los Alamos National Laboratory)
- *
- *                               George Zagaris
- *                                 (Kitware)
- *
- *                            OPEN SOURCE LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *   1. Redistributions of source code must retain the above copyright notice,
- *      this list of conditions and the following disclaimer. Software changes,
- *      modifications, or derivative works, should be noted with comments and
- *      the author and organization's name.
- *
- *   2. Redistributions in binary form must reproduce the above copyright
- *      notice, this list of conditions and the following disclaimer in the
- *      documentation and/or other materials provided with the distribution.
- *
- *   3. Neither the names of UChicago Argonne, LLC or the Department of Energy
- *      nor the names of its contributors may be used to endorse or promote
- *      products derived from this software without specific prior written
- *      permission.
- *
- *   4. The software and the end-user documentation included with the
- *      redistribution, if any, must include the following acknowledgment:
- *
- *     "This product includes software produced by UChicago Argonne, LLC under
- *      Contract No. DE-AC02-06CH11357 with the Department of Energy."
- *
- * *****************************************************************************
- *                                DISCLAIMER
- * THE SOFTWARE IS SUPPLIED "AS IS" WITHOUT WARRANTY OF ANY KIND. NEITHER THE
- * UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT OF ENERGY, NOR
- * UCHICAGO ARGONNE, LLC, NOR ANY OF THEIR EMPLOYEES, MAKES ANY WARRANTY,
- * EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR RESPONSIBILITY FOR THE
- * ACCURARY, COMPLETENESS, OR USEFULNESS OF ANY INFORMATION, DATA, APPARATUS,
- * PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS THAT ITS USE WOULD NOT INFRINGE
- * PRIVATELY OWNED RIGHTS.
- *
- * *****************************************************************************
- */
-
-// Compatibility file for C99 and C++ complex.  This header
-// can be included by either C99 or ANSI C++ programs to
-// allow complex arithmetic to be written in a common subset.
-// Note that overloads for both the real and complex math
-// functions are available after this header has been
-// included.
-
-#ifndef HACC_COMPLEXTYPE_H
-#define HACC_COMPLEXTYPE_H
-
-#ifdef __cplusplus
-
-#include <cmath>
-#include <complex>
-
-typedef std::complex<double> complex_t;
-
-#define I complex_t(0.0, 1.0)
-
-#else
-
-#include <complex.h>
-#include <math.h>
-
-typedef double complex complex_t;
-
-#define complex_t(r,i) ((double)(r) + ((double)(i)) * I)
-
-#define real(x) creal(x)
-#define imag(x) cimag(x)
-#define abs(x) fabs(x)
-#define arg(x) carg(x)
-
-#endif  // #ifdef __cplusplus
-
-#endif  // HACC_COMPLEXTYPE_H
diff --git a/Src/Extern/SWFFT/distribution.c b/Src/Extern/SWFFT/distribution.c
deleted file mode 100644
index 6ae1ccd474..0000000000
--- a/Src/Extern/SWFFT/distribution.c
+++ /dev/null
@@ -1,2018 +0,0 @@
-/*
- *                 Copyright (C) 2017, UChicago Argonne, LLC
- *                            All Rights Reserved
- *
- *           Hardware/Hybrid Cosmology Code (HACC), Version 1.0
- *
- * Salman Habib, Adrian Pope, Hal Finkel, Nicholas Frontiere, Katrin Heitmann,
- *      Vitali Morozov, Jeffrey Emberson, Thomas Uram, Esteban Rangel
- *                        (Argonne National Laboratory)
- *
- *  David Daniel, Patricia Fasel, Chung-Hsing Hsu, Zarija Lukic, James Ahrens
- *                      (Los Alamos National Laboratory)
- *
- *                               George Zagaris
- *                                 (Kitware)
- *
- *                            OPEN SOURCE LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *   1. Redistributions of source code must retain the above copyright notice,
- *      this list of conditions and the following disclaimer. Software changes,
- *      modifications, or derivative works, should be noted with comments and
- *      the author and organization's name.
- *
- *   2. Redistributions in binary form must reproduce the above copyright
- *      notice, this list of conditions and the following disclaimer in the
- *      documentation and/or other materials provided with the distribution.
- *
- *   3. Neither the names of UChicago Argonne, LLC or the Department of Energy
- *      nor the names of its contributors may be used to endorse or promote
- *      products derived from this software without specific prior written
- *      permission.
- *
- *   4. The software and the end-user documentation included with the
- *      redistribution, if any, must include the following acknowledgment:
- *
- *     "This product includes software produced by UChicago Argonne, LLC under
- *      Contract No. DE-AC02-06CH11357 with the Department of Energy."
- *
- * *****************************************************************************
- *                                DISCLAIMER
- * THE SOFTWARE IS SUPPLIED "AS IS" WITHOUT WARRANTY OF ANY KIND. NEITHER THE
- * UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT OF ENERGY, NOR
- * UCHICAGO ARGONNE, LLC, NOR ANY OF THEIR EMPLOYEES, MAKES ANY WARRANTY,
- * EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR RESPONSIBILITY FOR THE
- * ACCURARY, COMPLETENESS, OR USEFULNESS OF ANY INFORMATION, DATA, APPARATUS,
- * PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS THAT ITS USE WOULD NOT INFRINGE
- * PRIVATELY OWNED RIGHTS.
- *
- * *****************************************************************************
- */
-
-#include <assert.h>
-#include <mpi.h>
-#include <stdbool.h>
-#include <stddef.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdint.h>
-
-#include "distribution_c.h"
-
-#include "verbosity.h"
-
-#ifndef USE_SLAB_WORKAROUND
-#define USE_SLAB_WORKAROUND 0
-#endif
-
-enum {
-  REDISTRIBUTE_1_TO_3,
-  REDISTRIBUTE_3_TO_1,
-  REDISTRIBUTE_2_TO_3,
-  REDISTRIBUTE_3_TO_2
-};
-
-//#define DEBUG_CONDITION (self == 0 || self == 1)
-#define DEBUG_CONDITION false
-
-// return comma or period depending on position in a list
-static inline char const *separator(int i, int n)
-{
-  return i == (n - 1) ? "." : ", ";
-}
-
-
-//Go from rank of processor to its cartesian coords, and vica versa.
-//Assumes the ranks increment in the z dimension then y then x.
-void Coord_cube(int myrank,
-                int coord[],
-                distribution_t *d)
-{
-  coord[0]=myrank/(d->process_topology_3.nproc[1]*d->process_topology_3.nproc[2]);
-  coord[1]=(myrank%(d->process_topology_3.nproc[1]*d->process_topology_3.nproc[2]))/(d->process_topology_3.nproc[2]);
-  coord[2]=(myrank%(d->process_topology_3.nproc[1]*d->process_topology_3.nproc[2]))%(d->process_topology_3.nproc[2]);
-  return;
-}
-
-void Rank_cube(int * myrank,
-               int coord[],
-               distribution_t *d)
-{
-  *myrank = coord[2] + (d->process_topology_3.nproc[2])*(coord[1] + d->process_topology_3.nproc[1]*coord[0]);
-  return;
-}
-
-
-/*
-  The subsequent member functions are used to look up and ranks of x,y, and z
-  pencils from their coordinates, and vica versa.
-  The ordering of the ranks is such that pencils will be going through cubes
-  with the same rank sequentially. (since the cubes ranks are already
-  determined and can not be changed, these routines figure out which ranks
-  the pencils should be assigned so that there is no communication hangs.)
-*/
-void Coord_x_pencils(int myrank,
-                     int coord[],
-                     distribution_t *d)
-{
-  // asserts only one processor in x_direction
-  assert(d->process_topology_2_x.nproc[0] == 1);
-  //since x_pencils only have one processor in the x_direction.
-  coord[0]=0;
-  int num_pen_in_cube_col=d->process_topology_2_x.nproc[1]/d->process_topology_3.nproc[1];
-  int num_pen_in_cube_row=d->process_topology_2_x.nproc[2]/d->process_topology_3.nproc[2];
-  int num_cubes=(d->process_topology_3.nproc[2]*d->process_topology_3.nproc[1]);
-
-/*
-  the x_pencil ranks increment in each cube sequentially, after reaching the
-  last cube the second slot in the first cube is the next rank, and then the
-  process repeats. num_repeats, is the number of times this repetition had to
-  have occurred to increment to the current rank.
-*/
-  int num_repeats=myrank/(num_cubes);
-
-  //now subtract the difference of how many repetitions, to find the lowest
-  //rank in the cube it resides.
-  int low_rank=myrank-num_repeats*num_cubes;
-
-  //find the y and z coords of the low_rank, then adjust coords for ranks
-  //that repeated around the cube.
-  coord[1] = (low_rank/d->process_topology_3.nproc[2])*num_pen_in_cube_col
-    + num_repeats%num_pen_in_cube_col;
-  coord[2] = (low_rank%d->process_topology_3.nproc[2])*num_pen_in_cube_row + num_repeats/num_pen_in_cube_col;
-
-  return;
-}
-
-void Rank_x_pencils(int * myrank,
-                    int coord[],
-                    distribution_t *d)
-{
-  int num_pen_in_cube_col=d->process_topology_2_x.nproc[1]/d->process_topology_3.nproc[1];
-  int num_pen_in_cube_row=d->process_topology_2_x.nproc[2]/d->process_topology_3.nproc[2];
-  if(num_pen_in_cube_col == 0 && verbosity() > 2)
-    fprintf(stderr,"num_cube_col%d ",
-            d->process_topology_2_x.nproc[1]/d->process_topology_3.nproc[1]);
-  if(num_pen_in_cube_row == 0 && verbosity() > 2)
-    fprintf(stderr,"num_cube_row%d ", d->process_topology_3.nproc[2]);
-  assert(num_pen_in_cube_col !=0 && num_pen_in_cube_row !=0);
-  int alpha = coord[1]%num_pen_in_cube_col;
-  int num_cubes = (d->process_topology_3.nproc[2]*d->process_topology_3.nproc[1]);
-  int beta = coord[2]%num_pen_in_cube_row;
-  *myrank = (alpha*num_cubes)
-    + ((coord[1]/num_pen_in_cube_col)*d->process_topology_3.nproc[2])
-    + (beta*(num_cubes)*num_pen_in_cube_col) + coord[2]/num_pen_in_cube_row;
-  return;
-}
-
-void Coord_y_pencils(int myrank,
-                     int coord[],
-                     distribution_t *d)
-{
-  // asserts only one processor in y_direction
-  assert(d->process_topology_2_y.nproc[1] == 1);
-  //since y_pencils only have one processor in the y_direction.
-  coord[1] = 0;
-  int num_pen_in_cube_row = d->process_topology_2_y.nproc[2]/d->process_topology_3.nproc[2];
-  int alpha = myrank%(d->process_topology_2_y.nproc[2]);
-  coord[0] = myrank/d->process_topology_2_y.nproc[2];
-
-  coord[2] = (alpha/d->process_topology_3.nproc[2])
-    + (alpha%d->process_topology_3.nproc[2])*num_pen_in_cube_row;
-
-  return;
-}
-
-void Rank_y_pencils(int * myrank,
-                    int coord[],
-                    distribution_t *d)
-{
-  int num_pen_in_cube_col = d->process_topology_2_y.nproc[0]/d->process_topology_3.nproc[0];
-  int num_pen_in_cube_row = d->process_topology_2_y.nproc[2]/d->process_topology_3.nproc[2];
-  //WHY ARE THESE COMMENTED OUT?
-  //if(num_pen_in_cube_col ==0)fprintf(stderr,"num_cube_col%d ", d->process_topology_2_y.nproc[1]/d->process_topology_3.nproc[1]);
-  //if(num_pen_in_cube_row ==0)fprintf(stderr,"num_cube_row%d ", d->process_topology_3.nproc[2]);
-  assert(num_pen_in_cube_col !=0 && num_pen_in_cube_row !=0);
-  int beta = coord[2]%num_pen_in_cube_row;
-  *myrank = coord[0]*d->process_topology_2_y.nproc[2]
-    + beta*d->process_topology_3.nproc[2]
-    + coord[2]/num_pen_in_cube_row;
-  return;
-}
-
-void Coord_z_pencils(int myrank,
-                     int coord[],
-                     distribution_t *d)
-{
-  // asserts only one processor in z_direction
-  assert(d->process_topology_2_z.nproc[2] == 1);
-  //since z_pencils only have one processor in the z_direction.
-  coord[2] = 0;
-  int num_pen_in_cube_col = d->process_topology_2_z.nproc[1]/d->process_topology_3.nproc[1];
-  int num_pen_in_cube_row = d->process_topology_2_z.nproc[0]/d->process_topology_3.nproc[0];
-  int num_pen_in_cube = d->process_topology_3.nproc[2];
-  int alpha = myrank/(d->process_topology_2_z.nproc[1]*num_pen_in_cube_row);
-  coord[0] = alpha*num_pen_in_cube_row + (myrank%num_pen_in_cube)/num_pen_in_cube_col;
-  coord[1] = ((myrank%(d->process_topology_2_z.nproc[1]*num_pen_in_cube_row))/num_pen_in_cube)*num_pen_in_cube_col + myrank%num_pen_in_cube_col;
-
-  return;
-}
-
-void Rank_z_pencils(int * myrank,
-                    int coord[],
-                    distribution_t *d)
-{
-  int num_pen_in_cube_col = d->process_topology_2_z.nproc[1]/d->process_topology_3.nproc[1];
-  int num_pen_in_cube_row = d->process_topology_2_z.nproc[0]/d->process_topology_3.nproc[0];
-  int num_pen_in_cube = d->process_topology_3.nproc[2];
-  if(num_pen_in_cube_col == 0 && verbosity() > 2)
-    fprintf(stderr,"num_cube_col%d ",
-            d->process_topology_2_z.nproc[1]/d->process_topology_3.nproc[1]);
-  if(num_pen_in_cube_row == 0 && verbosity() > 2)
-    fprintf(stderr,"num_cube_row%d ", d->process_topology_3.nproc[2]);
-  assert(num_pen_in_cube_col !=0 && num_pen_in_cube_row !=0);
-  int alpha = coord[1]%num_pen_in_cube_col;
-  int beta = coord[0]%num_pen_in_cube_row;
-  *myrank = alpha
-    + ((coord[1]/num_pen_in_cube_col)*num_pen_in_cube)
-    + (beta*num_pen_in_cube_col)
-    + (coord[0]/num_pen_in_cube_row)*d->process_topology_2_z.nproc[1]*num_pen_in_cube_row;
-  return;
-}
-
-
-// create 1-, 2- and 3-d cartesian data distributions comm MPI Communicator
-void distribution_init(MPI_Comm comm,
-                       const int n[],
-                       const int Ndims[],
-                       distribution_t *d,
-               const int* rmap,
-                       bool debug)
-{
-/*
-   As of 09/06/2011 The MPI function MPI_Dims_create is used to come up with
-   the most evenly distributed number of processors for the 3d distribution.
-   Since this can actually vary between machines, we should later write our own
-   prime factorization function that does that for us. For the 2d distribution
-   pencils, Dims_create is also used, but the code then checks if the pencils
-   it outputs fits inside the 3d cuboids that were created. If it does not, it
-   tries swapping the dimensions of the pencils, and if they still do not fit,
-   it takes the 3d cubes dimensions of processors (np1,np2,np3) and (for
-   Z-pencils for example) makes pencils of the form (np1*np3,np2,1), or
-   (np1,np2*np3,1) which fit inside the cubes. However, even then, it is not
-   ensured that this will work since for example, if np1*np3 is bigger then
-   the number of points in one dimension (Ng) then there are not enough points
-   for each processor to have at least one point in that dimension. So the code
-   checks this and asserts three variables check_x_dims check_y_dims, and
-   check_z_dims, which will assert if these kinda errors happen (as well as
-   checking errors coming from picking the total number of processors and Ng
-   in a way where the cubes will not fit for any orientation (like 100 procs
-   and Ng=101!)). Currently the fix to these errors is to pick better values
-   for Ng and the total number of processors that work, however when we do
-   have our own prime factorization method, then that method could also make
-   pencils that fit inside the proper distribution (and we would not need so
-   many checks). In the mean time, to pick these "better" values for Ng, the
-   user should pick values such that: Ng % np1, Ng % np2, and Ng % np3 all
-   equal zero, and that np1*np2, np2*np3, and np3*np1 are all less then Ng.
-   (in other words, the cubes created fit inside the number of grid points,
-   and the number of pencils created is not more then the number of points
-   in a dimension (Ng)).
-*/
-  d->parent = comm;
-
-  int nproc;//num_processors
-  int self; //rank
-  int ndim = 3;
-  int period[3];
-
-  MPI_Comm_rank(comm, &self);
-  MPI_Comm_size(comm, &nproc);
-
-  // Construct the rankmap[grid] --> rank map (converts grids to ranks) from the input
-  // (if none is provided then we assume the trivial map {0, 1, ..., nproc}).
-  d->rankmap  = (int *) malloc(sizeof(int)*nproc);
-  if(rmap) for(int i=0; i<nproc; i++) d->rankmap[i] = rmap[i];
-  else for(int i=0; i<nproc; i++) d->rankmap[i] = i;
-
-  // Construct the inverse gridmap[rank] --> grid map (converts ranks to grids)
-  d->gridmap = (int *) malloc(sizeof(int)*nproc);
-  for(int i=0; i<nproc; i++) {
-    for(int j=0; j<nproc; j++) {
-      if(i == d->rankmap[j]) { d->gridmap[i] = j; break; }
-    }
-  }
-
-  // Map this rank to the correct grid
-  self = d->gridmap[self];
-
-  if (!self && verbosity() > 2)
-    printf("Initializing redistribution using a %s layout on %d ranks.\n",
-#ifdef PENCIL
-           "pencil"
-#else
-           "slab"
-#endif
-           ,nproc);
-
-  d->debug = debug;
-  for (int i = 0; i < 3; ++i)
-    d->n[i] = n[i];
-
-  // set up process grid with 1d decomposition (SLABs)
-  d->process_topology_1.nproc[0] = 0;
-  d->process_topology_1.nproc[1] = 1; // don't distribute outer dimensions
-  d->process_topology_1.nproc[2] = 1; // don't distribute outer dimensions
-  period[0] = period[1] = period[2] = 1;
-  //process_topology_1.nproc is filled with number of processors in each dim
-  MPI_Dims_create(nproc, ndim, d->process_topology_1.nproc);
-
-  if(self == 0 && verbosity() > 2) {
-    printf("distribution 1D: [%d:%d:%d]\n",
-           d->process_topology_1.nproc[0],
-           d->process_topology_1.nproc[1],
-           d->process_topology_1.nproc[2]);
-    fflush(stdout);
-  }
-
-  if (d->debug && 0 == self && verbosity() > 2) {
-    fprintf(stderr, "Process grids:\n");
-    fprintf(stderr, "  1d: ");
-    for (int i = 0; i < ndim; ++i) {
-      fprintf(stderr, "%d%s",
-              d->process_topology_1.nproc[i],
-              separator(i, ndim));
-    }
-    fprintf(stderr, "\n");
-  }
-  //creates the new communicator
-  MPI_Cart_create(comm, ndim, d->process_topology_1.nproc, period, 0,
-                  &d->process_topology_1.cart);
-  //gets .self (is coordinate)
-  MPI_Cart_get(d->process_topology_1.cart, ndim, d->process_topology_1.nproc,
-               d->process_topology_1.period, d->process_topology_1.self);
-  //calculates the local dimensions (number of points in each dimension)
-  d->process_topology_1.n[0] = n[0] / d->process_topology_1.nproc[0];
-  d->process_topology_1.n[1] = n[1] / d->process_topology_1.nproc[1];
-  d->process_topology_1.n[2] = n[2] / d->process_topology_1.nproc[2];
-
-
-
-  // set up process grid with 3d decomposition (CUBE)
-  d->process_topology_3.nproc[0] = 0;
-  d->process_topology_3.nproc[1] = 0;
-  d->process_topology_3.nproc[2] = 0;
-  period[0] = period[1] = period[2] = 1;
-  Custom3D_Dims_create(Ndims, nproc, ndim, d->process_topology_3.nproc);
-
-  if(self == 0 && verbosity() > 2) {
-    printf("distribution 3D: [%d:%d:%d]\n",
-           d->process_topology_3.nproc[0],
-           d->process_topology_3.nproc[1],
-           d->process_topology_3.nproc[2]);
-    fflush(stdout);
-  }
-
-  if(d->debug && 0 == self && verbosity() > 2) {
-    fprintf(stderr, "  3d: ");
-    for (int i = 0; i < ndim; ++i) {
-      fprintf(stderr, "%d%s",
-              d->process_topology_3.nproc[i],
-              separator(i, ndim));
-    }
-    fprintf(stderr, "\n");
-  }
-
-  MPI_Cart_create(comm, ndim, d->process_topology_3.nproc, period, 0,
-                  &d->process_topology_3.cart);
-  //finds cartesian coordinate of this current rank
-  Coord_cube(self,d->process_topology_3.self,d);
-
-  if(debug){
-/*
-  this debug statement checks to see if the way coordinates found by
-  calculation matches MPI's coord system (MPI might differ between machines
-  so this is why the code calculates the coord system itself, however with
-  debug on, can check if it matches MPI(even tho it is not enforced to match
-  it.)).
-*/
-    int prev_coord[3];
-    prev_coord[0]=d->process_topology_3.self[0];
-    prev_coord[1]=d->process_topology_3.self[1];
-    prev_coord[2]=d->process_topology_3.self[2];
-    MPI_Cart_get(d->process_topology_3.cart, ndim,
-                 d->process_topology_3.nproc,
-                 d->process_topology_3.period,
-                 d->process_topology_3.self);
-    for(int i=0; i < 3; i++)
-      if(prev_coord[i] != d->process_topology_3.self[i])
-        abort();
-  }
-  assert(n[0]%d->process_topology_3.nproc[0] == 0);
-  assert(n[0]%d->process_topology_3.nproc[1] == 0);
-  assert(n[0]%d->process_topology_3.nproc[2] == 0);
-
-  //set local dimensions
-  d->process_topology_3.n[0] = n[0] / d->process_topology_3.nproc[0];
-  d->process_topology_3.n[1] = n[1] / d->process_topology_3.nproc[1];
-  d->process_topology_3.n[2] = n[2] / d->process_topology_3.nproc[2];
-
-
-
-  // set up process grid with 2d decomposition (z_PENCILs )
-  d->process_topology_2_z.nproc[0] = 0;
-  d->process_topology_2_z.nproc[1] = 0;
-  d->process_topology_2_z.nproc[2] = 1; // don't distribute outer dimension
-  period[0] = period[1] = period[2] = 1;
-  MPI_Dims_create(nproc, ndim, d->process_topology_2_z.nproc);
-  d->process_topology_2_z.n[0] = n[0] / d->process_topology_2_z.nproc[0];
-  d->process_topology_2_z.n[1] = n[1] / d->process_topology_2_z.nproc[1];
-  d->process_topology_2_z.n[2] = n[2] / d->process_topology_2_z.nproc[2];
-  //variable used to ensure that pencils created fit inside the cuboids,
-  //if not the code will assert out.
-  bool check_z_dims=false;
-  if(d->process_topology_2_z.n[0] != 0
-     && d->process_topology_2_z.n[1] != 0
-     && d->process_topology_2_z.n[2] != 0)
-  {// protects from dividing by zero.
-    check_z_dims = ((d->process_topology_3.n[0]) % (d->process_topology_2_z.n[0]) == 0)
-      && ((d->process_topology_3.n[1]) % (d->process_topology_2_z.n[1]) == 0)
-      && (n[0] % (d->process_topology_2_z.nproc[0]) == 0)
-      && (n[0] % (d->process_topology_2_z.nproc[1]) == 0);
-
-    if(self==0 && debug && !check_z_dims && verbosity() > 2)
-      fprintf(stderr,"Need to fix Z PENCILS z_procs(%d,%d,%d) 3d.ns(%d,%d,%d) 2d_z.ns(%d,%d,%d).... \n",
-              d->process_topology_2_z.nproc[0],
-              d->process_topology_2_z.nproc[1],
-              d->process_topology_2_z.nproc[2],
-              d->process_topology_3.n[0],
-              d->process_topology_3.n[1],
-              d->process_topology_3.n[2],
-              d->process_topology_2_z.n[0],
-              d->process_topology_2_z.n[1],
-              d->process_topology_2_z.n[2]);
-
-    //try swapping pencil dimensions if current setup pencil dimensions dont
-    //fit inside the cubes.
-    if(!(check_z_dims)
-       && ((d->process_topology_3.n[0]) % (d->process_topology_2_z.n[1]) == 0)
-       && ((d->process_topology_3.n[1]) % (d->process_topology_2_z.n[0]) == 0))
-    {
-
-      if(self==0 && debug && verbosity() > 2)
-        fprintf(stderr,"Swapping Z pencils in initialization  (%d,%d,%d).... \n",
-                d->process_topology_2_z.nproc[0],
-                d->process_topology_2_z.nproc[1],
-                d->process_topology_2_z.nproc[2]);
-      int temp=d->process_topology_2_z.nproc[0];
-      d->process_topology_2_z.nproc[0] = d->process_topology_2_z.nproc[1];
-      d->process_topology_2_z.nproc[1] = temp;
-      d->process_topology_2_z.nproc[2] = d->process_topology_2_z.nproc[2];
-
-      d->process_topology_2_z.n[0] = n[0] / d->process_topology_2_z.nproc[0];
-      d->process_topology_2_z.n[1] = n[1] / d->process_topology_2_z.nproc[1];
-      d->process_topology_2_z.n[2] = n[2] / d->process_topology_2_z.nproc[2];
-      check_z_dims = ((d->process_topology_3.n[0]) % (d->process_topology_2_z.n[0]) == 0)
-        && ((d->process_topology_3.n[1]) % (d->process_topology_2_z.n[1]) == 0)
-        && (n[0] % (d->process_topology_2_z.nproc[0]) == 0)
-        && (n[0] % (d->process_topology_2_z.nproc[1]) == 0);
-    }
-  } else {
-    check_z_dims=false;
-  }
-  /*
-    if that did not work, make a pencil that does if inside the 3d cuboids by
-    taking the cuboids dimensions (np1,np2,np3) and making pencils
-    (np1,np2*np3,1), or (np1*np3,np2,1) on the most evenly distributed
-    dimensions
-  */
-  if(!check_z_dims){
-    if(self==0 && debug && verbosity() > 2)
-      fprintf(stderr,"MAKING Z PENCILS FIT zprocs(%d,%d,%d) z.ns(%d,%d,%d).... \n",
-              d->process_topology_2_z.nproc[0],
-              d->process_topology_2_z.nproc[1],
-              d->process_topology_2_z.nproc[2],
-              d->process_topology_2_z.n[0],
-              d->process_topology_2_z.n[1],
-              d->process_topology_2_z.n[2]);
-
-    d->process_topology_2_z.nproc[2]=1;
-    if(d->process_topology_3.n[0]>d->process_topology_3.n[1])
-    {
-      d->process_topology_2_z.nproc[1]=d->process_topology_3.nproc[1]*d->process_topology_3.nproc[2];
-      d->process_topology_2_z.nproc[0]=d->process_topology_3.nproc[0];
-      if((n[0] % (d->process_topology_2_z.nproc[0]) != 0)
-         || (n[0] % (d->process_topology_2_z.nproc[1]) != 0))
-      {
-        d->process_topology_2_z.nproc[0]=d->process_topology_3.nproc[0]*d->process_topology_3.nproc[2];
-        d->process_topology_2_z.nproc[1]=d->process_topology_3.nproc[1];
-      }
-    } else {
-      d->process_topology_2_z.nproc[0]=d->process_topology_3.nproc[0]*d->process_topology_3.nproc[2];
-      d->process_topology_2_z.nproc[1]=d->process_topology_3.nproc[1];
-      if((n[0] % (d->process_topology_2_z.nproc[0]) != 0)
-         || (n[0] % (d->process_topology_2_z.nproc[1]) != 0))
-      {
-        d->process_topology_2_z.nproc[1]=d->process_topology_3.nproc[1]*d->process_topology_3.nproc[2];
-        d->process_topology_2_z.nproc[0]=d->process_topology_3.nproc[0];
-      }
-    }
-    d->process_topology_2_z.n[0] = n[0] / d->process_topology_2_z.nproc[0];
-    d->process_topology_2_z.n[1] = n[1] / d->process_topology_2_z.nproc[1];
-    d->process_topology_2_z.n[2] = n[2] / d->process_topology_2_z.nproc[2];
-    if(self==0 && debug && verbosity() > 2)
-      fprintf(stderr,"MAKING Z PENCILS FIT AFTER zprocs(%d,%d,%d) z.ns(%d,%d,%d)...\n",
-              d->process_topology_2_z.nproc[0],
-              d->process_topology_2_z.nproc[1],
-              d->process_topology_2_z.nproc[2],
-              d->process_topology_2_z.n[0],
-              d->process_topology_2_z.n[1],
-              d->process_topology_2_z.n[2]);
-    if(d->process_topology_2_z.n[0] != 0
-       && d->process_topology_2_z.n[1] != 0
-       && d->process_topology_2_z.n[2] != 0)
-    {// protects from dividing by zero.
-      check_z_dims=((d->process_topology_3.n[0]) % (d->process_topology_2_z.n[0]) == 0)
-        && ((d->process_topology_3.n[1]) % (d->process_topology_2_z.n[1]) == 0)
-        && (n[0] % (d->process_topology_2_z.nproc[0]) == 0)
-        && (n[0] % (d->process_topology_2_z.nproc[1]) == 0);
-    } else {
-      check_z_dims=false;
-    }
-  }
-
-  if (d->debug && 0 == self && verbosity() > 2) {
-    fprintf(stderr, "  2d_z: ");
-    for (int i = 0; i < ndim; ++i) {
-      fprintf(stderr, "%d%s",
-              d->process_topology_2_z.nproc[i],
-              separator(i, ndim));
-    }
-    fprintf(stderr, "\n");
-  }
-  if(!check_z_dims && debug && (self==0)){
-    FILE * outfile;
-    outfile= fopen("error.data","a");
-    fprintf(outfile,"Z DIMS FAILS:(%d,%d,%d) (%d,%d,%d) \n",
-            d->process_topology_2_z.nproc[0],
-            d->process_topology_2_z.nproc[1],
-            d->process_topology_2_z.nproc[2],
-            d->process_topology_3.nproc[0],
-            d->process_topology_3.nproc[1],
-            d->process_topology_3.nproc[2]);
-  }
-  assert(check_z_dims);
-/*
-  if this happends, it is because the dimensions were chosen incorrectly.
-  Either to many processors for the number of points in one dimension (could
-  not do at least 1 point per processor), or the methods above could
-  not make a distribution of pencils that fit in the cubiods, which would
-  happen if the user gave numbers that wouldent work (we require the number
-  of processors in each dimension of the cuboid must be modulo the number of
-  points in that dimension, otherwise, this error will happen).
-*/
-  MPI_Cart_create(comm,
-                  ndim,
-                  d->process_topology_2_z.nproc,
-                  period,
-                  0,
-                  &d->process_topology_2_z.cart);
-  //find the cartesian coord of the current rank (for the z_pencil)
-  Coord_z_pencils(self,d->process_topology_2_z.self,d);
-
-  if(self == 0 && verbosity() > 2) {
-    printf("distribution 2z: [%d:%d:%d]\n",
-           d->process_topology_2_z.nproc[0],
-           d->process_topology_2_z.nproc[1],
-           d->process_topology_2_z.nproc[2]);
-    fflush(stdout);
-  }
-
-
-
-  // set up process grid with 2d decomposition (x_PENCILs)
-  d->process_topology_2_x.nproc[0] = 1; // don't distribute outer dimension
-  d->process_topology_2_x.nproc[1] = 0;
-  d->process_topology_2_x.nproc[2] = 0;
-  period[0] = period[1] = period[2] = 1;
-  MPI_Dims_create(nproc, ndim, d->process_topology_2_x.nproc);
-  d->process_topology_2_x.n[0] = n[0] / d->process_topology_2_x.nproc[0];
-  d->process_topology_2_x.n[1] = n[1] / d->process_topology_2_x.nproc[1];
-  d->process_topology_2_x.n[2] = n[2] / d->process_topology_2_x.nproc[2];
-  //variable used to ensure that pencils created fit inside the cuboids,
-  //if not the code will assert out.
-  bool check_x_dims = false;
-  if(d->process_topology_2_x.n[0] != 0
-     && d->process_topology_2_x.n[1] != 0
-     && d->process_topology_2_x.n[2] != 0)
-  {// protects from dividing by zero.
-    check_x_dims = ((d->process_topology_3.n[2]) % (d->process_topology_2_x.n[2]) == 0)
-      && ((d->process_topology_3.n[1]) % (d->process_topology_2_x.n[1]) == 0)
-      && (n[0] % (d->process_topology_2_x.nproc[2]) == 0)
-      && (n[0] % (d->process_topology_2_x.nproc[1]) == 0);
-    if(self==0 && debug && !check_x_dims && verbosity() > 2)
-      fprintf(stderr,"Need to fix X PENCILS x_procs(%d,%d,%d) 3d.ns(%d,%d,%d) 2d_x.ns(%d,%d,%d)...\n",
-              d->process_topology_2_x.nproc[0],
-              d->process_topology_2_x.nproc[1],
-              d->process_topology_2_x.nproc[2],
-              d->process_topology_3.n[0],
-              d->process_topology_3.n[1],
-              d->process_topology_3.n[2],
-              d->process_topology_2_x.n[0],
-              d->process_topology_2_x.n[1],
-              d->process_topology_2_x.n[2]);
-
-    //try swapping pencil dimensions if current setup does not have pencils
-    //that fit inside cubes.
-    if(!(check_x_dims)
-       && ((d->process_topology_3.n[2]) % (d->process_topology_2_x.n[1]) == 0)
-       && ((d->process_topology_3.n[1]) % (d->process_topology_2_x.n[2]) == 0))
-    {
-      if(self==0 && debug && verbosity() > 2)
-        fprintf(stderr,"Swapping X pencils in initialization .... \n");
-      d->process_topology_2_x.nproc[0] = d->process_topology_2_x.nproc[0];
-      int temp = d->process_topology_2_x.nproc[1];
-      d->process_topology_2_x.nproc[1] = d->process_topology_2_x.nproc[2];
-      d->process_topology_2_x.nproc[2] = temp;
-
-      d->process_topology_2_x.n[0] = n[0] / d->process_topology_2_x.nproc[0];
-      d->process_topology_2_x.n[1] = n[1] / d->process_topology_2_x.nproc[1];
-      d->process_topology_2_x.n[2] = n[2] / d->process_topology_2_x.nproc[2];
-      check_x_dims = ((d->process_topology_3.n[2]) % (d->process_topology_2_x.n[2]) == 0)
-        && ((d->process_topology_3.n[1]) % (d->process_topology_2_x.n[1]) == 0)
-        && (n[0] % (d->process_topology_2_x.nproc[2]) == 0)
-        && (n[0] % (d->process_topology_2_x.nproc[1]) == 0);
-    }
-  } else{
-    check_x_dims=false;
-  }
-  /*
-    if that did not work, make a pencil that does by taking the cuboid
-    (np1,np2,np3) and making pencils of the form (1,np2*np1,np3) or
-    (1,np2*np1,np3) depending on the most even distribution it can.
-  */
-  if(!check_x_dims){
-    if(self==0 && debug && verbosity() > 2)
-      fprintf(stderr,"MAKING X PENCILS FIT xprocs(%d,%d,%d) x.ns(%d,%d,%d)...\n",
-              d->process_topology_2_x.nproc[0],
-              d->process_topology_2_x.nproc[1],
-              d->process_topology_2_x.nproc[2],
-              d->process_topology_2_x.n[0],
-              d->process_topology_2_x.n[1],
-              d->process_topology_2_x.n[2]);
-
-    d->process_topology_2_x.nproc[0] = 1;
-    if(d->process_topology_3.nproc[2] > d->process_topology_3.nproc[1])
-    {
-      d->process_topology_2_x.nproc[1] = d->process_topology_3.nproc[1]*d->process_topology_3.nproc[0];
-      d->process_topology_2_x.nproc[2] = d->process_topology_3.nproc[2];
-      if((n[0] % (d->process_topology_2_x.nproc[2]) != 0)
-         || (n[0] % (d->process_topology_2_x.nproc[0]) != 0))
-      {
-        d->process_topology_2_x.nproc[2]=d->process_topology_3.nproc[2]*d->process_topology_3.nproc[0];
-        d->process_topology_2_x.nproc[1]=d->process_topology_3.nproc[1];
-      }
-
-    } else {
-      d->process_topology_2_x.nproc[2] = d->process_topology_3.nproc[2]*d->process_topology_3.nproc[0];
-      d->process_topology_2_x.nproc[1] = d->process_topology_3.nproc[1];
-      if((n[0] % (d->process_topology_2_x.nproc[2]) != 0)
-         || (n[0] % (d->process_topology_2_x.nproc[0]) != 0))
-      {
-        d->process_topology_2_x.nproc[1]=d->process_topology_3.nproc[1]*d->process_topology_3.nproc[0];
-        d->process_topology_2_x.nproc[2]=d->process_topology_3.nproc[2];
-      }
-    }
-    d->process_topology_2_x.n[0] = n[0] / d->process_topology_2_x.nproc[0];
-    d->process_topology_2_x.n[1] = n[1] / d->process_topology_2_x.nproc[1];
-    d->process_topology_2_x.n[2] = n[2] / d->process_topology_2_x.nproc[2];
-    if(self==0 && debug && verbosity() > 2)
-      fprintf(stderr,"MAKING X PENCILS FIT AFTER xprocs(%d,%d,%d) x.ns(%d,%d,%d)...\n",
-              d->process_topology_2_x.nproc[0],
-              d->process_topology_2_x.nproc[1],
-              d->process_topology_2_x.nproc[2],
-              d->process_topology_2_x.n[0],
-              d->process_topology_2_x.n[1],
-              d->process_topology_2_x.n[2]);
-    if(d->process_topology_2_x.n[0] != 0
-       && d->process_topology_2_x.n[1] != 0
-       && d->process_topology_2_x.n[2] != 0)
-    {// protects from dividing by zero.
-      check_x_dims = ((d->process_topology_3.n[2]) % (d->process_topology_2_x.n[2]) == 0)
-        && ((d->process_topology_3.n[1]) % (d->process_topology_2_x.n[1]) == 0)
-        && (n[0] % (d->process_topology_2_x.nproc[2]) == 0)
-        && (n[0] % (d->process_topology_2_x.nproc[1]) == 0);
-    } else {
-      check_x_dims=false;
-    }
-  }
-
-  if (d->debug && 0 == self && verbosity() > 2) {
-    fprintf(stderr, "  2d_x: ");
-    for (int i = 0; i < ndim; ++i) {
-      fprintf(stderr, "%d%s",
-              d->process_topology_2_x.nproc[i],
-              separator(i, ndim));
-    }
-    fprintf(stderr, "\n");
-  }
-  if(!check_x_dims && debug && (self==0) && verbosity() > 2){
-    FILE * outfile;
-    outfile= fopen("error.data","a");
-    fprintf(outfile,"X DIMS FAILS:(%d,%d,%d) (%d,%d,%d) \n",
-            d->process_topology_2_x.nproc[0],
-            d->process_topology_2_x.nproc[1],
-            d->process_topology_2_x.nproc[2],
-            d->process_topology_3.nproc[0],
-            d->process_topology_3.nproc[1],
-            d->process_topology_3.nproc[2]);
-  }
-  assert(check_x_dims);
-/*
-  if this happends, it is because the dimensions were chosen incorrectly.
-  Either to many processors for the number of points in one dimension (could
-  not do at least 1 point per processor), or the methods above could not make
-  a distribution of pencils that fit in the cubiods, which would happen if the
-  user gave numbers that wouldent work (we require the number of processors in
-  each dimension of the cuboid must be modulo the number of points in that
-  dimension, otherwise, this error will happen).
-*/
-  MPI_Cart_create(comm,
-                  ndim,
-                  d->process_topology_2_x.nproc,
-                  period,
-                  0,
-                  &d->process_topology_2_x.cart);
-  Coord_x_pencils(self, d->process_topology_2_x.self, d);
-
-  if(self == 0 && verbosity() > 2) {
-    printf("distribution 2x: [%d:%d:%d]\n",
-           d->process_topology_2_x.nproc[0],
-           d->process_topology_2_x.nproc[1],
-           d->process_topology_2_x.nproc[2]);
-    fflush(stdout);
-  }
-
-
-
-  // set up process grid with 2d decomposition (y_PENCILs)
-  d->process_topology_2_y.nproc[0] = 0;
-  d->process_topology_2_y.nproc[1] = 1; // don't distribute outer dimension
-  d->process_topology_2_y.nproc[2] = 0;
-  period[0] = period[1] = period[2] = 1;
-  MPI_Dims_create(nproc, ndim, d->process_topology_2_y.nproc);
-  d->process_topology_2_y.n[0] = n[0] / d->process_topology_2_y.nproc[0];
-  d->process_topology_2_y.n[1] = n[1] / d->process_topology_2_y.nproc[1];
-  d->process_topology_2_y.n[2] = n[2] / d->process_topology_2_y.nproc[2];
-  //variable used to ensure that pencils created fit inside the cuboids,
-  //if not the code will assert out.
-  bool check_y_dims=false;
-  if(d->process_topology_2_y.n[0] != 0
-     && d->process_topology_2_y.n[1] != 0
-     && d->process_topology_2_y.n[2] != 0)
-  {// protects from dividing by zero.
-    check_y_dims = (((d->process_topology_3.n[2]) % (d->process_topology_2_y.n[2]) == 0)
-                    && ((d->process_topology_3.n[0]) % (d->process_topology_2_y.n[0]) == 0)
-                    && (n[0] % (d->process_topology_2_y.nproc[2]) == 0)
-                    && (n[0] % (d->process_topology_2_y.nproc[0]) == 0));
-    if(self==0 && debug && !check_y_dims && verbosity() > 2)
-      fprintf(stderr,"Need to fix Y PENCILS y_procs(%d,%d,%d) 3d.ns(%d,%d,%d) 2d_y.ns(%d,%d,%d)...\n",
-              d->process_topology_2_y.nproc[0],
-              d->process_topology_2_y.nproc[1],
-              d->process_topology_2_y.nproc[2],
-              d->process_topology_3.n[0],
-              d->process_topology_3.n[1],
-              d->process_topology_3.n[2],
-              d->process_topology_2_y.n[0],
-              d->process_topology_2_y.n[1],
-              d->process_topology_2_y.n[2]);
-    //try swapping pencil dimensions if the current dimension of the pencils
-    //does not fit inside the cubes.
-    if(!(check_y_dims)
-       && ((d->process_topology_3.n[2]) % (d->process_topology_2_y.n[0]) == 0)
-       && ((d->process_topology_3.n[0]) % (d->process_topology_2_y.n[2]) == 0))
-    {
-      if(self==0 && debug && verbosity() > 2)
-        fprintf(stderr,"Swapping Y pencils in initialization .... \n");
-
-      int temp = d->process_topology_2_y.nproc[0];
-      d->process_topology_2_y.nproc[0] = d->process_topology_2_y.nproc[2];
-      d->process_topology_2_y.nproc[2] = temp;
-      d->process_topology_2_y.nproc[1] = d->process_topology_2_y.nproc[1];
-
-      d->process_topology_2_y.n[0] = n[0] / d->process_topology_2_y.nproc[0];
-      d->process_topology_2_y.n[1] = n[1] / d->process_topology_2_y.nproc[1];
-      d->process_topology_2_y.n[2] = n[2] / d->process_topology_2_y.nproc[2];
-      check_y_dims = (((d->process_topology_3.n[2]) % (d->process_topology_2_y.n[2]) == 0)
-                      && ((d->process_topology_3.n[0]) % (d->process_topology_2_y.n[0]) == 0)
-                      && (n[0] % (d->process_topology_2_y.nproc[2]) == 0)
-                      && (n[0] % (d->process_topology_2_y.nproc[0]) == 0));
-    }
-  } else {
-    check_y_dims = false;
-  }
-/*
-  if that did not work, make a pencil that does by taking the cuboid
-  (np1,np2,np3) and making pencils of the form (np1,1,np3*np2) or
-  (np1*np2,1,np3) depending on the most even distribution it can.
-*/
-  if(!check_y_dims){
-    if(self==0 && debug && verbosity() > 2)
-      fprintf(stderr,"MAKING Y PENCILS FIT yprocs(%d,%d,%d) y.ns(%d,%d,%d)...\n",
-              d->process_topology_2_y.nproc[0],
-              d->process_topology_2_y.nproc[1],
-              d->process_topology_2_y.nproc[2],
-              d->process_topology_2_y.n[0],
-              d->process_topology_2_y.n[1],
-              d->process_topology_2_y.n[2]);
-
-    d->process_topology_2_y.nproc[1]=1;
-    if(d->process_topology_3.nproc[2] > d->process_topology_3.nproc[0])
-    {
-      d->process_topology_2_y.nproc[0] = d->process_topology_3.nproc[0]*d->process_topology_3.nproc[1];
-      d->process_topology_2_y.nproc[2] = d->process_topology_3.nproc[2];
-      if((n[0] % (d->process_topology_2_y.nproc[2]) != 0)
-         || (n[0] % (d->process_topology_2_y.nproc[0]) != 0))
-      {
-        d->process_topology_2_y.nproc[2] = d->process_topology_3.nproc[2]*d->process_topology_3.nproc[1];
-        d->process_topology_2_y.nproc[0] = d->process_topology_3.nproc[0];
-      }
-    } else {
-      d->process_topology_2_y.nproc[2] = d->process_topology_3.nproc[2]*d->process_topology_3.nproc[1];
-      d->process_topology_2_y.nproc[0] = d->process_topology_3.nproc[0];
-      if((n[0] % (d->process_topology_2_y.nproc[2]) != 0)
-         || (n[0] % (d->process_topology_2_y.nproc[0]) != 0))
-      {
-        d->process_topology_2_y.nproc[0] = d->process_topology_3.nproc[0]*d->process_topology_3.nproc[1];
-        d->process_topology_2_y.nproc[2] = d->process_topology_3.nproc[2];
-      }
-    }
-
-    d->process_topology_2_y.n[0] = n[0] / d->process_topology_2_y.nproc[0];
-    d->process_topology_2_y.n[1] = n[1] / d->process_topology_2_y.nproc[1];
-    d->process_topology_2_y.n[2] = n[2] / d->process_topology_2_y.nproc[2];
-    if(self==0 && debug && verbosity() > 2)
-      fprintf(stderr,"MAKING Y PENCILS FIT AFTER yprocs(%d,%d,%d) y.ns(%d,%d,%d)...\n",
-              d->process_topology_2_y.nproc[0],
-              d->process_topology_2_y.nproc[1],
-              d->process_topology_2_y.nproc[2],
-              d->process_topology_2_y.n[0],
-              d->process_topology_2_y.n[1],
-              d->process_topology_2_y.n[2]);
-    if(d->process_topology_2_y.n[0] != 0 && d->process_topology_2_y.n[1] != 0
-       && d->process_topology_2_y.n[2] != 0)
-    {// protects from dividing by zero.
-      check_y_dims = (((d->process_topology_3.n[2]) % (d->process_topology_2_y.n[2]) == 0)
-                      && ((d->process_topology_3.n[0]) % (d->process_topology_2_y.n[0]) == 0)
-                      && (n[0] % (d->process_topology_2_y.nproc[2]) == 0)
-                      && (n[0] % (d->process_topology_2_y.nproc[0]) == 0));
-    } else {
-      check_y_dims=false;
-    }
-  }
-
-  if (d->debug && 0 == self && verbosity() > 2) {
-    fprintf(stderr, "  2d_y: ");
-    for (int i = 0; i < ndim; ++i) {
-      fprintf(stderr, "%d%s",
-              d->process_topology_2_y.nproc[i],
-              separator(i, ndim));
-    }
-    fprintf(stderr, "\n");
-  }
-  if(!check_y_dims && debug && (self==0) && verbosity() > 2){
-    FILE * outfile;
-    outfile = fopen("error.data","a");
-    fprintf(outfile,"Y DIMS FAILS:(%d,%d,%d) (%d,%d,%d) \n",
-            d->process_topology_2_y.nproc[0],
-            d->process_topology_2_y.nproc[1],
-            d->process_topology_2_y.nproc[2],
-            d->process_topology_3.nproc[0],
-            d->process_topology_3.nproc[1],
-            d->process_topology_3.nproc[2]);
-  }
-  assert(check_y_dims);
-/*
-  if this happends, it is because the dimensions were chosen incorrectly.
-  Either to many processors for the number of points in one dimension (could
-  not do at least 1 point per processor), or the methods above could
-  not make a distribution of pencils that fit in the cubiods, which would
-  happen if the user gave numbers that wouldent work (we require the number of
-  processors in each dimension of the cuboid must be modulo the number of
-  points in that dimension, otherwise, this error will happen).
-*/
-  MPI_Cart_create(comm,
-                  ndim,
-                  d->process_topology_2_y.nproc,
-                  period,
-                  0,
-                  &d->process_topology_2_y.cart);
-  //find the cartesian coord of the current rank (for the y_pencil)
-  Coord_y_pencils(self,d->process_topology_2_y.self,d);
-
-  if(self == 0 && verbosity() > 2) {
-    printf("distribution 2y: [%d:%d:%d]\n",
-           d->process_topology_2_y.nproc[0],
-           d->process_topology_2_y.nproc[1],
-           d->process_topology_2_y.nproc[2]);
-    fflush(stdout);
-  }
-
-
-
-  if (d->debug) {
-    int myrank_cube;
-    Rank_cube(&myrank_cube,d->process_topology_3.self,d);
-    int myrank_x;
-    Rank_x_pencils(&myrank_x,d->process_topology_2_x.self,d);
-    int myrank_y;
-    Rank_y_pencils(&myrank_y,d->process_topology_2_y.self,d);
-    int myrank_z;
-    Rank_z_pencils(&myrank_z,d->process_topology_2_z.self,d);
-    if(myrank_z != self
-       || myrank_y != self
-       || myrank_x != self
-       || myrank_cube != self)
-      abort(); //means ranks were calculated wrong.
-    if (0 == self && verbosity() > 2) {
-      fprintf(stderr, "Process map:\n");
-    }
-    for (int p = 0; p < nproc; ++p) {
-      MPI_Barrier(comm);
-      if (p == self && verbosity() > 2) {
-        fprintf(stderr, "  %d: 1d = (%d, %d, %d), 2d_x = (%d, %d, %d) rank is= %d,2d_y = (%d, %d, %d) rank is= %d,2d_z = (%d, %d, %d) rank is= %d, 3d = (%d, %d, %d). rank is= %d\n",
-                self,
-                d->process_topology_1.self[0],
-                d->process_topology_1.self[1],
-                d->process_topology_1.self[2],
-                d->process_topology_2_x.self[0],
-                d->process_topology_2_x.self[1],
-                d->process_topology_2_x.self[2],
-                myrank_x,
-                d->process_topology_2_y.self[0],
-                d->process_topology_2_y.self[1],
-                d->process_topology_2_y.self[2],
-                myrank_y,
-                d->process_topology_2_z.self[0],
-                d->process_topology_2_z.self[1],
-                d->process_topology_2_z.self[2],
-                myrank_z,
-                d->process_topology_3.self[0],
-                d->process_topology_3.self[1],
-                d->process_topology_3.self[2],
-                myrank_cube);
-      }
-    }
-  }
-
-  //allocate size of buffers used to hold pencil chunks of data in the
-  //distribution routines for 3d to 1d and vica versa.
-  int buff_z_chunk = d->process_topology_2_z.n[0]*d->process_topology_2_z.n[1]*d->process_topology_3.n[2];
-  int buff_y_chunk = d->process_topology_2_y.n[0]*d->process_topology_2_y.n[2]*d->process_topology_3.n[1];
-  int buff_x_chunk = d->process_topology_2_x.n[1]*d->process_topology_2_x.n[2]*d->process_topology_3.n[0];
-  int buff_size = 0;
-  if(buff_z_chunk > buff_y_chunk){
-    buff_size=buff_z_chunk;
-  } else {
-    buff_size=buff_y_chunk;
-  }
-  if(buff_x_chunk > buff_size)
-    buff_size = buff_x_chunk;
-
-  d->d2_chunk=(complex_t *) malloc(sizeof(complex_t)*buff_size);
-  d->d3_chunk=(complex_t *) malloc(sizeof(complex_t)*buff_size);
-}
-
-// Use MPI_Dims_create to create a 3D decomposition, or use a user-provided decomposition
-// if it is appropriate (it has the required number of processors)
-void Custom3D_Dims_create(const int Ndims[], int nproc, int ndims, int dims[])
-{
-
-  int check = 1;
-  for(int i=0; i<ndims; i++) check *= Ndims[i];
-
-  if(check == nproc) {
-    for(int i=0; i<ndims; i++) dims[i] = Ndims[i];
-  }
-  else {
-    MPI_Dims_create(nproc, ndims, dims);
-  }
-
-}
-
-// create 1-, 2- and 3-d cartesian data distributions with explicitly
-// provided dimension lists
-void distribution_init_explicit(MPI_Comm comm,
-                                const int n[],
-                                int nproc_1d[],
-                                int nproc_2d_x[],
-                                int nproc_2d_y[],
-                                int nproc_2d_z[],
-                                int nproc_3d[],
-                                distribution_t *d,
-                                bool debug)
-{
-  d->parent = comm;
-
-  int nproc;
-  int self;
-  int ndim = 3;
-  int period[3];
-
-  MPI_Comm_rank(comm, &self);
-  MPI_Comm_size(comm, &nproc);
-
-  if (!self && verbosity() > 2) printf("Initializing redistribution using a %s layout on %d ranks.\n",
-#ifdef PENCIL
-                    "pencil"
-#else
-                    "slab"
-#endif
-                    ,nproc);
-
-  d->debug = debug;
-  for (int i = 0; i < 3; ++i)
-    d->n[i] = n[i];
-
-  // check supplied dimension lists are valid
-  assert(nproc_1d[0] == nproc);
-  assert(nproc_1d[1] == 1);
-  assert(nproc_1d[2] == 1);
-
-  assert(nproc_2d_x[1] * nproc_2d_x[2] == nproc);
-  assert(nproc_2d_x[0] == 1);
-
-  assert(nproc_2d_y[0] * nproc_2d_y[2] == nproc);
-  assert(nproc_2d_y[1] == 1);
-
-  assert(nproc_2d_z[0] * nproc_2d_z[1] == nproc);
-  assert(nproc_2d_z[2] == 1);
-
-  assert(nproc_3d[0] * nproc_3d[1] * nproc_3d[2]== nproc);
-
-  // set up process grid with 1d decomposition (SLABs)
-  period[0] = period[1] = period[2] = 1;
-  MPI_Cart_create(comm, ndim, nproc_1d, period, 0, &d->process_topology_1.cart);
-  MPI_Cart_get(d->process_topology_1.cart, ndim, d->process_topology_1.nproc, d->process_topology_1.period, d->process_topology_1.self);
-  d->process_topology_1.n[0] = n[0] / d->process_topology_1.nproc[0];
-  d->process_topology_1.n[1] = n[1] / d->process_topology_1.nproc[1];
-  d->process_topology_1.n[2] = n[2] / d->process_topology_1.nproc[2];
-  if (d->debug && 0 == self && verbosity() > 2) {
-    fprintf(stderr, "Process grids:\n");
-    fprintf(stderr, "  1d: ");
-    for (int i = 0; i < ndim; ++i) {
-      fprintf(stderr, "%d%s", d->process_topology_1.nproc[i], separator(i, ndim));
-    }
-    fprintf(stderr, "\n");
-  }
-
-  // set up process grid with 3d decomposition (CUBE)
-  period[0] = period[1] = period[2] = 1;
-  MPI_Cart_create(comm, ndim, nproc_3d, period, 0, &d->process_topology_3.cart);
-  Coord_cube(self,d->process_topology_3.self,d);
-  if (d->debug && 0 == self && verbosity() > 2) {
-    fprintf(stderr, "  3d: ");
-    for (int i = 0; i < ndim; ++i) {
-      fprintf(stderr, "%d%s", d->process_topology_3.nproc[i], separator(i, ndim));
-    }
-    fprintf(stderr, "\n");
-  }
-  if(debug){
-    int prev_coord[3];
-    prev_coord[0]=d->process_topology_3.self[0];
-    prev_coord[1]=d->process_topology_3.self[1];
-    prev_coord[2]=d->process_topology_3.self[2];
-    MPI_Cart_get(d->process_topology_3.cart, ndim, d->process_topology_3.nproc, d->process_topology_3.period, d->process_topology_3.self);
-    for(int i=0; i < 3; i++){
-      if(prev_coord[i] != d->process_topology_3.self[i])abort();//Cube coordinates calculated wrong!
-    }
-  }
-  d->process_topology_3.n[0] = n[0] / d->process_topology_3.nproc[0];
-  d->process_topology_3.n[1] = n[1] / d->process_topology_3.nproc[1];
-  d->process_topology_3.n[2] = n[2] / d->process_topology_3.nproc[2];
-
-  // set up process grid with 2d_x decomposition (X dim Pencils)
-  period[0] = period[1] = period[2] = 1;
-  MPI_Cart_create(comm, ndim, nproc_2d_x, period, 0, &d->process_topology_2_x.cart);
-  d->process_topology_2_x.nproc[0]=nproc_2d_x[0];
-  d->process_topology_2_x.nproc[1]=nproc_2d_x[1];
-  d->process_topology_2_x.nproc[2]=nproc_2d_x[2];
-  d->process_topology_2_x.n[0] = n[0] / d->process_topology_2_x.nproc[0];
-  d->process_topology_2_x.n[1] = n[1] / d->process_topology_2_x.nproc[1];
-  d->process_topology_2_x.n[2] = n[2] / d->process_topology_2_x.nproc[2];
-
-  bool check_x_dims=((d->process_topology_3.n[2]) % (d->process_topology_2_x.n[2]) == 0) && ((d->process_topology_3.n[1]) % (d->process_topology_2_x.n[1]) == 0) && (n[0] % (d->process_topology_2_x.nproc[2]) == 0) && (n[0] % (d->process_topology_2_x.nproc[0]) == 0);
-  if(!check_x_dims && debug && (self==0)){
-    FILE * outfile;
-    outfile= fopen("error.data","a");
-    fprintf(outfile,"X DIMS FAILS:(%d,%d,%d) (%d,%d,%d) \n",d->process_topology_2_x.nproc[0],d->process_topology_2_x.nproc[1],d->process_topology_2_x.nproc[2], d->process_topology_3.nproc[0],d->process_topology_3.nproc[1],d->process_topology_3.nproc[2]);
-  }
-  assert(check_x_dims);//if this happends, it is because the dimensions were chosen incorrectly. Either to many processors for the number of points in one dimension (could not do at least 1 point per processor), or the methods above could
-  //not make a distribution of pencils that fit in the cubiods, which would happen if the user gave numbers that wouldent work (we require the number of processors in each dimension of the cuboid must be modulo the number of points
-  //in that dimension, otherwise, this error will happen).
-  Coord_x_pencils(self,d->process_topology_2_x.self,d);
-
-  if (d->debug && 0 == self && verbosity() > 2) {
-    fprintf(stderr, "  2d_x: ");
-    for (int i = 0; i < ndim; ++i) {
-      fprintf(stderr, "%d%s", d->process_topology_2_x.nproc[i], separator(i, ndim));
-    }
-    fprintf(stderr, "\n");
-  }
-
-  // set up process grid with 2d_y decomposition (Y dim Pencils)
-  period[0] = period[1] = period[2] = 1;
-  MPI_Cart_create(comm, ndim, nproc_2d_y, period, 0, &d->process_topology_2_y.cart);
-  d->process_topology_2_y.nproc[0]=nproc_2d_y[0];
-  d->process_topology_2_y.nproc[1]=nproc_2d_y[1];
-  d->process_topology_2_y.nproc[2]=nproc_2d_y[2];
-  d->process_topology_2_y.n[0] = n[0] / d->process_topology_2_y.nproc[0];
-  d->process_topology_2_y.n[1] = n[1] / d->process_topology_2_y.nproc[1];
-  d->process_topology_2_y.n[2] = n[2] / d->process_topology_2_y.nproc[2];
-
-
-  bool check_y_dims=(((d->process_topology_3.n[2]) % (d->process_topology_2_y.n[2]) == 0) && ((d->process_topology_3.n[0]) % (d->process_topology_2_y.n[0]) == 0) && (n[0] % (d->process_topology_2_y.nproc[2]) == 0) && (n[0] % (d->process_topology_2_y.nproc[0]) == 0));
-  if(!check_y_dims && debug && (self==0)){
-    FILE * outfile;
-    outfile= fopen("error.data","a");
-    fprintf(outfile,"Y DIMS FAILS:(%d,%d,%d) (%d,%d,%d) \n",d->process_topology_2_y.nproc[0],d->process_topology_2_y.nproc[1],d->process_topology_2_y.nproc[2], d->process_topology_3.nproc[0],d->process_topology_3.nproc[1],d->process_topology_3.nproc[2]);
-  }
-  assert(check_y_dims);//if this happends, it is because the dimensions were chosen incorrectly. Either to many processors for the number of points in one dimension (could not do at least 1 point per processor), or the methods above could
-  //not make a distribution of pencils that fit in the cubiods, which would happen if the user gave numbers that wouldent work (we require the number of processors in each dimension of the cuboid must be modulo the number of points
-  //in that dimension, otherwise, this error will happen).
-  Coord_y_pencils(self,d->process_topology_2_y.self,d);
-
-  if (d->debug && 0 == self && verbosity() > 2) {
-    fprintf(stderr, "  2d_y: ");
-    for (int i = 0; i < ndim; ++i) {
-      fprintf(stderr, "%d%s", d->process_topology_2_y.nproc[i], separator(i, ndim));
-    }
-    fprintf(stderr, "\n");
-  }
-
-  // set up process grid with 2d_z decomposition (Z dim pencils)
-  period[0] = period[1] = period[2] = 1;
-  MPI_Cart_create(comm, ndim, nproc_2d_z, period, 0, &d->process_topology_2_z.cart);
-  d->process_topology_2_z.nproc[0]=nproc_2d_z[0];
-  d->process_topology_2_z.nproc[1]=nproc_2d_z[1];
-  d->process_topology_2_z.nproc[2]=nproc_2d_z[2];
-  d->process_topology_2_z.n[0] = n[0] / d->process_topology_2_z.nproc[0];
-  d->process_topology_2_z.n[1] = n[1] / d->process_topology_2_z.nproc[1];
-  d->process_topology_2_z.n[2] = n[2] / d->process_topology_2_z.nproc[2];
-
-
-  bool check_z_dims=((d->process_topology_3.n[0]) % (d->process_topology_2_z.n[0]) == 0) && ((d->process_topology_3.n[1]) % (d->process_topology_2_z.n[1]) == 0) && (n[0] % (d->process_topology_2_z.nproc[0]) == 0) && (n[0] % (d->process_topology_2_z.nproc[1]) == 0);
-  if(!check_z_dims && debug && (self==0)){
-    FILE * outfile;
-    outfile= fopen("error.data","a");
-    fprintf(outfile,"Z DIMS FAILS:(%d,%d,%d) (%d,%d,%d) \n",d->process_topology_2_z.nproc[0],d->process_topology_2_z.nproc[1],d->process_topology_2_z.nproc[2], d->process_topology_3.nproc[0],d->process_topology_3.nproc[1],d->process_topology_3.nproc[2]);
-  }
-  assert(check_z_dims);//if this happends, it is because the dimensions were chosen incorrectly. Either to many processors for the number of points in one dimension (could not do at least 1 point per processor), or the methods above could
-  //not make a distribution of pencils that fit in the cubiods, which would happen if the user gave numbers that wouldent work (we require the number of processors in each dimension of the cuboid must be modulo the number of points
-  //in that dimension, otherwise, this error will happen).
-  Coord_z_pencils(self,d->process_topology_2_z.self,d);
-
-  if (d->debug && 0 == self && verbosity() > 2) {
-    fprintf(stderr, "  2d_z: ");
-    for (int i = 0; i < ndim; ++i) {
-      fprintf(stderr, "%d%s", d->process_topology_2_z.nproc[i], separator(i, ndim));
-    }
-    fprintf(stderr, "\n");
-  }
-  //assert that all pencils fit in the cuboid.
-
-  if (d->debug) {
-    int myrank_cube;
-    Rank_cube(&myrank_cube,d->process_topology_3.self,d);
-    int myrank_z;
-    Rank_z_pencils(&myrank_z,d->process_topology_2_z.self,d);
-    int myrank_y;
-    Rank_y_pencils(&myrank_y,d->process_topology_2_y.self,d);
-    int myrank_x;
-    Rank_x_pencils(&myrank_x,d->process_topology_2_x.self,d);
-    if(myrank_z != self || myrank_y != self || myrank_x != self || myrank_cube != self)abort(); //means ranks were calculated wrong.
-    if (0 == self && verbosity() > 2) {
-      fprintf(stderr, "Process map:\n");
-    }
-    for (int p = 0; p < nproc; ++p) {
-      MPI_Barrier(comm);
-      if (p == self && verbosity() > 2) {
-        fprintf(stderr,
-                "  %d: 1d = (%d, %d, %d), 2d_x = (%d, %d, %d) rank (%d), 2d_y = (%d, %d, %d) rank (%d), 2d_z = (%d, %d, %d) rank (%d), 3d = (%d, %d, %d) rank (%d).\n",
-                self,
-                d->process_topology_1.self[0], d->process_topology_1.self[1], d->process_topology_1.self[2],
-                d->process_topology_2_x.self[0], d->process_topology_2_x.self[1], d->process_topology_2_x.self[2],myrank_x,
-                d->process_topology_2_y.self[0], d->process_topology_2_y.self[1], d->process_topology_2_y.self[2],myrank_y,
-                d->process_topology_2_z.self[0], d->process_topology_2_z.self[1], d->process_topology_2_z.self[2],myrank_z,
-                d->process_topology_3.self[0], d->process_topology_3.self[1], d->process_topology_3.self[2],myrank_cube);
-      }
-    }
-  }
-}
-
-
-
-
-///
-// clean up the data distribution
-//   d    distribution descriptor
-///
-void distribution_fini(distribution_t *d)
-{
-  MPI_Comm_free(&d->process_topology_1.cart);
-  MPI_Comm_free(&d->process_topology_2_x.cart);
-  MPI_Comm_free(&d->process_topology_2_y.cart);
-  MPI_Comm_free(&d->process_topology_2_z.cart);
-  MPI_Comm_free(&d->process_topology_3.cart);
-  free(d->d2_chunk);
-  free(d->d3_chunk);
-  free(d->gridmap);
-  free(d->rankmap);
-}
-
-
-///
-// check that the dimensions, n, of an array are commensurate with the
-// process grids of this distribution
-//   n    (global) grid dimensions
-//   d    distribution descriptor
-///
-void distribution_assert_commensurate(distribution_t *d)
-{
-  for (int i = 0; i < 3; ++i) {
-#if defined(PENCIL)
-    assert(0 == (d->n[i] % d->process_topology_2_x.nproc[i]));
-    assert(0 == (d->n[i] % d->process_topology_2_y.nproc[i]));
-    assert(0 == (d->n[i] % d->process_topology_2_z.nproc[i]));
-#else
-    assert(0 == (d->n[i] % d->process_topology_1.nproc[i]));
-#endif
-    assert(0 == (d->n[i] % d->process_topology_3.nproc[i]));
-  }
-}
-
-
-// forward declarations
-static void redistribute(const complex_t *, complex_t *, distribution_t *, int);
-static void redistribute_2_and_3(const complex_t *, complex_t *, distribution_t *, int, int);
-static void redistribute_slab(const complex_t *, complex_t *, distribution_t *, int);
-
-
-///
-// redistribute a 1-d to a 3-d data distribution
-//   a    input
-//   b    ouput
-//   d    distribution descriptor
-///
-void distribution_1_to_3(const complex_t *a,
-                         complex_t *b,
-                         distribution_t *d)
-{
-  if (USE_SLAB_WORKAROUND) {
-    redistribute_slab(a, b, d, REDISTRIBUTE_1_TO_3);
-  } else {
-    redistribute(a, b, d, REDISTRIBUTE_1_TO_3);
-  }
-}
-
-
-///
-// redistribute a 3-d to a 1-d data distribution
-//   a    input
-//   b    ouput
-//   d    distribution descriptor
-///
-void distribution_3_to_1(const complex_t *a,
-                         complex_t *b,
-                         distribution_t *d)
-{
-  if (USE_SLAB_WORKAROUND) {
-    redistribute_slab(a, b, d, REDISTRIBUTE_3_TO_1);
-  } else {
-    redistribute(a, b, d, REDISTRIBUTE_3_TO_1);
-  }
-}
-
-
-///
-// redistribute between 1- and 3-d distributions.
-//   a    input
-//   b    ouput
-//   d    distribution descriptor
-//   dir  direction of redistribution
-//
-// This actually does the work.
-///
-static void redistribute(const complex_t *a,
-                         complex_t *b,
-                         distribution_t *d,
-                         int direction)
-{
-  int remaining_dim[3];
-  MPI_Comm subgrid_cart;
-  int subgrid_self;
-  int subgrid_nproc;
-
-  // exchange data with processes in a 2-d slab of 3-d subdomains
-
-  remaining_dim[0] = 0;
-  remaining_dim[1] = 1;
-  remaining_dim[2] = 1;
-  MPI_Cart_sub(d->process_topology_3.cart, remaining_dim, &subgrid_cart);
-  MPI_Comm_rank(subgrid_cart, &subgrid_self);
-  MPI_Comm_size(subgrid_cart, &subgrid_nproc);
-
-  for (int p = 0; p < subgrid_nproc; ++p) {
-    int d1_peer = (subgrid_self + p) % subgrid_nproc;
-    int d3_peer = (subgrid_self - p + subgrid_nproc) % subgrid_nproc;
-    int coord[2];
-    int sizes[3];
-    int subsizes[3];
-    int starts[3];
-    MPI_Datatype d1_type;
-    MPI_Datatype d3_type;
-
-    MPI_Cart_coords(subgrid_cart, d1_peer, 2, coord);
-    if (0) {
-      int self;
-      MPI_Comm_rank(MPI_COMM_WORLD, &self);
-      fprintf(stderr, "%d: d1_peer, d1_coord, d3_peer = %d, (%d, %d), %d\n",
-              self, d1_peer, coord[0], coord[1], d3_peer);
-    }
-
-    // create dataypes representing a subarray in the 1- and 3-d distributions
-
-    sizes[0] = d->process_topology_1.n[0];
-    sizes[1] = d->process_topology_1.n[1];
-    sizes[2] = d->process_topology_1.n[2];
-    subsizes[0] = d->process_topology_1.n[0];
-    subsizes[1] = d->process_topology_3.n[1];
-    subsizes[2] = d->process_topology_3.n[2];
-    starts[0] = 0;
-    starts[1] = coord[0] * d->process_topology_3.n[1];
-    starts[2] = coord[1] * d->process_topology_3.n[2];
-    MPI_Type_create_subarray(3, sizes, subsizes, starts, MPI_ORDER_C, MPI_DOUBLE_COMPLEX, &d1_type);
-    MPI_Type_commit(&d1_type);
-
-    sizes[0] = d->process_topology_3.n[0];
-    sizes[1] = d->process_topology_3.n[1];
-    sizes[2] = d->process_topology_3.n[2];
-    subsizes[0] = d->process_topology_1.n[0];
-    subsizes[1] = d->process_topology_3.n[1];
-    subsizes[2] = d->process_topology_3.n[2];
-    starts[0] = d3_peer * d->process_topology_1.n[0];
-    starts[1] = 0;
-    starts[2] = 0;
-    MPI_Type_create_subarray(3, sizes, subsizes, starts, MPI_ORDER_C, MPI_DOUBLE_COMPLEX, &d3_type);
-    MPI_Type_commit(&d3_type);
-
-    // exchange data
-
-    if (direction == REDISTRIBUTE_3_TO_1) {
-      MPI_Sendrecv((char *) a, 1, d3_type, d3_peer, 0,
-                   (char *) b, 1, d1_type, d1_peer, 0,
-                   subgrid_cart, MPI_STATUS_IGNORE);
-    } else if (direction == REDISTRIBUTE_1_TO_3) {
-      MPI_Sendrecv((char *) a, 1, d1_type, d1_peer, 0,
-                   (char *) b, 1, d3_type, d3_peer, 0,
-                   subgrid_cart, MPI_STATUS_IGNORE);
-    } else {
-      abort();
-    }
-
-    // free datatypes
-
-    MPI_Type_free(&d1_type);
-    MPI_Type_free(&d3_type);
-  }
-
-  MPI_Comm_free(&subgrid_cart);
-}
-
-
-///
-// redistribute between 1- and 3-d distributions.
-//   a    input
-//   b    ouput
-//   d    distribution descriptor
-//   dir  direction of redistribution
-//
-// This actually does the work, using slabs of subarrays to work
-// around an issue in Open MPI with large non-contiguous datatypes.
-///
-static void redistribute_slab(const complex_t *a,
-                              complex_t *b,
-                              distribution_t *d,
-                              int direction)
-{
-  int remaining_dim[3];
-  MPI_Comm subgrid_cart;
-  int subgrid_self;
-  int subgrid_nproc;
-  ptrdiff_t d1_slice = d->process_topology_1.n[1] * d->process_topology_1.n[2] * sizeof(complex_t);
-  ptrdiff_t d3_slice = d->process_topology_3.n[1] * d->process_topology_3.n[2] * sizeof(complex_t);
-
-  // exchange data with processes in a 2-d slab of 3-d subdomains
-
-  remaining_dim[0] = 0;
-  remaining_dim[1] = 1;
-  remaining_dim[2] = 1;
-  MPI_Cart_sub(d->process_topology_3.cart, remaining_dim, &subgrid_cart);
-  MPI_Comm_rank(subgrid_cart, &subgrid_self);
-  MPI_Comm_size(subgrid_cart, &subgrid_nproc);
-
-  for (int p = 0; p < subgrid_nproc; ++p) {
-    int coord[2];
-    int d1_peer = (subgrid_self + p) % subgrid_nproc;
-    int d3_peer = (subgrid_self - p + subgrid_nproc) % subgrid_nproc;
-
-    MPI_Cart_coords(subgrid_cart, d1_peer, 2, coord);
-    if (0) {
-      int self;
-      MPI_Comm_rank(MPI_COMM_WORLD, &self);
-      fprintf(stderr, "%d: d1_peer, d1_coord, d3_peer = %d, (%d, %d), %d\n",
-              self, d1_peer, coord[0], coord[1], d3_peer);
-    }
-
-    for (int slice = 0; slice < d->process_topology_1.n[0]; ++slice) {
-      int sizes[2];
-      int subsizes[2];
-      int starts[2];
-      MPI_Datatype d1_type;
-      MPI_Datatype d3_type;
-      ptrdiff_t d1_offset = slice * d1_slice;
-      ptrdiff_t d3_offset = (slice + d3_peer * d->process_topology_1.n[0]) * d3_slice;
-
-      // create subarray dataypes representing the slice subarray in the 1- and 3-d distributions
-
-      sizes[0] = d->process_topology_1.n[1];
-      sizes[1] = d->process_topology_1.n[2];
-      subsizes[0] = d->process_topology_3.n[1];
-      subsizes[1] = d->process_topology_3.n[2];
-      starts[0] = coord[0] * d->process_topology_3.n[1];
-      starts[1] = coord[1] * d->process_topology_3.n[2];
-      MPI_Type_create_subarray(2, sizes, subsizes, starts, MPI_ORDER_C, MPI_DOUBLE_COMPLEX, &d1_type);
-      MPI_Type_commit(&d1_type);
-
-      MPI_Type_contiguous(d->process_topology_3.n[1] * d->process_topology_3.n[2],
-                          MPI_DOUBLE_COMPLEX,
-                          &d3_type);
-      MPI_Type_commit(&d3_type);
-
-      // exchange data
-
-      if (direction == REDISTRIBUTE_3_TO_1) {
-        MPI_Sendrecv((char *) a + d3_offset, 1, d3_type, d3_peer, 0,
-                     (char *) b + d1_offset, 1, d1_type, d1_peer, 0,
-                     subgrid_cart, MPI_STATUS_IGNORE);
-      } else if (direction == REDISTRIBUTE_1_TO_3) {
-        MPI_Sendrecv((char *) a + d1_offset, 1, d1_type, d1_peer, 0,
-                     (char *) b + d3_offset, 1, d3_type, d3_peer, 0,
-                     subgrid_cart, MPI_STATUS_IGNORE);
-      } else {
-        abort();
-      }
-
-      // free datatypes
-
-      MPI_Type_free(&d1_type);
-      MPI_Type_free(&d3_type);
-    }
-  }
-
-  MPI_Comm_free(&subgrid_cart);
-}
-
-
-///
-// redistribute a 2-d to a 3-d data distribution
-//   a    input
-//   b    ouput
-//   d    distribution descriptor
-///
-void distribution_2_to_3(const complex_t *a,
-                         complex_t *b,
-                         distribution_t *d, int z_dim)
-{
-  redistribute_2_and_3(a, b, d, REDISTRIBUTE_2_TO_3, z_dim);
-}
-
-
-///
-// redistribute a 3-d to a 2-d data distribution
-//   a    input
-//   b    ouput
-//   d    distribution descriptor
-///
-void distribution_3_to_2(const complex_t *a,
-                         complex_t *b,
-                         distribution_t *d, int z_dim)
-{
-  redistribute_2_and_3(a, b, d, REDISTRIBUTE_3_TO_2, z_dim);
-}
-
-
-///
-// redistribute between 2- and 3-d distributions.
-//   a    input
-//   b    ouput
-//   d    distribution descriptor
-//   dir  direction of redistribution
-//
-// This actually does the work.
-///
-static void redistribute_2_and_3(const complex_t *a,
-                                 complex_t *b,
-                                 distribution_t *d,
-                                 int direction,
-                                 int z_dim)
-{
-  int self = d->process_topology_1.self[0];
-  int npeers;
-  int me=0;//determines which processor to print
-  bool print_me=false; //prints info on processor whose rank = me.
-  bool print_mess=false;//prints communication sends and receives without actually doing the comms(intended to debug comm hangs).
-  bool print_result=false /*true*/;//prints a line in a file called "passed.data" which happends if the code runs completely.
-  assert(z_dim==0||z_dim==1||z_dim==2);
-  int x_dim=0,y_dim=0;
-  //x_dim, y_dim and z_dim are the dimensions of the x,y,z axis of the pencil with respect to the original axis(where index 2 is into the grid, 1 is vertical translation and 0 is horizontal).
-  switch(z_dim){
-    case 0: x_dim=1; y_dim=2;
-      if((self == me) && print_me && verbosity() > 2) fprintf(stderr, "DOING X PENCILS!...\n"); break;
-    case 1: x_dim=2; y_dim=0;
-      if((self == me && print_me && verbosity() > 2)) fprintf(stderr, "DOING Y PENCILS!...\n"); break;
-    case 2: x_dim=0; y_dim=1;
-      if((self == me && print_me && verbosity() > 2)) fprintf(stderr, "DOING Z PENCILS!...\n"); break;
-    default: assert("incorrect inputted dimension");
-  }
-
-  // assuming dimensions are all commensurate, then the number of
-  // peers to exchange with is the number of processes in the z_dimension
-  // direction in the 3d distribution
-  npeers = d->process_topology_3.nproc[z_dim]; //picked last direction (lets say into the grid)
-
-  // book-keeping for the processor translation in the x-y plane
-  int p0 = 0;
-  int p1 = 0;
-  int p1max = 0;
-
-  MPI_Request req1=MPI_REQUEST_NULL;
-  MPI_Request req2=MPI_REQUEST_NULL;
-
-  int pencil_sizes[3];
-  int cube_sizes[3];
-  int subsizes[3];
-
-
-  cube_sizes[x_dim] = d->process_topology_3.n[x_dim];
-  cube_sizes[y_dim] = d->process_topology_3.n[y_dim];
-  cube_sizes[z_dim] = d->process_topology_3.n[z_dim];
-
-  //set variables used to calculate the subarrays of each pencil and cube.
-  switch(z_dim){
-    case 0:
-      p1max = d->process_topology_2_x.nproc[x_dim] / d->process_topology_3.nproc[x_dim] - 1;
-      //find out the size of the chunk you need to use (stored in subsizes), and set sizes to the local size of the pencil.
-      //The x and y dimensions of the subchunck will be the dimensions of the pencil (since the code asserts at the beginning that all pencils fit inside the 3d cuboid.)
-      //The z dimension will be the dimension of the cuboid, since this will always be <= to the z_dim of the pencil.
-      pencil_sizes[x_dim] = d->process_topology_2_x.n[x_dim];
-      pencil_sizes[y_dim] = d->process_topology_2_x.n[y_dim];
-      pencil_sizes[z_dim] = d->process_topology_2_x.n[z_dim];
-      subsizes[x_dim] = d->process_topology_2_x.n[x_dim];
-      subsizes[y_dim] = d->process_topology_2_x.n[y_dim];
-      break;
-    case 1:
-      p1max = d->process_topology_2_y.nproc[x_dim] / d->process_topology_3.nproc[x_dim] - 1;
-      pencil_sizes[x_dim] = d->process_topology_2_y.n[x_dim];
-      pencil_sizes[y_dim] = d->process_topology_2_y.n[y_dim];
-      pencil_sizes[z_dim] = d->process_topology_2_y.n[z_dim];
-      subsizes[x_dim] = d->process_topology_2_y.n[x_dim];
-      subsizes[y_dim] = d->process_topology_2_y.n[y_dim];
-      break;
-    case 2:
-      p1max = d->process_topology_2_z.nproc[y_dim] / d->process_topology_3.nproc[y_dim] - 1;
-      pencil_sizes[x_dim] = d->process_topology_2_z.n[x_dim];
-      pencil_sizes[y_dim] = d->process_topology_2_z.n[y_dim];
-      pencil_sizes[z_dim] = d->process_topology_2_z.n[z_dim];
-      subsizes[x_dim] = d->process_topology_2_z.n[x_dim];
-      subsizes[y_dim] = d->process_topology_2_z.n[y_dim];
-      break;
-  }
-  subsizes[z_dim] = d->process_topology_3.n[z_dim];
-  int chunk_size=subsizes[0]*subsizes[1]*subsizes[2];//size of data chunks that will be communicated between pencil and cube distributions.
-
-  //set variables that will be used to find pencils chunks
-  int pencil_dims[3]={0,0,0};// size of entire pencil in its local coord system
-  int local_sizes[3]={0,0,0}; //size of chunk in its local coord system.
-  if(z_dim==2){
-    local_sizes[0]=subsizes[0];
-    local_sizes[1]=subsizes[1];
-    local_sizes[2]=subsizes[2];
-    pencil_dims[0]=d->process_topology_2_z.n[0];//pencil dims in grid coord system (where index 2 is the z direction).
-    pencil_dims[1]=d->process_topology_2_z.n[1];
-    pencil_dims[2]=d->process_topology_2_z.n[2];
-  }
-  else if(z_dim==1){
-
-    local_sizes[0]=subsizes[0];
-    local_sizes[1]=subsizes[2];
-    local_sizes[2]=subsizes[1];
-    pencil_dims[0]=d->process_topology_2_y.n[0];
-    pencil_dims[1]=d->process_topology_2_y.n[2];
-    pencil_dims[2]=d->process_topology_2_y.n[1];
-  }
-  else if(z_dim==0){
-    local_sizes[0]=subsizes[2];
-    local_sizes[1]=subsizes[1];
-    local_sizes[2]=subsizes[0];
-    pencil_dims[0]=d->process_topology_2_x.n[2];
-    pencil_dims[1]=d->process_topology_2_x.n[1];
-    pencil_dims[2]=d->process_topology_2_x.n[0];
-  }
-
-  if((self == me) && print_me && verbosity() > 2) fprintf(stderr, "%d, %d, %d, %d Dimensions!...\n", x_dim,y_dim,z_dim, p1max);
-
-  // communicate with our peers
-  for (int p = 0; p < npeers; ++p) {
-    if((self == me) && print_me && verbosity() > 2) fprintf(stderr, "%d, %d, %d Made it beg-for!...\n", self,p, npeers);
-
-    int d2_coord[3];
-    int d2_peer;
-    int d2_peer_coord[3];
-    int d3_coord[3];
-    int d3_peer;
-    int d3_peer_coord[3];
-    int recv_peer;
-    int send_peer;
-    int d2_array_start[3];
-    int d3_array_start[3];
-    //turn the processor coordinate into one specified by the number of data points in each dimension.
-    for (int i = 0; i < 3; ++i) {
-      switch(z_dim){
-        case 0: d2_coord[i]  = d->process_topology_2_x.self[i] * d->process_topology_2_x.n[i]; break;
-        case 1: d2_coord[i]  = d->process_topology_2_y.self[i] * d->process_topology_2_y.n[i]; break;
-        case 2: d2_coord[i]  = d->process_topology_2_z.self[i] * d->process_topology_2_z.n[i]; break;
-      }
-    }
-    //over every iteration of the loop, transverse down the pencil (since it will be divided in chunks whose coordinates will only differ in the z_dimension.
-    d2_coord[z_dim] += p * d->process_topology_3.n[z_dim];
-
-
-    if((self == me) && print_me && verbosity() > 2) fprintf(stderr, "%d, %d, %d Coord!...\n", d2_coord[0],d2_coord[1],d2_coord[2]);
-
-
-    //d2_array_start is the starting index of the chunk in the pencils local coordinates.
-    d2_array_start[0] = d2_coord[x_dim] % pencil_sizes[x_dim];
-    d2_array_start[1] = d2_coord[y_dim] % pencil_sizes[y_dim];
-    d2_array_start[2] = d2_coord[z_dim] % pencil_sizes[z_dim];
-
-    if ((DEBUG_CONDITION || ((self== me) && print_me)) && verbosity() > 2) {
-      fprintf(stderr,
-              "%d: pencil_sizes=(%d,%d,%d), cube_sizes=(%d,%d,%d), subsizes=(%d,%d,%d),d2_coord=(%d,%d,%d), d2_array_start=(%d,%d,%d) \n",
-              self,
-              pencil_sizes[0], pencil_sizes[1], pencil_sizes[2],
-              cube_sizes[0], cube_sizes[1], cube_sizes[2],
-              subsizes[0], subsizes[1], subsizes[2],
-              d2_coord[0], d2_coord[1], d2_coord[2],
-              d2_array_start[0],d2_array_start[1],d2_array_start[2]);
-    }
-
-
-    //if making cuboids from pencils, right here we need to fill the d2_chunk array with the data that later needs to be sent to a cuboid.
-        //The array is a chunk of the pencil and is why we needed to calculate the starting index for the array in the local coordinates.
-    if(direction == REDISTRIBUTE_2_TO_3){
-      int64_t ch_indx=0;
-      int dims_size=pencil_dims[0]*pencil_dims[1]*pencil_dims[2];
-      for(int i0=d2_array_start[0];i0<d2_array_start[0]+local_sizes[0];i0++){
-        for(int i1=d2_array_start[1];i1<d2_array_start[1]+local_sizes[1];i1++){
-          for(int i2=d2_array_start[2];i2<d2_array_start[2]+local_sizes[2];i2++){
-            int64_t local_indx=pencil_dims[2]*(pencil_dims[1]*i0+i1) + i2;
-            assert(local_indx < dims_size);
-            assert(ch_indx <chunk_size && ch_indx >= 0 && local_indx>=0 && local_indx < dims_size);
-            d->d2_chunk[ch_indx]=a[local_indx];
-            ch_indx++;
-          }
-        }
-      }
-
-      if((self == me) && print_me && verbosity() > 2) fprintf(stderr, "%d, %d, %d, pencil_dims!...\n", pencil_dims[0],pencil_dims[1],pencil_dims[2]);
-    }
-
-    // what peer in the 3d distribution owns this subarray?
-    for (int i = 0; i < 3; ++i) {
-      d3_peer_coord[i] = d2_coord[i] / d->process_topology_3.n[i];
-    }
-    if((self == me) && print_me && verbosity() > 2) fprintf(stderr, "%d, %d, %d Cube that hits pencil coord!...\n",d3_peer_coord[0],d3_peer_coord[1],d3_peer_coord[2]);
-    //find the rank of this peer.
-    switch(z_dim){
-      case 0: MPI_Cart_rank(d->process_topology_3.cart, d3_peer_coord, &d3_peer); break;
-      case 1: MPI_Cart_rank(d->process_topology_3.cart, d3_peer_coord, &d3_peer); break;
-      case 2: MPI_Cart_rank(d->process_topology_3.cart, d3_peer_coord, &d3_peer); break;
-    }
-    if((self == me) && print_me && verbosity() > 2) fprintf(stderr, "%d, %d, Made it half way!...\n", self,p);
-    if((self == me) && print_me && verbosity() > 2) fprintf(stderr, "%d, %d, PEER!...\n", self,d3_peer);
-
-    //By here in the for loop, we have broken the pencil into a chunk and found which cuboid it resides; over every iteration, the for-loop will break up the pencil in the z_dimension.
-    //From here on we do the opposite. We divide the cuboid into chunks (that are the same size as the ones in the pencil), and determine which pencils own these chunks.
-
-
-    // what is the coordinate of my pth subarray in the 3d distribution?
-    for (int i = 0; i < 3; ++i) {
-      switch(z_dim){
-        case 0: d3_coord[i]  = d->process_topology_3.self[i] * d->process_topology_3.n[i]; break;
-        case 1: d3_coord[i]  = d->process_topology_3.self[i] * d->process_topology_3.n[i]; break;
-        case 2: d3_coord[i]  = d->process_topology_3.self[i] * d->process_topology_3.n[i]; break;
-      }
-    }
-
-    //now unlike above, we dont need to iterate in the z_dim, because for each processor its subarrays inward dimension is already set by the cubes z_dim.
-    //Instead, each iteration of the for-loop will look at different subarrays whose locations in the cuboid differ by local x and y coords.
-
-    switch(z_dim){
-      //p1 is a place holder for the first translation . The outside for-loop will increment the coord in that direction, say x_dim,
-      //and keep doing so until all of the chunks in that dimension are calculated. Then it will increment p0 in the other dimension (in this example the y)
-      //and repeat until all of the subchunks in the x and y dimensions are calculated.
-      //are found.
-      //Note: p0 and p1 will increment different dimensions depending of whether it is using the x y or z pencils, this is because the set up of the coordinate system for each
-      //pencil is different and to ensure that no communications hang up later, the directions coded below are unique for each type of pencil.
-      case 0:
-        d3_coord[y_dim] += p0 * d->process_topology_2_x.n[y_dim];
-        d3_coord[x_dim] += p1 * d->process_topology_2_x.n[x_dim];
-        break;
-      case 1:
-        d3_coord[y_dim] += p0 * d->process_topology_2_y.n[y_dim];
-        d3_coord[x_dim] += p1 * d->process_topology_2_y.n[x_dim];
-        break;
-      case 2:
-        d3_coord[x_dim] += p0 * d->process_topology_2_z.n[x_dim];
-        d3_coord[y_dim] += p1 * d->process_topology_2_z.n[y_dim];
-        break;
-    }
-    if (p1 == p1max) {
-      p0++;
-      p1 = 0;
-    } else {
-      p1++;
-    }
-    // create a dataype for my pth subrarray in the 3d distribution
-
-
-    //d3_array_start holds the starting index of the chunk in the cubes local coordinates(note the cubes local coord system is actually the same as the grids global coord system, by set up)
-
-    d3_array_start[x_dim] = d3_coord[x_dim] % cube_sizes[x_dim];
-    d3_array_start[y_dim] = d3_coord[y_dim] % cube_sizes[y_dim];
-    d3_array_start[z_dim] = d3_coord[z_dim] % cube_sizes[z_dim];
-
-    //make starting point so that it coincides with the starting point of the pencil from the pencils coordinate system. (for z_pencils nothing needs to be changed, since it already
-    //has the coordinate system of the grid, however, the x and y pencils have different starting points of the subchunk in their coord systems.)
-    if(z_dim==0 || z_dim ==1){
-      d3_array_start[2]=d3_array_start[2]+subsizes[2]-1;
-    }
-    if(print_me && (self==me) && verbosity() > 2) fprintf(stderr,"D3_array_start is (%d,%d,%d) and subsizes is (%d,%d,%d) \n",d3_array_start[0],d3_array_start[1],d3_array_start[2],subsizes[0],subsizes[1],subsizes[2]);
-
-
-    //If sending cube chunks to pencils, need to fill those chunks with data here. The chunks are filled in the order
-    //such that when the pencil receives the chunk, in its local array indexing, it assumes that the array is already
-    //filled such that it is contiguous. Therefore, complicated for-loops below fill the array in the cubes local indexing to match what the pencil will
-    //expect.
-    if(direction == REDISTRIBUTE_3_TO_2){
-      int64_t ch_indx=0;
-      int dims_size=cube_sizes[0]*cube_sizes[1]*cube_sizes[2];
-      if((self == me) && print_me && verbosity() > 2) fprintf(stderr, "%d, %d, MAKE 3D Chunk...\n", self,d3_peer);
-      switch(z_dim){
-        case 0:
-          for(int i2=d3_array_start[y_dim];i2>d3_array_start[y_dim]-subsizes[y_dim];i2--){//perhaps y_dim
-            for(int i1=d3_array_start[x_dim];i1<d3_array_start[x_dim]+subsizes[x_dim];i1++){//perhaps x_dim
-              for(int i0=d3_array_start[z_dim];i0<d3_array_start[z_dim]+subsizes[z_dim];i0++){//perhaps z_dim
-                int64_t local_indx=d->process_topology_3.n[2]*(d->process_topology_3.n[1]*i0+i1) + i2;
-                assert(local_indx < dims_size);
-                assert(ch_indx <chunk_size && ch_indx >= 0 && local_indx>=0 && local_indx < dims_size);
-                d->d3_chunk[ch_indx]=a[local_indx];
-                ch_indx++;
-              }
-            }
-          }
-          break;
-        case 1:
-          for(int i0=d3_array_start[y_dim];i0<d3_array_start[y_dim]+subsizes[y_dim];i0++){
-            for(int i2=d3_array_start[x_dim];i2>d3_array_start[x_dim]-subsizes[x_dim];i2--){
-              for(int i1=d3_array_start[z_dim];i1<d3_array_start[z_dim]+subsizes[z_dim];i1++){
-                int64_t local_indx=d->process_topology_3.n[2]*(d->process_topology_3.n[1]*i0+i1) + i2;
-                assert(local_indx < dims_size);
-                assert(ch_indx <chunk_size && ch_indx >= 0 && local_indx>=0 && local_indx < dims_size);
-                d->d3_chunk[ch_indx]=a[local_indx];
-                ch_indx++;
-              }
-            }
-          }
-
-          break;
-        case 2:
-          for(int i0=d3_array_start[x_dim];i0<d3_array_start[x_dim]+subsizes[x_dim];i0++){
-            for(int i1=d3_array_start[y_dim];i1<d3_array_start[y_dim]+subsizes[y_dim];i1++){
-              for(int i2=d3_array_start[z_dim];i2<d3_array_start[z_dim]+subsizes[z_dim];i2++){
-                int64_t local_indx=d->process_topology_3.n[2]*(d->process_topology_3.n[1]*i0+i1) + i2;
-                assert(local_indx < dims_size);
-                assert(ch_indx <chunk_size && ch_indx >= 0 && local_indx>=0 && local_indx < dims_size);
-                d->d3_chunk[ch_indx]=a[local_indx];
-                ch_indx++;
-              }
-            }
-          }
-
-          break;
-      }
-    }
-
-    if ((DEBUG_CONDITION || ((self == me) && print_me)) && verbosity() > 2) {
-      fprintf(stderr,
-              "%d: pencil_sizes=(%d,%d,%d), cube_sizes=(%d,%d,%d), subsizes=(%d,%d,%d), d3_coord=(%d,%d,%d), d3_array_start=(%d,%d,%d) \n",
-              self,
-              pencil_sizes[0], pencil_sizes[1], pencil_sizes[2],
-              cube_sizes[0], cube_sizes[1], cube_sizes[2],
-              subsizes[0], subsizes[1], subsizes[2],
-              d3_coord[0], d3_coord[1], d3_coord[2],
-              d3_array_start[0],d3_array_start[1],d3_array_start[2]);
-    }
-
-    // what peer in the 2d distribution owns this subarray?
-    for (int i = 0; i < 3; ++i) {
-      switch(z_dim){
-        case 0:
-          d2_peer_coord[i] = d3_coord[i] / d->process_topology_2_x.n[i];
-          break;
-        case 1:
-          d2_peer_coord[i] = d3_coord[i] / d->process_topology_2_y.n[i];
-          break;
-        case 2:
-          d2_peer_coord[i] = d3_coord[i] / d->process_topology_2_z.n[i];
-          break;
-      }
-    }
-    d2_peer_coord[z_dim] = 0;//since these are pencils, there is no two pencils in this direction.
-    if((self == me) && print_me && verbosity() > 2) fprintf(stderr, "%d, %d, %d PENCIL that hits chunk!...\n",d2_peer_coord[0],d2_peer_coord[1],d2_peer_coord[2]);
-    switch(z_dim){
-      //find its rank
-      case 0:
-        Rank_x_pencils(&d2_peer,d2_peer_coord,d);
-        break;
-      case 1:
-        Rank_y_pencils(&d2_peer,d2_peer_coord,d);
-        break;
-      case 2:
-        Rank_z_pencils(&d2_peer,d2_peer_coord,d);
-        break;
-    }
-    if((self == me) && print_me && verbosity() > 2) fprintf(stderr, "%d, %d, %d Made it before comm!...\n", self,p, npeers);
-
-    // record the communication to be done in a schedule. Make sure to map each grid to the correct rank
-    if (direction == REDISTRIBUTE_3_TO_2) {
-      recv_peer = d->rankmap[d3_peer];
-      send_peer = d->rankmap[d2_peer];
-    } else if (direction == REDISTRIBUTE_2_TO_3) {
-      recv_peer = d->rankmap[d2_peer];
-      send_peer = d->rankmap[d3_peer];
-    } else {
-      abort();
-    }
-    //communication of the chunks:
-    //if print_mess boolean is set to true, then the code runs without sending any messages, and is used to test which messages would be sent in the entire run.
-    //(designed to debug comm hangups, if they occur).
-
-    if(direction == REDISTRIBUTE_3_TO_2){
-
-      if((self == me) && print_mess && verbosity() > 2) fprintf(stderr, " I am %d, making request to receive from %d...\n", self,recv_peer);
-      if(!print_mess)MPI_Irecv((void *) d->d2_chunk, chunk_size, MPI_DOUBLE_COMPLEX, recv_peer, 0, d->process_topology_1.cart, &req1);
-
-      if((self == me) && print_mess && verbosity() > 2) fprintf(stderr, " I am %d, making request to send to %d...\n", self,send_peer);
-      if(!print_mess)MPI_Isend((void *) d->d3_chunk, chunk_size, MPI_DOUBLE_COMPLEX, send_peer, 0, d->process_topology_1.cart, &req2);
-
-      if((self == me) && print_mess && verbosity() > 2) fprintf(stderr, " I am %d, waiting to receive from %d...\n", self,recv_peer);
-      //fprintf(stderr, " I am %d, waiting to receive from %d...\n", self,recv_peer);
-      if(!print_mess)MPI_Wait(&req1,MPI_STATUS_IGNORE);
-
-      //if((self == me || self == 1 || self == 2 || self == 3) && print_me)fprintf(stderr, " I am %d, waiting to send to %d...\n", self,send_peer);
-      //fprintf(stderr, " I am %d, waiting to send to %d...\n", self,send_peer);
-      if(self==me && print_mess && verbosity() > 2) fprintf(stderr, " I am %d, waiting to send to %d...\n", self,send_peer);
-      if(!print_mess)MPI_Wait(&req2,MPI_STATUS_IGNORE);
-
-      //fill the local array with the received chunk.
-      int64_t ch_indx=0;
-      int dims_size=pencil_dims[0]*pencil_dims[1]*pencil_dims[2];
-      if(self==me && print_me && verbosity() > 2) fprintf(stderr,"REAL SUBSIZES (%d,%d,%d)\n",subsizes[x_dim],subsizes[y_dim],subsizes[z_dim]);
-      if(self==me && print_me && verbosity() > 2) fprintf(stderr,"PENCIL DIMENSION VS. local sizes (%d,%d,%d) vs (%d,%d,%d)\n",pencil_dims[0],pencil_dims[1],pencil_dims[2],local_sizes[0],local_sizes[1],local_sizes[2]);
-      if(self==me && print_me && verbosity() > 2) fprintf(stderr,"DIM_2_ARRAY_START (%d,%d,%d) \n",d2_array_start[0],d2_array_start[1],d2_array_start[2]);
-      for(int i0=d2_array_start[0];i0<d2_array_start[0]+local_sizes[0];i0++){
-        for(int i1=d2_array_start[1];i1<d2_array_start[1]+local_sizes[1];i1++){
-          for(int i2=d2_array_start[2];i2<d2_array_start[2]+local_sizes[2];i2++){
-            int64_t local_indx=pencil_dims[2]*(pencil_dims[1]*i0+i1) + i2;
-            //if(self==me)fprintf(stderr,"local_indx = %d ",local_indx);
-            //if(local_indx >= dims_size)fprintf(stderr,"WOW, in third for, dims is (%d), we are %d and my rank is %d",dims_size,local_indx,self);
-            assert(local_indx < dims_size);
-            assert(ch_indx <chunk_size && ch_indx >= 0 && local_indx>=0 && local_indx < dims_size);
-            b[local_indx]=d->d2_chunk[ch_indx];
-            //if((p==0 || p==1 || p==2 || p==3 || p==4 || p==5) && self==me)fprintf(stderr,"(%f,%f) ",real(d->d2_chunk[ch_indx]),imag(d->d2_chunk[ch_indx]));
-            ch_indx++;
-          }
-                                }
-      }
-      //     if((p==0 ||p==1 || p==2 || p==3 || p==4 || p==5) && self==me)fprintf(stderr,"P is %d \n",p);
-
-    }
-    else if (direction == REDISTRIBUTE_2_TO_3) {
-
-      if((self == me) && print_mess && verbosity() > 2) fprintf(stderr, " I am %d, making request to receive from %d...\n", self,recv_peer);
-      if(!print_mess)MPI_Irecv((void *) d->d3_chunk, chunk_size, MPI_DOUBLE_COMPLEX, recv_peer, 0, d->process_topology_1.cart, &req1);
-
-      if((self == me) && print_mess && verbosity() > 2) fprintf(stderr, " I am %d, making request to send to %d...\n", self,send_peer);
-      if(!print_mess)MPI_Isend((void *) d->d2_chunk, chunk_size, MPI_DOUBLE_COMPLEX, send_peer, 0, d->process_topology_1.cart, &req2);
-
-      if((self == me) && print_mess && verbosity() > 2) fprintf(stderr, " I am %d, waiting to receive from %d...\n", self,recv_peer);
-      if(!print_mess)MPI_Wait(&req1,MPI_STATUS_IGNORE);
-
-      if((self == me) && print_mess && verbosity() > 2) fprintf(stderr, " I am %d, waiting to send to %d...\n", self,send_peer);
-      if(!print_mess)MPI_Wait(&req2,MPI_STATUS_IGNORE);
-      int64_t ch_indx=0;
-      int dims_size=(d->process_topology_3.n[2])*(d->process_topology_3.n[1])*(d->process_topology_3.n[0]);
-      if(z_dim==0){
-        //fill the local array with the received chunk.
-
-        for(int i2=d3_array_start[y_dim];i2>d3_array_start[y_dim]-subsizes[y_dim];i2--){
-          for(int i1=d3_array_start[x_dim];i1<d3_array_start[x_dim]+subsizes[x_dim];i1++){
-            for(int i0=d3_array_start[z_dim];i0<d3_array_start[z_dim]+subsizes[z_dim];i0++){
-              int64_t local_indx=d->process_topology_3.n[2]*(d->process_topology_3.n[1]*i0+i1) + i2;
-              //if(local_indx >= dims_size)fprintf(stderr,"WOW, in fourth for, dims is (%d), we are %d and my rank is %d",dims_size,local_indx,self);
-              assert(local_indx < dims_size);
-              assert(ch_indx <chunk_size && ch_indx >= 0 && local_indx>=0 && local_indx < dims_size);
-              b[local_indx]=d->d3_chunk[ch_indx];
-              //                         if(p==3 && self==me)fprintf(stderr,"(%f,%f) ",real(d->d3_chunk[ch_indx]),imag(d->d3_chunk[ch_indx]));
-              ch_indx++;
-            }
-          }
-        }
-      }
-      else if(z_dim==1){
-        for(int i0=d3_array_start[y_dim];i0<d3_array_start[y_dim]+subsizes[y_dim];i0++){
-          for(int i2=d3_array_start[x_dim];i2>d3_array_start[x_dim]-subsizes[x_dim];i2--){
-            for(int i1=d3_array_start[z_dim];i1<d3_array_start[z_dim]+subsizes[z_dim];i1++){
-              int64_t local_indx=d->process_topology_3.n[2]*(d->process_topology_3.n[1]*i0+i1) + i2;
-              //if(local_indx >= dims_size)fprintf(stderr,"WOW, in fourth for, dims is (%d), we are %d and my rank is %d",dims_size,local_indx,self);
-              assert(local_indx < dims_size);
-              assert(ch_indx <chunk_size && ch_indx >= 0 && local_indx>=0 && local_indx < dims_size);
-              b[local_indx]=d->d3_chunk[ch_indx];
-              //                             if(p==0 && self==me)fprintf(stderr,"(%f,%f) ",real(d->d3_chunk[ch_indx]),imag(d->d3_chunk[ch_indx]));
-              ch_indx++;
-            }
-          }
-        }
-
-      }
-      else if(z_dim==2){
-        for(int i0=d3_array_start[x_dim];i0<d3_array_start[x_dim]+subsizes[x_dim];i0++){
-          for(int i1=d3_array_start[y_dim];i1<d3_array_start[y_dim]+subsizes[y_dim];i1++){
-            for(int i2=d3_array_start[z_dim];i2<d3_array_start[z_dim]+subsizes[z_dim];i2++){
-              int64_t local_indx=d->process_topology_3.n[2]*(d->process_topology_3.n[1]*i0+i1) + i2;
-              assert(local_indx < dims_size);
-              assert(ch_indx <chunk_size && ch_indx >= 0 && local_indx>=0 && local_indx < dims_size);
-              b[local_indx]=d->d3_chunk[ch_indx];
-              //                   if(p==1 && self==me)fprintf(stderr,"(%f,%f) ",real(d->d3_chunk[ch_indx]),imag(d->d3_chunk[ch_indx]));
-              ch_indx++;
-            }
-          }
-        }
-
-      }
-      else{
-        abort();
-      }
-    }
-
-    if (DEBUG_CONDITION && verbosity() > 2) {
-      fprintf(stderr,
-              "%d: npeers,p,p0,p1,p1max=(%d,%d,%d,%d,%d), "
-              "d3_coord=(%d,%d,%d), d2_peer_coord=(%d,%d,%d), "
-              "d2_coord=(%d,%d,%d), d3_peer_coord=(%d,%d,%d), "
-              "recv_peer=%d, send_peer=%d\n",
-              self,
-              npeers, p, p0, p1, p1max,
-              d3_coord[0], d3_coord[1], d3_coord[2],
-              d2_peer_coord[0], d2_peer_coord[1], d2_peer_coord[2],
-              d2_coord[0], d2_coord[1], d2_coord[2],
-              d3_peer_coord[0], d3_peer_coord[1], d3_peer_coord[2],
-              recv_peer, send_peer);
-    }
-
-    if((self == me) && print_me && verbosity() > 2) fprintf(stderr, "%d, %d, %d Made it end-for!...\n", self,p, npeers);
-  }
-
-  //if((self == me) && print_me)fprintf(outfile, "   Made it all the way! for z_dim =(%d) and num_proc = (%d)...\n", z_dim, d->process_topology_1.nproc[0]);
-  if((self == me) && print_result){
-    FILE * outfile;
-    outfile= fopen("passed.data","a");
-    if (outfile) fprintf(outfile, "   Made it all the way! for z_dim =(%d) and num_proc = (%d)...\n", z_dim, d->process_topology_1.nproc[0]);
-    if (outfile) fclose(outfile);
-  }
-//    fprintf(stderr, "%d, Made it all the way!...\n", self);
-}
diff --git a/Src/Extern/SWFFT/distribution_c.h b/Src/Extern/SWFFT/distribution_c.h
deleted file mode 100644
index 4f91573829..0000000000
--- a/Src/Extern/SWFFT/distribution_c.h
+++ /dev/null
@@ -1,269 +0,0 @@
-/*
- *                 Copyright (C) 2017, UChicago Argonne, LLC
- *                            All Rights Reserved
- *
- *           Hardware/Hybrid Cosmology Code (HACC), Version 1.0
- *
- * Salman Habib, Adrian Pope, Hal Finkel, Nicholas Frontiere, Katrin Heitmann,
- *      Vitali Morozov, Jeffrey Emberson, Thomas Uram, Esteban Rangel
- *                        (Argonne National Laboratory)
- *
- *  David Daniel, Patricia Fasel, Chung-Hsing Hsu, Zarija Lukic, James Ahrens
- *                      (Los Alamos National Laboratory)
- *
- *                               George Zagaris
- *                                 (Kitware)
- *
- *                            OPEN SOURCE LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *   1. Redistributions of source code must retain the above copyright notice,
- *      this list of conditions and the following disclaimer. Software changes,
- *      modifications, or derivative works, should be noted with comments and
- *      the author and organization's name.
- *
- *   2. Redistributions in binary form must reproduce the above copyright
- *      notice, this list of conditions and the following disclaimer in the
- *      documentation and/or other materials provided with the distribution.
- *
- *   3. Neither the names of UChicago Argonne, LLC or the Department of Energy
- *      nor the names of its contributors may be used to endorse or promote
- *      products derived from this software without specific prior written
- *      permission.
- *
- *   4. The software and the end-user documentation included with the
- *      redistribution, if any, must include the following acknowledgment:
- *
- *     "This product includes software produced by UChicago Argonne, LLC under
- *      Contract No. DE-AC02-06CH11357 with the Department of Energy."
- *
- * *****************************************************************************
- *                                DISCLAIMER
- * THE SOFTWARE IS SUPPLIED "AS IS" WITHOUT WARRANTY OF ANY KIND. NEITHER THE
- * UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT OF ENERGY, NOR
- * UCHICAGO ARGONNE, LLC, NOR ANY OF THEIR EMPLOYEES, MAKES ANY WARRANTY,
- * EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR RESPONSIBILITY FOR THE
- * ACCURARY, COMPLETENESS, OR USEFULNESS OF ANY INFORMATION, DATA, APPARATUS,
- * PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS THAT ITS USE WOULD NOT INFRINGE
- * PRIVATELY OWNED RIGHTS.
- *
- * *****************************************************************************
- */
-
-#ifndef HACC_DISTRIBUTION_H
-#define HACC_DISTRIBUTION_H
-
-#include <mpi.h>
-
-#include "complex-type.h"
-
-#define PENCIL 1
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-///
-// descriptor for a process grid
-//   cart     Cartesian MPI communicator
-//   nproc[]  dimensions of process grid
-//   period[] periods of process grid
-//   self[]   coordinate of this process in the process grid
-//   n[]      local grid dimensions
-///
-typedef struct {
-  MPI_Comm cart;
-  int nproc[3];
-  int period[3];
-  int self[3];
-  int n[3];
-} process_topology_t;
-
-
-///
-// descriptor for data distribution
-//   debug               toggle debug output
-//   n[3]                (global) grid dimensions
-//   process_topology_1  1-d process topology
-//   process_topology_2  2-d process topology
-//   process_topology_3  3-d process topology
-///
-typedef struct {
-  bool debug;
-  int n[3];
-  process_topology_t process_topology_1;
-  process_topology_t process_topology_2_z;
-  process_topology_t process_topology_2_y;
-  process_topology_t process_topology_2_x;
-  process_topology_t process_topology_3;
-  complex_t *d2_chunk;
-  complex_t *d3_chunk;
-  int *gridmap;
-  int *rankmap;
-  MPI_Comm parent;
-} distribution_t;
-
-
-///
-// create 1-, 2- and 3-d cartesian data distributions
-//   comm   MPI Communicator
-//   d      distribution descriptor
-//   n      (global) grid dimensions (3 element array)
-//   Ndims  3d process grid (3 element array: x, y, z)
-//   rmap   pointer to grid->rank map
-//   debug  debugging output
-///
-void distribution_init(MPI_Comm comm,
-                       const int n[],
-               const int Ndims[],
-                       distribution_t *d,
-               const int* rmap,
-                       bool debug);
-
-
-///
-// create 1-, 2- and 3-d cartesian data distributions with explicitly
-// provided dimension lists
-//   comm       MPI Communicator
-//   n          (global) grid dimensions (3 element array)
-//   nproc_1d   1d process grid (3 element array: x, 1, 1)
-//   nproc_2d   1d process grid (3 element array: x, y, 1)
-//   nproc_3d   3d process grid (3 element array: x, y, z)
-//   d          distribution descriptor
-//   debug      debugging output
-///
-void distribution_init_explicit(MPI_Comm comm,
-                                const int n[],
-                                int nproc_1d[],
-                                int nproc_2d_x[],
-                                int nproc_2d_y[],
-                                int nproc_2d_z[],
-                                int nproc_3d[],
-                                distribution_t *d,
-                                bool debug);
-
-///
-// creates a custom 3D decomposition or uses MPI_Dims_create to do so
-///
-void Custom3D_Dims_create(const int Ndims[], int nproc, int ndims, int dims[]);
-
-
-///
-// clean up the data distribution
-//   d    distribution descriptor
-///
-void distribution_fini(distribution_t *d);
-
-
-///
-// assert that the data and processor grids are commensurate
-//   d    distribution descriptor
-///
-void distribution_assert_commensurate(distribution_t *d);
-
-
-///
-// redistribute a 1-d to a 3-d data distribution
-//   a    input
-//   b    ouput
-//   d    distribution descriptor
-///
-void distribution_1_to_3(const complex_t *a,
-                         complex_t *b,
-                         distribution_t *d);
-
-///
-// redistribute a 3-d to a 1-d data distribution
-//   a    input
-//   b    ouput
-//   d    distribution descriptor
-///
-void distribution_3_to_1(const complex_t *a,
-                         complex_t *b,
-                         distribution_t *d);
-
-///
-// redistribute a 2-d to a 3-d data distribution
-//   a    input
-//   b    ouput
-//   d    distribution descriptor
-///
-void distribution_2_to_3(const complex_t *a,
-                         complex_t *b,
-                         distribution_t *d,
-                         int dim_z);
-
-///
-// redistribute a 3-d to a 2-d data distribution
-//   a    input
-//   b    ouput
-//   d    distribution descriptor
-///
-void distribution_3_to_2(const complex_t *a,
-                         complex_t *b,
-                         distribution_t *d,
-                         int dim_z);
-
-
-///
-// Some accessor functions
-///
-static inline int distribution_get_nproc_1d(distribution_t *d, int direction)
-{
-    return d->process_topology_1.nproc[direction];
-}
-
-static inline int distribution_get_nproc_2d_x(distribution_t *d, int direction)
-{
-    return d->process_topology_2_x.nproc[direction];
-}
-static inline int distribution_get_nproc_2d_y(distribution_t *d, int direction)
-{
-    return d->process_topology_2_y.nproc[direction];
-}
-static inline int distribution_get_nproc_2d_z(distribution_t *d, int direction)
-{
-    return d->process_topology_2_z.nproc[direction];
-}
-
-static inline int distribution_get_nproc_3d(distribution_t *d, int direction)
-{
-    return d->process_topology_3.nproc[direction];
-}
-
-static inline int distribution_get_self_1d(distribution_t *d, int direction)
-{
-    return d->process_topology_1.self[direction];
-}
-
-static inline int distribution_get_self_2d_x(distribution_t *d, int direction)
-{
-    return d->process_topology_2_x.self[direction];
-}
-static inline int distribution_get_self_2d_y(distribution_t *d, int direction)
-{
-    return d->process_topology_2_y.self[direction];
-}
-static inline int distribution_get_self_2d_z(distribution_t *d, int direction)
-{
-    return d->process_topology_2_z.self[direction];
-}
-static inline int distribution_get_self_3d(distribution_t *d, int direction)
-{
-    return d->process_topology_3.self[direction];
-}
-
-void Coord_x_pencils(int myrank, int coord[], distribution_t *d);
-void Rank_x_pencils(int * myrank, int coord[], distribution_t *d);
-void Coord_y_pencils(int myrank, int coord[], distribution_t *d);
-void Rank_y_pencils(int * myrank, int coord[], distribution_t *d);
-void Coord_z_pencils(int myrank, int coord[], distribution_t *d);
-void Rank_z_pencils(int * myrank, int coord[], distribution_t *d);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif // HACC_DISTRIBUTION_H
diff --git a/Src/Extern/SWFFT/verbosity.cpp b/Src/Extern/SWFFT/verbosity.cpp
deleted file mode 100644
index 78dae92666..0000000000
--- a/Src/Extern/SWFFT/verbosity.cpp
+++ /dev/null
@@ -1,8 +0,0 @@
-#include "verbosity.h"
-#include "AMReX.H"
-
-extern "C" int verbosity () {
-
-    return amrex::Verbose();
-
-}
diff --git a/Src/Extern/SWFFT/verbosity.h b/Src/Extern/SWFFT/verbosity.h
deleted file mode 100644
index a60da885f6..0000000000
--- a/Src/Extern/SWFFT/verbosity.h
+++ /dev/null
@@ -1,11 +0,0 @@
-#ifndef VERBOSITY_H
-#define VERBOSITY_H
-
-// This provides a wrapper for amrex::Verbosity() to control the level of SWFFT output.
-
-#ifdef __cplusplus
-extern "C"
-#endif
-int verbosity ();
-
-#endif
diff --git a/Src/Extern/hpgmg/BL_HPGMG.H b/Src/Extern/hpgmg/BL_HPGMG.H
deleted file mode 100644
index 992ba55743..0000000000
--- a/Src/Extern/hpgmg/BL_HPGMG.H
+++ /dev/null
@@ -1,237 +0,0 @@
-#ifndef BL_HPGMG_H
-#define BL_HPGMG_H
-#include <AMReX_Config.H>
-
-#include <AMReX_MultiFab.H>
-
-#ifdef USEHPGMG
-
-#define STENCIL_MAX_SHAPES 3
-#define VECTOR_ALPHA       5  // cell centered coefficient
-#define BC_PERIODIC        0
-#define VECTOR_E           1  // error used in residual correction FMG
-#define VECTOR_F           3  // original right-hand side (Au=f), cell centered
-#define VECTOR_U           4  // numerical solution
-#define BC_DIRICHLET       1
-#define VECTOR_BETA_I      6 // face centered coefficient (n.b. element 0 is the left face of the ghost zone element)
-#define VECTOR_BETA_J      7 // face centered coefficient (n.b. element 0 is the back face of the ghost zone element)
-#define VECTOR_BETA_K      8 // face centered coefficient (n.b. element 0 is the bottom face of the ghost zone element)
-#define RESTRICT_CELL      0
-
-#ifndef BLOCKCOPY_TILE_I
-#define BLOCKCOPY_TILE_I 10000
-#else
-#warning By overriding BLOCKCOPY_TILE_I, you are tiling in the unit stride.  I hope you know what you are doing.
-#endif
-#ifndef BLOCKCOPY_TILE_J
-#define BLOCKCOPY_TILE_J 8
-#endif
-#ifndef BLOCKCOPY_TILE_K
-#define BLOCKCOPY_TILE_K 8
-#endif
-
-typedef struct {
-  int subtype;                        // e.g. used to calculate normal to domain for BC's
-  struct {int i, j, k;}dim;        // dimensions of the block to copy
-  struct {int box, i, j, k, jStride, kStride;double * __restrict__ ptr;}read,write;
-  // coordinates in the read grid to extract data,
-  // coordinates in the write grid to insert data
-  // if read/write.box<0, then use write/read.ptr, otherwise use boxes[box].vectors[id]
-  // Thus, you can do grid->grid, grid->buf, buf->grid, or buf->buf
-} __attribute__((aligned(64))) blockCopy_type;
-
-
-typedef struct {
-    int                           num_recvs;        //   number of neighbors by type
-    int                           num_sends;        //   number of neighbors by type
-    int     * __restrict__       recv_ranks;        //   MPI rank of each neighbor...          recv_ranks[neighbor]
-    int     * __restrict__       send_ranks;        //   MPI rank of each neighbor...          send_ranks[neighbor]
-    int     * __restrict__       recv_sizes;        //   size of each MPI recv buffer...       recv_sizes[neighbor]
-    int     * __restrict__       send_sizes;        //   size of each MPI send buffer...       send_sizes[neighbor]
-    double ** __restrict__     recv_buffers;        //   MPI recv buffer for each neighbor...  recv_buffers[neighbor][ recv_sizes[neighbor] ]
-    double ** __restrict__     send_buffers;        //   MPI send buffer for each neighbor...  send_buffers[neighbor][ send_sizes[neighbor] ]
-    int                 allocated_blocks[3];        //   number of blocks allocated (not necessarily used) each list...
-    int                       num_blocks[3];        //   number of blocks in each list...        num_blocks[pack,local,unpack]
-    blockCopy_type *              blocks[3];        //   list of block copies...                     blocks[pack,local,unpack]
-    #ifdef BL_USE_MPI
-    MPI_Request * __restrict__     requests;
-    MPI_Status  * __restrict__       status;
-    #endif
-} communicator_type;
-
-typedef struct {
-  int                         global_box_id;        // used to inded into level->rank_of_box
-  struct {int i, j, k;}low;                        // global coordinates of the first (non-ghost) element of subdomain
-  int                                   dim;        // dimension of this box's core (owned)
-  int                                ghosts;        // ghost zone depth
-  int                jStride,kStride,volume;        // useful for offsets
-  int                            numVectors;        //
-  double   ** __restrict__          vectors;        // vectors[c] = pointer to 3D array for vector c for one box
-} box_type;
-
-typedef struct {
-  double h;                                        // grid spacing at this level
-  int active;                                        // I am an active process (I have work to do on this or subsequent levels)
-  int num_ranks;                                // total number of MPI ranks
-  int my_rank;                                        // my MPI rank
-  int box_dim;                                        // dimension of each cubical box (not counting ghost zones)
-  int box_ghosts;                                // ghost zone depth for each box
-  int box_jStride,box_kStride,box_volume;        // useful for offsets
-  int numVectors;                                // number of vectors stored in each box
-  int tag;                                        // tag each level uniquely... FIX... replace with sub commuicator
-  struct {int i, j, k;}boxes_in;                // total number of boxes in i,j,k across this level
-  struct {int i, j, k;}dim;                        // global dimensions at this level (NOTE: dim.i == boxes_in.i * box_dim)
-
-  int * rank_of_box;                                // 3D array containing rank of each box.  i-major ordering
-  int    num_my_boxes;                                //           number of boxes owned by this rank
-  box_type * my_boxes;                                // pointer to array of boxes owned by this rank
-
-  // create flattened FP data... useful for CUDA/OpenMP4/OpenACC when you want to copy an entire vector to/from an accelerator
-  double   ** __restrict__          vectors;        // vectors[v][box][k][j][i] = pointer to 5D array for vector v encompasing all boxes on this process...
-  double    * __restrict__     vectors_base;    // pointer used for malloc/free.  vectors[v] are shifted from this for alignment
-
-  int       allocated_blocks;                        //       number of blocks allocated by this rank (note, this represents a flattening of the box/cell hierarchy to facilitate threading)
-  int          num_my_blocks;                        //       number of blocks     owned by this rank (note, this represents a flattening of the box/cell hierarchy to facilitate threading)
-  blockCopy_type * my_blocks;                        // pointer to array of blocks owned by this rank (note, this represents a flattening of the box/cell hierarchy to facilitate threading)
-
-  struct {
-    int                type;                        // BC_PERIODIC or BC_DIRICHLET
-    int    allocated_blocks[STENCIL_MAX_SHAPES];// number of blocks allocated (not necessarily used) for boundary conditions on this level for [shape]
-    int          num_blocks[STENCIL_MAX_SHAPES];// number of blocks used for boundary conditions on this level for [shape]
-    blockCopy_type * blocks[STENCIL_MAX_SHAPES];// pointer to array of blocks used for boundary conditions on this level for [shape]
-  } boundary_condition;                                // boundary conditions on this level
-
-  communicator_type exchange_ghosts[STENCIL_MAX_SHAPES];// mini program that performs a neighbor ghost zone exchange for [shape]
-  communicator_type restriction[4];                        // mini program that performs restriction and agglomeration for [0=cell centered, 1=i-face, 2=j-face, 3-k-face]
-  communicator_type interpolation;                        // mini program that performs interpolation and dissemination...
-  #ifdef BL_USE_MPI
-  MPI_Comm MPI_COMM_ALLREDUCE;                        // MPI sub communicator for just the ranks that have boxes on this level or any subsequent level...
-  #endif
-  double dominant_eigenvalue_of_DinvA;                // estimate on the dominate eigenvalue of D^{-1}A
-  int must_subtract_mean;                        // e.g. Poisson with Periodic BC's
-  double    * __restrict__ RedBlack_base;       // allocated pointer... will be aligned for the first non ghost zone element
-  double    * __restrict__ RedBlack_FP;                // Red/Black Mask (i.e. 0.0 or 1.0) for even/odd planes (2*kStride).
-
-  int num_threads;
-  double    * __restrict__ fluxes;                // temporary array used to hold the flux values used by FV operators
-
-  // statistics information...
-  struct {
-    uint64_t              smooth;
-    uint64_t            apply_op;
-    uint64_t            residual;
-    uint64_t               blas1;
-    uint64_t               blas3;
-    uint64_t boundary_conditions;
-    // Distributed Restriction
-    uint64_t   restriction_total;
-    uint64_t   restriction_pack;
-    uint64_t   restriction_local;
-    uint64_t   restriction_unpack;
-    uint64_t   restriction_recv;
-    uint64_t   restriction_send;
-    uint64_t   restriction_wait;
-    // Distributed interpolation
-    uint64_t interpolation_total;
-    uint64_t interpolation_pack;
-    uint64_t interpolation_local;
-    uint64_t interpolation_unpack;
-    uint64_t interpolation_recv;
-    uint64_t interpolation_send;
-    uint64_t interpolation_wait;
-    // Ghost Zone Exchanges...
-    uint64_t     ghostZone_total;
-    uint64_t     ghostZone_pack;
-    uint64_t     ghostZone_local;
-    uint64_t     ghostZone_unpack;
-    uint64_t     ghostZone_recv;
-    uint64_t     ghostZone_send;
-    uint64_t     ghostZone_wait;
-    // Collectives...
-    uint64_t   collectives;
-    uint64_t         Total;
-  }cycles;
-  int Krylov_iterations;        // total number of bottom solver iterations
-  int CAKrylov_formations_of_G; // i.e. [G,g] = [P,R]^T[P,R,rt]
-  int vcycles_from_this_level;  // number of vcycles performed that were initiated from this level
-} level_type;
-
-typedef struct {
-  int num_ranks;        // total number of MPI ranks for MPI_COMM_WORLD
-  int my_rank;                // my MPI rank for MPI_COMM_WORLD
-  int       num_levels;        // depth of the v-cycle
-  level_type ** levels;        // array of pointers to levels
-
-  struct {
-    uint64_t MGBuild; // total time spent building the coefficients...
-    uint64_t MGSolve; // total time spent in MGSolve
-  }cycles;
-  int MGSolves_performed;
-} mg_type;
-
-extern "C" {
-    void create_vectors(level_type* level, int numVectors);
-    void append_block_to_list (blockCopy_type ** blocks, int *allocated_blocks,
-                               int *num_blocks, int dim_i, int dim_j, int dim_k,
-                               int read_box, double *read_ptr, int read_i,
-                               int read_j, int read_k, int read_jStride,
-                               int read_kStride, int read_scale, int write_box,
-                               double *write_ptr, int write_i, int write_j,
-                               int write_k, int write_jStride, int write_kStride,
-                               int write_scale, int my_blockcopy_tile_i,
-                               int my_blockcopy_tile_j, int my_blockcopy_tile_k,
-                               int subtype);
-    void build_exchange_ghosts (level_type * level, int shape);
-    void build_boundary_conditions (level_type * level, int shape);
-    void initialize_problem (level_type * level, double hLevel, double a, double b);
-    double dot (level_type * level, int id_a, int id_b);
-    double mean (level_type * level, int id_a);
-    void shift_vector (level_type * level, int id_c, int id_a, double shift_a);
-    void rebuild_operator (level_type * level, level_type * fromLevel, double a, double b);
-    void MGBuild (mg_type * all_grids, level_type * fine_grid, double a, double b, int minCoarseGridDim, const MPI_Comm comm);
-    void MGResetTimers (mg_type * all_grids);
-    void zero_vector (level_type * level, int component_id);
-    void MGSolve (mg_type *all_grids, int onLevel, int u_id, int F_id, double a, double b, double dtol, double rtol);
-    void FMGSolve(mg_type *all_grids, int onLevel, int u_id, int F_id, double a, double b, double rtol);
-    void MGPrintTiming (mg_type * all_grids, int fromLevel);
-    double error (level_type * level, int id_a, int id_b);
-    void destroy_level (level_type * level);
-    int stencil_get_radius();
-    double norm(level_type * level, int component_id);
-    void restriction(level_type * level_c, int id_c, level_type *level_f, int id_f, int restrictionType);
-    void richardson_error(mg_type *all_grids, int levelh, int u_id);
-    void MGDestroy(mg_type *all_grids);
-}
-
-// If we want to use the multigrid solver from HPGMG then we must convert our
-// MultiFabs to its own level data structures. This function essentially
-// replaces the create_level() function in HPGMG.
-void CreateHPGMGLevel(level_type* level,
-                      const amrex::MultiFab& mf,
-                      const int n_cell,
-                      const int max_grid_size,
-                      const int my_rank,
-                      const int num_ranks,
-                      const int domain_boundary_condition,
-                      const int numVectors,
-                      const double h0);
-
-void SetupHPGMGCoefficients(const double a,
-                            const double b,
-                            const amrex::MultiFab& alpha,
-                            const amrex::MultiFab& beta_cc,
-                            level_type* level);
-
-void ConvertToHPGMGLevel(const amrex::MultiFab& mf,
-                         const int n_cell,
-                         const int max_grid_size,
-                         level_type* level,
-                         const int component_id);
-
-void ConvertFromHPGMGLevel(amrex::MultiFab& mf,
-                           const level_type* level,
-                           const int component_id);
-
-#endif /* USEHPGMG */
-
-#endif /* BL_HPGMG_H */
diff --git a/Src/Extern/hpgmg/BL_HPGMG.cpp b/Src/Extern/hpgmg/BL_HPGMG.cpp
deleted file mode 100644
index 5fa763673a..0000000000
--- a/Src/Extern/hpgmg/BL_HPGMG.cpp
+++ /dev/null
@@ -1,522 +0,0 @@
-#include <BL_HPGMG.H>
-
-using namespace amrex;
-
-// If we want to use the multigrid solver from HPGMG then we must convert our
-// MultiFabs to HPGMG's level data structures. This function essentially
-// replaces the create_level() function in HPGMG.
-#ifdef USEHPGMG
-void CreateHPGMGLevel (level_type* level,
-                       const MultiFab& mf,
-                       const int n_cell,
-                       const int max_grid_size,
-                       const int my_rank,
-                       const int num_ranks,
-                       const int domain_boundary_condition,
-                       const int numVectors,
-                       const double h0)
-{
-    int box;
-    const int boxes_in_i = n_cell / max_grid_size;
-    int TotalBoxes = boxes_in_i * boxes_in_i * boxes_in_i;
-
-    // HPGMG requires perfect cubes for all boxes
-    for (MFIter mfi(mf); mfi.isValid(); ++mfi)
-    {
-        const Box& bx = mfi.validbox();
-        if (!bx.isSquare()) {
-             amrex::Error("All boxes must be square in HPGMG");
-        }
-    }
-
-    // HPGMG also requires all boxes to be the same size, so we iterate over
-    // all boxes and make sure they're the same.
-    for (MFIter mfi1(mf); mfi1.isValid(); ++mfi1)
-    {
-        const Box& bx1 = mfi1.validbox();
-        for (MFIter mfi2(mf); mfi2.isValid(); ++mfi2)
-        {
-            const Box& bx2 = mfi2.validbox();
-            if (!(bx1.sameSize(bx2)))
-            {
-                amrex::Error("All boxes must be identical in HPGMG!");
-            }
-        }
-    }
-
-    // All the boxes have identical size and shape, so we just pick one of them
-    // as a representative to fill in all the level data for HPGMG.
-    MFIter mfi(mf);
-    while (!mfi.isValid()) ++mfi;
-
-    const Box& bx = mfi.validbox();
-    const int box_dim = bx.length(0); /* Since we've already checked that all boxes are the same size, we can just use the size from one of them here. */
-
-    if (TotalBoxes / num_ranks == 0)
-      amrex::Error("Must have at least one box per MPI task when using HPGMG");
-
-    if (ParallelDescriptor::IOProcessor())
-    {
-      std::cout << std::endl << "attempting to create a " << box_dim*boxes_in_i << "^3 level from " << TotalBoxes << " x " << box_dim << "^3 boxes distributed among " << num_ranks << " tasks..." << std::endl;
-      if (domain_boundary_condition==BC_DIRICHLET)
-      {
-        std::cout << "boundary condition = BC_DIRICHLET" << std::endl;
-      }
-      else if (domain_boundary_condition==BC_PERIODIC)
-      {
-        std::cout << "boundary condition = BC_PERIODIC" << std::endl;
-      }
-      else
-      {
-        amrex::Error("Unknown boundary condition supplied");
-      }
-    }
-
-    int omp_threads = 1;
-
-#ifdef AMREX_USE_OMP
-#pragma omp parallel
-    {
-#pragma omp master
-      {
-        omp_threads = omp_get_num_threads ();
-      }
-    }
-#endif
-
-    int box_ghosts = stencil_get_radius();
-
-    level->box_dim        = box_dim;
-    level->box_ghosts     = box_ghosts;
-    level->numVectors     = 0; // no vectors have been allocated yet
-    level->vectors_base   = NULL; // pointer returned by bulk malloc
-    level->vectors        = NULL; // pointers to individual vectors
-    level->boxes_in.i     = boxes_in_i;
-    level->boxes_in.j     = boxes_in_i;
-    level->boxes_in.k     = boxes_in_i;
-    level->dim.i          = box_dim*level->boxes_in.i;
-    level->dim.j          = box_dim*level->boxes_in.j;
-    level->dim.k          = box_dim*level->boxes_in.k;
-    level->active         = 1;
-    level->my_rank        = my_rank;
-    level->num_ranks      = num_ranks;
-    level->boundary_condition.type = domain_boundary_condition;
-    level->must_subtract_mean = -1;
-    level->num_threads      = omp_threads;
-    level->my_blocks        = NULL;
-    level->num_my_blocks    = 0;
-    level->allocated_blocks = 0;
-    level->tag              = log2(level->dim.i);
-    level->h                = h0;
-    level->fluxes           = NULL;
-
-    // allocate 3D array of integers to hold the MPI rank of the corresponding box and initialize to -1 (unassigned)
-    level->rank_of_box = (int*)malloc(level->boxes_in.i*level->boxes_in.j*level->boxes_in.k*sizeof(int));
-    if(level->rank_of_box==NULL)
-        amrex::Error("malloc of level->rank_of_box failed");
-    for(box=0;box<level->boxes_in.i*level->boxes_in.j*level->boxes_in.k;box++){level->rank_of_box[box]=-1;}  // -1 denotes that there is no actual box assigned to this region
-
-
-    // Now convert our rank distribution of boxes to HPGMG's rank_of_box array.
-    // This is convoluted because HPGMG first assigns boxes to ranks, and then
-    // lexicographically assigns the coordinates of each box. This
-    // lexicographical ordering of box coordinates is *required* in order for
-    // the MPI communication patterns in HPGMG to function correctly, via the
-    // global_box_id variable. In other words, HPGMG anticipates the geometric
-    // relationship between boxes based on their respective values of
-    // global_box_id, and routes MPI traffic accordingly. However, in BoxLib
-    // the box ranks and indices are not necessarily in this order, so we have
-    // to "fake" the box ordering in HPGMG here (even though the coordinates
-    // aren't actually assigned until we call create_vectors()) in order to
-    // match the box ranks between BoxLib and HPGMG. This whole method is dumb
-    // and deserves a better solution, but I don't know a better way to do it.
-
-    int num_local_boxes = 0;
-    int i,j,k;
-    for(k=0;k<level->boxes_in.k;k++){
-    for(j=0;j<level->boxes_in.j;j++){
-    for(i=0;i<level->boxes_in.i;i++){
-      int jStride = level->boxes_in.i;
-      int kStride = level->boxes_in.i*level->boxes_in.j;
-      int b=i + j*jStride + k*kStride;
-
-      // These will be the coordinates of a box in HPGMG. These are also the
-      // coordinates of a box already created in BoxLib. Now we iterate through
-      // every rank's local boxes until we find the matching one, and assign
-      // the rank of the HPGMG box to the same rank in BoxLib.
-
-      const int low_i      = i*level->box_dim;
-      const int low_j      = j*level->box_dim;
-      const int low_k      = k*level->box_dim;
-
-      bool found = false;
-      for (MFIter mfi(mf); mfi.isValid(); ++mfi)
-      {
-        const Box &bx = mfi.validbox();
-        const int *loVect = bx.loVect();
-
-        // Found the matching box!
-        if ((low_i == loVect[0]) &&
-            (low_j == loVect[1]) &&
-            (low_k == loVect[2]))
-        {
-            found = true;
-            num_local_boxes++;
-            break;
-        }
-      }
-      if (found)
-      {
-        level->rank_of_box[b] = my_rank;
-      }
-    }}}
-
-    // Now tell all the ranks what each other's box ranks are.
-    const int tot_num_boxes = level->boxes_in.i * level->boxes_in.j * level->boxes_in.k;
-    int all_box_ranks[tot_num_boxes];
-    std::fill_n(all_box_ranks, tot_num_boxes, 1);
-    MPI_Allreduce(level->rank_of_box, all_box_ranks, tot_num_boxes, MPI_INT, MPI_PROD, ParallelDescriptor::Communicator());
-    for (unsigned int i = 0; i < tot_num_boxes; ++i)
-    {
-        level->rank_of_box[i] = std::abs(all_box_ranks[i]);
-    }
-
-    std::vector<int> box_ranks(level->rank_of_box, level->rank_of_box + tot_num_boxes);
-
-    // calculate how many boxes I own...
-    level->num_my_boxes=0;
-    for(box=0;box<level->boxes_in.i*level->boxes_in.j*level->boxes_in.k;box++){if(level->rank_of_box[box]==level->my_rank)level->num_my_boxes++;}
-    level->my_boxes = (box_type*)malloc(level->num_my_boxes*sizeof(box_type));
-    if((level->num_my_boxes>0)&&(level->my_boxes==NULL))
-        amrex::Error("malloc failed - create_level/level->my_boxes");
-
-    // allocate flattened vector FP data and create pointers...
-    if (ParallelDescriptor::IOProcessor())
-        std::cout << "Allocating vectors... ";
-    create_vectors (level, numVectors);
-    if (ParallelDescriptor::IOProcessor())
-        std::cout << "done." << std::endl;
-
-    // Build and auxilarlly data structure that flattens boxes into blocks...
-    for(box=0;box<level->num_my_boxes;box++){
-      int blockcopy_i = BLOCKCOPY_TILE_I;
-      int blockcopy_j = BLOCKCOPY_TILE_J;
-      int blockcopy_k = BLOCKCOPY_TILE_K;
-
-      append_block_to_list(&(level->my_blocks),&(level->allocated_blocks),&(level->num_my_blocks),
-        /* dim.i         = */ level->my_boxes[box].dim,
-        /* dim.j         = */ level->my_boxes[box].dim,
-        /* dim.k         = */ level->my_boxes[box].dim,
-        /* read.box      = */ box,
-        /* read.ptr      = */ NULL,
-        /* read.i        = */ 0,
-        /* read.j        = */ 0,
-        /* read.k        = */ 0,
-        /* read.jStride  = */ level->my_boxes[box].jStride,
-        /* read.kStride  = */ level->my_boxes[box].kStride,
-        /* read.scale    = */ 1,
-        /* write.box     = */ box,
-        /* write.ptr     = */ NULL,
-        /* write.i       = */ 0,
-        /* write.j       = */ 0,
-        /* write.k       = */ 0,
-        /* write.jStride = */ level->my_boxes[box].jStride,
-        /* write.kStride = */ level->my_boxes[box].kStride,
-        /* write.scale   = */ 1,
-        /* blockcopy_i   = */ blockcopy_i,
-        /* blockcopy_j   = */ blockcopy_j,
-        /* blockcopy_k   = */ blockcopy_k,
-        /* subtype       = */ 0
-      );
-    }
-
-    // build an assist structure for Gauss Seidel Red Black that would facilitate unrolling and SIMDization...
-    level->RedBlack_base = NULL;
-    level->RedBlack_FP = NULL;
-    if(level->num_my_boxes){
-      int i,j;
-      int kStride = level->my_boxes[0].kStride;
-      int jStride = level->my_boxes[0].jStride;
-      level->RedBlack_base = (double*)malloc(2*kStride*sizeof(double)+256); // used for free()
-      level->RedBlack_FP   = level->RedBlack_base; // aligned version
-      // align first *non-ghost* zone element to a 64-Byte boundary...
-      while( (uint64_t)(level->RedBlack_FP + level->box_ghosts*(1+level->box_jStride)) & 0x3f ){level->RedBlack_FP++;}
-      // initialize RedBlack array...
-      for(j=0-level->box_ghosts;j<level->box_dim+level->box_ghosts;j++){
-      for(i=0-level->box_ghosts;i<level->box_dim+level->box_ghosts;i++){
-        int ij = (i+level->box_ghosts) + (j+level->box_ghosts)*jStride;
-        if((i^j^1)&0x1){
-          level->RedBlack_FP[ij        ]=1.0;
-          level->RedBlack_FP[ij+kStride]=0.0;
-        }else{
-          level->RedBlack_FP[ij        ]=0.0;
-          level->RedBlack_FP[ij+kStride]=1.0;
-        }
-      }}
-    }
-
-    int shape;
-    // create mini program for each stencil shape to perform a ghost zone exchange...
-    for(shape=0;shape<STENCIL_MAX_SHAPES;shape++)build_exchange_ghosts(    level,shape);
-    // create mini program for each stencil shape to perform a boundary condition...
-    for(shape=0;shape<STENCIL_MAX_SHAPES;shape++)build_boundary_conditions(level,shape);
-
-
-    // duplicate the parent communicator to be the communicator for each level
-    #ifdef BL_USE_MPI
-    if (ParallelDescriptor::IOProcessor())
-        std::cout << "Duplicating MPI communicator... ";
-    double time_start = MPI_Wtime();
-    MPI_Comm_dup(ParallelDescriptor::Communicator(),&level->MPI_COMM_ALLREDUCE);
-    double time_end = MPI_Wtime();
-    double time_in_comm_dup = 0;
-    double time_in_comm_dup_send = time_end-time_start;
-    MPI_Allreduce(&time_in_comm_dup_send,&time_in_comm_dup,1,MPI_DOUBLE,MPI_MAX,ParallelDescriptor::Communicator());
-    if (ParallelDescriptor::IOProcessor())
-      std::cout << "done (" << time_in_comm_dup << " seconds)" << std::endl;
-    #endif /* BL_USE_MPI */
-
-    // report on potential load imbalance
-    int BoxesPerProcess = level->num_my_boxes;
-    #ifdef BL_USE_MPI
-    int BoxesPerProcessSend = level->num_my_boxes;
-    MPI_Allreduce(&BoxesPerProcessSend,&BoxesPerProcess,1,MPI_INT,MPI_MAX,ParallelDescriptor::Communicator());
-    #endif /* BL_USE_MPI */
-    if (ParallelDescriptor::IOProcessor())
-      std::cout << "Calculating boxes per process... target=" << (double)TotalBoxes/(double)num_ranks << ", max=" << BoxesPerProcess << std::endl;
-}
-
-
-void SetupHPGMGCoefficients(const double a,
-                            const double b,
-                            const MultiFab& alpha,
-                            const MultiFab& beta_cc,
-                            level_type* level)
-{
-
-    // First set the alphas (cell-centered).
-    bool found = false;
-    for (MFIter mfi(alpha); mfi.isValid(); ++mfi) {
-
-      const Box &bx = mfi.validbox();
-
-      const int *loVect = bx.loVect();
-      unsigned int box;
-      for (box = 0; box < level->num_my_boxes; ++box)
-      {
-        if ((level->my_boxes[box].low.i == loVect[0]) &&
-            (level->my_boxes[box].low.j == loVect[1]) &&
-            (level->my_boxes[box].low.k == loVect[2]))
-        {
-          found = true;
-          break;
-        }
-      }
-      if (!found)
-      {
-        amrex::Error("Could not find matching boxes between HPGMG and BoxLib");
-      }
-
-      const Box &fabbox = mfi.fabbox();
-      const double *alpha_data_ptr = alpha[mfi].dataPtr();
-      int i,j,k;
-      const int jStride = level->my_boxes[box].jStride;
-      const int kStride = level->my_boxes[box].kStride;
-      const int  ghosts = level->my_boxes[box].ghosts;
-      const int   dim_i = level->my_boxes[box].dim;
-      const int   dim_j = level->my_boxes[box].dim;
-      const int   dim_k = level->my_boxes[box].dim;
-
-      const int BL_jStride = fabbox.length(0);
-      const int BL_kStride = fabbox.length(0) * fabbox.length(1);
-      const int BoxLib_ghosts = alpha.nGrow();
-      #ifdef AMREX_USE_OMP
-      #pragma omp parallel for private(k,j,i) collapse(3)
-      #endif
-      for(k=0;k<dim_k;k++){
-      for(j=0;j<dim_j;j++){
-      for(i=0;i<dim_i;i++){
-        int ijk_HPGMG = (i+ghosts) + (j+ghosts)*jStride + (k+ghosts)*kStride;
-        const int ijk_BoxLib = (i+BoxLib_ghosts) + (j+BoxLib_ghosts)*BL_jStride + (k+BoxLib_ghosts)*BL_kStride;
-        level->my_boxes[box].vectors[VECTOR_ALPHA][ijk_HPGMG] = alpha_data_ptr[ijk_BoxLib];
-      }}}
-    }
-
-
-    // Now convert the cell-centered beta to faces.
-    found = false;
-    for (MFIter mfi(beta_cc); mfi.isValid(); ++mfi) {
-
-      const Box &bx = mfi.validbox();
-
-      const int *loVect = bx.loVect();
-      unsigned int box;
-      for (box = 0; box < level->num_my_boxes; ++box)
-      {
-        if ((level->my_boxes[box].low.i == loVect[0]) &&
-            (level->my_boxes[box].low.j == loVect[1]) &&
-            (level->my_boxes[box].low.k == loVect[2]))
-        {
-          found = true;
-          break;
-        }
-      }
-      if (!found)
-      {
-        amrex::Error("Could not find matching boxes between HPGMG and BoxLib");
-      }
-
-      const Box &fabbox = mfi.fabbox();
-
-      const double *beta_data_ptr = beta_cc[mfi].dataPtr();
-      int i,j,k;
-      const int jStride = level->my_boxes[box].jStride;
-      const int kStride = level->my_boxes[box].kStride;
-      const int  ghosts = level->my_boxes[box].ghosts;
-      const int   dim_i = level->my_boxes[box].dim;
-      const int   dim_j = level->my_boxes[box].dim;
-      const int   dim_k = level->my_boxes[box].dim;
-      const int BL_jStride = fabbox.length(0);
-      const int BL_kStride = fabbox.length(0) * fabbox.length(1);
-      const int BoxLib_ghosts = beta_cc.nGrow();
-
-      #ifdef AMREX_USE_OMP
-      #pragma omp parallel for private(k,j,i) collapse(3)
-      #endif
-      for(k=0;k<=dim_k;k++){ // include high face
-      for(j=0;j<=dim_j;j++){ // include high face
-      for(i=0;i<=dim_i;i++){ // include high face
-        //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-        int ijk_HPGMG = (i+ghosts) + (j+ghosts)*jStride + (k+ghosts)*kStride;
-        const int ijk_BoxLib   = (i  +BoxLib_ghosts) + (j  +BoxLib_ghosts)*BL_jStride + (k  +BoxLib_ghosts)*BL_kStride;
-        const int im1jk_BoxLib = (i-1+BoxLib_ghosts) + (j  +BoxLib_ghosts)*BL_jStride + (k  +BoxLib_ghosts)*BL_kStride;
-        const int ijm1k_BoxLib = (i  +BoxLib_ghosts) + (j-1+BoxLib_ghosts)*BL_jStride + (k  +BoxLib_ghosts)*BL_kStride;
-        const int ijkm1_BoxLib = (i  +BoxLib_ghosts) + (j  +BoxLib_ghosts)*BL_jStride + (k-1+BoxLib_ghosts)*BL_kStride;
-        level->my_boxes[box].vectors[VECTOR_BETA_I][ijk_HPGMG] = 0.5 * (beta_data_ptr[ijk_BoxLib] + beta_data_ptr[im1jk_BoxLib]);
-        level->my_boxes[box].vectors[VECTOR_BETA_J][ijk_HPGMG] = 0.5 * (beta_data_ptr[ijk_BoxLib] + beta_data_ptr[ijm1k_BoxLib]);
-        level->my_boxes[box].vectors[VECTOR_BETA_K][ijk_HPGMG] = 0.5 * (beta_data_ptr[ijk_BoxLib] + beta_data_ptr[ijkm1_BoxLib]);
-      }}}
-    }
-}
-
-
-void ConvertToHPGMGLevel (const MultiFab& mf,
-                     const int n_cell,
-                     const int max_grid_size,
-                     level_type* level,
-                     const int component_id)
-{
-    bool found = false;
-    for (MFIter mfi(mf); mfi.isValid(); ++mfi) {
-
-      const Box &bx = mfi.validbox();
-
-      // The local box indices are ordered differently in HPGMG and BoxLib. So
-      // as a simple (but SLOW) hack, we just find the boxes with matching
-      // lower indices.
-      // TODO: make this box matching less hacky
-      const int *loVect = bx.loVect();
-      unsigned int box;
-      for (box = 0; box < level->num_my_boxes; ++box)
-      {
-        if ((level->my_boxes[box].low.i == loVect[0]) &&
-            (level->my_boxes[box].low.j == loVect[1]) &&
-            (level->my_boxes[box].low.k == loVect[2]))
-        {
-          found = true;
-          break;
-        }
-      }
-
-      if (!found)
-      {
-        amrex::Error("Could not find matching boxes between HPGMG and BoxLib");
-      }
-
-      const Box &fabbox = mfi.fabbox();
-      const int BL_jStride = fabbox.length(0);
-      const int BL_kStride = fabbox.length(0) * fabbox.length(1);
-
-      const double *fab_data = mf[mfi].dataPtr();
-      int i,j,k;
-      const int jStride = level->my_boxes[box].jStride;
-      const int kStride = level->my_boxes[box].kStride;
-      const int  ghosts = level->my_boxes[box].ghosts;
-      const int   dim_i = level->my_boxes[box].dim;
-      const int   dim_j = level->my_boxes[box].dim;
-      const int   dim_k = level->my_boxes[box].dim;
-      const int BoxLib_ghosts = mf.nGrow();
-
-      #ifdef AMREX_USE_OMP
-      #pragma omp parallel for private(k,j,i) collapse(3)
-      #endif
-      for(k=0;k<dim_k;k++){
-      for(j=0;j<dim_j;j++){
-      for(i=0;i<dim_i;i++){
-
-        // The HPGMG strides are padded to align memory and encourage
-        // SIMD-ization, so they are different than the BoxLib strides.
-
-        const int ijk_HPGMG = (i+ghosts) + (j+ghosts)*jStride + (k+ghosts)*kStride;
-        const int ijk_BoxLib = (i+BoxLib_ghosts) + (j+BoxLib_ghosts)*BL_jStride + (k+BoxLib_ghosts)*BL_kStride;
-
-        level->my_boxes[box].vectors[component_id][ijk_HPGMG] = fab_data[ijk_BoxLib];
-
-      }}}
-
-    }
-}
-
-void ConvertFromHPGMGLevel(MultiFab& mf,
-                           const level_type* level,
-                           const int component_id)
-{
-  for (MFIter mfi(mf); mfi.isValid(); ++mfi)
-  {
-      const Box &bx = mfi.validbox();
-      double *fab_data = mf[mfi].dataPtr();
-
-      // First find the HPGMG box corresponding to this BoxLib box.
-      const int *loVect = bx.loVect();
-      int box;
-      for (box = 0; box < level->num_my_boxes; ++box)
-      {
-        if ((level->my_boxes[box].low.i == loVect[0]) &&
-            (level->my_boxes[box].low.j == loVect[1]) &&
-            (level->my_boxes[box].low.k == loVect[2]))
-          break;
-      }
-
-      const Box &fabbox = mfi.fabbox();
-
-      // Found the matching boxes, now fill the data.
-      const int dim_i = level->my_boxes[box].dim;
-      const int dim_j = level->my_boxes[box].dim;
-      const int dim_k = level->my_boxes[box].dim;
-      const int ghosts = level->my_boxes[box].ghosts;
-      const int jStride = level->my_boxes[box].jStride;
-      const int kStride = level->my_boxes[box].kStride;
-      const int BoxLib_ghosts = mf.nGrow();
-
-      int i, j, k;
-      #ifdef AMREX_USE_OMP
-      #pragma omp parallel for private(k,j,i) collapse(3)
-      #endif
-      for(k=0;k<dim_k;k++){
-      for(j=0;j<dim_j;j++){
-      for(i=0;i<dim_i;i++){
-
-        const int ijk_HPGMG = (i+ghosts) + (j+ghosts)*jStride + (k+ghosts)*kStride;
-
-        // WARNING: this indexing stride works for FABs *ONLY* if we have ONE
-        // component in the FAB. If we have more than one we have to stride
-        // over the components in the outermost loop (outside of k).
-        const int BL_jStride = fabbox.length(0);
-        const int BL_kStride = fabbox.length(0) * fabbox.length(1);
-        const int ijk_BoxLib = (i+BoxLib_ghosts) + (j+BoxLib_ghosts)*BL_jStride + (k+BoxLib_ghosts)*BL_kStride;
-
-        fab_data[ijk_BoxLib] = level->my_boxes[box].vectors[VECTOR_U][ijk_HPGMG];
-      }}}
-  }
-}
-#endif /* USEHPGMG */
diff --git a/Src/Extern/hpgmg/Make.package b/Src/Extern/hpgmg/Make.package
deleted file mode 100644
index f8145e91cf..0000000000
--- a/Src/Extern/hpgmg/Make.package
+++ /dev/null
@@ -1,5 +0,0 @@
-CEXE_sources += BL_HPGMG.cpp
-CEXE_headers += BL_HPGMG.H
-
-VPATH_LOCATIONS += $(AMREX_HOME)/Src/Extern/hpgmg
-INCLUDE_LOCATIONS += $(AMREX_HOME)/Src/Extern/hpgmg
diff --git a/Src/FFT/AMReX_FFT.H b/Src/FFT/AMReX_FFT.H
index f8050fff93..11bf4f4cc8 100644
--- a/Src/FFT/AMReX_FFT.H
+++ b/Src/FFT/AMReX_FFT.H
@@ -2,968 +2,16 @@
 #define AMREX_FFT_H_
 #include <AMReX_Config.H>
 
-#include <AMReX_MultiFab.H>
-#include <AMReX_FFT_Helper.H>
-#include <numeric>
-#include <tuple>
-#include <utility>
-
-#if defined(AMREX_USE_CUDA)
-#  include <cufft.h>
-#  include <cuComplex.h>
-#elif defined(AMREX_USE_HIP)
-#  if __has_include(<rocfft/rocfft.h>)  // ROCm 5.3+
-#    include <rocfft/rocfft.h>
-#  else
-#    include <rocfft.h>
-#  endif
-#  include <hip/hip_complex.h>
-#elif defined(AMREX_USE_SYCL)
-#  include <oneapi/mkl/dfti.hpp>
-#else
-#  include <fftw3.h>
-#endif
+#include <AMReX_FFT_LocalR2C.H>
+#include <AMReX_FFT_OpenBCSolver.H>
+#include <AMReX_FFT_R2C.H>
+#include <AMReX_FFT_R2X.H>
 
 namespace amrex::FFT
 {
-
-/**
- * \brief Discrete Fourier Transform
- *
- * This class supports Fourier transforms between real and complex data. The
- * name R2C indicates that the forward transform converts real data to
- * complex data, while the backward transform converts complex data to real
- * data. It should be noted that both directions of transformation are
- * supported, not just from real to complex. The scaling follows the FFTW
- * convention, where applying the forward transform followed by the backward
- * transform scales the original data by the size of the input array.
- *
- * For more details, we refer the users to
- * https://amrex-codes.github.io/amrex/docs_html/FFT_Chapter.html.
- */
-template <typename T = Real, FFT::Direction D = FFT::Direction::both>
-class R2C
-{
-public:
-    using MF = std::conditional_t<std::is_same_v<T,Real>,
-                                  MultiFab, FabArray<BaseFab<T> > >;
-    using cMF = FabArray<BaseFab<GpuComplex<T> > >;
-
-    /**
-     * \brief Constructor
-     *
-     * \param domain the forward domain (i.e., the domain of the real data)
-     * \param info optional information
-     */
-    explicit R2C (Box const& domain, Info const& info = Info{});
-
-    ~R2C ();
-
-    R2C (R2C const&) = delete;
-    R2C (R2C &&) = delete;
-    R2C& operator= (R2C const&) = delete;
-    R2C& operator= (R2C &&) = delete;
-
-    /**
-     * \brief Forward and then backward transform
-     *
-     * This function is available only when this class template is
-     * instantiated for transforms in both directions. It's more efficient
-     * than calling the forward function that stores the spectral data in a
-     * caller provided container followed by the backward function, because
-     * this can avoid parallel communication between the internal data and
-     * the caller's data container.
-     *
-     * \param inmf         input data in MultiFab or FabArray<BaseFab<float>>
-     * \param outmf        output data in MultiFab or FabArray<BaseFab<float>>
-     * \param post_forward a callable object for processing the post-forward
-     *                     data before the backward transform. Its interface
-     *                     is `(int,int,int,GpuComplex<T>&)`, where the integers
-     *                     are indices in the spectral space, and the reference
-     *                     to the complex number allows for the modification of
-     *                     the spectral data at that location.
-     */
-    template <typename F, Direction DIR=D,
-              std::enable_if_t<DIR == Direction::both, int> = 0>
-    void forwardThenBackward (MF const& inmf, MF& outmf, F const& post_forward)
-    {
-        this->forward(inmf);
-        this->post_forward_doit(post_forward);
-        this->backward(outmf);
-    }
-
-    /**
-     * \brief Forward transform
-     *
-     * The output is stored in this object's internal data. This function is
-     * not available when this class template is instantiated for
-     * backward-only transform.
-     *
-     * \param inmf input data in MultiFab or FabArray<BaseFab<float>>
-     */
-    template <Direction DIR=D, std::enable_if_t<DIR == Direction::forward ||
-                                                DIR == Direction::both, int> = 0>
-    void forward (MF const& inmf);
-
-    /**
-     * \brief Forward transform
-     *
-     * This function is not available when this class template is
-     * instantiated for backward-only transform.
-     *
-     * \param inmf input data in MultiFab or FabArray<BaseFab<float>>
-     * \param outmf output data in FabArray<BaseFab<GpuComplex<T>>>
-     */
-    template <Direction DIR=D, std::enable_if_t<DIR == Direction::forward ||
-                                                DIR == Direction::both, int> = 0>
-    void forward (MF const& inmf, cMF& outmf);
-
-    /**
-     * \brief Backward transform
-     *
-     * This function is available only when this class template is
-     * instantiated for transforms in both directions.
-     *
-     * \param outmf output data in MultiFab or FabArray<BaseFab<float>>
-     */
-    template <Direction DIR=D, std::enable_if_t<DIR == Direction::both, int> = 0>
-    void backward (MF& outmf);
-
-    /**
-     * \brief Backward transform
-     *
-     * This function is not available when this class template is
-     * instantiated for forward-only transform.
-     *
-     * \param inmf input data in FabArray<BaseFab<GpuComplex<T>>>
-     * \param outmf output data in MultiFab or FabArray<BaseFab<float>>
-     */
-    template <Direction DIR=D, std::enable_if_t<DIR == Direction::backward ||
-                                                DIR == Direction::both, int> = 0>
-    void backward (cMF const& inmf, MF& outmf);
-
-    /**
-     * \brief Get the internal spectral data
-     *
-     * This function is not available when this class template is
-     * instantiated for backward-only transform. For performance reasons,
-     * the returned data array does not have the usual ordering of
-     * `(x,y,z)`. The order is specified in the second part of the return
-     * value.
-     */
-    template <Direction DIR=D, std::enable_if_t<DIR == Direction::forward ||
-                                                DIR == Direction::both, int> = 0>
-    std::pair<cMF*,IntVect> getSpectralData ();
-
-    struct Swap01
-    {
-        [[nodiscard]] AMREX_GPU_HOST_DEVICE Dim3 operator() (Dim3 i) const noexcept
-        {
-            return {i.y, i.x, i.z};
-        }
-
-        [[nodiscard]] AMREX_GPU_HOST_DEVICE Dim3 Inverse (Dim3 i) const noexcept
-        {
-            return {i.y, i.x, i.z};
-        }
-
-        [[nodiscard]] IndexType operator() (IndexType it) const noexcept
-        {
-            return it;
-        }
-
-        [[nodiscard]] IndexType Inverse (IndexType it) const noexcept
-        {
-            return it;
-        }
-    };
-
-    struct Swap02
-    {
-        [[nodiscard]] AMREX_GPU_HOST_DEVICE Dim3 operator() (Dim3 i) const noexcept
-        {
-            return {i.z, i.y, i.x};
-        }
-
-        [[nodiscard]] AMREX_GPU_HOST_DEVICE Dim3 Inverse (Dim3 i) const noexcept
-        {
-            return {i.z, i.y, i.x};
-        }
-
-        [[nodiscard]] IndexType operator() (IndexType it) const noexcept
-        {
-            return it;
-        }
-
-        [[nodiscard]] IndexType Inverse (IndexType it) const noexcept
-        {
-            return it;
-        }
-    };
-
-    struct RotateFwd
-    {
-        // dest -> src: (x,y,z) -> (y,z,x)
-        [[nodiscard]] AMREX_GPU_HOST_DEVICE Dim3 operator() (Dim3 i) const noexcept
-        {
-            return {i.y, i.z, i.x};
-        }
-
-        // src -> dest: (x,y,z) -> (z,x,y)
-        [[nodiscard]] AMREX_GPU_HOST_DEVICE Dim3 Inverse (Dim3 i) const noexcept
-        {
-            return {i.z, i.x, i.y};
-        }
-
-        [[nodiscard]] IndexType operator() (IndexType it) const noexcept
-        {
-            return it;
-        }
-
-        [[nodiscard]] IndexType Inverse (IndexType it) const noexcept
-        {
-            return it;
-        }
-    };
-
-    struct RotateBwd
-    {
-        // dest -> src: (x,y,z) -> (z,x,y)
-        [[nodiscard]] AMREX_GPU_HOST_DEVICE Dim3 operator() (Dim3 i) const noexcept
-        {
-            return {i.z, i.x, i.y};
-        }
-
-        // src -> dest: (x,y,z) -> (y,z,x)
-        [[nodiscard]] AMREX_GPU_HOST_DEVICE Dim3 Inverse (Dim3 i) const noexcept
-        {
-            return {i.y, i.z, i.x};
-        }
-
-        [[nodiscard]] IndexType operator() (IndexType it) const noexcept
-        {
-            return it;
-        }
-
-        [[nodiscard]] IndexType Inverse (IndexType it) const noexcept
-        {
-            return it;
-        }
-    };
-
-    // public for cuda
-    template <typename F>
-    void post_forward_doit (F const& post_forward);
-
-private:
-
-#if defined(AMREX_USE_CUDA)
-    using VendorPlan = cufftHandle;
-    using VendorPlan2 = VendorPlan;
-    using FFTComplex = std::conditional_t<std::is_same_v<float,T>,
-                                          cuComplex, cuDoubleComplex>;
-#elif defined(AMREX_USE_HIP)
-    using VendorPlan = rocfft_plan;
-    using VendorPlan2 = VendorPlan;
-    using FFTComplex = std::conditional_t<std::is_same_v<float,T>,
-                                          float2, double2>;
-#elif defined(AMREX_USE_SYCL)
-    using VendorPlan = oneapi::mkl::dft::descriptor<
-        std::is_same_v<float,T> ? oneapi::mkl::dft::precision::SINGLE
-                                : oneapi::mkl::dft::precision::DOUBLE,
-        oneapi::mkl::dft::domain::REAL> *;
-    using VendorPlan2 = oneapi::mkl::dft::descriptor<
-        std::is_same_v<float,T> ? oneapi::mkl::dft::precision::SINGLE
-                                : oneapi::mkl::dft::precision::DOUBLE,
-        oneapi::mkl::dft::domain::COMPLEX> *;
-    using FFTComplex = GpuComplex<T>;
-#else
-    using VendorPlan = std::conditional_t<std::is_same_v<float,T>,
-                                          fftwf_plan, fftw_plan>;
-    using VendorPlan2 = VendorPlan;
-    using FFTComplex = std::conditional_t<std::is_same_v<float,T>,
-                                          fftwf_complex, fftw_complex>;
-#endif
-
-    struct Plan {
-        bool defined = false;
-        VendorPlan plan = 0; // NOLINT
-    };
-
-    struct Plan2 {
-        bool defined = false;
-        VendorPlan2 plan = 0; // NOLINT
-    };
-
-    template <typename FA>
-    static typename FA::FABType::value_type *
-    get_fab (FA& fa) {
-        auto myproc = ParallelContext::MyProcSub();
-        if (myproc < fa.size()) {
-            return fa.fabPtr(myproc);
-        } else {
-            return nullptr;
-        }
-    }
-
-    static void exec_r2c (Plan plan, MF& in, cMF& out);
-    static void exec_c2r (Plan plan, cMF& in, MF& out);
-    template <Direction direction>
-    static void exec_c2c (Plan2 plan, cMF& inout);
-
-    template <typename P>
-    static void destroy_plan (P plan);
-    static std::pair<Plan2,Plan2> make_c2c_plans (cMF& inout);
-
-    void backward_doit (MF& outmf);
-
-    Plan m_fft_fwd_x{};
-    Plan m_fft_bwd_x{};
-    Plan2 m_fft_fwd_y{};
-    Plan2 m_fft_bwd_y{};
-    Plan2 m_fft_fwd_z{};
-    Plan2 m_fft_bwd_z{};
-
-    // Comm meta-data. In the forward phase, we start with (x,y,z),
-    // transpose to (y,x,z) and then (z,x,y). In the backward phase, we
-    // perform inverse transpose.
-    std::unique_ptr<MultiBlockCommMetaData> m_cmd_x2y; // (x,y,z) -> (y,x,z)
-    std::unique_ptr<MultiBlockCommMetaData> m_cmd_y2x; // (y,x,z) -> (x,y,z)
-    std::unique_ptr<MultiBlockCommMetaData> m_cmd_y2z; // (y,x,z) -> (z,x,y)
-    std::unique_ptr<MultiBlockCommMetaData> m_cmd_z2y; // (z,x,y) -> (y,x,z)
-    Swap01 m_dtos_x2y{};
-    Swap01 m_dtos_y2x{};
-    Swap02 m_dtos_y2z{};
-    Swap02 m_dtos_z2y{};
-
-    MF  m_rx;
-    cMF m_cx;
-    cMF m_cy;
-    cMF m_cz;
-
-    Box m_real_domain;
-    Box m_spectral_domain_x;
-    Box m_spectral_domain_y;
-    Box m_spectral_domain_z;
-
-    Info m_info;
-};
-
-template <typename T, Direction D>
-R2C<T,D>::R2C (Box const& domain, Info const& info)
-    : m_real_domain(domain),
-      m_spectral_domain_x(IntVect(0), IntVect(AMREX_D_DECL(domain.length(0)/2,
-                                                           domain.bigEnd(1),
-                                                           domain.bigEnd(2)))),
-#if (AMREX_SPACEDIM >= 2)
-      m_spectral_domain_y(IntVect(0), IntVect(AMREX_D_DECL(domain.bigEnd(1),
-                                                           domain.length(0)/2,
-                                                           domain.bigEnd(2)))),
-#if (AMREX_SPACEDIM == 3)
-      m_spectral_domain_z(IntVect(0), IntVect(AMREX_D_DECL(domain.bigEnd(2),
-                                                           domain.length(0)/2,
-                                                           domain.bigEnd(1)))),
-#endif
-#endif
-      m_info(info)
-{
-    static_assert(std::is_same_v<float,T> || std::is_same_v<double,T>);
-    AMREX_ALWAYS_ASSERT(m_real_domain.smallEnd() == 0 &&
-                        m_real_domain.length(0) > 1 &&
-                        m_real_domain.cellCentered());
-#if (AMREX_SPACEDIM == 3)
-    AMREX_ALWAYS_ASSERT(m_real_domain.length(2) > 1 || ! m_info.batch_mode);
-    AMREX_ALWAYS_ASSERT(m_real_domain.length(1) > 1 || m_real_domain.length(2) == 1);
-#else
-    AMREX_ALWAYS_ASSERT(! m_info.batch_mode);
-#endif
-
-    int myproc = ParallelContext::MyProcSub();
-    int nprocs = ParallelContext::NProcsSub();
-
-    auto bax = amrex::decompose(m_real_domain, nprocs, {AMREX_D_DECL(false,true,true)});
-    DistributionMapping dmx = detail::make_iota_distromap(bax.size());
-    m_rx.define(bax, dmx, 1, 0);
-
-    {
-        BoxList bl = bax.boxList();
-        for (auto & b : bl) {
-            b.setBig(0, m_spectral_domain_x.bigEnd(0));
-        }
-        BoxArray cbax(std::move(bl));
-        m_cx.define(cbax, dmx, 1, 0);
-    }
-
-    // plans for x-direction
-    if (myproc < m_rx.size())
-    {
-        Box const local_box = m_rx.boxArray()[myproc];
-        int n = local_box.length(0);
-        int howmany = AMREX_D_TERM(1, *local_box.length(1), *local_box.length(2));
-
-#if defined(AMREX_USE_CUDA)
-        if constexpr (D == Direction::both || D == Direction::forward) {
-            cufftType fwd_type = std::is_same_v<float,T> ? CUFFT_R2C : CUFFT_D2Z;
-            AMREX_CUFFT_SAFE_CALL
-                (cufftPlanMany(&m_fft_fwd_x.plan, 1, &n,
-                               nullptr, 1, m_real_domain.length(0),
-                               nullptr, 1, m_spectral_domain_x.length(0),
-                               fwd_type, howmany));
-            AMREX_CUFFT_SAFE_CALL(cufftSetStream(m_fft_fwd_x.plan, Gpu::gpuStream()));
-        }
-        if constexpr (D == Direction::both || D == Direction::backward) {
-            cufftType bwd_type = std::is_same_v<float,T> ? CUFFT_C2R : CUFFT_Z2D;
-            AMREX_CUFFT_SAFE_CALL
-                (cufftPlanMany(&m_fft_bwd_x.plan, 1, &n,
-                               nullptr, 1, m_spectral_domain_x.length(0),
-                               nullptr, 1, m_real_domain.length(0),
-                               bwd_type, howmany));
-            AMREX_CUFFT_SAFE_CALL(cufftSetStream(m_fft_bwd_x.plan, Gpu::gpuStream()));
-        }
-#elif defined(AMREX_USE_HIP)
-
-        auto prec = std::is_same_v<float,T> ? rocfft_precision_single : rocfft_precision_double;
-        const std::size_t length = n;
-        if constexpr (D == Direction::both || D == Direction::forward) {
-            AMREX_ROCFFT_SAFE_CALL
-                (rocfft_plan_create(&m_fft_fwd_x.plan, rocfft_placement_notinplace,
-                                    rocfft_transform_type_real_forward, prec, 1,
-                                    &length, howmany, nullptr));
-        }
-        if constexpr (D == Direction::both || D == Direction::backward) {
-            AMREX_ROCFFT_SAFE_CALL
-                (rocfft_plan_create(&m_fft_bwd_x.plan, rocfft_placement_notinplace,
-                                    rocfft_transform_type_real_inverse, prec, 1,
-                                    &length, howmany, nullptr));
-        }
-
-#elif defined(AMREX_USE_SYCL)
-
-        m_fft_fwd_x.plan = new std::remove_pointer_t<VendorPlan>(n);
-        m_fft_fwd_x.plan->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
-                                    DFTI_NOT_INPLACE);
-        m_fft_fwd_x.plan->set_value(oneapi::mkl::dft::config_param::NUMBER_OF_TRANSFORMS,
-                                    howmany);
-        m_fft_fwd_x.plan->set_value(oneapi::mkl::dft::config_param::FWD_DISTANCE,
-                                    m_real_domain.length(0));
-        m_fft_fwd_x.plan->set_value(oneapi::mkl::dft::config_param::BWD_DISTANCE,
-                                    m_spectral_domain_x.length(0));
-        std::array<std::int64_t,2> strides{0,1};
-        m_fft_fwd_x.plan->set_value(oneapi::mkl::dft::config_param::FWD_STRIDES,
-                                    strides.data());
-        m_fft_fwd_x.plan->set_value(oneapi::mkl::dft::config_param::BWD_STRIDES,
-                                    strides.data());
-        m_fft_fwd_x.plan->set_value(oneapi::mkl::dft::config_param::WORKSPACE,
-                                    oneapi::mkl::dft::config_value::WORKSPACE_EXTERNAL);
-        m_fft_fwd_x.plan->commit(amrex::Gpu::Device::streamQueue());
-
-        m_fft_bwd_x.plan = m_fft_fwd_x.plan;
-
-#else /* FFTW */
-
-        auto* in = m_rx[myproc].dataPtr();
-        auto* out = (FFTComplex*)(m_cx[myproc].dataPtr());
-
-        if constexpr (std::is_same_v<float,T>) {
-            if constexpr (D == Direction::both || D == Direction::forward) {
-                m_fft_fwd_x.plan = fftwf_plan_many_dft_r2c
-                    (1, &n, howmany, in, nullptr, 1, m_real_domain.length(0),
-                     out, nullptr, 1, m_spectral_domain_x.length(0),
-                     FFTW_ESTIMATE | FFTW_DESTROY_INPUT);
-            }
-            if constexpr (D == Direction::both || D == Direction::backward) {
-                m_fft_bwd_x.plan = fftwf_plan_many_dft_c2r
-                    (1, &n, howmany, out, nullptr, 1, m_spectral_domain_x.length(0),
-                     in, nullptr, 1, m_real_domain.length(0),
-                     FFTW_ESTIMATE | FFTW_DESTROY_INPUT);
-            }
-        } else {
-            if constexpr (D == Direction::both || D == Direction::forward) {
-                m_fft_fwd_x.plan = fftw_plan_many_dft_r2c
-                    (1, &n, howmany, in, nullptr, 1, m_real_domain.length(0),
-                     out, nullptr, 1, m_spectral_domain_x.length(0),
-                     FFTW_ESTIMATE | FFTW_DESTROY_INPUT);
-            }
-            if constexpr (D == Direction::both || D == Direction::backward) {
-                m_fft_bwd_x.plan = fftw_plan_many_dft_c2r
-                    (1, &n, howmany, out, nullptr, 1, m_spectral_domain_x.length(0),
-                     in, nullptr, 1, m_real_domain.length(0),
-                     FFTW_ESTIMATE | FFTW_DESTROY_INPUT);
-            }
-        }
-#endif
-        if constexpr (D == Direction::both || D == Direction::forward) {
-            m_fft_fwd_x.defined = true;
-        }
-        if constexpr (D == Direction::both || D == Direction::backward) {
-            m_fft_bwd_x.defined = true;
-        }
-    }
-
-#if (AMREX_SPACEDIM >= 2)
-    DistributionMapping cdmy;
-    if (m_real_domain.length(1) > 1) {
-        auto cbay = amrex::decompose(m_spectral_domain_y, nprocs, {AMREX_D_DECL(false,true,true)});
-        if (cbay.size() == dmx.size()) {
-            cdmy = dmx;
-        } else {
-            cdmy = detail::make_iota_distromap(cbay.size());
-        }
-        m_cy.define(cbay, cdmy, 1, 0);
-
-        std::tie(m_fft_fwd_y, m_fft_bwd_y) = make_c2c_plans(m_cy);
-
-        // comm meta-data between x and y phases
-        m_cmd_x2y = std::make_unique<MultiBlockCommMetaData>
-            (m_cy, m_spectral_domain_y, m_cx, IntVect(0), m_dtos_x2y);
-        m_cmd_y2x = std::make_unique<MultiBlockCommMetaData>
-            (m_cx, m_spectral_domain_x, m_cy, IntVect(0), m_dtos_y2x);
-    }
-
-#if (AMREX_SPACEDIM == 3)
-    if (m_real_domain.length(1) > 1 &&
-        (! m_info.batch_mode && m_real_domain.length(2) > 1))
-    {
-        auto cbaz = amrex::decompose(m_spectral_domain_z, nprocs, {false,true,true});
-        DistributionMapping cdmz;
-        if (cbaz.size() == dmx.size()) {
-            cdmz = dmx;
-        } else if (cbaz.size() == cdmy.size()) {
-            cdmz = cdmy;
-        } else {
-            cdmz = detail::make_iota_distromap(cbaz.size());
-        }
-         m_cz.define(cbaz, cdmz, 1, 0);
-
-        std::tie(m_fft_fwd_z, m_fft_bwd_z) = make_c2c_plans(m_cz);
-
-        // comm meta-data between y and z phases
-        m_cmd_y2z = std::make_unique<MultiBlockCommMetaData>
-            (m_cz, m_spectral_domain_z, m_cy, IntVect(0), m_dtos_y2z);
-        m_cmd_z2y = std::make_unique<MultiBlockCommMetaData>
-            (m_cy, m_spectral_domain_y, m_cz, IntVect(0), m_dtos_z2y);
-    }
-#endif
-#endif
-}
-
-template <typename T, Direction D>
-template <typename P>
-void R2C<T,D>::destroy_plan (P plan)
-{
-    if (! plan.defined) { return; }
-
-#if defined(AMREX_USE_CUDA)
-    AMREX_CUFFT_SAFE_CALL(cufftDestroy(plan.plan));
-#elif defined(AMREX_USE_HIP)
-    AMREX_ROCFFT_SAFE_CALL(rocfft_plan_destroy(plan.plan));
-#elif defined(AMREX_USE_SYCL)
-    delete plan.plan;
-#else
-    if constexpr (std::is_same_v<float,T>) {
-        fftwf_destroy_plan(plan.plan);
-    } else {
-        fftw_destroy_plan(plan.plan);
-    }
-#endif
-
-    plan.defined = false;
-}
-
-template <typename T, Direction D>
-R2C<T,D>::~R2C<T,D> ()
-{
-#if defined(AMREX_USE_SYCL)
-    if constexpr (D == Direction::both || D == Direction::forward) {
-        destroy_plan(m_fft_fwd_x);
-        destroy_plan(m_fft_fwd_y);
-        destroy_plan(m_fft_fwd_z);
-    } else {
-        destroy_plan(m_fft_bwd_x);
-        destroy_plan(m_fft_bwd_y);
-        destroy_plan(m_fft_bwd_z);
-    }
-#else
-    destroy_plan(m_fft_fwd_x);
-    destroy_plan(m_fft_fwd_y);
-    destroy_plan(m_fft_fwd_z);
-    destroy_plan(m_fft_bwd_x);
-    destroy_plan(m_fft_bwd_y);
-    destroy_plan(m_fft_bwd_z);
-#endif
-}
-
-#ifdef AMREX_USE_HIP
-namespace detail { void hip_execute (rocfft_plan plan, void **in, void **out); }
-#endif
-
-#ifdef AMREX_USE_SYCL
-namespace detail
-{
-template <typename T, Direction direction, typename P, typename TI, typename TO>
-void sycl_execute (P plan, TI* in, TO* out)
-{
-    std::size_t workspaceSize = 0;
-    plan->get_value(oneapi::mkl::dft::config_param::WORKSPACE_BYTES,
-                    &workspaceSize);
-    auto* buffer = (T*)amrex::The_Arena()->alloc(workspaceSize);
-    plan->set_workspace(buffer);
-    sycl::event r;
-    if (std::is_same_v<TI,TO>) {
-        amrex::ignore_unused(in);
-        if constexpr (direction == Direction::forward) {
-            r = oneapi::mkl::dft::compute_forward(*plan, out);
-        } else {
-            r = oneapi::mkl::dft::compute_backward(*plan, out);
-        }
-    } else {
-        if constexpr (direction == Direction::forward) {
-            r = oneapi::mkl::dft::compute_forward(*plan, in, out);
-        } else {
-            r = oneapi::mkl::dft::compute_backward(*plan, in, out);
-        }
-    }
-    r.wait();
-    amrex::The_Arena()->free(buffer);
-}
-}
-#endif
-
-template <typename T, Direction D>
-void R2C<T,D>::exec_r2c (Plan plan, MF& in, cMF& out)
-{
-    if (! plan.defined) { return; }
-
-#if defined(AMREX_USE_GPU)
-    auto* pin = in[ParallelContext::MyProcSub()].dataPtr();
-    auto* pout = out[ParallelContext::MyProcSub()].dataPtr();
-#else
-    amrex::ignore_unused(in,out);
-#endif
-
-#if defined(AMREX_USE_CUDA)
-    if constexpr (std::is_same_v<float,T>) {
-        AMREX_CUFFT_SAFE_CALL(cufftExecR2C(plan.plan, pin, (FFTComplex*)pout));
-    } else {
-        AMREX_CUFFT_SAFE_CALL(cufftExecD2Z(plan.plan, pin, (FFTComplex*)pout));
-    }
-#elif defined(AMREX_USE_HIP)
-    detail::hip_execute(plan.plan, (void**)&pin, (void**)&pout);
-#elif defined(AMREX_USE_SYCL)
-    detail::sycl_execute<T,Direction::forward>(plan.plan, pin, (std::complex<T>*)pout);
-#else
-    if constexpr (std::is_same_v<float,T>) {
-        fftwf_execute(plan.plan);
-    } else {
-        fftw_execute(plan.plan);
-    }
-#endif
-}
-
-template <typename T, Direction D>
-void R2C<T,D>::exec_c2r (Plan plan, cMF& in, MF& out)
-{
-    if (! plan.defined) { return; }
-
-#if defined(AMREX_USE_GPU)
-    auto* pin = in[ParallelContext::MyProcSub()].dataPtr();
-    auto* pout = out[ParallelContext::MyProcSub()].dataPtr();
-#else
-    amrex::ignore_unused(in,out);
-#endif
-
-#if defined(AMREX_USE_CUDA)
-    if constexpr (std::is_same_v<float,T>) {
-        AMREX_CUFFT_SAFE_CALL(cufftExecC2R(plan.plan, (FFTComplex*)pin, pout));
-    } else {
-        AMREX_CUFFT_SAFE_CALL(cufftExecZ2D(plan.plan, (FFTComplex*)pin, pout));
-    }
-#elif defined(AMREX_USE_HIP)
-    detail::hip_execute(plan.plan, (void**)&pin, (void**)&pout);
-#elif defined(AMREX_USE_SYCL)
-    detail::sycl_execute<T,Direction::backward>(plan.plan, (std::complex<T>*)pin, pout);
-#else
-    if constexpr (std::is_same_v<float,T>) {
-        fftwf_execute(plan.plan);
-    } else {
-        fftw_execute(plan.plan);
-    }
-#endif
-}
-
-template <typename T, Direction D>
-template <Direction direction>
-void R2C<T,D>::exec_c2c (Plan2 plan, cMF& inout)
-{
-    if (! plan.defined) { return; }
-
-    amrex::ignore_unused(inout);
-#if defined(AMREX_USE_GPU)
-    auto* p = inout[ParallelContext::MyProcSub()].dataPtr();
-#endif
-
-#if defined(AMREX_USE_CUDA)
-    auto cufft_direction = (direction == Direction::forward) ? CUFFT_FORWARD : CUFFT_INVERSE;
-    if constexpr (std::is_same_v<float,T>) {
-        AMREX_CUFFT_SAFE_CALL(cufftExecC2C(plan.plan, (FFTComplex*)p, (FFTComplex*)p,
-                                           cufft_direction));
-    } else {
-        AMREX_CUFFT_SAFE_CALL(cufftExecZ2Z(plan.plan, (FFTComplex*)p, (FFTComplex*)p,
-                                           cufft_direction));
-    }
-#elif defined(AMREX_USE_HIP)
-    detail::hip_execute(plan.plan, (void**)&p, (void**)&p);
-#elif defined(AMREX_USE_SYCL)
-    detail::sycl_execute<T,direction>(plan.plan, (std::complex<T>*)p, (std::complex<T>*)p);
-#else
-    if constexpr (std::is_same_v<float,T>) {
-        fftwf_execute(plan.plan);
-    } else {
-        fftw_execute(plan.plan);
-    }
-#endif
-}
-
-template <typename T, Direction D>
-template <Direction DIR, std::enable_if_t<DIR == Direction::forward ||
-                                          DIR == Direction::both, int> >
-void R2C<T,D>::forward (MF const& inmf)
-{
-    m_rx.ParallelCopy(inmf, 0, 0, 1);
-    exec_r2c(m_fft_fwd_x, m_rx, m_cx);
-
-    if (                          m_cmd_x2y) {
-        ParallelCopy(m_cy, m_cx, *m_cmd_x2y, 0, 0, 1, m_dtos_x2y);
-    }
-    exec_c2c<Direction::forward>(m_fft_fwd_y, m_cy);
-
-    if (                          m_cmd_y2z) {
-        ParallelCopy(m_cz, m_cy, *m_cmd_y2z, 0, 0, 1, m_dtos_y2z);
-    }
-    exec_c2c<Direction::forward>(m_fft_fwd_z, m_cz);
-}
-
-template <typename T, Direction D>
-template <Direction DIR, std::enable_if_t<DIR == Direction::both, int> >
-void R2C<T,D>::backward (MF& outmf)
-{
-    backward_doit(outmf);
-}
-
-template <typename T, Direction D>
-void R2C<T,D>::backward_doit (MF& outmf)
-{
-    exec_c2c<Direction::backward>(m_fft_bwd_z, m_cz);
-    if (                          m_cmd_z2y) {
-        ParallelCopy(m_cy, m_cz, *m_cmd_z2y, 0, 0, 1, m_dtos_z2y);
-    }
-
-    exec_c2c<Direction::backward>(m_fft_bwd_y, m_cy);
-    if (                          m_cmd_y2x) {
-        ParallelCopy(m_cx, m_cy, *m_cmd_y2x, 0, 0, 1, m_dtos_y2x);
-    }
-
-    exec_c2r(m_fft_bwd_x, m_cx, m_rx);
-    outmf.ParallelCopy(m_rx, 0, 0, 1);
-}
-
-template <typename T, Direction D>
-std::pair<typename R2C<T,D>::Plan2, typename R2C<T,D>::Plan2>
-R2C<T,D>::make_c2c_plans (cMF& inout)
-{
-    Plan2 fwd;
-    Plan2 bwd;
-
-    auto* fab = get_fab(inout);
-    if (!fab) { return {fwd, bwd};}
-
-    Box const& local_box = fab->box();
-
-    int n = local_box.length(0);
-    int howmany = AMREX_D_TERM(1, *local_box.length(1), *local_box.length(2));
-
-#if defined(AMREX_USE_CUDA)
-
-    if constexpr (D == Direction::both || D == Direction::forward) {
-        cufftType fwd_type = std::is_same_v<float,T> ? CUFFT_C2C : CUFFT_Z2Z;
-        AMREX_CUFFT_SAFE_CALL
-            (cufftPlanMany(&fwd.plan, 1, &n, nullptr, 1, n, nullptr, 1, n,
-                           fwd_type, howmany));
-        AMREX_CUFFT_SAFE_CALL(cufftSetStream(fwd.plan, Gpu::gpuStream()));
-    }
-    if constexpr (D == Direction::both || D == Direction::backward) {
-        cufftType bwd_type = std::is_same_v<float,T> ? CUFFT_C2C : CUFFT_Z2Z;
-        AMREX_CUFFT_SAFE_CALL
-            (cufftPlanMany(&bwd.plan, 1, &n, nullptr, 1, n, nullptr, 1, n,
-                           bwd_type, howmany));
-        AMREX_CUFFT_SAFE_CALL(cufftSetStream(bwd.plan, Gpu::gpuStream()));
-    }
-
-#elif defined(AMREX_USE_HIP)
-
-    auto prec = std::is_same_v<float,T> ? rocfft_precision_single : rocfft_precision_double;
-    const std::size_t length = n;
-    if constexpr (D == Direction::both || D == Direction::forward) {
-        AMREX_ROCFFT_SAFE_CALL
-            (rocfft_plan_create(&fwd.plan, rocfft_placement_inplace,
-                                rocfft_transform_type_complex_forward, prec, 1,
-                                &length, howmany, nullptr));
-    }
-    if constexpr (D == Direction::both || D == Direction::backward) {
-        AMREX_ROCFFT_SAFE_CALL
-            (rocfft_plan_create(&bwd.plan, rocfft_placement_inplace,
-                                rocfft_transform_type_complex_inverse, prec, 1,
-                                &length, howmany, nullptr));
-    }
-
-#elif defined(AMREX_USE_SYCL)
-
-    fwd.plan = new std::remove_pointer_t<VendorPlan2>(n);
-    fwd.plan->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
-                        DFTI_INPLACE);
-    fwd.plan->set_value(oneapi::mkl::dft::config_param::NUMBER_OF_TRANSFORMS,
-                        howmany);
-    fwd.plan->set_value(oneapi::mkl::dft::config_param::FWD_DISTANCE, n);
-    fwd.plan->set_value(oneapi::mkl::dft::config_param::BWD_DISTANCE, n);
-    std::array<std::int64_t,2> strides{0,1};
-    fwd.plan->set_value(oneapi::mkl::dft::config_param::FWD_STRIDES, strides.data());
-    fwd.plan->set_value(oneapi::mkl::dft::config_param::BWD_STRIDES, strides.data());
-    fwd.plan->set_value(oneapi::mkl::dft::config_param::WORKSPACE,
-                        oneapi::mkl::dft::config_value::WORKSPACE_EXTERNAL);
-    fwd.plan->commit(amrex::Gpu::Device::streamQueue());
-
-    bwd.plan = fwd.plan;
-
-#else
-    auto* pinout = (FFTComplex*)fab->dataPtr();
-
-    if constexpr (std::is_same_v<float,T>) {
-        if constexpr (D == Direction::both || D == Direction::forward) {
-            fwd.plan = fftwf_plan_many_dft(1, &n, howmany, pinout, nullptr, 1, n,
-                                           pinout, nullptr, 1, n, -1, FFTW_ESTIMATE);
-        }
-        if constexpr (D == Direction::both || D == Direction::backward) {
-            bwd.plan = fftwf_plan_many_dft(1, &n, howmany, pinout, nullptr, 1, n,
-                                           pinout, nullptr, 1, n, +1, FFTW_ESTIMATE);
-        }
-    } else {
-        if constexpr (D == Direction::both || D == Direction::forward) {
-            fwd.plan = fftw_plan_many_dft(1, &n, howmany, pinout, nullptr, 1, n,
-                                          pinout, nullptr, 1, n, -1, FFTW_ESTIMATE);
-        }
-        if constexpr (D == Direction::both || D == Direction::backward) {
-            bwd.plan = fftw_plan_many_dft(1, &n, howmany, pinout, nullptr, 1, n,
-                                          pinout, nullptr, 1, n, +1, FFTW_ESTIMATE);
-        }
-    }
-#endif
-
-    if constexpr (D == Direction::both || D == Direction::forward) {
-        fwd.defined = true;
-    }
-    if constexpr (D == Direction::both || D == Direction::backward) {
-        bwd.defined = true;
-    }
-
-    return {fwd,bwd};
-}
-
-template <typename T, Direction D>
-template <typename F>
-void R2C<T,D>::post_forward_doit (F const& post_forward)
-{
-    if (m_info.batch_mode) {
-        amrex::Abort("xxxxx todo: post_forward");
-    } else {
-        if (                           ! m_cz.empty()) {
-            auto* spectral_fab = get_fab(m_cz);
-            if (spectral_fab) {
-                auto const& a = spectral_fab->array(); // m_cz's ordering is z,x,y
-                ParallelFor(spectral_fab->box(),
-                [=] AMREX_GPU_DEVICE (int iz, int jx, int ky)
-                {
-                    post_forward(jx,ky,iz,a(iz,jx,ky));
-                });
-            }
-        } else if (                    ! m_cy.empty()) {
-            auto* spectral_fab = get_fab(m_cy);
-            if (spectral_fab) {
-                auto const& a = spectral_fab->array(); // m_cy's ordering is y,x,z
-                ParallelFor(spectral_fab->box(),
-                [=] AMREX_GPU_DEVICE (int iy, int jx, int k)
-                {
-                    post_forward(jx,iy,k,a(iy,jx,k));
-                });
-            }
-        } else {
-            auto* spectral_fab = get_fab(m_cx);
-            if (spectral_fab) {
-                auto const& a = spectral_fab->array();
-                ParallelFor(spectral_fab->box(),
-                [=] AMREX_GPU_DEVICE (int i, int j, int k)
-                {
-                    post_forward(i,j,k,a(i,j,k));
-                });
-            }
-        }
-    }
-}
-
-template <typename T, Direction D>
-template <Direction DIR, std::enable_if_t<DIR == Direction::forward ||
-                                          DIR == Direction::both, int> >
-std::pair<typename R2C<T,D>::cMF *, IntVect>
-R2C<T,D>::getSpectralData ()
-{
-    if (!m_cz.empty()) {
-        return std::make_pair(&m_cz, IntVect{AMREX_D_DECL(2,0,1)});
-    } else if (!m_cy.empty()) {
-        return std::make_pair(&m_cy, IntVect{AMREX_D_DECL(1,0,2)});
-    } else {
-        return std::make_pair(&m_cx, IntVect{AMREX_D_DECL(0,1,2)});
-    }
-}
-
-template <typename T, Direction D>
-template <Direction DIR, std::enable_if_t<DIR == Direction::forward ||
-                                          DIR == Direction::both, int> >
-void R2C<T,D>::forward (MF const& inmf, cMF& outmf)
-{
-    forward(inmf);
-    if (!m_cz.empty()) { // m_cz's order (z,x,y) -> (x,y,z)
-        RotateBwd dtos{};
-        MultiBlockCommMetaData cmd
-            (outmf, m_spectral_domain_x, m_cz, IntVect(0), dtos);
-        ParallelCopy(outmf, m_cz, cmd, 0, 0, 1, dtos);
-    } else if (!m_cy.empty()) { // m_cy's order (y,x,z) -> (x,y,z)
-        MultiBlockCommMetaData cmd
-            (outmf, m_spectral_domain_x, m_cy, IntVect(0), m_dtos_y2x);
-        ParallelCopy(outmf, m_cy, cmd, 0, 0, 1, m_dtos_y2x);
-    } else {
-        outmf.ParallelCopy(m_cx, 0, 0, 1);
-    }
-}
-
-template <typename T, Direction D>
-template <Direction DIR, std::enable_if_t<DIR == Direction::backward ||
-                                          DIR == Direction::both, int> >
-void R2C<T,D>::backward (cMF const& inmf, MF& outmf)
-{
-    if (!m_cz.empty()) { // (x,y,z) -> m_cz's order (z,x,y)
-        RotateFwd dtos{};
-        MultiBlockCommMetaData cmd
-            (m_cz, m_spectral_domain_z, inmf, IntVect(0), dtos);
-        ParallelCopy(m_cz, inmf, cmd, 0, 0, 1, dtos);
-    } else if (!m_cy.empty()) { // (x,y,z) -> m_cy's ordering (y,x,z)
-        MultiBlockCommMetaData cmd
-            (m_cy, m_spectral_domain_y, inmf, IntVect(0), m_dtos_x2y);
-        ParallelCopy(m_cy, inmf, cmd, 0, 0, 1, m_dtos_x2y);
-    } else {
-        m_cx.ParallelCopy(inmf, 0, 0, 1);
-    }
-    backward_doit(outmf);
-}
-
+    void Initialize ();
+    void Finalize ();
+    void Clear ();
 }
 
 #endif
diff --git a/Src/FFT/AMReX_FFT.cpp b/Src/FFT/AMReX_FFT.cpp
index 68984a8f24..91ac1a7a92 100644
--- a/Src/FFT/AMReX_FFT.cpp
+++ b/Src/FFT/AMReX_FFT.cpp
@@ -1,5 +1,86 @@
 #include <AMReX_FFT.H>
-#include <algorithm>
+#include <AMReX_FFT_Helper.H>
+
+#include <map>
+
+namespace amrex::FFT
+{
+
+namespace
+{
+    bool s_initialized = false;
+    std::map<Key, PlanD> s_plans_d;
+    std::map<Key, PlanF> s_plans_f;
+}
+
+void Initialize ()
+{
+    if (!s_initialized)
+    {
+        s_initialized = true;
+
+#if defined(AMREX_USE_HIP) && defined(AMREX_USE_FFT)
+        AMREX_ROCFFT_SAFE_CALL(rocfft_setup());
+#endif
+    }
+
+    amrex::ExecOnFinalize(amrex::FFT::Finalize);
+}
+
+void Finalize ()
+{
+    if (s_initialized)
+    {
+        s_initialized = false;
+
+        Clear();
+
+#if defined(AMREX_USE_HIP) && defined(AMREX_USE_FFT)
+        AMREX_ROCFFT_SAFE_CALL(rocfft_cleanup());
+#endif
+    }
+}
+
+void Clear ()
+{
+    for (auto& [k, p] : s_plans_d) {
+        Plan<double>::destroy_vendor_plan(p);
+    }
+
+    for (auto& [k, p] : s_plans_f) {
+        Plan<float>::destroy_vendor_plan(p);
+    }
+}
+
+PlanD* get_vendor_plan_d (Key const& key)
+{
+    if (auto found = s_plans_d.find(key); found != s_plans_d.end()) {
+        return &(found->second);
+    } else {
+        return nullptr;
+    }
+}
+
+PlanF* get_vendor_plan_f (Key const& key)
+{
+    if (auto found = s_plans_f.find(key); found != s_plans_f.end()) {
+        return &(found->second);
+    } else {
+        return nullptr;
+    }
+}
+
+void add_vendor_plan_d (Key const& key, PlanD plan)
+{
+    s_plans_d[key] = plan;
+}
+
+void add_vendor_plan_f (Key const& key, PlanF plan)
+{
+    s_plans_f[key] = plan;
+}
+
+}
 
 namespace amrex::FFT::detail
 {
diff --git a/Src/FFT/AMReX_FFT_Helper.H b/Src/FFT/AMReX_FFT_Helper.H
index c8ae2b74ea..efe7ab0b1e 100644
--- a/Src/FFT/AMReX_FFT_Helper.H
+++ b/Src/FFT/AMReX_FFT_Helper.H
@@ -2,12 +2,55 @@
 #define AMREX_FFT_HELPER_H_
 #include <AMReX_Config.H>
 
+#include <AMReX.H>
+#include <AMReX_BLProfiler.H>
+#include <AMReX_DataAllocator.H>
 #include <AMReX_DistributionMapping.H>
+#include <AMReX_Enum.H>
+#include <AMReX_Gpu.H>
+#include <AMReX_GpuComplex.H>
+#include <AMReX_Math.H>
+
+#if defined(AMREX_USE_CUDA)
+#  include <cufft.h>
+#  include <cuComplex.h>
+#elif defined(AMREX_USE_HIP)
+#  if __has_include(<rocfft/rocfft.h>)  // ROCm 5.3+
+#    include <rocfft/rocfft.h>
+#  else
+#    include <rocfft.h>
+#  endif
+#  include <hip/hip_complex.h>
+#elif defined(AMREX_USE_SYCL)
+#  if __has_include(<oneapi/mkl/dft.hpp>) // oneAPI 2025.0
+#    include <oneapi/mkl/dft.hpp>
+#else
+#    define AMREX_USE_MKL_DFTI_2024 1
+#    include <oneapi/mkl/dfti.hpp>
+#  endif
+#else
+#  include <fftw3.h>
+#endif
+
+#include <algorithm>
+#include <complex>
+#include <limits>
+#include <memory>
+#include <tuple>
+#include <utility>
+#include <variant>
 
 namespace amrex::FFT
 {
 
-enum struct Direction { forward, backward, both };
+enum struct Direction { forward, backward, both, none };
+
+enum struct DomainStrategy { slab, pencil };
+
+AMREX_ENUM( Boundary, periodic, even, odd );
+
+enum struct Kind { none, r2c_f, r2c_b, c2c_f, c2c_b, r2r_ee_f, r2r_ee_b,
+                   r2r_oo_f, r2r_oo_b, r2r_eo, r2r_oe };
 
 struct Info
 {
@@ -16,14 +59,1391 @@ struct Info
     //! batch size.
     bool batch_mode = false;
 
+    //! Max number of processes to use
+    int nprocs = std::numeric_limits<int>::max();
+
     Info& setBatchMode (bool x) { batch_mode = x; return *this; }
+    Info& setNumProcs (int n) { nprocs = n; return *this; }
 };
 
+#ifdef AMREX_USE_HIP
+namespace detail { void hip_execute (rocfft_plan plan, void **in, void **out); }
+#endif
+
+#ifdef AMREX_USE_SYCL
+namespace detail
+{
+template <typename T, Direction direction, typename P, typename TI, typename TO>
+void sycl_execute (P* plan, TI* in, TO* out)
+{
+#ifndef AMREX_USE_MKL_DFTI_2024
+    std::int64_t workspaceSize = 0;
+#else
+    std::size_t workspaceSize = 0;
+#endif
+    plan->get_value(oneapi::mkl::dft::config_param::WORKSPACE_BYTES,
+                    &workspaceSize);
+    auto* buffer = (T*)amrex::The_Arena()->alloc(workspaceSize);
+    plan->set_workspace(buffer);
+    sycl::event r;
+    if (std::is_same_v<TI,TO>) {
+        amrex::ignore_unused(in);
+        if constexpr (direction == Direction::forward) {
+            r = oneapi::mkl::dft::compute_forward(*plan, out);
+        } else {
+            r = oneapi::mkl::dft::compute_backward(*plan, out);
+        }
+    } else {
+        if constexpr (direction == Direction::forward) {
+            r = oneapi::mkl::dft::compute_forward(*plan, in, out);
+        } else {
+            r = oneapi::mkl::dft::compute_backward(*plan, in, out);
+        }
+    }
+    r.wait();
+    amrex::The_Arena()->free(buffer);
+}
+}
+#endif
+
+template <typename T>
+struct Plan
+{
+#if defined(AMREX_USE_CUDA)
+    using VendorPlan = cufftHandle;
+    using VendorComplex = std::conditional_t<std::is_same_v<float,T>,
+                                             cuComplex, cuDoubleComplex>;
+#elif defined(AMREX_USE_HIP)
+    using VendorPlan = rocfft_plan;
+    using VendorComplex = std::conditional_t<std::is_same_v<float,T>,
+                                             float2, double2>;
+#elif defined(AMREX_USE_SYCL)
+    using mkl_desc_r = oneapi::mkl::dft::descriptor<std::is_same_v<float,T>
+                                     ? oneapi::mkl::dft::precision::SINGLE
+                                     : oneapi::mkl::dft::precision::DOUBLE,
+                                     oneapi::mkl::dft::domain::REAL>;
+    using mkl_desc_c = oneapi::mkl::dft::descriptor<std::is_same_v<float,T>
+                                     ? oneapi::mkl::dft::precision::SINGLE
+                                     : oneapi::mkl::dft::precision::DOUBLE,
+                                     oneapi::mkl::dft::domain::COMPLEX>;
+    using VendorPlan = std::variant<mkl_desc_r*,mkl_desc_c*>;
+    using VendorComplex = std::complex<T>;
+#else
+    using VendorPlan = std::conditional_t<std::is_same_v<float,T>,
+                                          fftwf_plan, fftw_plan>;
+    using VendorComplex = std::conditional_t<std::is_same_v<float,T>,
+                                             fftwf_complex, fftw_complex>;
+#endif
+
+    int n = 0;
+    int howmany = 0;
+    Kind kind = Kind::none;
+    bool r2r_data_is_complex = false;
+    bool defined = false;
+    bool defined2 = false;
+    VendorPlan plan{};
+    VendorPlan plan2{};
+    void* pf = nullptr;
+    void* pb = nullptr;
+
+#ifdef AMREX_USE_GPU
+    void set_ptrs (void* p0, void* p1) {
+        pf = p0;
+        pb = p1;
+    }
+#endif
+
+    void destroy ()
+    {
+        if (defined) {
+            destroy_vendor_plan(plan);
+            defined = false;
+        }
+#if !defined(AMREX_USE_GPU)
+        if (defined2) {
+            destroy_vendor_plan(plan2);
+            defined2 = false;
+        }
+#endif
+    }
+
+    template <Direction D>
+    void init_r2c (Box const& box, T* pr, VendorComplex* pc, bool is_2d_transform = false)
+    {
+        static_assert(D == Direction::forward || D == Direction::backward);
+
+        int rank = is_2d_transform ? 2 : 1;
+
+        kind = (D == Direction::forward) ? Kind::r2c_f : Kind::r2c_b;
+        defined = true;
+        pf = (void*)pr;
+        pb = (void*)pc;
+
+        int len[2] = {};
+        if (rank == 1) {
+            len[0] = box.length(0);
+            len[1] = box.length(0); // Not used except for HIP. Yes it's `(0)`.
+        } else {
+            len[0] = box.length(1); // Most FFT libraries assume row-major ordering
+            len[1] = box.length(0); // except for rocfft
+        }
+        int nr = (rank == 1) ? len[0] : len[0]*len[1];
+        n = nr;
+        int nc = (rank == 1) ? (len[0]/2+1) : (len[1]/2+1)*len[0];
+#if (AMREX_SPACEDIM == 1)
+        howmany = 1;
+#else
+        howmany = (rank == 1) ? AMREX_D_TERM(1, *box.length(1), *box.length(2))
+                              : AMREX_D_TERM(1, *1            , *box.length(2));
+#endif
+
+        amrex::ignore_unused(nc);
+
+#if defined(AMREX_USE_CUDA)
+
+        AMREX_CUFFT_SAFE_CALL(cufftCreate(&plan));
+        AMREX_CUFFT_SAFE_CALL(cufftSetAutoAllocation(plan, 0));
+        std::size_t work_size;
+        if constexpr (D == Direction::forward) {
+            cufftType fwd_type = std::is_same_v<float,T> ? CUFFT_R2C : CUFFT_D2Z;
+            AMREX_CUFFT_SAFE_CALL
+                (cufftMakePlanMany(plan, rank, len, nullptr, 1, nr, nullptr, 1, nc, fwd_type, howmany, &work_size));
+        } else {
+            cufftType bwd_type = std::is_same_v<float,T> ? CUFFT_C2R : CUFFT_Z2D;
+            AMREX_CUFFT_SAFE_CALL
+                (cufftMakePlanMany(plan, rank, len, nullptr, 1, nc, nullptr, 1, nr, bwd_type, howmany, &work_size));
+        }
+
+#elif defined(AMREX_USE_HIP)
+
+        auto prec = std::is_same_v<float,T> ? rocfft_precision_single : rocfft_precision_double;
+        // switch to column-major ordering
+        std::size_t length[2] = {std::size_t(len[1]), std::size_t(len[0])};
+        if constexpr (D == Direction::forward) {
+            AMREX_ROCFFT_SAFE_CALL
+                (rocfft_plan_create(&plan, rocfft_placement_notinplace,
+                                    rocfft_transform_type_real_forward, prec, rank,
+                                    length, howmany, nullptr));
+        } else {
+            AMREX_ROCFFT_SAFE_CALL
+                (rocfft_plan_create(&plan, rocfft_placement_notinplace,
+                                    rocfft_transform_type_real_inverse, prec, rank,
+                                    length, howmany, nullptr));
+        }
+
+#elif defined(AMREX_USE_SYCL)
+
+        mkl_desc_r* pp;
+        if (rank == 1) {
+            pp = new mkl_desc_r(len[0]);
+        } else {
+            pp = new mkl_desc_r({std::int64_t(len[0]), std::int64_t(len[1])});
+        }
+#ifndef AMREX_USE_MKL_DFTI_2024
+        pp->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
+                      oneapi::mkl::dft::config_value::NOT_INPLACE);
+#else
+        pp->set_value(oneapi::mkl::dft::config_param::PLACEMENT, DFTI_NOT_INPLACE);
+#endif
+        pp->set_value(oneapi::mkl::dft::config_param::NUMBER_OF_TRANSFORMS, howmany);
+        pp->set_value(oneapi::mkl::dft::config_param::FWD_DISTANCE, nr);
+        pp->set_value(oneapi::mkl::dft::config_param::BWD_DISTANCE, nc);
+        std::vector<std::int64_t> strides;
+        strides.push_back(0);
+        if (rank == 2) { strides.push_back(len[1]); }
+        strides.push_back(1);
+#ifndef AMREX_USE_MKL_DFTI_2024
+        pp->set_value(oneapi::mkl::dft::config_param::FWD_STRIDES, strides);
+        pp->set_value(oneapi::mkl::dft::config_param::BWD_STRIDES, strides);
+#else
+        pp->set_value(oneapi::mkl::dft::config_param::FWD_STRIDES, strides.data());
+        pp->set_value(oneapi::mkl::dft::config_param::BWD_STRIDES, strides.data());
+#endif
+        pp->set_value(oneapi::mkl::dft::config_param::WORKSPACE,
+                      oneapi::mkl::dft::config_value::WORKSPACE_EXTERNAL);
+        pp->commit(amrex::Gpu::Device::streamQueue());
+        plan = pp;
+
+#else /* FFTW */
+
+        if constexpr (std::is_same_v<float,T>) {
+            if constexpr (D == Direction::forward) {
+                plan = fftwf_plan_many_dft_r2c
+                    (rank, len, howmany, pr, nullptr, 1, nr, pc, nullptr, 1, nc,
+                     FFTW_ESTIMATE | FFTW_DESTROY_INPUT);
+            } else {
+                plan = fftwf_plan_many_dft_c2r
+                    (rank, len, howmany, pc, nullptr, 1, nc, pr, nullptr, 1, nr,
+                     FFTW_ESTIMATE | FFTW_DESTROY_INPUT);
+            }
+        } else {
+            if constexpr (D == Direction::forward) {
+                plan = fftw_plan_many_dft_r2c
+                    (rank, len, howmany, pr, nullptr, 1, nr, pc, nullptr, 1, nc,
+                     FFTW_ESTIMATE | FFTW_DESTROY_INPUT);
+            } else {
+                plan = fftw_plan_many_dft_c2r
+                    (rank, len, howmany, pc, nullptr, 1, nc, pr, nullptr, 1, nr,
+                     FFTW_ESTIMATE | FFTW_DESTROY_INPUT);
+            }
+        }
+#endif
+    }
+
+    template <Direction D, int M>
+    void init_r2c (IntVectND<M> const& fft_size, void*, void*, bool cache);
+
+    template <Direction D>
+    void init_c2c (Box const& box, VendorComplex* p)
+    {
+        static_assert(D == Direction::forward || D == Direction::backward);
+
+        kind = (D == Direction::forward) ? Kind::c2c_f : Kind::c2c_b;
+        defined = true;
+        pf = (void*)p;
+        pb = (void*)p;
+
+        n = box.length(0);
+        howmany = AMREX_D_TERM(1, *box.length(1), *box.length(2));
+
+#if defined(AMREX_USE_CUDA)
+        AMREX_CUFFT_SAFE_CALL(cufftCreate(&plan));
+        AMREX_CUFFT_SAFE_CALL(cufftSetAutoAllocation(plan, 0));
+
+        cufftType t = std::is_same_v<float,T> ? CUFFT_C2C : CUFFT_Z2Z;
+        std::size_t work_size;
+        AMREX_CUFFT_SAFE_CALL
+            (cufftMakePlanMany(plan, 1, &n, nullptr, 1, n, nullptr, 1, n, t, howmany, &work_size));
+
+#elif defined(AMREX_USE_HIP)
+
+        auto prec = std::is_same_v<float,T> ? rocfft_precision_single
+                                            : rocfft_precision_double;
+        auto dir= (D == Direction::forward) ? rocfft_transform_type_complex_forward
+                                            : rocfft_transform_type_complex_inverse;
+        const std::size_t length = n;
+        AMREX_ROCFFT_SAFE_CALL
+            (rocfft_plan_create(&plan, rocfft_placement_inplace, dir, prec, 1,
+                                &length, howmany, nullptr));
+
+#elif defined(AMREX_USE_SYCL)
+
+        auto* pp = new mkl_desc_c(n);
+#ifndef AMREX_USE_MKL_DFTI_2024
+        pp->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
+                      oneapi::mkl::dft::config_value::INPLACE);
+#else
+        pp->set_value(oneapi::mkl::dft::config_param::PLACEMENT, DFTI_INPLACE);
+#endif
+        pp->set_value(oneapi::mkl::dft::config_param::NUMBER_OF_TRANSFORMS, howmany);
+        pp->set_value(oneapi::mkl::dft::config_param::FWD_DISTANCE, n);
+        pp->set_value(oneapi::mkl::dft::config_param::BWD_DISTANCE, n);
+        std::vector<std::int64_t> strides = {0,1};
+#ifndef AMREX_USE_MKL_DFTI_2024
+        pp->set_value(oneapi::mkl::dft::config_param::FWD_STRIDES, strides);
+        pp->set_value(oneapi::mkl::dft::config_param::BWD_STRIDES, strides);
+#else
+        pp->set_value(oneapi::mkl::dft::config_param::FWD_STRIDES, strides.data());
+        pp->set_value(oneapi::mkl::dft::config_param::BWD_STRIDES, strides.data());
+#endif
+        pp->set_value(oneapi::mkl::dft::config_param::WORKSPACE,
+                      oneapi::mkl::dft::config_value::WORKSPACE_EXTERNAL);
+        pp->commit(amrex::Gpu::Device::streamQueue());
+        plan = pp;
+
+#else /* FFTW */
+
+        if constexpr (std::is_same_v<float,T>) {
+            if constexpr (D == Direction::forward) {
+                plan = fftwf_plan_many_dft
+                    (1, &n, howmany, p, nullptr, 1, n, p, nullptr, 1, n, -1,
+                     FFTW_ESTIMATE);
+            } else {
+                plan = fftwf_plan_many_dft
+                    (1, &n, howmany, p, nullptr, 1, n, p, nullptr, 1, n, +1,
+                     FFTW_ESTIMATE);
+            }
+        } else {
+            if constexpr (D == Direction::forward) {
+                plan = fftw_plan_many_dft
+                    (1, &n, howmany, p, nullptr, 1, n, p, nullptr, 1, n, -1,
+                     FFTW_ESTIMATE);
+            } else {
+                plan = fftw_plan_many_dft
+                    (1, &n, howmany, p, nullptr, 1, n, p, nullptr, 1, n, +1,
+                     FFTW_ESTIMATE);
+            }
+        }
+#endif
+    }
+
+#ifndef AMREX_USE_GPU
+    template <Direction D>
+    fftw_r2r_kind get_fftw_kind (std::pair<Boundary,Boundary> const& bc)
+    {
+        if (bc.first == Boundary::even && bc.second == Boundary::even)
+        {
+            return (D == Direction::forward) ? FFTW_REDFT10 : FFTW_REDFT01;
+        }
+        else if (bc.first == Boundary::even && bc.second == Boundary::odd)
+        {
+            return FFTW_REDFT11;
+        }
+        else if (bc.first == Boundary::odd && bc.second == Boundary::even)
+        {
+            return FFTW_RODFT11;
+        }
+        else if (bc.first == Boundary::odd && bc.second == Boundary::odd)
+        {
+            return (D == Direction::forward) ? FFTW_RODFT10 : FFTW_RODFT01;
+        }
+        else {
+            amrex::Abort("FFT: unsupported BC");
+            return fftw_r2r_kind{};
+        }
+
+    }
+#endif
+
+    template <Direction D>
+    Kind get_r2r_kind (std::pair<Boundary,Boundary> const& bc)
+    {
+        if (bc.first == Boundary::even && bc.second == Boundary::even)
+        {
+            return (D == Direction::forward) ? Kind::r2r_ee_f : Kind::r2r_ee_b;
+        }
+        else if (bc.first == Boundary::even && bc.second == Boundary::odd)
+        {
+            return Kind::r2r_eo;
+        }
+        else if (bc.first == Boundary::odd && bc.second == Boundary::even)
+        {
+            return Kind::r2r_oe;
+        }
+        else if (bc.first == Boundary::odd && bc.second == Boundary::odd)
+        {
+            return (D == Direction::forward) ? Kind::r2r_oo_f : Kind::r2r_oo_b;
+        }
+        else {
+            amrex::Abort("FFT: unsupported BC");
+            return Kind::none;
+        }
+
+    }
+
+    template <Direction D>
+    void init_r2r (Box const& box, T* p, std::pair<Boundary,Boundary> const& bc,
+                   int howmany_initval = 1)
+    {
+        static_assert(D == Direction::forward || D == Direction::backward);
+
+        kind = get_r2r_kind<D>(bc);
+        defined = true;
+        pf = (void*)p;
+        pb = (void*)p;
+
+        n = box.length(0);
+        howmany = AMREX_D_TERM(howmany_initval, *box.length(1), *box.length(2));
+
+#if defined(AMREX_USE_GPU)
+        int nex=0;
+        if (bc.first == Boundary::odd && bc.second == Boundary::odd &&
+            Direction::forward == D) {
+            nex = 2*n;
+        } else if (bc.first == Boundary::odd && bc.second == Boundary::odd &&
+            Direction::backward == D) {
+            nex = 4*n;
+        } else if (bc.first == Boundary::even && bc.second == Boundary::even &&
+            Direction::forward == D) {
+            nex = 2*n;
+        } else if (bc.first == Boundary::even && bc.second == Boundary::even &&
+            Direction::backward == D) {
+            nex = 4*n;
+        } else if ((bc.first == Boundary::even && bc.second == Boundary::odd) ||
+                   (bc.first == Boundary::odd && bc.second == Boundary::even)) {
+            nex = 4*n;
+        } else {
+            amrex::Abort("FFT: unsupported BC");
+        }
+        int nc = (nex/2) + 1;
+
+#if defined (AMREX_USE_CUDA)
+
+        AMREX_CUFFT_SAFE_CALL(cufftCreate(&plan));
+        AMREX_CUFFT_SAFE_CALL(cufftSetAutoAllocation(plan, 0));
+        cufftType fwd_type = std::is_same_v<float,T> ? CUFFT_R2C : CUFFT_D2Z;
+        std::size_t work_size;
+        AMREX_CUFFT_SAFE_CALL
+            (cufftMakePlanMany(plan, 1, &nex, nullptr, 1, nc*2, nullptr, 1, nc, fwd_type, howmany, &work_size));
+
+#elif defined(AMREX_USE_HIP)
+
+        amrex::ignore_unused(nc);
+        auto prec = std::is_same_v<float,T> ? rocfft_precision_single : rocfft_precision_double;
+        const std::size_t length = nex;
+        AMREX_ROCFFT_SAFE_CALL
+            (rocfft_plan_create(&plan, rocfft_placement_inplace,
+                                rocfft_transform_type_real_forward, prec, 1,
+                                &length, howmany, nullptr));
+
+#elif defined(AMREX_USE_SYCL)
+
+        auto* pp = new mkl_desc_r(nex);
+#ifndef AMREX_USE_MKL_DFTI_2024
+        pp->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
+                      oneapi::mkl::dft::config_value::INPLACE);
+#else
+        pp->set_value(oneapi::mkl::dft::config_param::PLACEMENT, DFTI_INPLACE);
+#endif
+        pp->set_value(oneapi::mkl::dft::config_param::NUMBER_OF_TRANSFORMS, howmany);
+        pp->set_value(oneapi::mkl::dft::config_param::FWD_DISTANCE, nc*2);
+        pp->set_value(oneapi::mkl::dft::config_param::BWD_DISTANCE, nc);
+        std::vector<std::int64_t> strides = {0,1};
+#ifndef AMREX_USE_MKL_DFTI_2024
+        pp->set_value(oneapi::mkl::dft::config_param::FWD_STRIDES, strides);
+        pp->set_value(oneapi::mkl::dft::config_param::BWD_STRIDES, strides);
+#else
+        pp->set_value(oneapi::mkl::dft::config_param::FWD_STRIDES, strides.data());
+        pp->set_value(oneapi::mkl::dft::config_param::BWD_STRIDES, strides.data());
+#endif
+        pp->set_value(oneapi::mkl::dft::config_param::WORKSPACE,
+                      oneapi::mkl::dft::config_value::WORKSPACE_EXTERNAL);
+        pp->commit(amrex::Gpu::Device::streamQueue());
+        plan = pp;
+
+#endif
+
+#else /* FFTW */
+        auto fftw_kind = get_fftw_kind<D>(bc);
+        if constexpr (std::is_same_v<float,T>) {
+            plan = fftwf_plan_many_r2r
+                (1, &n, howmany, p, nullptr, 1, n, p, nullptr, 1, n, &fftw_kind,
+                 FFTW_ESTIMATE);
+        } else {
+            plan = fftw_plan_many_r2r
+                (1, &n, howmany, p, nullptr, 1, n, p, nullptr, 1, n, &fftw_kind,
+                 FFTW_ESTIMATE);
+        }
+#endif
+    }
+
+    template <Direction D>
+    void init_r2r (Box const& box, VendorComplex* pc,
+                   std::pair<Boundary,Boundary> const& bc)
+    {
+        static_assert(D == Direction::forward || D == Direction::backward);
+
+        auto* p = (T*)pc;
+
+#if defined(AMREX_USE_GPU)
+
+        init_r2r<D>(box, p, bc, 2);
+        r2r_data_is_complex = true;
+
+#else
+
+        kind = get_r2r_kind<D>(bc);
+        defined = true;
+        pf = (void*)p;
+        pb = (void*)p;
+
+        n = box.length(0);
+        howmany = AMREX_D_TERM(1, *box.length(1), *box.length(2));
+
+        defined2 = true;
+        auto fftw_kind = get_fftw_kind<D>(bc);
+        if constexpr (std::is_same_v<float,T>) {
+            plan = fftwf_plan_many_r2r
+                (1, &n, howmany, p, nullptr, 2, n*2, p, nullptr, 2, n*2, &fftw_kind,
+                 FFTW_ESTIMATE);
+            plan2 = fftwf_plan_many_r2r
+                (1, &n, howmany, p+1, nullptr, 2, n*2, p+1, nullptr, 2, n*2, &fftw_kind,
+                 FFTW_ESTIMATE);
+        } else {
+            plan = fftw_plan_many_r2r
+                (1, &n, howmany, p, nullptr, 2, n*2, p, nullptr, 2, n*2, &fftw_kind,
+                 FFTW_ESTIMATE);
+            plan2 = fftw_plan_many_r2r
+                (1, &n, howmany, p+1, nullptr, 2, n*2, p+1, nullptr, 2, n*2, &fftw_kind,
+                 FFTW_ESTIMATE);
+        }
+#endif
+    }
+
+    template <Direction D>
+    void compute_r2c ()
+    {
+        static_assert(D == Direction::forward || D == Direction::backward);
+        if (!defined) { return; }
+
+        using TI = std::conditional_t<(D == Direction::forward), T, VendorComplex>;
+        using TO = std::conditional_t<(D == Direction::backward), T, VendorComplex>;
+        auto* pi = (TI*)((D == Direction::forward) ? pf : pb);
+        auto* po = (TO*)((D == Direction::forward) ? pb : pf);
+
+#if defined(AMREX_USE_CUDA)
+        AMREX_CUFFT_SAFE_CALL(cufftSetStream(plan, Gpu::gpuStream()));
+
+        std::size_t work_size = 0;
+        AMREX_CUFFT_SAFE_CALL(cufftGetSize(plan, &work_size));
+
+        auto* work_area = The_Arena()->alloc(work_size);
+        AMREX_CUFFT_SAFE_CALL(cufftSetWorkArea(plan, work_area));
+
+        if constexpr (D == Direction::forward) {
+            if constexpr (std::is_same_v<float,T>) {
+                AMREX_CUFFT_SAFE_CALL(cufftExecR2C(plan, pi, po));
+            } else {
+                AMREX_CUFFT_SAFE_CALL(cufftExecD2Z(plan, pi, po));
+            }
+        } else {
+            if constexpr (std::is_same_v<float,T>) {
+                AMREX_CUFFT_SAFE_CALL(cufftExecC2R(plan, pi, po));
+            } else {
+                AMREX_CUFFT_SAFE_CALL(cufftExecZ2D(plan, pi, po));
+            }
+        }
+        Gpu::streamSynchronize();
+        The_Arena()->free(work_area);
+#elif defined(AMREX_USE_HIP)
+        detail::hip_execute(plan, (void**)&pi, (void**)&po);
+#elif defined(AMREX_USE_SYCL)
+        detail::sycl_execute<T,D>(std::get<0>(plan), pi, po);
+#else
+        amrex::ignore_unused(pi,po);
+        if constexpr (std::is_same_v<float,T>) {
+            fftwf_execute(plan);
+        } else {
+            fftw_execute(plan);
+        }
+#endif
+    }
+
+    template <Direction D>
+    void compute_c2c ()
+    {
+        static_assert(D == Direction::forward || D == Direction::backward);
+        if (!defined) { return; }
+
+        auto* p = (VendorComplex*)pf;
+
+#if defined(AMREX_USE_CUDA)
+        AMREX_CUFFT_SAFE_CALL(cufftSetStream(plan, Gpu::gpuStream()));
+
+        std::size_t work_size = 0;
+        AMREX_CUFFT_SAFE_CALL(cufftGetSize(plan, &work_size));
+
+        auto* work_area = The_Arena()->alloc(work_size);
+        AMREX_CUFFT_SAFE_CALL(cufftSetWorkArea(plan, work_area));
+
+        auto dir = (D == Direction::forward) ? CUFFT_FORWARD : CUFFT_INVERSE;
+        if constexpr (std::is_same_v<float,T>) {
+            AMREX_CUFFT_SAFE_CALL(cufftExecC2C(plan, p, p, dir));
+        } else {
+            AMREX_CUFFT_SAFE_CALL(cufftExecZ2Z(plan, p, p, dir));
+        }
+        Gpu::streamSynchronize();
+        The_Arena()->free(work_area);
+#elif defined(AMREX_USE_HIP)
+        detail::hip_execute(plan, (void**)&p, (void**)&p);
+#elif defined(AMREX_USE_SYCL)
+        detail::sycl_execute<T,D>(std::get<1>(plan), p, p);
+#else
+        amrex::ignore_unused(p);
+        if constexpr (std::is_same_v<float,T>) {
+            fftwf_execute(plan);
+        } else {
+            fftw_execute(plan);
+        }
+#endif
+    }
+
+#ifdef AMREX_USE_GPU
+    [[nodiscard]] void* alloc_scratch_space () const
+    {
+        int nc = 0;
+        if (kind == Kind::r2r_oo_f || kind == Kind::r2r_ee_f) {
+            nc = n + 1;
+        } else if (kind == Kind::r2r_oo_b || kind == Kind::r2r_ee_b ||
+                   kind == Kind::r2r_oe || kind == Kind::r2r_eo) {
+            nc = 2*n+1;
+        } else {
+            amrex::Abort("FFT: alloc_scratch_space: unsupported kind");
+        }
+        return The_Arena()->alloc(sizeof(GpuComplex<T>)*nc*howmany);
+    }
+
+    static void free_scratch_space (void* p) { The_Arena()->free(p); }
+
+    void pack_r2r_buffer (void* pbuf, T const* psrc) const
+    {
+        auto* pdst = (T*) pbuf;
+        if (kind == Kind::r2r_oo_f || kind == Kind::r2r_ee_f) {
+            T sign = (kind == Kind::r2r_oo_f) ? T(-1) : T(1);
+            int ostride = (n+1)*2;
+            int istride = n;
+            int nex = 2*n;
+            int norig = n;
+            Long nelems = Long(nex)*howmany;
+            if (r2r_data_is_complex) {
+                ParallelFor(nelems/2, [=] AMREX_GPU_DEVICE (Long ielem)
+                {
+                    auto batch = ielem / Long(nex);
+                    auto i = int(ielem - batch*nex);
+                    for (int ir = 0; ir < 2; ++ir) {
+                        auto* po = pdst + (2*batch+ir)*ostride + i;
+                        auto const* pi = psrc + 2*batch*istride + ir;
+                        if (i < norig) {
+                            *po = pi[i*2];
+                        } else {
+                            *po = sign * pi[(2*norig-1-i)*2];
+                        }
+                    }
+                });
+            } else {
+                ParallelFor(nelems, [=] AMREX_GPU_DEVICE (Long ielem)
+                {
+                    auto batch = ielem / Long(nex);
+                    auto i = int(ielem - batch*nex);
+                    auto* po = pdst + batch*ostride + i;
+                    auto const* pi = psrc + batch*istride;
+                    if (i < norig) {
+                        *po = pi[i];
+                    } else {
+                        *po = sign * pi[2*norig-1-i];
+                    }
+                });
+            }
+        } else if (kind == Kind::r2r_oo_b) {
+            int ostride = (2*n+1)*2;
+            int istride = n;
+            int nex = 4*n;
+            int norig = n;
+            Long nelems = Long(nex)*howmany;
+            if (r2r_data_is_complex) {
+                ParallelFor(nelems/2, [=] AMREX_GPU_DEVICE (Long ielem)
+                {
+                    auto batch = ielem / Long(nex);
+                    auto i = int(ielem - batch*nex);
+                    for (int ir = 0; ir < 2; ++ir) {
+                        auto* po = pdst + (2*batch+ir)*ostride + i;
+                        auto const* pi = psrc + 2*batch*istride + ir;
+                        if (i < norig) {
+                            *po = pi[i*2];
+                        } else if (i < (2*norig-1)) {
+                            *po = pi[(2*norig-2-i)*2];
+                        } else if (i == (2*norig-1)) {
+                            *po = T(0);
+                        } else if (i < (3*norig)) {
+                            *po = -pi[(i-2*norig)*2];
+                        } else if (i < (4*norig-1)) {
+                            *po = -pi[(4*norig-2-i)*2];
+                        } else {
+                            *po = T(0);
+                        }
+                    }
+                });
+            } else {
+                ParallelFor(nelems, [=] AMREX_GPU_DEVICE (Long ielem)
+                {
+                    auto batch = ielem / Long(nex);
+                    auto i = int(ielem - batch*nex);
+                    auto* po = pdst + batch*ostride + i;
+                    auto const* pi = psrc + batch*istride;
+                    if (i < norig) {
+                        *po = pi[i];
+                    } else if (i < (2*norig-1)) {
+                        *po = pi[2*norig-2-i];
+                    } else if (i == (2*norig-1)) {
+                        *po = T(0);
+                    } else if (i < (3*norig)) {
+                        *po = -pi[i-2*norig];
+                    } else if (i < (4*norig-1)) {
+                        *po = -pi[4*norig-2-i];
+                    } else {
+                        *po = T(0);
+                    }
+                });
+            }
+        } else if (kind == Kind::r2r_ee_b) {
+            int ostride = (2*n+1)*2;
+            int istride = n;
+            int nex = 4*n;
+            int norig = n;
+            Long nelems = Long(nex)*howmany;
+            if (r2r_data_is_complex) {
+                ParallelFor(nelems/2, [=] AMREX_GPU_DEVICE (Long ielem)
+                {
+                    auto batch = ielem / Long(nex);
+                    auto i = int(ielem - batch*nex);
+                    for (int ir = 0; ir < 2; ++ir) {
+                        auto* po = pdst + (2*batch+ir)*ostride + i;
+                        auto const* pi = psrc + 2*batch*istride + ir;
+                        if (i < norig) {
+                            *po = pi[i*2];
+                        } else if (i == norig) {
+                            *po = T(0);
+                        } else if (i < (2*norig+1)) {
+                            *po = -pi[(2*norig-i)*2];
+                        } else if (i < (3*norig)) {
+                            *po = -pi[(i-2*norig)*2];
+                        } else if (i == 3*norig) {
+                            *po = T(0);
+                        } else {
+                            *po = pi[(4*norig-i)*2];
+                        }
+                    }
+                });
+            } else {
+                ParallelFor(nelems, [=] AMREX_GPU_DEVICE (Long ielem)
+                {
+                    auto batch = ielem / Long(nex);
+                    auto i = int(ielem - batch*nex);
+                    auto* po = pdst + batch*ostride + i;
+                    auto const* pi = psrc + batch*istride;
+                    if (i < norig) {
+                        *po = pi[i];
+                    } else if (i == norig) {
+                        *po = T(0);
+                    } else if (i < (2*norig+1)) {
+                        *po = -pi[2*norig-i];
+                    } else if (i < (3*norig)) {
+                        *po = -pi[i-2*norig];
+                    } else if (i == 3*norig) {
+                        *po = T(0);
+                    } else {
+                        *po = pi[4*norig-i];
+                    }
+                });
+            }
+        } else if (kind == Kind::r2r_eo) {
+            int ostride = (2*n+1)*2;
+            int istride = n;
+            int nex = 4*n;
+            int norig = n;
+            Long nelems = Long(nex)*howmany;
+            if (r2r_data_is_complex) {
+                ParallelFor(nelems/2, [=] AMREX_GPU_DEVICE (Long ielem)
+                {
+                    auto batch = ielem / Long(nex);
+                    auto i = int(ielem - batch*nex);
+                    for (int ir = 0; ir < 2; ++ir) {
+                        auto* po = pdst + (2*batch+ir)*ostride + i;
+                        auto const* pi = psrc + 2*batch*istride + ir;
+                        if (i < norig) {
+                            *po = pi[i*2];
+                        } else if (i < (2*norig)) {
+                            *po = -pi[(2*norig-1-i)*2];
+                        } else if (i < (3*norig)) {
+                            *po = -pi[(i-2*norig)*2];
+                        } else {
+                            *po = pi[(4*norig-1-i)*2];
+                        }
+                    }
+                });
+            } else {
+                ParallelFor(nelems, [=] AMREX_GPU_DEVICE (Long ielem)
+                {
+                    auto batch = ielem / Long(nex);
+                    auto i = int(ielem - batch*nex);
+                    auto* po = pdst + batch*ostride + i;
+                    auto const* pi = psrc + batch*istride;
+                    if (i < norig) {
+                        *po = pi[i];
+                    } else if (i < (2*norig)) {
+                        *po = -pi[2*norig-1-i];
+                    } else if (i < (3*norig)) {
+                        *po = -pi[i-2*norig];
+                    } else {
+                        *po = pi[4*norig-1-i];
+                    }
+                });
+            }
+        } else if (kind == Kind::r2r_oe) {
+            int ostride = (2*n+1)*2;
+            int istride = n;
+            int nex = 4*n;
+            int norig = n;
+            Long nelems = Long(nex)*howmany;
+            if (r2r_data_is_complex) {
+                ParallelFor(nelems/2, [=] AMREX_GPU_DEVICE (Long ielem)
+                {
+                    auto batch = ielem / Long(nex);
+                    auto i = int(ielem - batch*nex);
+                    for (int ir = 0; ir < 2; ++ir) {
+                        auto* po = pdst + (2*batch+ir)*ostride + i;
+                        auto const* pi = psrc + 2*batch*istride + ir;
+                        if (i < norig) {
+                            *po = pi[i*2];
+                        } else if (i < (2*norig)) {
+                            *po = pi[(2*norig-1-i)*2];
+                        } else if (i < (3*norig)) {
+                            *po = -pi[(i-2*norig)*2];
+                        } else {
+                            *po = -pi[(4*norig-1-i)*2];
+                        }
+                    }
+                });
+            } else {
+                ParallelFor(nelems, [=] AMREX_GPU_DEVICE (Long ielem)
+                {
+                    auto batch = ielem / Long(nex);
+                    auto i = int(ielem - batch*nex);
+                    auto* po = pdst + batch*ostride + i;
+                    auto const* pi = psrc + batch*istride;
+                    if (i < norig) {
+                        *po = pi[i];
+                    } else if (i < (2*norig)) {
+                        *po = pi[2*norig-1-i];
+                    } else if (i < (3*norig)) {
+                        *po = -pi[i-2*norig];
+                    } else {
+                        *po = -pi[4*norig-1-i];
+                    }
+                });
+            }
+        } else {
+            amrex::Abort("FFT: pack_r2r_buffer: unsupported kind");
+        }
+    }
+
+    void unpack_r2r_buffer (T* pdst, void const* pbuf) const
+    {
+        auto const* psrc = (GpuComplex<T> const*) pbuf;
+        int norig = n;
+        Long nelems = Long(norig)*howmany;
+        int ostride = n;
+
+        if (kind == Kind::r2r_oo_f) {
+            int istride = n+1;
+            if (r2r_data_is_complex) {
+                ParallelFor(nelems/2, [=] AMREX_GPU_DEVICE (Long ielem)
+                {
+                    auto batch = ielem / Long(norig);
+                    auto k = int(ielem - batch*norig);
+                    auto [s, c] = Math::sincospi(T(k+1)/T(2*norig));
+                    for (int ir = 0; ir < 2; ++ir) {
+                        auto const& yk = psrc[(2*batch+ir)*istride+k+1];
+                        pdst[2*batch*ostride+ir+k*2] = s * yk.real() - c * yk.imag();
+                    }
+                });
+            } else {
+                ParallelFor(nelems, [=] AMREX_GPU_DEVICE (Long ielem)
+                {
+                    auto batch = ielem / Long(norig);
+                    auto k = int(ielem - batch*norig);
+                    auto [s, c] = Math::sincospi(T(k+1)/T(2*norig));
+                    auto const& yk = psrc[batch*istride+k+1];
+                    pdst[batch*ostride+k] = s * yk.real() - c * yk.imag();
+                });
+            }
+        } else if (kind == Kind::r2r_oo_b) {
+            int istride = 2*n+1;
+            if (r2r_data_is_complex) {
+                ParallelFor(nelems/2, [=] AMREX_GPU_DEVICE (Long ielem)
+                {
+                    auto batch = ielem / Long(norig);
+                    auto k = int(ielem - batch*norig);
+                    auto [s, c] = Math::sincospi(T(2*k+1)/T(2*norig));
+                    for (int ir = 0; ir < 2; ++ir) {
+                        auto const& yk = psrc[(2*batch+ir)*istride+2*k+1];
+                        pdst[2*batch*ostride+ir+k*2] = T(0.5)*(s * yk.real() - c * yk.imag());
+                    }
+                });
+            } else {
+                ParallelFor(nelems, [=] AMREX_GPU_DEVICE (Long ielem)
+                {
+                    auto batch = ielem / Long(norig);
+                    auto k = int(ielem - batch*norig);
+                    auto [s, c] = Math::sincospi(T(2*k+1)/T(2*norig));
+                    auto const& yk = psrc[batch*istride+2*k+1];
+                    pdst[batch*ostride+k] = T(0.5)*(s * yk.real() - c * yk.imag());
+                });
+            }
+        } else if (kind == Kind::r2r_ee_f) {
+            int istride = n+1;
+            if (r2r_data_is_complex) {
+                ParallelFor(nelems/2, [=] AMREX_GPU_DEVICE (Long ielem)
+                {
+                    auto batch = ielem / Long(norig);
+                    auto k = int(ielem - batch*norig);
+                    auto [s, c] = Math::sincospi(T(k)/T(2*norig));
+                    for (int ir = 0; ir < 2; ++ir) {
+                        auto const& yk = psrc[(2*batch+ir)*istride+k];
+                        pdst[2*batch*ostride+ir+k*2] = c * yk.real() + s * yk.imag();
+                    }
+                });
+            } else {
+                ParallelFor(nelems, [=] AMREX_GPU_DEVICE (Long ielem)
+                {
+                    auto batch = ielem / Long(norig);
+                    auto k = int(ielem - batch*norig);
+                    auto [s, c] = Math::sincospi(T(k)/T(2*norig));
+                    auto const& yk = psrc[batch*istride+k];
+                    pdst[batch*ostride+k] = c * yk.real() + s * yk.imag();
+                });
+            }
+        } else if (kind == Kind::r2r_ee_b) {
+            int istride = 2*n+1;
+            if (r2r_data_is_complex) {
+                ParallelFor(nelems/2, [=] AMREX_GPU_DEVICE (Long ielem)
+                {
+                    auto batch = ielem / Long(norig);
+                    auto k = int(ielem - batch*norig);
+                    for (int ir = 0; ir < 2; ++ir) {
+                        auto const& yk = psrc[(2*batch+ir)*istride+2*k+1];
+                        pdst[2*batch*ostride+ir+k*2] = T(0.5) * yk.real();
+                    }
+                });
+            } else {
+                ParallelFor(nelems, [=] AMREX_GPU_DEVICE (Long ielem)
+                {
+                    auto batch = ielem / Long(norig);
+                    auto k = int(ielem - batch*norig);
+                    auto const& yk = psrc[batch*istride+2*k+1];
+                    pdst[batch*ostride+k] = T(0.5) * yk.real();
+                });
+            }
+        } else if (kind == Kind::r2r_eo) {
+            int istride = 2*n+1;
+            if (r2r_data_is_complex) {
+                ParallelFor(nelems/2, [=] AMREX_GPU_DEVICE (Long ielem)
+                {
+                    auto batch = ielem / Long(norig);
+                    auto k = int(ielem - batch*norig);
+                    auto [s, c] = Math::sincospi((k+T(0.5))/T(2*norig));
+                    for (int ir = 0; ir < 2; ++ir) {
+                        auto const& yk = psrc[(2*batch+ir)*istride+2*k+1];
+                        pdst[2*batch*ostride+ir+k*2] = T(0.5) * (c * yk.real() + s * yk.imag());
+                    }
+                });
+            } else {
+                ParallelFor(nelems, [=] AMREX_GPU_DEVICE (Long ielem)
+                {
+                    auto batch = ielem / Long(norig);
+                    auto k = int(ielem - batch*norig);
+                    auto [s, c] = Math::sincospi((k+T(0.5))/T(2*norig));
+                    auto const& yk = psrc[batch*istride+2*k+1];
+                    pdst[batch*ostride+k] = T(0.5) * (c * yk.real() + s * yk.imag());
+                });
+            }
+        } else if (kind == Kind::r2r_oe) {
+            int istride = 2*n+1;
+            if (r2r_data_is_complex) {
+                ParallelFor(nelems/2, [=] AMREX_GPU_DEVICE (Long ielem)
+                {
+                    auto batch = ielem / Long(norig);
+                    auto k = int(ielem - batch*norig);
+                    auto [s, c] = Math::sincospi((k+T(0.5))/T(2*norig));
+                    for (int ir = 0; ir < 2; ++ir) {
+                        auto const& yk = psrc[(2*batch+ir)*istride+2*k+1];
+                        pdst[2*batch*ostride+ir+k*2] = T(0.5) * (s * yk.real() - c * yk.imag());
+                    }
+                });
+            } else {
+                ParallelFor(nelems, [=] AMREX_GPU_DEVICE (Long ielem)
+                {
+                    auto batch = ielem / Long(norig);
+                    auto k = int(ielem - batch*norig);
+                    auto [s, c] = Math::sincospi((k+T(0.5))/T(2*norig));
+                    auto const& yk = psrc[batch*istride+2*k+1];
+                    pdst[batch*ostride+k] = T(0.5) * (s * yk.real() - c * yk.imag());
+                });
+            }
+        } else {
+            amrex::Abort("FFT: unpack_r2r_buffer: unsupported kind");
+        }
+    }
+#endif
+
+    template <Direction D>
+    void compute_r2r ()
+    {
+        static_assert(D == Direction::forward || D == Direction::backward);
+        if (!defined) { return; }
+
+#if defined(AMREX_USE_GPU)
+
+        auto* pscratch = alloc_scratch_space();
+
+        pack_r2r_buffer(pscratch, (T*)((D == Direction::forward) ? pf : pb));
+
+#if defined(AMREX_USE_CUDA)
+
+        AMREX_CUFFT_SAFE_CALL(cufftSetStream(plan, Gpu::gpuStream()));
+
+        std::size_t work_size = 0;
+        AMREX_CUFFT_SAFE_CALL(cufftGetSize(plan, &work_size));
+
+        auto* work_area = The_Arena()->alloc(work_size);
+        AMREX_CUFFT_SAFE_CALL(cufftSetWorkArea(plan, work_area));
+
+        if constexpr (std::is_same_v<float,T>) {
+            AMREX_CUFFT_SAFE_CALL(cufftExecR2C(plan, (T*)pscratch, (VendorComplex*)pscratch));
+        } else {
+            AMREX_CUFFT_SAFE_CALL(cufftExecD2Z(plan, (T*)pscratch, (VendorComplex*)pscratch));
+        }
+
+#elif defined(AMREX_USE_HIP)
+        detail::hip_execute(plan, (void**)&pscratch, (void**)&pscratch);
+#elif defined(AMREX_USE_SYCL)
+        detail::sycl_execute<T,Direction::forward>(std::get<0>(plan), (T*)pscratch, (VendorComplex*)pscratch);
+#endif
+
+        unpack_r2r_buffer((T*)((D == Direction::forward) ? pb : pf), pscratch);
+
+        Gpu::streamSynchronize();
+        free_scratch_space(pscratch);
+#if defined(AMREX_USE_CUDA)
+        The_Arena()->free(work_area);
+#endif
+
+#else /* FFTW */
+
+        if constexpr (std::is_same_v<float,T>) {
+            fftwf_execute(plan);
+            if (defined2) { fftwf_execute(plan2); }
+        } else {
+            fftw_execute(plan);
+            if (defined2) { fftw_execute(plan2); }
+        }
+
+#endif
+    }
+
+    static void destroy_vendor_plan (VendorPlan plan)
+    {
+#if defined(AMREX_USE_CUDA)
+        AMREX_CUFFT_SAFE_CALL(cufftDestroy(plan));
+#elif defined(AMREX_USE_HIP)
+        AMREX_ROCFFT_SAFE_CALL(rocfft_plan_destroy(plan));
+#elif defined(AMREX_USE_SYCL)
+        std::visit([](auto&& p) { delete p; }, plan);
+#else
+        if constexpr (std::is_same_v<float,T>) {
+            fftwf_destroy_plan(plan);
+        } else {
+            fftw_destroy_plan(plan);
+        }
+#endif
+    }
+};
+
+using Key = std::tuple<IntVectND<3>,Direction,Kind>;
+using PlanD = typename Plan<double>::VendorPlan;
+using PlanF = typename Plan<float>::VendorPlan;
+
+PlanD* get_vendor_plan_d (Key const& key);
+PlanF* get_vendor_plan_f (Key const& key);
+
+void add_vendor_plan_d (Key const& key, PlanD plan);
+void add_vendor_plan_f (Key const& key, PlanF plan);
+
+template <typename T>
+template <Direction D, int M>
+void Plan<T>::init_r2c (IntVectND<M> const& fft_size, void* pbf, void* pbb, bool cache)
+{
+    static_assert(D == Direction::forward || D == Direction::backward);
+
+    kind = (D == Direction::forward) ? Kind::r2c_f : Kind::r2c_b;
+    defined = true;
+    pf = pbf;
+    pb = pbb;
+
+    n = 1;
+    for (auto s : fft_size) { n *= s; }
+    howmany = 1;
+
+#if defined(AMREX_USE_GPU)
+    Key key = {fft_size.template expand<3>(), D, kind};
+    if (cache) {
+        VendorPlan* cached_plan = nullptr;
+        if constexpr (std::is_same_v<float,T>) {
+            cached_plan = get_vendor_plan_f(key);
+        } else {
+            cached_plan = get_vendor_plan_d(key);
+        }
+        if (cached_plan) {
+            plan = *cached_plan;
+            return;
+        }
+    }
+#else
+    amrex::ignore_unused(cache);
+#endif
+
+#if defined(AMREX_USE_CUDA)
+
+    AMREX_CUFFT_SAFE_CALL(cufftCreate(&plan));
+    AMREX_CUFFT_SAFE_CALL(cufftSetAutoAllocation(plan, 0));
+    cufftType type;
+    if constexpr (D == Direction::forward) {
+        type = std::is_same_v<float,T> ? CUFFT_R2C : CUFFT_D2Z;
+    } else {
+        type = std::is_same_v<float,T> ? CUFFT_C2R : CUFFT_Z2D;
+    }
+    std::size_t work_size;
+    if constexpr (M == 1) {
+        AMREX_CUFFT_SAFE_CALL
+            (cufftMakePlan1d(plan, fft_size[0], type, howmany, &work_size));
+    } else if constexpr (M == 2) {
+        AMREX_CUFFT_SAFE_CALL
+            (cufftMakePlan2d(plan, fft_size[1], fft_size[0], type, &work_size));
+    } else if constexpr (M == 3) {
+        AMREX_CUFFT_SAFE_CALL
+            (cufftMakePlan3d(plan, fft_size[2], fft_size[1], fft_size[0], type, &work_size));
+    }
+
+#elif defined(AMREX_USE_HIP)
+
+    auto prec = std::is_same_v<float,T> ? rocfft_precision_single : rocfft_precision_double;
+    std::size_t length[M];
+    for (int idim = 0; idim < M; ++idim) { length[idim] = fft_size[idim]; }
+    if constexpr (D == Direction::forward) {
+        AMREX_ROCFFT_SAFE_CALL
+            (rocfft_plan_create(&plan, rocfft_placement_notinplace,
+                                rocfft_transform_type_real_forward, prec, M,
+                                length, howmany, nullptr));
+    } else {
+        AMREX_ROCFFT_SAFE_CALL
+            (rocfft_plan_create(&plan, rocfft_placement_notinplace,
+                                rocfft_transform_type_real_inverse, prec, M,
+                                length, howmany, nullptr));
+    }
+
+#elif defined(AMREX_USE_SYCL)
+
+    mkl_desc_r* pp;
+    if (M == 1) {
+        pp = new mkl_desc_r(fft_size[0]);
+    } else {
+        std::vector<std::int64_t> len(M);
+        for (int idim = 0; idim < M; ++idim) {
+            len[idim] = fft_size[M-1-idim];
+        }
+        pp = new mkl_desc_r(len);
+    }
+#ifndef AMREX_USE_MKL_DFTI_2024
+    pp->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
+                  oneapi::mkl::dft::config_value::NOT_INPLACE);
+#else
+    pp->set_value(oneapi::mkl::dft::config_param::PLACEMENT, DFTI_NOT_INPLACE);
+#endif
+
+    std::vector<std::int64_t> strides(M+1);
+    strides[0] = 0;
+    strides[M] = 1;
+    for (int i = M-1; i >= 1; --i) {
+        strides[i] = strides[i+1] * fft_size[M-1-i];
+    }
+
+#ifndef AMREX_USE_MKL_DFTI_2024
+    pp->set_value(oneapi::mkl::dft::config_param::FWD_STRIDES, strides);
+    pp->set_value(oneapi::mkl::dft::config_param::BWD_STRIDES, strides);
+#else
+    pp->set_value(oneapi::mkl::dft::config_param::FWD_STRIDES, strides.data());
+    // Do not set BWD_STRIDES
+#endif
+    pp->set_value(oneapi::mkl::dft::config_param::WORKSPACE,
+                  oneapi::mkl::dft::config_value::WORKSPACE_EXTERNAL);
+    pp->commit(amrex::Gpu::Device::streamQueue());
+    plan = pp;
+
+#else /* FFTW */
+
+    if (pf == nullptr || pb == nullptr) { return; }
+
+    int size_for_row_major[M];
+    for (int idim = 0; idim < M; ++idim) {
+        size_for_row_major[idim] = fft_size[M-1-idim];
+    }
+
+    if constexpr (std::is_same_v<float,T>) {
+        if constexpr (D == Direction::forward) {
+            plan = fftwf_plan_dft_r2c
+                (M, size_for_row_major, (float*)pf, (fftwf_complex*)pb,
+                 FFTW_ESTIMATE);
+        } else {
+            plan = fftwf_plan_dft_c2r
+                (M, size_for_row_major, (fftwf_complex*)pb, (float*)pf,
+                 FFTW_ESTIMATE);
+        }
+    } else {
+        if constexpr (D == Direction::forward) {
+            plan = fftw_plan_dft_r2c
+                (M, size_for_row_major, (double*)pf, (fftw_complex*)pb,
+                 FFTW_ESTIMATE);
+        } else {
+            plan = fftw_plan_dft_c2r
+                (M, size_for_row_major, (fftw_complex*)pb, (double*)pf,
+                 FFTW_ESTIMATE);
+        }
+    }
+#endif
+
+#if defined(AMREX_USE_GPU)
+    if (cache) {
+        if constexpr (std::is_same_v<float,T>) {
+            add_vendor_plan_f(key, plan);
+        } else {
+            add_vendor_plan_d(key, plan);
+        }
+    }
+#endif
+}
+
 namespace detail
 {
     DistributionMapping make_iota_distromap (Long n);
+
+    template <typename FA>
+    typename FA::FABType::value_type * get_fab (FA& fa)
+    {
+        auto myproc = ParallelContext::MyProcSub();
+        if (myproc < fa.size()) {
+            return fa.fabPtr(myproc);
+        } else {
+            return nullptr;
+        }
+    }
+
+    template <typename FA1, typename FA2>
+    std::unique_ptr<char,DataDeleter> make_mfs_share (FA1& fa1, FA2& fa2)
+    {
+        bool not_same_fa = true;
+        if constexpr (std::is_same_v<FA1,FA2>) {
+            not_same_fa = (&fa1 != &fa2);
+        }
+        using FAB1 = typename FA1::FABType::value_type;
+        using FAB2 = typename FA2::FABType::value_type;
+        using T1 = typename FAB1::value_type;
+        using T2 = typename FAB2::value_type;
+        auto myproc = ParallelContext::MyProcSub();
+        bool alloc_1 = (myproc < fa1.size());
+        bool alloc_2 = (myproc < fa2.size()) && not_same_fa;
+        void* p = nullptr;
+        if (alloc_1 && alloc_2) {
+            Box const& box1 = fa1.fabbox(myproc);
+            Box const& box2 = fa2.fabbox(myproc);
+            int ncomp1 = fa1.nComp();
+            int ncomp2 = fa2.nComp();
+            p = The_Arena()->alloc(std::max(sizeof(T1)*box1.numPts()*ncomp1,
+                                            sizeof(T2)*box2.numPts()*ncomp2));
+            fa1.setFab(myproc, FAB1(box1, ncomp1, (T1*)p));
+            fa2.setFab(myproc, FAB2(box2, ncomp2, (T2*)p));
+        } else if (alloc_1) {
+            Box const& box1 = fa1.fabbox(myproc);
+            int ncomp1 = fa1.nComp();
+            p = The_Arena()->alloc(sizeof(T1)*box1.numPts()*ncomp1);
+            fa1.setFab(myproc, FAB1(box1, ncomp1, (T1*)p));
+        } else if (alloc_2) {
+            Box const& box2 = fa2.fabbox(myproc);
+            int ncomp2 = fa2.nComp();
+            p = The_Arena()->alloc(sizeof(T2)*box2.numPts()*ncomp2);
+            fa2.setFab(myproc, FAB2(box2, ncomp2, (T2*)p));
+        } else {
+            return nullptr;
+        }
+        return std::unique_ptr<char,DataDeleter>((char*)p, DataDeleter{The_Arena()});
+    }
 }
 
+struct Swap01
+{
+    [[nodiscard]] constexpr Dim3 operator() (Dim3 i) const noexcept
+    {
+        return {i.y, i.x, i.z};
+    }
+
+    static constexpr Dim3 Inverse (Dim3 i)
+    {
+        return {i.y, i.x, i.z};
+    }
+
+    [[nodiscard]] constexpr IndexType operator() (IndexType it) const noexcept
+    {
+        return it;
+    }
+
+    static constexpr IndexType Inverse (IndexType it)
+    {
+        return it;
+    }
+};
+
+struct Swap02
+{
+    [[nodiscard]] constexpr Dim3 operator() (Dim3 i) const noexcept
+    {
+        return {i.z, i.y, i.x};
+    }
+
+    static constexpr Dim3 Inverse (Dim3 i)
+    {
+        return {i.z, i.y, i.x};
+    }
+
+    [[nodiscard]] constexpr IndexType operator() (IndexType it) const noexcept
+    {
+        return it;
+    }
+
+    static constexpr IndexType Inverse (IndexType it)
+    {
+        return it;
+    }
+};
+
+struct RotateFwd
+{
+    // dest -> src: (x,y,z) -> (y,z,x)
+    [[nodiscard]] constexpr Dim3 operator() (Dim3 i) const noexcept
+    {
+        return {i.y, i.z, i.x};
+    }
+
+    // src -> dest: (x,y,z) -> (z,x,y)
+    static constexpr Dim3 Inverse (Dim3 i)
+    {
+        return {i.z, i.x, i.y};
+    }
+
+    [[nodiscard]] constexpr IndexType operator() (IndexType it) const noexcept
+    {
+        return it;
+    }
+
+    static constexpr IndexType Inverse (IndexType it)
+    {
+        return it;
+    }
+};
+
+struct RotateBwd
+{
+    // dest -> src: (x,y,z) -> (z,x,y)
+    [[nodiscard]] constexpr Dim3 operator() (Dim3 i) const noexcept
+    {
+        return {i.z, i.x, i.y};
+    }
+
+    // src -> dest: (x,y,z) -> (y,z,x)
+    static constexpr Dim3 Inverse (Dim3 i)
+    {
+        return {i.y, i.z, i.x};
+    }
+
+    [[nodiscard]] constexpr IndexType operator() (IndexType it) const noexcept
+    {
+        return it;
+    }
+
+    static constexpr IndexType Inverse (IndexType it)
+    {
+        return it;
+    }
+};
+
 }
 
 #endif
diff --git a/Src/FFT/AMReX_FFT_LocalR2C.H b/Src/FFT/AMReX_FFT_LocalR2C.H
new file mode 100644
index 0000000000..11b4be6149
--- /dev/null
+++ b/Src/FFT/AMReX_FFT_LocalR2C.H
@@ -0,0 +1,333 @@
+#ifndef AMREX_FFT_LOCAL_R2C_H_
+#define AMREX_FFT_LOCAL_R2C_H_
+#include <AMReX_Config.H>
+
+#include <AMReX_Arena.H>
+#include <AMReX_FFT_Helper.H>
+
+namespace amrex::FFT
+{
+
+/**
+ * \brief Local Discrete Fourier Transform
+ *
+ * This class supports Fourier transforms between real and complex data. The
+ * name R2C indicates that the forward transform converts real data to
+ * complex data, while the backward transform converts complex data to real
+ * data. It should be noted that both directions of transformation are
+ * supported, not just from real to complex. The scaling follows the FFTW
+ * convention, where applying the forward transform followed by the backward
+ * transform scales the original data by the size of the input array.
+ *
+ * For more details, we refer the users to
+ * https://amrex-codes.github.io/amrex/docs_html/FFT_Chapter.html.
+ */
+template <typename T, FFT::Direction D = FFT::Direction::both,
+          int M = AMREX_SPACEDIM>
+class LocalR2C
+{
+public:
+    /**
+     * \brief Constructor
+     *
+     * Given the diverse interfaces of FFT libraries we use, this constructo
+     * has a number of optional arguments.
+     *
+     * The user can provide the data pointers to the constructor. They are
+     * only needed by FFTW because its plan creation requires the input and
+     * output arrays. If they are null, we will delay the plan creation for
+     * FFTW until the forward or backward function is called.
+     *
+     * The cache_plan option is only used when we use cufft, rocfft and
+     * onemkl, but not FFTW.
+     *
+     * \param fft_size The forward domain size (i.e., the domain of the real data)
+     * \param p_fwd Forward domain data pointer (optional)
+     * \param p_bwd Backward domain data pointer (optional)
+     * \param cache_plan Try to cache the plan or not (optionl)
+     */
+    explicit LocalR2C (IntVectND<M> const& fft_size,
+                       T* p_fwd = nullptr,
+                       GpuComplex<T>* p_bwd = nullptr,
+#ifdef AMREX_USE_GPU
+                       bool cache_plan = true);
+#else
+                       bool cache_plan = false);
+#endif
+
+    ~LocalR2C ();
+
+    LocalR2C () = default;
+    LocalR2C (LocalR2C &&) noexcept;
+    LocalR2C& operator= (LocalR2C &&) noexcept;
+
+    LocalR2C (LocalR2C const&) = delete;
+    LocalR2C& operator= (LocalR2C const&) = delete;
+
+    /**
+     * \brief Forward transform
+     *
+     * This function is not available when this class template is
+     * instantiated for backward-only transform. For GPUs, this function is
+     * synchronous on the host.
+     *
+     * \param indata input data
+     * \param outdata output data
+     */
+    template <Direction DIR=D, std::enable_if_t<DIR == Direction::forward ||
+                                                DIR == Direction::both, int> = 0>
+    void forward (T const* indata, GpuComplex<T>* outdata);
+
+    void clear ();
+
+    /**
+     * \brief Backward transform
+     *
+     * This function is not available when this class template is
+     * instantiated for forward-only transform. For GPUs, this function is
+     * synchronous on the host.
+     *
+     * \param indata input data
+     * \param outdata output data
+     */
+    template <Direction DIR=D, std::enable_if_t<DIR == Direction::backward ||
+                                                DIR == Direction::both, int> = 0>
+    void backward (GpuComplex<T> const* indata, T* outdata);
+
+    //! Scaling factor. If the data goes through forward and then backward,
+    //! the result multiplied by the scaling factor is equal to the original
+    //! data.
+    [[nodiscard]] T scalingFactor () const;
+
+    //! Spectral domain size
+    [[nodiscard]] IntVectND<M> const& spectralSize () const {
+        return m_spectral_size;
+    }
+
+private:
+
+    Plan<T> m_fft_fwd;
+    Plan<T> m_fft_bwd;
+
+    T* m_p_fwd = nullptr;
+    GpuComplex<T>* m_p_bwd = nullptr;
+
+#if defined(AMREX_USE_SYCL)
+    gpuStream_t m_gpu_stream{};
+#endif
+
+    IntVectND<M> m_real_size;
+    IntVectND<M> m_spectral_size;
+
+    bool m_cache_plan = false;
+};
+
+template <typename T, FFT::Direction D, int M>
+LocalR2C<T,D,M>::LocalR2C (IntVectND<M> const& fft_size, T* p_fwd,
+                           GpuComplex<T>* p_bwd, bool cache_plan)
+    : m_p_fwd(p_fwd),
+      m_p_bwd(p_bwd),
+      m_real_size(fft_size),
+      m_spectral_size(fft_size)
+#if defined(AMREX_USE_GPU)
+      , m_cache_plan(cache_plan)
+#endif
+{
+#if !defined(AMREX_USE_GPU)
+    amrex::ignore_unused(cache_plan);
+#endif
+
+    BL_PROFILE("FFT::LocalR2C");
+    m_spectral_size[0] = m_real_size[0]/2 + 1;
+
+#if defined(AMREX_USE_SYCL)
+
+    auto current_stream = Gpu::gpuStream();
+    Gpu::Device::resetStreamIndex();
+    m_gpu_stream = Gpu::gpuStream();
+
+#endif
+
+    auto* pf = (void*)m_p_fwd;
+    auto* pb = (void*)m_p_bwd;
+
+#ifdef AMREX_USE_SYCL
+    m_fft_fwd.template init_r2c<Direction::forward,M>(m_real_size, pf, pb, m_cache_plan);
+    m_fft_bwd = m_fft_fwd;
+#else
+    if constexpr (D == Direction::both || D == Direction::forward) {
+        m_fft_fwd.template init_r2c<Direction::forward,M>(m_real_size, pf, pb, m_cache_plan);
+    }
+    if constexpr (D == Direction::both || D == Direction::backward) {
+        m_fft_bwd.template init_r2c<Direction::backward,M>(m_real_size, pf, pb, m_cache_plan);
+    }
+#endif
+
+#if defined(AMREX_USE_SYCL)
+    Gpu::Device::setStream(current_stream);
+#endif
+}
+
+template <typename T, FFT::Direction D, int M>
+void LocalR2C<T,D,M>::clear ()
+{
+    if (!m_cache_plan) {
+        if (m_fft_bwd.plan != m_fft_fwd.plan) {
+            m_fft_bwd.destroy();
+        }
+        m_fft_fwd.destroy();
+    }
+
+    m_fft_fwd = Plan<T>{};
+    m_fft_bwd = Plan<T>{};
+}
+
+template <typename T, FFT::Direction D, int M>
+LocalR2C<T,D,M>::~LocalR2C ()
+{
+    static_assert(M >= 1 && M <= 3);
+    clear();
+}
+
+template <typename T, FFT::Direction D, int M>
+LocalR2C<T,D,M>::LocalR2C (LocalR2C && rhs) noexcept
+    : m_p_fwd(rhs.m_p_fwd),
+      m_p_bwd(rhs.m_p_bwd),
+      m_fft_fwd(rhs.m_fft_fwd),
+      m_fft_bwd(rhs.m_fft_bwd),
+#if defined(AMREX_USE_SYCL)
+      m_gpu_stream(rhs.m_gpu_stream),
+#endif
+      m_real_size(rhs.m_real_size),
+      m_spectral_size(rhs.m_spectral_size),
+      m_cache_plan(rhs.m_cache_plan)
+{
+    rhs.m_cache_plan = true; // So that plans in rhs are not destroyed.
+}
+
+template <typename T, FFT::Direction D, int M>
+LocalR2C<T,D,M>& LocalR2C<T,D,M>::operator= (LocalR2C && rhs) noexcept
+{
+    if (this == &rhs) { return *this; }
+
+    this->clear();
+
+    m_p_fwd = rhs.m_p_fwd;
+    m_p_bwd = rhs.m_p_bwd;
+    m_fft_fwd = rhs.m_fft_fwd;
+    m_fft_bwd = rhs.m_fft_bwd;
+#if defined(AMREX_USE_SYCL)
+    m_gpu_stream = rhs.m_gpu_stream;
+#endif
+    m_real_size = rhs.m_real_size;
+    m_spectral_size = rhs.m_spectral_size;
+    m_cache_plan = rhs.m_cache_plan;
+
+    rhs.m_cache_plan = true; // So that plans in rhs are not destroyed.
+
+    return *this;
+}
+
+template <typename T, FFT::Direction D, int M>
+template <Direction DIR, std::enable_if_t<DIR == Direction::forward ||
+                                         DIR == Direction::both, int> >
+void LocalR2C<T,D,M>::forward (T const* indata, GpuComplex<T>* outdata)
+{
+    BL_PROFILE("FFT::LocalR2C::forward");
+
+#if defined(AMREX_USE_GPU)
+
+    m_fft_fwd.set_ptrs((void*)indata, (void*)outdata);
+
+#if defined(AMREX_USE_SYCL)
+    auto current_stream = Gpu::gpuStream();
+    if (current_stream != m_gpu_stream) {
+        Gpu::streamSynchronize();
+        Gpu::Device::setStream(m_gpu_stream);
+    }
+#endif
+
+#else /* FFTW */
+
+    if (((T*)indata != m_p_fwd) || (outdata != m_p_bwd)) {
+        m_p_fwd = (T*)indata;
+        m_p_bwd = outdata;
+        auto* pf = (void*)m_p_fwd;
+        auto* pb = (void*)m_p_bwd;
+        m_fft_fwd.destroy();
+        m_fft_fwd.template init_r2c<Direction::forward,M>(m_real_size, pf, pb, false);
+        if constexpr (D == Direction::both) {
+            m_fft_bwd.destroy();
+            m_fft_bwd.template init_r2c<Direction::backward,M>(m_real_size, pf, pb, false);
+        }
+    }
+
+#endif
+
+    m_fft_fwd.template compute_r2c<Direction::forward>();
+
+#if defined(AMREX_USE_SYCL)
+    if (current_stream != m_gpu_stream) {
+        Gpu::Device::setStream(current_stream);
+    }
+#endif
+}
+
+template <typename T, FFT::Direction D, int M>
+template <Direction DIR, std::enable_if_t<DIR == Direction::backward ||
+                                          DIR == Direction::both, int> >
+void LocalR2C<T,D,M>::backward (GpuComplex<T> const* indata, T* outdata)
+{
+    BL_PROFILE("FFT::LocalR2C::backward");
+
+#if defined(AMREX_USE_GPU)
+
+    m_fft_bwd.set_ptrs((void*)outdata, (void*)indata);
+
+#if defined(AMREX_USE_SYCL)
+    auto current_stream = Gpu::gpuStream();
+    if (current_stream != m_gpu_stream) {
+        Gpu::streamSynchronize();
+        Gpu::Device::setStream(m_gpu_stream);
+    }
+#endif
+
+#else /* FFTW */
+
+    if (((GpuComplex<T>*)indata != m_p_bwd) || (outdata != m_p_fwd)) {
+        m_p_fwd = outdata;
+        m_p_bwd = (GpuComplex<T>*)indata;
+        auto* pf = (void*)m_p_fwd;
+        auto* pb = (void*)m_p_bwd;
+        m_fft_bwd.destroy();
+        m_fft_bwd.template init_r2c<Direction::backward,M>(m_real_size, pf, pb, false);
+        if constexpr (D == Direction::both) {
+            m_fft_fwd.destroy();
+            m_fft_fwd.template init_r2c<Direction::forward,M>(m_real_size, pf, pb, false);
+        }
+    }
+
+#endif
+
+    m_fft_bwd.template compute_r2c<Direction::backward>();
+
+#if defined(AMREX_USE_SYCL)
+    if (current_stream != m_gpu_stream) {
+        Gpu::Device::setStream(current_stream);
+    }
+#endif
+}
+
+template <typename T, FFT::Direction D, int M>
+T LocalR2C<T,D,M>::scalingFactor () const
+{
+    T r = 1;
+    for (auto s : m_real_size) {
+        r *= T(s);
+    }
+    return T(1)/r;
+}
+
+}
+
+#endif
diff --git a/Src/FFT/AMReX_FFT_OpenBCSolver.H b/Src/FFT/AMReX_FFT_OpenBCSolver.H
new file mode 100644
index 0000000000..1f75d18719
--- /dev/null
+++ b/Src/FFT/AMReX_FFT_OpenBCSolver.H
@@ -0,0 +1,206 @@
+#ifndef AMREX_FFT_OPENBC_SOLVER_H_
+#define AMREX_FFT_OPENBC_SOLVER_H_
+
+#include <AMReX_FFT_R2C.H>
+
+#include <AMReX_VisMF.H>
+
+namespace amrex::FFT
+{
+
+template <typename T = Real>
+class OpenBCSolver
+{
+public:
+    using MF = typename R2C<T>::MF;
+    using cMF = typename R2C<T>::cMF;
+
+    explicit OpenBCSolver (Box const& domain, Info const& info = Info{});
+
+    template <class F>
+    void setGreensFunction (F const& greens_function);
+
+    void solve (MF& phi, MF const& rho);
+
+    [[nodiscard]] Box const& Domain () const { return m_domain; }
+
+private:
+    static Box make_grown_domain (Box const& domain, Info const& info);
+
+    Box m_domain;
+    Info m_info;
+    R2C<T> m_r2c;
+    cMF m_G_fft;
+    std::unique_ptr<R2C<T>> m_r2c_green;
+};
+
+template <typename T>
+Box OpenBCSolver<T>::make_grown_domain (Box const& domain, Info const& info)
+{
+    IntVect len = domain.length();
+#if (AMREX_SPACEDIM == 3)
+    if (info.batch_mode) { len[2] = 0; }
+#else
+    amrex::ignore_unused(info);
+#endif
+    return Box(domain.smallEnd(), domain.bigEnd()+len, domain.ixType());
+}
+
+template <typename T>
+OpenBCSolver<T>::OpenBCSolver (Box const& domain, Info const& info)
+    : m_domain(domain),
+      m_info(info),
+      m_r2c(OpenBCSolver<T>::make_grown_domain(domain,info), info)
+{
+#if (AMREX_SPACEDIM == 3)
+    if (m_info.batch_mode) {
+        auto gdom = make_grown_domain(domain,m_info);
+        gdom.enclosedCells(2);
+        gdom.setSmall(2, 0);
+        int nprocs = std::min({ParallelContext::NProcsSub(),
+                               m_info.nprocs,
+                               m_domain.length(2)});
+        gdom.setBig(2, nprocs-1);
+        m_r2c_green = std::make_unique<R2C<T>>(gdom,info);
+        auto [sd, ord] = m_r2c_green->getSpectralData();
+        m_G_fft = cMF(*sd, amrex::make_alias, 0, 1);
+    } else
+#endif
+    {
+        amrex::ignore_unused(m_r2c_green);
+        auto [sd, ord] = m_r2c.getSpectralData();
+        amrex::ignore_unused(ord);
+        m_G_fft.define(sd->boxArray(), sd->DistributionMap(), 1, 0);
+    }
+}
+
+template <typename T>
+template <class F>
+void OpenBCSolver<T>::setGreensFunction (F const& greens_function)
+{
+    auto* infab = m_info.batch_mode ? detail::get_fab(m_r2c_green->m_rx)
+        :                             detail::get_fab(m_r2c.m_rx);
+    auto const& lo = m_domain.smallEnd();
+    auto const& lo3 = lo.dim3();
+    auto const& len = m_domain.length3d();
+    if (infab) {
+        auto const& a = infab->array();
+        auto box = infab->box();
+        GpuArray<int,3> nimages{1,1,1};
+        int ndims = m_info.batch_mode ? AMREX_SPACEDIM : AMREX_SPACEDIM-1;
+        for (int idim = 0; idim < ndims; ++idim) {
+            if (box.smallEnd(idim) == lo[idim] && box.length(idim) == 2*len[idim]) {
+                box.growHi(idim, -len[idim]+1); // +1 to include the middle plane
+                nimages[idim] = 2;
+            }
+        }
+        AMREX_ASSERT(nimages[0] == 2);
+        box.shift(-lo);
+        amrex::ParallelFor(box, [=] AMREX_GPU_DEVICE (int i, int j, int k)
+        {
+            T G;
+            if (i == len[0] || j == len[1] || k == len[2]) {
+                G = 0;
+            } else {
+                auto ii = i;
+                auto jj = (j > len[1]) ? 2*len[1]-j : j;
+                auto kk = (k > len[2]) ? 2*len[2]-k : k;
+                G = greens_function(ii+lo3.x,jj+lo3.y,kk+lo3.z);
+            }
+            for (int koff = 0; koff < nimages[2]; ++koff) {
+                int k2 = (koff == 0) ?  k : 2*len[2]-k;
+                if ((k2 == 2*len[2]) || (koff == 1 && k == len[2])) {
+                    continue;
+                }
+                for (int joff = 0; joff < nimages[1]; ++joff) {
+                    int j2 = (joff == 0) ?  j : 2*len[1]-j;
+                    if ((j2 == 2*len[1]) || (joff == 1 && j == len[1])) {
+                        continue;
+                    }
+                    for (int ioff = 0; ioff < nimages[0]; ++ioff) {
+                        int i2 = (ioff == 0) ?  i : 2*len[0]-i;
+                        if ((i2 == 2*len[0]) || (ioff == 1 && i == len[0])) {
+                            continue;
+                        }
+                        a(i2+lo3.x,j2+lo3.y,k2+lo3.z) = G;
+                    }
+                }
+            }
+        });
+    }
+
+    if (m_info.batch_mode) {
+        m_r2c_green->forward(m_r2c_green->m_rx);
+    } else {
+        m_r2c.forward(m_r2c.m_rx);
+    }
+
+    if (!m_info.batch_mode) {
+        auto [sd, ord] = m_r2c.getSpectralData();
+        amrex::ignore_unused(ord);
+        auto const* srcfab = detail::get_fab(*sd);
+        if (srcfab) {
+            auto* dstfab = detail::get_fab(m_G_fft);
+            if (dstfab) {
+#if defined(AMREX_USE_GPU)
+                Gpu::dtod_memcpy_async
+#else
+                std::memcpy
+#endif
+                    (dstfab->dataPtr(), srcfab->dataPtr(), dstfab->nBytes());
+            } else {
+                amrex::Abort("FFT::OpenBCSolver: how did this happen");
+            }
+        }
+    }
+}
+
+template <typename T>
+void OpenBCSolver<T>::solve (MF& phi, MF const& rho)
+{
+    auto& inmf = m_r2c.m_rx;
+    inmf.setVal(T(0));
+    inmf.ParallelCopy(rho, 0, 0, 1);
+
+    m_r2c.forward(inmf);
+
+    auto scaling_factor = m_r2c.scalingFactor();
+
+    auto const* gfab = detail::get_fab(m_G_fft);
+    if (gfab) {
+        auto [sd, ord] = m_r2c.getSpectralData();
+        amrex::ignore_unused(ord);
+        auto* rhofab = detail::get_fab(*sd);
+        if (rhofab) {
+            auto* pdst = rhofab->dataPtr();
+            auto const* psrc = gfab->dataPtr();
+            Box const& rhobox = rhofab->box();
+#if (AMREX_SPACEDIM == 3)
+            Long leng = gfab->box().numPts();
+            if (m_info.batch_mode) {
+                AMREX_ASSERT(gfab->box().length(2) == 1 &&
+                             leng == (rhobox.length(0) * rhobox.length(1)));
+            } else {
+                AMREX_ASSERT(leng == rhobox.numPts());
+            }
+#endif
+            amrex::ParallelFor(rhobox.numPts(), [=] AMREX_GPU_DEVICE (Long i)
+            {
+#if (AMREX_SPACEDIM == 3)
+                Long isrc = i % leng;
+#else
+                Long isrc = i;
+#endif
+                pdst[i] *= psrc[isrc] * scaling_factor;
+            });
+        } else {
+            amrex::Abort("FFT::OpenBCSolver::solve: how did this happen?");
+        }
+    }
+
+    m_r2c.backward_doit(phi, phi.nGrowVect());
+}
+
+}
+
+#endif
diff --git a/Src/FFT/AMReX_FFT_Poisson.H b/Src/FFT/AMReX_FFT_Poisson.H
index 6206206210..8ab467cc54 100644
--- a/Src/FFT/AMReX_FFT_Poisson.H
+++ b/Src/FFT/AMReX_FFT_Poisson.H
@@ -8,32 +8,86 @@ namespace amrex::FFT
 {
 
 /**
- * \brief Poisson solver for all periodic boundaries using FFT
+ * \brief Poisson solver for periodic, Dirichlet & Neumann boundaries using
+ * FFT.
  */
-template <typename MF>
+template <typename MF = MultiFab>
 class Poisson
 {
 public:
 
+    template <typename FA=MF, std::enable_if_t<IsFabArray_v<FA>,int> = 0>
+    Poisson (Geometry const& geom,
+             Array<std::pair<Boundary,Boundary>,AMREX_SPACEDIM> const& bc)
+        : m_geom(geom), m_bc(bc)
+    {
+        bool all_periodic = true;
+        for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) {
+            all_periodic = all_periodic
+                && (bc[idim].first == Boundary::periodic)
+                && (bc[idim].second == Boundary::periodic);
+        }
+        if (all_periodic) {
+            m_r2c = std::make_unique<R2C<typename MF::value_type>>(m_geom.Domain());
+        } else {
+            m_r2x = std::make_unique<R2X<typename MF::value_type>> (m_geom.Domain(), m_bc);
+        }
+    }
+
     template <typename FA=MF, std::enable_if_t<IsFabArray_v<FA>,int> = 0>
     explicit Poisson (Geometry const& geom)
-        : m_geom(geom), m_r2c(geom.Domain())
+        : m_geom(geom),
+          m_bc{AMREX_D_DECL(std::make_pair(Boundary::periodic,Boundary::periodic),
+                            std::make_pair(Boundary::periodic,Boundary::periodic),
+                            std::make_pair(Boundary::periodic,Boundary::periodic))}
     {
-        AMREX_ALWAYS_ASSERT(geom.isAllPeriodic());
+        if (m_geom.isAllPeriodic()) {
+            m_r2c = std::make_unique<R2C<typename MF::value_type>>(m_geom.Domain());
+        } else {
+            amrex::Abort("FFT::Poisson: wrong BC");
+        }
     }
 
     void solve (MF& soln, MF const& rhs);
 
 private:
     Geometry m_geom;
-    R2C<typename MF::value_type, Direction::both> m_r2c;
+    Array<std::pair<Boundary,Boundary>,AMREX_SPACEDIM> m_bc;
+    std::unique_ptr<R2X<typename MF::value_type>> m_r2x;
+    std::unique_ptr<R2C<typename MF::value_type>> m_r2c;
 };
 
+#if (AMREX_SPACEDIM == 3)
+/**
+ * \brief Poisson solve for Open BC using FFT.
+ */
+template <typename MF = MultiFab>
+class PoissonOpenBC
+{
+public:
+
+    template <typename FA=MF, std::enable_if_t<IsFabArray_v<FA>,int> = 0>
+    explicit PoissonOpenBC (Geometry const& geom,
+                            IndexType ixtype = IndexType::TheCellType(),
+                            IntVect const& ngrow = IntVect(0));
+
+    void solve (MF& soln, MF const& rhs);
+
+    void define_doit (); // has to be public for cuda
+
+private:
+    Geometry m_geom;
+    Box m_grown_domain;
+    IntVect m_ngrow;
+    OpenBCSolver<typename MF::value_type> m_solver;
+};
+#endif
+
 /**
  * \brief 3D Poisson solver for periodic boundaries in the first two
  * dimensions and Neumann in the last dimension.
  */
-template <typename MF>
+template <typename MF = MultiFab>
 class PoissonHybrid
 {
 public:
@@ -59,48 +113,121 @@ private:
 template <typename MF>
 void Poisson<MF>::solve (MF& soln, MF const& rhs)
 {
+    BL_PROFILE("FFT::Poisson::solve");
+
     using T = typename MF::value_type;
 
     GpuArray<T,AMREX_SPACEDIM> fac
-        {AMREX_D_DECL(T(2)*Math::pi<T>()/T(m_geom.ProbLength(0)),
-                      T(2)*Math::pi<T>()/T(m_geom.ProbLength(1)),
-                      T(2)*Math::pi<T>()/T(m_geom.ProbLength(2)))};
-    GpuArray<T,AMREX_SPACEDIM> dx
-        {AMREX_D_DECL(T(m_geom.CellSize(0)),
-                      T(m_geom.CellSize(1)),
-                      T(m_geom.CellSize(2)))};
-    auto scale = T(1.0/m_geom.Domain().d_numPts());
-#if (AMREX_SPACEDIM > 1)
-    auto const& len = m_geom.Domain().length();
-#endif
+        {AMREX_D_DECL(Math::pi<T>()/T(m_geom.Domain().length(0)),
+                      Math::pi<T>()/T(m_geom.Domain().length(1)),
+                      Math::pi<T>()/T(m_geom.Domain().length(2)))};
+    for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) {
+        if (m_bc[idim].first == Boundary::periodic) {
+            fac[idim] *= T(2);
+        }
+    }
+    GpuArray<T,AMREX_SPACEDIM> dxfac
+        {AMREX_D_DECL(T(2)/T(m_geom.CellSize(0)*m_geom.CellSize(0)),
+                      T(2)/T(m_geom.CellSize(1)*m_geom.CellSize(1)),
+                      T(2)/T(m_geom.CellSize(2)*m_geom.CellSize(2)))};
+    auto scale = (m_r2x) ? m_r2x->scalingFactor() : m_r2c->scalingFactor();
+
+    GpuArray<T,AMREX_SPACEDIM> offset{AMREX_D_DECL(T(0),T(0),T(0))};
+    // Not sure about odd-even and even-odd yet
+    for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) {
+        if (m_bc[idim].first == Boundary::odd &&
+            m_bc[idim].second == Boundary::odd)
+        {
+            offset[idim] = T(1);
+        }
+        else if ((m_bc[idim].first == Boundary::odd &&
+                  m_bc[idim].second == Boundary::even) ||
+                 (m_bc[idim].first == Boundary::even &&
+                  m_bc[idim].second == Boundary::odd))
+        {
+            offset[idim] = T(0.5);
+        }
+    }
 
-    m_r2c.forwardThenBackward(rhs, soln,
-                              [=] AMREX_GPU_DEVICE (int i, int j, int k,
-                                                    GpuComplex<T>& spectral_data)
+    auto f = [=] AMREX_GPU_DEVICE (int i, int j, int k, auto& spectral_data)
     {
-        amrex::ignore_unused(i,j,k);
-        // the values in the upper-half of the spectral array in y and z
-        // are here interpreted as negative wavenumbers
-        AMREX_D_TERM(T a = fac[0]*i;,
-                     T b = (j < len[1]/2) ? fac[1]*j : fac[1]*(len[1]-j);,
-                     T c = (k < len[2]/2) ? fac[2]*k : fac[2]*(len[2]-k));
-        T k2 = AMREX_D_TERM(T(2)*(std::cos(a*dx[0])-T(1))/(dx[0]*dx[0]),
-                           +T(2)*(std::cos(b*dx[1])-T(1))/(dx[1]*dx[1]),
-                           +T(2)*(std::cos(c*dx[2])-T(1))/(dx[2]*dx[2]));
+        amrex::ignore_unused(j,k);
+        AMREX_D_TERM(T a = fac[0]*(i+offset[0]);,
+                     T b = fac[1]*(j+offset[1]);,
+                     T c = fac[2]*(k+offset[2]));
+        T k2 = AMREX_D_TERM(dxfac[0]*(std::cos(a)-T(1)),
+                           +dxfac[1]*(std::cos(b)-T(1)),
+                           +dxfac[2]*(std::cos(c)-T(1)));
         if (k2 != T(0)) {
             spectral_data /= k2;
-        } else {
-            // interpretation here is that the average value of the
-            // solution is zero
-            spectral_data = 0;
         }
         spectral_data *= scale;
+    };
+
+    if (m_r2x) {
+        m_r2x->forwardThenBackward(rhs, soln, f);
+    } else {
+        m_r2c->forwardThenBackward(rhs, soln, f);
+    }
+}
+
+#if (AMREX_SPACEDIM == 3)
+
+template <typename MF>
+template <typename FA, std::enable_if_t<IsFabArray_v<FA>,int> FOO>
+PoissonOpenBC<MF>::PoissonOpenBC (Geometry const& geom, IndexType ixtype,
+                                  IntVect const& ngrow)
+    : m_geom(geom),
+      m_grown_domain(amrex::grow(amrex::convert(geom.Domain(),ixtype),ngrow)),
+      m_ngrow(ngrow),
+      m_solver(m_grown_domain)
+{
+    define_doit();
+}
+
+template <typename MF>
+void PoissonOpenBC<MF>::define_doit ()
+{
+    using T = typename MF::value_type;
+    auto const& lo = m_grown_domain.smallEnd();
+    auto const dx = T(m_geom.CellSize(0));
+    auto const dy = T(m_geom.CellSize(1));
+    auto const dz = T(m_geom.CellSize(2));
+    auto const gfac = T(1)/T(std::sqrt(T(12)));
+    // 0.125 comes from that there are 8 Gauss quadrature points
+    auto const fac = T(-0.125) * (dx*dy*dz) / (T(4)*Math::pi<T>());
+    m_solver.setGreensFunction([=] AMREX_GPU_DEVICE (int i, int j, int k) -> T
+    {
+        auto x = (T(i-lo[0]) - gfac) * dx; // first Gauss quadrature point
+        auto y = (T(j-lo[1]) - gfac) * dy;
+        auto z = (T(k-lo[2]) - gfac) * dz;
+        T r = 0;
+        for (int gx = 0; gx < 2; ++gx) {
+        for (int gy = 0; gy < 2; ++gy) {
+        for (int gz = 0; gz < 2; ++gz) {
+            auto xg = x + 2*gx*gfac*dx;
+            auto yg = y + 2*gy*gfac*dy;
+            auto zg = z + 2*gz*gfac*dz;
+            r += T(1)/std::sqrt(xg*xg+yg*yg+zg*zg);
+        }}}
+        return fac * r;
     });
 }
 
+template <typename MF>
+void PoissonOpenBC<MF>::solve (MF& soln, MF const& rhs)
+{
+    AMREX_ASSERT(m_grown_domain.ixType() == soln.ixType() && m_grown_domain.ixType() == rhs.ixType());
+    m_solver.solve(soln, rhs);
+}
+
+#endif /* AMREX_SPACEDIM == 3 */
+
 template <typename MF>
 void PoissonHybrid<MF>::solve (MF& soln, MF const& rhs)
 {
+    BL_PROFILE("FFT::PoissonHybrid::solve");
+
 #if (AMREX_SPACEDIM < 3)
     amrex::ignore_unused(soln, rhs);
 #else
diff --git a/Src/FFT/AMReX_FFT_R2C.H b/Src/FFT/AMReX_FFT_R2C.H
new file mode 100644
index 0000000000..456a3ddf7d
--- /dev/null
+++ b/Src/FFT/AMReX_FFT_R2C.H
@@ -0,0 +1,638 @@
+#ifndef AMREX_FFT_R2C_H_
+#define AMREX_FFT_R2C_H_
+#include <AMReX_Config.H>
+
+#include <AMReX_MultiFab.H>
+#include <AMReX_FFT_Helper.H>
+#include <algorithm>
+#include <numeric>
+#include <tuple>
+
+namespace amrex::FFT
+{
+
+template <typename T> class OpenBCSolver;
+
+/**
+ * \brief Parallel Discrete Fourier Transform
+ *
+ * This class supports Fourier transforms between real and complex data. The
+ * name R2C indicates that the forward transform converts real data to
+ * complex data, while the backward transform converts complex data to real
+ * data. It should be noted that both directions of transformation are
+ * supported, not just from real to complex. The scaling follows the FFTW
+ * convention, where applying the forward transform followed by the backward
+ * transform scales the original data by the size of the input array.
+ *
+ * For more details, we refer the users to
+ * https://amrex-codes.github.io/amrex/docs_html/FFT_Chapter.html.
+ */
+template <typename T = Real, FFT::Direction D = FFT::Direction::both,
+          FFT::DomainStrategy = FFT::DomainStrategy::slab>
+          // Don't change the default. Otherwise OpenBCSolver might break.
+class R2C
+{
+public:
+    using MF = std::conditional_t<std::is_same_v<T,Real>,
+                                  MultiFab, FabArray<BaseFab<T> > >;
+    using cMF = FabArray<BaseFab<GpuComplex<T> > >;
+
+    template <typename U> friend class OpenBCSolver;
+
+    /**
+     * \brief Constructor
+     *
+     * \param domain the forward domain (i.e., the domain of the real data)
+     * \param info optional information
+     */
+    explicit R2C (Box const& domain, Info const& info = Info{});
+
+    ~R2C ();
+
+    R2C (R2C const&) = delete;
+    R2C (R2C &&) = delete;
+    R2C& operator= (R2C const&) = delete;
+    R2C& operator= (R2C &&) = delete;
+
+    /**
+     * \brief Forward and then backward transform
+     *
+     * This function is available only when this class template is
+     * instantiated for transforms in both directions. It's more efficient
+     * than calling the forward function that stores the spectral data in a
+     * caller provided container followed by the backward function, because
+     * this can avoid parallel communication between the internal data and
+     * the caller's data container.
+     *
+     * \param inmf         input data in MultiFab or FabArray<BaseFab<float>>
+     * \param outmf        output data in MultiFab or FabArray<BaseFab<float>>
+     * \param post_forward a callable object for processing the post-forward
+     *                     data before the backward transform. Its interface
+     *                     is `(int,int,int,GpuComplex<T>&)`, where the integers
+     *                     are indices in the spectral space, and the reference
+     *                     to the complex number allows for the modification of
+     *                     the spectral data at that location.
+     */
+    template <typename F, Direction DIR=D,
+              std::enable_if_t<DIR == Direction::both, int> = 0>
+    void forwardThenBackward (MF const& inmf, MF& outmf, F const& post_forward)
+    {
+        BL_PROFILE("FFT::R2C::forwardbackward");
+        this->forward(inmf);
+        this->post_forward_doit(post_forward);
+        this->backward(outmf);
+    }
+
+    /**
+     * \brief Forward transform
+     *
+     * The output is stored in this object's internal data. This function is
+     * not available when this class template is instantiated for
+     * backward-only transform.
+     *
+     * \param inmf input data in MultiFab or FabArray<BaseFab<float>>
+     */
+    template <Direction DIR=D, std::enable_if_t<DIR == Direction::forward ||
+                                                DIR == Direction::both, int> = 0>
+    void forward (MF const& inmf);
+
+    /**
+     * \brief Forward transform
+     *
+     * This function is not available when this class template is
+     * instantiated for backward-only transform.
+     *
+     * \param inmf input data in MultiFab or FabArray<BaseFab<float>>
+     * \param outmf output data in FabArray<BaseFab<GpuComplex<T>>>
+     */
+    template <Direction DIR=D, std::enable_if_t<DIR == Direction::forward ||
+                                                DIR == Direction::both, int> = 0>
+    void forward (MF const& inmf, cMF& outmf);
+
+    /**
+     * \brief Backward transform
+     *
+     * This function is available only when this class template is
+     * instantiated for transforms in both directions.
+     *
+     * \param outmf output data in MultiFab or FabArray<BaseFab<float>>
+     */
+    template <Direction DIR=D, std::enable_if_t<DIR == Direction::both, int> = 0>
+    void backward (MF& outmf);
+
+    /**
+     * \brief Backward transform
+     *
+     * This function is not available when this class template is
+     * instantiated for forward-only transform.
+     *
+     * \param inmf input data in FabArray<BaseFab<GpuComplex<T>>>
+     * \param outmf output data in MultiFab or FabArray<BaseFab<float>>
+     */
+    template <Direction DIR=D, std::enable_if_t<DIR == Direction::backward ||
+                                                DIR == Direction::both, int> = 0>
+    void backward (cMF const& inmf, MF& outmf);
+
+    //! Scaling factor. If the data goes through forward and then backward,
+    //! the result multiplied by the scaling factor is equal to the original
+    //! data.
+    [[nodiscard]] T scalingFactor () const;
+
+    /**
+     * \brief Get the internal spectral data
+     *
+     * This function is not available when this class template is
+     * instantiated for backward-only transform. For performance reasons,
+     * the returned data array does not have the usual ordering of
+     * `(x,y,z)`. The order is specified in the second part of the return
+     * value.
+     */
+    template <Direction DIR=D, std::enable_if_t<DIR == Direction::forward ||
+                                                DIR == Direction::both, int> = 0>
+    std::pair<cMF*,IntVect> getSpectralData ();
+
+    /**
+     * \brief Get BoxArray and DistributionMapping for spectral data
+     *
+     * The returned BoxArray and DistributionMapping can be used to build
+     * FabArray<BaseFab<GpuComplex<T>>> for spectral data. The returned
+     * BoxArray has the usual order of `(x,y,z)`.
+     */
+    [[nodiscard]] std::pair<BoxArray,DistributionMapping> getSpectralDataLayout () const;
+
+    // public for cuda
+    template <typename F>
+    void post_forward_doit (F const& post_forward);
+
+private:
+
+    static std::pair<Plan<T>,Plan<T>> make_c2c_plans (cMF& inout);
+
+    void backward_doit (MF& outmf, IntVect const& ngout = IntVect(0));
+
+    Plan<T> m_fft_fwd_x{};
+    Plan<T> m_fft_bwd_x{};
+    Plan<T> m_fft_fwd_y{};
+    Plan<T> m_fft_bwd_y{};
+    Plan<T> m_fft_fwd_z{};
+    Plan<T> m_fft_bwd_z{};
+
+    // Comm meta-data. In the forward phase, we start with (x,y,z),
+    // transpose to (y,x,z) and then (z,x,y). In the backward phase, we
+    // perform inverse transpose.
+    std::unique_ptr<MultiBlockCommMetaData> m_cmd_x2y; // (x,y,z) -> (y,x,z)
+    std::unique_ptr<MultiBlockCommMetaData> m_cmd_y2x; // (y,x,z) -> (x,y,z)
+    std::unique_ptr<MultiBlockCommMetaData> m_cmd_y2z; // (y,x,z) -> (z,x,y)
+    std::unique_ptr<MultiBlockCommMetaData> m_cmd_z2y; // (z,x,y) -> (y,x,z)
+    std::unique_ptr<MultiBlockCommMetaData> m_cmd_x2z; // (x,y,z) -> (z,x,y)
+    std::unique_ptr<MultiBlockCommMetaData> m_cmd_z2x; // (z,x,y) -> (x,y,z)
+    Swap01 m_dtos_x2y{};
+    Swap01 m_dtos_y2x{};
+    Swap02 m_dtos_y2z{};
+    Swap02 m_dtos_z2y{};
+    RotateFwd m_dtos_x2z{};
+    RotateBwd m_dtos_z2x{};
+
+    MF  m_rx;
+    cMF m_cx;
+    cMF m_cy;
+    cMF m_cz;
+
+    std::unique_ptr<char,DataDeleter> m_data_1;
+    std::unique_ptr<char,DataDeleter> m_data_2;
+
+    Box m_real_domain;
+    Box m_spectral_domain_x;
+    Box m_spectral_domain_y;
+    Box m_spectral_domain_z;
+
+    Info m_info;
+
+    bool m_slab_decomp = false;
+};
+
+template <typename T, Direction D, DomainStrategy S>
+R2C<T,D,S>::R2C (Box const& domain, Info const& info)
+    : m_real_domain(domain),
+      m_spectral_domain_x(IntVect(0), IntVect(AMREX_D_DECL(domain.length(0)/2,
+                                                           domain.length(1)-1,
+                                                           domain.length(2)-1)),
+                          domain.ixType()),
+#if (AMREX_SPACEDIM >= 2)
+      m_spectral_domain_y(IntVect(0), IntVect(AMREX_D_DECL(domain.length(1)-1,
+                                                           domain.length(0)/2,
+                                                           domain.length(2)-1)),
+                          domain.ixType()),
+#if (AMREX_SPACEDIM == 3)
+      m_spectral_domain_z(IntVect(0), IntVect(AMREX_D_DECL(domain.length(2)-1,
+                                                           domain.length(0)/2,
+                                                           domain.length(1)-1)),
+                          domain.ixType()),
+#endif
+#endif
+      m_info(info)
+{
+    BL_PROFILE("FFT::R2C");
+
+    static_assert(std::is_same_v<float,T> || std::is_same_v<double,T>);
+    AMREX_ALWAYS_ASSERT(m_real_domain.length(0) > 1);
+#if (AMREX_SPACEDIM == 3)
+    AMREX_ALWAYS_ASSERT(m_real_domain.length(2) > 1 || ! m_info.batch_mode);
+    AMREX_ALWAYS_ASSERT(m_real_domain.length(1) > 1 || m_real_domain.length(2) == 1);
+#else
+    AMREX_ALWAYS_ASSERT(! m_info.batch_mode);
+#endif
+
+    int myproc = ParallelContext::MyProcSub();
+    int nprocs = std::min(ParallelContext::NProcsSub(), m_info.nprocs);
+
+#if (AMREX_SPACEDIM == 3)
+    if (S == DomainStrategy::slab && (m_real_domain.length(1) > 1)) {
+        m_slab_decomp = true;
+    }
+#endif
+
+    //
+    // make data containers
+    //
+
+    auto bax = amrex::decompose(m_real_domain, nprocs,
+                                {AMREX_D_DECL(false,!m_slab_decomp,true)}, true);
+    DistributionMapping dmx = detail::make_iota_distromap(bax.size());
+    m_rx.define(bax, dmx, 1, 0, MFInfo().SetAlloc(false));
+
+    {
+        BoxList bl = bax.boxList();
+        for (auto & b : bl) {
+            b.shift(-m_real_domain.smallEnd());
+            b.setBig(0, m_spectral_domain_x.bigEnd(0));
+        }
+        BoxArray cbax(std::move(bl));
+        m_cx.define(cbax, dmx, 1, 0, MFInfo().SetAlloc(false));
+    }
+
+#if (AMREX_SPACEDIM >= 2)
+    DistributionMapping cdmy;
+    if ((m_real_domain.length(1) > 1) && !m_slab_decomp) {
+        auto cbay = amrex::decompose(m_spectral_domain_y, nprocs,
+                                     {AMREX_D_DECL(false,true,true)}, true);
+        if (cbay.size() == dmx.size()) {
+            cdmy = dmx;
+        } else {
+            cdmy = detail::make_iota_distromap(cbay.size());
+        }
+        m_cy.define(cbay, cdmy, 1, 0, MFInfo().SetAlloc(false));
+    }
+#endif
+
+#if (AMREX_SPACEDIM == 3)
+    if (m_real_domain.length(1) > 1 &&
+        (! m_info.batch_mode && m_real_domain.length(2) > 1))
+    {
+        auto cbaz = amrex::decompose(m_spectral_domain_z, nprocs,
+                                     {false,true,true}, true);
+        DistributionMapping cdmz;
+        if (cbaz.size() == dmx.size()) {
+            cdmz = dmx;
+        } else if (cbaz.size() == cdmy.size()) {
+            cdmz = cdmy;
+        } else {
+            cdmz = detail::make_iota_distromap(cbaz.size());
+        }
+         m_cz.define(cbaz, cdmz, 1, 0, MFInfo().SetAlloc(false));
+    }
+#endif
+
+    if (m_slab_decomp) {
+        m_data_1 = detail::make_mfs_share(m_rx, m_cz);
+        m_data_2 = detail::make_mfs_share(m_cx, m_cx);
+    } else {
+        m_data_1 = detail::make_mfs_share(m_rx, m_cy);
+        m_data_2 = detail::make_mfs_share(m_cx, m_cz);
+    }
+
+    //
+    // make copiers
+    //
+
+#if (AMREX_SPACEDIM >= 2)
+    if (! m_cy.empty()) {
+        // comm meta-data between x and y phases
+        m_cmd_x2y = std::make_unique<MultiBlockCommMetaData>
+            (m_cy, m_spectral_domain_y, m_cx, IntVect(0), m_dtos_x2y);
+        m_cmd_y2x = std::make_unique<MultiBlockCommMetaData>
+            (m_cx, m_spectral_domain_x, m_cy, IntVect(0), m_dtos_y2x);
+    }
+#endif
+#if (AMREX_SPACEDIM == 3)
+    if (! m_cz.empty() ) {
+        if (m_slab_decomp) {
+            // comm meta-data between xy and z phases
+            m_cmd_x2z = std::make_unique<MultiBlockCommMetaData>
+                (m_cz, m_spectral_domain_z, m_cx, IntVect(0), m_dtos_x2z);
+            m_cmd_z2x = std::make_unique<MultiBlockCommMetaData>
+                (m_cx, m_spectral_domain_x, m_cz, IntVect(0), m_dtos_z2x);
+        } else {
+            // comm meta-data between y and z phases
+            m_cmd_y2z = std::make_unique<MultiBlockCommMetaData>
+                (m_cz, m_spectral_domain_z, m_cy, IntVect(0), m_dtos_y2z);
+            m_cmd_z2y = std::make_unique<MultiBlockCommMetaData>
+                (m_cy, m_spectral_domain_y, m_cz, IntVect(0), m_dtos_z2y);
+        }
+    }
+#endif
+
+    //
+    // make plans
+    //
+
+    if (myproc < m_rx.size())
+    {
+        Box const& box = m_rx.box(myproc);
+        auto* pr = m_rx[myproc].dataPtr();
+        auto* pc = (typename Plan<T>::VendorComplex *)m_cx[myproc].dataPtr();
+#ifdef AMREX_USE_SYCL
+        m_fft_fwd_x.template init_r2c<Direction::forward>(box, pr, pc, m_slab_decomp);
+        m_fft_bwd_x = m_fft_fwd_x;
+#else
+        if constexpr (D == Direction::both || D == Direction::forward) {
+            m_fft_fwd_x.template init_r2c<Direction::forward>(box, pr, pc, m_slab_decomp);
+        }
+        if constexpr (D == Direction::both || D == Direction::backward) {
+            m_fft_bwd_x.template init_r2c<Direction::backward>(box, pr, pc, m_slab_decomp);
+        }
+#endif
+    }
+
+#if (AMREX_SPACEDIM >= 2)
+    if (! m_cy.empty()) {
+        std::tie(m_fft_fwd_y, m_fft_bwd_y) = make_c2c_plans(m_cy);
+    }
+#endif
+#if (AMREX_SPACEDIM == 3)
+    if (! m_cz.empty()) {
+        std::tie(m_fft_fwd_z, m_fft_bwd_z) = make_c2c_plans(m_cz);
+    }
+#endif
+}
+
+template <typename T, Direction D, DomainStrategy S>
+R2C<T,D,S>::~R2C<T,D,S> ()
+{
+    if (m_fft_bwd_x.plan != m_fft_fwd_x.plan) {
+        m_fft_bwd_x.destroy();
+    }
+    if (m_fft_bwd_y.plan != m_fft_fwd_y.plan) {
+        m_fft_bwd_y.destroy();
+    }
+    if (m_fft_bwd_z.plan != m_fft_fwd_z.plan) {
+        m_fft_bwd_z.destroy();
+    }
+    m_fft_fwd_x.destroy();
+    m_fft_fwd_y.destroy();
+    m_fft_fwd_z.destroy();
+}
+
+template <typename T, Direction D, DomainStrategy S>
+template <Direction DIR, std::enable_if_t<DIR == Direction::forward ||
+                                          DIR == Direction::both, int> >
+void R2C<T,D,S>::forward (MF const& inmf)
+{
+    BL_PROFILE("FFT::R2C::forward(in)");
+
+    if (&m_rx != &inmf) {
+        m_rx.ParallelCopy(inmf, 0, 0, 1);
+    }
+    m_fft_fwd_x.template compute_r2c<Direction::forward>();
+
+    if (                          m_cmd_x2y) {
+        ParallelCopy(m_cy, m_cx, *m_cmd_x2y, 0, 0, 1, m_dtos_x2y);
+    }
+    m_fft_fwd_y.template compute_c2c<Direction::forward>();
+
+    if (                          m_cmd_y2z) {
+        ParallelCopy(m_cz, m_cy, *m_cmd_y2z, 0, 0, 1, m_dtos_y2z);
+    }
+#if (AMREX_SPACEDIM == 3)
+    else if (                     m_cmd_x2z) {
+        ParallelCopy(m_cz, m_cx, *m_cmd_x2z, 0, 0, 1, m_dtos_x2z);
+    }
+#endif
+    m_fft_fwd_z.template compute_c2c<Direction::forward>();
+}
+
+template <typename T, Direction D, DomainStrategy S>
+template <Direction DIR, std::enable_if_t<DIR == Direction::both, int> >
+void R2C<T,D,S>::backward (MF& outmf)
+{
+    backward_doit(outmf);
+}
+
+template <typename T, Direction D, DomainStrategy S>
+void R2C<T,D,S>::backward_doit (MF& outmf, IntVect const& ngout)
+{
+    BL_PROFILE("FFT::R2C::backward(out)");
+
+    m_fft_bwd_z.template compute_c2c<Direction::backward>();
+    if (                          m_cmd_z2y) {
+        ParallelCopy(m_cy, m_cz, *m_cmd_z2y, 0, 0, 1, m_dtos_z2y);
+    }
+#if (AMREX_SPACEDIM == 3)
+    else if (                     m_cmd_z2x) {
+        ParallelCopy(m_cx, m_cz, *m_cmd_z2x, 0, 0, 1, m_dtos_z2x);
+    }
+#endif
+
+    m_fft_bwd_y.template compute_c2c<Direction::backward>();
+    if (                          m_cmd_y2x) {
+        ParallelCopy(m_cx, m_cy, *m_cmd_y2x, 0, 0, 1, m_dtos_y2x);
+    }
+
+    m_fft_bwd_x.template compute_r2c<Direction::backward>();
+    outmf.ParallelCopy(m_rx, 0, 0, 1, IntVect(0), ngout);
+}
+
+template <typename T, Direction D, DomainStrategy S>
+std::pair<Plan<T>, Plan<T>>
+R2C<T,D,S>::make_c2c_plans (cMF& inout)
+{
+    Plan<T> fwd;
+    Plan<T> bwd;
+
+    auto* fab = detail::get_fab(inout);
+    if (!fab) { return {fwd, bwd};}
+
+    Box const& box = fab->box();
+    auto* pio = (typename Plan<T>::VendorComplex *)fab->dataPtr();
+
+#ifdef AMREX_USE_SYCL
+    fwd.template init_c2c<Direction::forward>(box, pio);
+    bwd = fwd;
+#else
+    if constexpr (D == Direction::both || D == Direction::forward) {
+        fwd.template init_c2c<Direction::forward>(box, pio);
+    }
+    if constexpr (D == Direction::both || D == Direction::backward) {
+        bwd.template init_c2c<Direction::backward>(box, pio);
+    }
+#endif
+
+    return {fwd, bwd};
+}
+
+template <typename T, Direction D, DomainStrategy S>
+template <typename F>
+void R2C<T,D,S>::post_forward_doit (F const& post_forward)
+{
+    if (m_info.batch_mode) {
+        amrex::Abort("xxxxx todo: post_forward");
+    } else {
+        if (                           ! m_cz.empty()) {
+            auto* spectral_fab = detail::get_fab(m_cz);
+            if (spectral_fab) {
+                auto const& a = spectral_fab->array(); // m_cz's ordering is z,x,y
+                ParallelFor(spectral_fab->box(),
+                [=] AMREX_GPU_DEVICE (int iz, int jx, int ky)
+                {
+                    post_forward(jx,ky,iz,a(iz,jx,ky));
+                });
+            }
+        } else if (                    ! m_cy.empty()) {
+            auto* spectral_fab = detail::get_fab(m_cy);
+            if (spectral_fab) {
+                auto const& a = spectral_fab->array(); // m_cy's ordering is y,x,z
+                ParallelFor(spectral_fab->box(),
+                [=] AMREX_GPU_DEVICE (int iy, int jx, int k)
+                {
+                    post_forward(jx,iy,k,a(iy,jx,k));
+                });
+            }
+        } else {
+            auto* spectral_fab = detail::get_fab(m_cx);
+            if (spectral_fab) {
+                auto const& a = spectral_fab->array();
+                ParallelFor(spectral_fab->box(),
+                [=] AMREX_GPU_DEVICE (int i, int j, int k)
+                {
+                    post_forward(i,j,k,a(i,j,k));
+                });
+            }
+        }
+    }
+}
+
+template <typename T, Direction D, DomainStrategy S>
+T R2C<T,D,S>::scalingFactor () const
+{
+#if (AMREX_SPACEDIM == 3)
+    if (m_info.batch_mode) {
+        return T(1)/T(Long(m_real_domain.length(0)) *
+                      Long(m_real_domain.length(1)));
+    } else
+#endif
+    {
+        return T(1)/T(m_real_domain.numPts());
+    }
+}
+
+template <typename T, Direction D, DomainStrategy S>
+template <Direction DIR, std::enable_if_t<DIR == Direction::forward ||
+                                          DIR == Direction::both, int> >
+std::pair<typename R2C<T,D,S>::cMF *, IntVect>
+R2C<T,D,S>::getSpectralData ()
+{
+    if (!m_cz.empty()) {
+        return std::make_pair(&m_cz, IntVect{AMREX_D_DECL(2,0,1)});
+    } else if (!m_cy.empty()) {
+        return std::make_pair(&m_cy, IntVect{AMREX_D_DECL(1,0,2)});
+    } else {
+        return std::make_pair(&m_cx, IntVect{AMREX_D_DECL(0,1,2)});
+    }
+}
+
+template <typename T, Direction D, DomainStrategy S>
+template <Direction DIR, std::enable_if_t<DIR == Direction::forward ||
+                                          DIR == Direction::both, int> >
+void R2C<T,D,S>::forward (MF const& inmf, cMF& outmf)
+{
+    BL_PROFILE("FFT::R2C::forward(inout)");
+
+    forward(inmf);
+    if (!m_cz.empty()) { // m_cz's order (z,x,y) -> (x,y,z)
+        RotateBwd dtos{};
+        MultiBlockCommMetaData cmd
+            (outmf, m_spectral_domain_x, m_cz, IntVect(0), dtos);
+        ParallelCopy(outmf, m_cz, cmd, 0, 0, 1, dtos);
+    } else if (!m_cy.empty()) { // m_cy's order (y,x,z) -> (x,y,z)
+        MultiBlockCommMetaData cmd
+            (outmf, m_spectral_domain_x, m_cy, IntVect(0), m_dtos_y2x);
+        ParallelCopy(outmf, m_cy, cmd, 0, 0, 1, m_dtos_y2x);
+    } else {
+        outmf.ParallelCopy(m_cx, 0, 0, 1);
+    }
+}
+
+template <typename T, Direction D, DomainStrategy S>
+template <Direction DIR, std::enable_if_t<DIR == Direction::backward ||
+                                          DIR == Direction::both, int> >
+void R2C<T,D,S>::backward (cMF const& inmf, MF& outmf)
+{
+    BL_PROFILE("FFT::R2C::backward(inout)");
+
+    if (!m_cz.empty()) { // (x,y,z) -> m_cz's order (z,x,y)
+        RotateFwd dtos{};
+        MultiBlockCommMetaData cmd
+            (m_cz, m_spectral_domain_z, inmf, IntVect(0), dtos);
+        ParallelCopy(m_cz, inmf, cmd, 0, 0, 1, dtos);
+    } else if (!m_cy.empty()) { // (x,y,z) -> m_cy's ordering (y,x,z)
+        MultiBlockCommMetaData cmd
+            (m_cy, m_spectral_domain_y, inmf, IntVect(0), m_dtos_x2y);
+        ParallelCopy(m_cy, inmf, cmd, 0, 0, 1, m_dtos_x2y);
+    } else {
+        m_cx.ParallelCopy(inmf, 0, 0, 1);
+    }
+    backward_doit(outmf);
+}
+
+template <typename T, Direction D, DomainStrategy S>
+std::pair<BoxArray,DistributionMapping>
+R2C<T,D,S>::getSpectralDataLayout () const
+{
+#if (AMREX_SPACEDIM == 3)
+    if (!m_cz.empty()) {
+        BoxList bl = m_cz.boxArray().boxList();
+        for (auto& b : bl) {
+            auto lo = b.smallEnd();
+            auto hi = b.bigEnd();
+            std::swap(lo[0], lo[1]);
+            std::swap(lo[1], lo[2]);
+            std::swap(hi[0], hi[1]);
+            std::swap(hi[1], hi[2]);
+            b.setSmall(lo);
+            b.setBig(hi);
+        }
+        return std::make_pair(BoxArray(std::move(bl)), m_cz.DistributionMap());
+    } else
+#endif
+#if (AMREX_SPACEDIM >= 2)
+    if (!!m_cy.empty()) {
+        BoxList bl = m_cy.boxArray().boxList();
+        for (auto& b : bl) {
+            auto lo = b.smallEnd();
+            auto hi = b.bigEnd();
+            std::swap(lo[0], lo[1]);
+            std::swap(hi[0], hi[1]);
+            b.setSmall(lo);
+            b.setBig(hi);
+        }
+        return std::make_pair(BoxArray(std::move(bl)), m_cy.DistributionMap());
+    } else
+#endif
+    {
+        return std::make_pair(m_cx.boxArray(), m_cx.DistributionMap());
+    }
+}
+
+}
+
+#endif
diff --git a/Src/FFT/AMReX_FFT_R2X.H b/Src/FFT/AMReX_FFT_R2X.H
new file mode 100644
index 0000000000..5d916ada3c
--- /dev/null
+++ b/Src/FFT/AMReX_FFT_R2X.H
@@ -0,0 +1,666 @@
+#ifndef AMREX_FFT_R2X_H_
+#define AMREX_FFT_R2X_H_
+#include <AMReX_Config.H>
+
+#include <AMReX_MultiFab.H>
+#include <AMReX_FFT_Helper.H>
+#include <algorithm>
+#include <numeric>
+#include <tuple>
+
+namespace amrex::FFT
+{
+
+/**
+ * \brief Discrete Fourier Transform
+ *
+ * This class supports Fourier transforms including cosine and sine
+ * transforms.
+ */
+template <typename T = Real>
+class R2X
+{
+public:
+    using MF = std::conditional_t<std::is_same_v<T,Real>,
+                                  MultiFab, FabArray<BaseFab<T> > >;
+    using cMF = FabArray<BaseFab<GpuComplex<T> > >;
+
+    R2X (Box const& domain,
+         Array<std::pair<Boundary,Boundary>,AMREX_SPACEDIM> const& bc,
+         Info const& info = Info{});
+
+    ~R2X ();
+
+    R2X (R2X const&) = delete;
+    R2X (R2X &&) = delete;
+    R2X& operator= (R2X const&) = delete;
+    R2X& operator= (R2X &&) = delete;
+
+    [[nodiscard]] T scalingFactor () const;
+
+    template <typename F>
+    void forwardThenBackward (MF const& inmf, MF& outmf, F const& post_forward);
+
+    // public for cuda
+    template <int dim, typename FAB, typename F>
+    void post_forward_doit (FAB* fab, F const& f);
+
+private:
+    Box m_dom_0;
+    Array<std::pair<Boundary,Boundary>,AMREX_SPACEDIM> m_bc;
+
+    Plan<T> m_fft_fwd_x{};
+    Plan<T> m_fft_bwd_x{};
+    Plan<T> m_fft_fwd_y{};
+    Plan<T> m_fft_bwd_y{};
+    Plan<T> m_fft_fwd_z{};
+    Plan<T> m_fft_bwd_z{};
+
+    std::unique_ptr<MultiBlockCommMetaData> m_cmd_cx2cy;
+    std::unique_ptr<MultiBlockCommMetaData> m_cmd_rx2ry;
+    std::unique_ptr<MultiBlockCommMetaData> m_cmd_cy2cz;
+    std::unique_ptr<MultiBlockCommMetaData> m_cmd_ry2rz;
+
+    std::unique_ptr<MultiBlockCommMetaData> m_cmd_cy2cx;
+    std::unique_ptr<MultiBlockCommMetaData> m_cmd_ry2rx;
+    std::unique_ptr<MultiBlockCommMetaData> m_cmd_cz2cy;
+    std::unique_ptr<MultiBlockCommMetaData> m_cmd_rz2ry;
+
+    Swap01 m_dtos_x2y{};
+    Swap01 m_dtos_y2x{};
+    Swap02 m_dtos_y2z{};
+    Swap02 m_dtos_z2y{};
+
+    MF m_rx;
+    MF m_ry;
+    MF m_rz;
+    cMF m_cx;
+    cMF m_cy;
+    cMF m_cz;
+
+    std::unique_ptr<char,DataDeleter> m_data_1;
+    std::unique_ptr<char,DataDeleter> m_data_2;
+
+    Box m_dom_rx;
+    Box m_dom_ry;
+    Box m_dom_rz;
+    Box m_dom_cx;
+    Box m_dom_cy;
+    Box m_dom_cz;
+
+    Info m_info;
+};
+
+template <typename T>
+R2X<T>::R2X (Box const& domain,
+             Array<std::pair<Boundary,Boundary>,AMREX_SPACEDIM> const& bc,
+             Info const& info)
+    : m_dom_0(domain),
+      m_bc(bc),
+      m_info(info)
+{
+    BL_PROFILE("FFT::R2X");
+
+    static_assert(std::is_same_v<float,T> || std::is_same_v<double,T>);
+    AMREX_ALWAYS_ASSERT(domain.smallEnd() == 0 &&
+                        domain.length(0) > 1 &&
+                        domain.cellCentered());
+#if (AMREX_SPACEDIM == 3)
+    AMREX_ALWAYS_ASSERT(domain.length(1) > 1 || domain.length(2) == 1);
+#endif
+    for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) {
+        AMREX_ALWAYS_ASSERT(domain.length(idim) > 1);
+        if (bc[idim].first == Boundary::periodic ||
+            bc[idim].second == Boundary::periodic) {
+            AMREX_ALWAYS_ASSERT(bc[idim].first == bc[idim].second);
+        }
+    }
+
+    int myproc = ParallelContext::MyProcSub();
+    int nprocs = std::min(ParallelContext::NProcsSub(), m_info.nprocs);
+
+    //
+    // make data containers
+    //
+
+    m_dom_rx = m_dom_0;
+    auto bax = amrex::decompose(m_dom_rx, nprocs, {AMREX_D_DECL(false,true,true)});
+    DistributionMapping dmx = detail::make_iota_distromap(bax.size());
+    m_rx.define(bax, dmx, 1, 0, MFInfo().SetAlloc(false));
+
+    // x-direction
+    if (bc[0].first == Boundary::periodic) {
+        // x-fft: r2c(m_rx->m_cx)
+        m_dom_cx = Box(IntVect(0), IntVect(AMREX_D_DECL(domain.length(0)/2,
+                                                        domain.bigEnd(1),
+                                                        domain.bigEnd(2))));
+        BoxList bl = bax.boxList();
+        for (auto & b : bl) {
+            b.setBig(0, m_dom_cx.bigEnd(0));
+        }
+        BoxArray cbax(std::move(bl));
+        m_cx.define(cbax, dmx, 1, 0, MFInfo().SetAlloc(false));
+    } // else: x-fft: r2r(m_rx)
+
+#if (AMREX_SPACEDIM >= 2)
+    if (domain.length(1) > 1) {
+        if (! m_cx.empty()) {
+            // copy(m_cx->m_cy)
+            m_dom_cy = Box(IntVect(0), IntVect(AMREX_D_DECL(m_dom_cx.bigEnd(1),
+                                                            m_dom_cx.bigEnd(0),
+                                                            m_dom_cx.bigEnd(2))));
+            auto ba = amrex::decompose(m_dom_cy, nprocs, {AMREX_D_DECL(false,true,true)});
+            DistributionMapping dm;
+            if (ba.size() == m_cx.size()) {
+                dm = m_cx.DistributionMap();
+            } else {
+                dm = detail::make_iota_distromap(ba.size());
+            }
+            m_cy.define(ba, dm, 1, 0, MFInfo().SetAlloc(false));
+            // if bc[1] is periodic:
+            //     c2c(m_cy->m_cy)
+            // else:
+            //     r2r(m_cy.re) & r2r(m_cy.im)
+        } else {
+            // copy(m_rx->m_ry)
+            m_dom_ry = Box(IntVect(0), IntVect(AMREX_D_DECL(m_dom_rx.bigEnd(1),
+                                                            m_dom_rx.bigEnd(0),
+                                                            m_dom_rx.bigEnd(2))));
+            auto ba = amrex::decompose(m_dom_ry, nprocs, {AMREX_D_DECL(false,true,true)});
+            DistributionMapping dm;
+            if (ba.size() == m_rx.size()) {
+                dm = m_rx.DistributionMap();
+            } else {
+                dm = detail::make_iota_distromap(ba.size());
+            }
+            m_ry.define(ba, dm, 1, 0, MFInfo().SetAlloc(false));
+            // if bc[1] is periodic:
+            //     r2c(m_ry->m_cy)
+            // else:
+            //     r2r(m_ry)
+            if (bc[1].first == Boundary::periodic) {
+                m_dom_cy = Box(IntVect(0), IntVect(AMREX_D_DECL(m_dom_ry.length(0)/2,
+                                                                m_dom_ry.bigEnd(1),
+                                                                m_dom_ry.bigEnd(2))));
+                BoxList bl = ba.boxList();
+                for (auto & b : bl) {
+                    b.setBig(0, m_dom_cy.bigEnd(0));
+                }
+                BoxArray cba(std::move(bl));
+                m_cy.define(cba, dm, 1, 0, MFInfo().SetAlloc(false));
+            }
+        }
+    }
+#endif
+
+#if (AMREX_SPACEDIM == 3)
+    if (domain.length(2) > 1) {
+        if (! m_cy.empty()) {
+            // copy(m_cy, m_cz)
+            m_dom_cz = Box(IntVect(0), IntVect(AMREX_D_DECL(m_dom_cy.bigEnd(2),
+                                                            m_dom_cy.bigEnd(1),
+                                                            m_dom_cy.bigEnd(0))));
+            auto ba = amrex::decompose(m_dom_cz, nprocs, {AMREX_D_DECL(false,true,true)});
+            DistributionMapping dm;
+            if (ba.size() == m_cy.size()) {
+                dm = m_cy.DistributionMap();
+            } else {
+                dm = detail::make_iota_distromap(ba.size());
+            }
+            m_cz.define(ba, dm, 1, 0, MFInfo().SetAlloc(false));
+            // if bc[2] is periodic:
+            //     c2c(m_cz->m_cz)
+            // else:
+            //     r2r(m_cz.re) & r2r(m_cz.im)
+        } else {
+            // copy(m_ry, m_rz)
+            m_dom_rz = Box(IntVect(0), IntVect(AMREX_D_DECL(m_dom_ry.bigEnd(2),
+                                                            m_dom_ry.bigEnd(1),
+                                                            m_dom_ry.bigEnd(0))));
+            auto ba = amrex::decompose(m_dom_rz, nprocs, {AMREX_D_DECL(false,true,true)});
+            DistributionMapping dm;
+            if (ba.size() == m_ry.size()) {
+                dm = m_ry.DistributionMap();
+            } else {
+                dm = detail::make_iota_distromap(ba.size());
+            }
+            m_rz.define(ba, dm, 1, 0, MFInfo().SetAlloc(false));
+            // if bc[2] is periodic:
+            //     r2c(m_rz->m_cz)
+            // else:
+            //     r2r(m_rz)
+            if (bc[2].first == Boundary::periodic) {
+                m_dom_cz = Box(IntVect(0), IntVect(AMREX_D_DECL(m_dom_rz.length(0)/2,
+                                                                m_dom_rz.bigEnd(1),
+                                                                m_dom_rz.bigEnd(2))));
+                BoxList bl = ba.boxList();
+                for (auto & b : bl) {
+                    b.setBig(0, m_dom_cz.bigEnd(0));
+                }
+                BoxArray cba(std::move(bl));
+                m_cz.define(cba, dm, 1, 0, MFInfo().SetAlloc(false));
+            }
+        }
+    }
+#endif
+
+    // There are several different execution paths.
+    //
+    // (1) x-r2c(m_rx->m_cx), copy(m_cx->m_cy), y-fft(m_cy),
+    //     copy(m_cy->m_cz), z-fft(m_cz)
+    //     In this case, we have m_rx, m_cx, m_cy, & m_cz.
+    //     we can alias(m_rx,m_cy) and alias(m_cx,m_cz).
+    //
+    // (2) x_r2r(m_rx), copy(m_rx->m_ry), y-r2c(m_ry->m_cy),
+    //     copy(m_cy->m_cz), z-fft(m_cz)
+    //     In this case, we have m_rx, m_ry, m_cy, & m_cz.
+    //     We can alias(m_rx,m_cy) and alias(m_ry,m_cz).
+    //
+    // (3) x_r2r(m_rx), copy(m_rx->m_ry), y-r2r(m_ry),
+    //     copy(m_ry->m_rz), z-r2c(m_rz->m_rz)
+    //     In this case, we have m_rx, m_ry, m_rz, & m_cz
+    //     We can alias(m_rx,m_rz) and alias(m_ry,m_cz)
+    //
+    // (4) x_r2r(m_rx), copy(m_rx->m_ry), y-r2r(m_ry),
+    //     copy(m_ry->m_rz), z-r2r(m_rz)
+    //     In this case, we have m_rx, m_ry, & m_rz.
+    //     We can alias(m_rx,m_rz).
+
+    if (! m_cx.empty()) {
+        m_data_1 = detail::make_mfs_share(m_rx, m_cy);
+        m_data_2 = detail::make_mfs_share(m_cx, m_cz);
+    } else if (! m_cy.empty()) {
+        m_data_1 = detail::make_mfs_share(m_rx, m_cy);
+        m_data_2 = detail::make_mfs_share(m_ry, m_cz);
+    } else if (! m_cz.empty()) {
+        m_data_1 = detail::make_mfs_share(m_rx, m_rz);
+        m_data_2 = detail::make_mfs_share(m_ry, m_cz);
+    } else {
+        m_data_1 = detail::make_mfs_share(m_rx, m_rz);
+        m_data_2 = detail::make_mfs_share(m_ry, m_cz); // It's okay m_cz is empty.
+    }
+
+    //
+    // make copiers
+    //
+
+#if (AMREX_SPACEDIM >= 2)
+    if (domain.length(1) > 1) {
+        if (! m_cx.empty()) {
+            // copy(m_cx->m_cy)
+            m_cmd_cx2cy = std::make_unique<MultiBlockCommMetaData>
+                (m_cy, m_dom_cy, m_cx, IntVect(0), m_dtos_x2y);
+            m_cmd_cy2cx = std::make_unique<MultiBlockCommMetaData>
+                (m_cx, m_dom_cx, m_cy, IntVect(0), m_dtos_y2x);
+        } else {
+            // copy(m_rx->m_ry)
+            m_cmd_rx2ry = std::make_unique<MultiBlockCommMetaData>
+                (m_ry, m_dom_ry, m_rx, IntVect(0), m_dtos_x2y);
+            m_cmd_ry2rx = std::make_unique<MultiBlockCommMetaData>
+                (m_rx, m_dom_rx, m_ry, IntVect(0), m_dtos_y2x);
+        }
+    }
+#endif
+
+#if (AMREX_SPACEDIM == 3)
+    if (domain.length(2) > 1) {
+        if (! m_cy.empty()) {
+            // copy(m_cy, m_cz)
+            m_cmd_cy2cz = std::make_unique<MultiBlockCommMetaData>
+                (m_cz, m_dom_cz, m_cy, IntVect(0), m_dtos_y2z);
+            m_cmd_cz2cy = std::make_unique<MultiBlockCommMetaData>
+                (m_cy, m_dom_cy, m_cz, IntVect(0), m_dtos_z2y);
+        } else {
+            // copy(m_ry, m_rz)
+            m_cmd_ry2rz = std::make_unique<MultiBlockCommMetaData>
+                (m_rz, m_dom_rz, m_ry, IntVect(0), m_dtos_y2z);
+            m_cmd_rz2ry = std::make_unique<MultiBlockCommMetaData>
+                (m_ry, m_dom_ry, m_rz, IntVect(0), m_dtos_z2y);
+        }
+    }
+#endif
+
+    //
+    // make plans
+    //
+
+    using VendorComplex = typename Plan<T>::VendorComplex;
+
+    if (myproc < m_rx.size())
+    {
+        Box const& box = m_rx.box(myproc);
+        auto* pf = m_rx[myproc].dataPtr();
+        if (bc[0].first == Boundary::periodic) {
+            auto* pb = (VendorComplex*) m_cx[myproc].dataPtr();
+            m_fft_fwd_x.template init_r2c<Direction::forward>(box, pf, pb);
+#if defined(AMREX_USE_SYCL)
+            m_fft_bwd_x = m_fft_fwd_x;
+#else
+            m_fft_bwd_x.template init_r2c<Direction::backward>(box, pf, pb);
+#endif
+        } else {
+            m_fft_fwd_x.template init_r2r<Direction::forward>(box, pf, bc[0]);
+#if defined(AMREX_USE_GPU)
+            if ((bc[0].first == Boundary::even && bc[0].second == Boundary::odd) ||
+                (bc[0].first == Boundary::odd && bc[0].second == Boundary::even)) {
+                m_fft_bwd_x = m_fft_fwd_x;
+            } else
+#endif
+            {
+                m_fft_bwd_x.template init_r2r<Direction::backward>(box, pf, bc[0]);
+            }
+        }
+    }
+
+#if (AMREX_SPACEDIM >= 2)
+    if (m_ry.empty() && m_bc[1].first == Boundary::periodic) {
+        if (myproc < m_cy.size()) {
+            Box const& box = m_cy.box(myproc);
+            auto* p = (VendorComplex *)m_cy[myproc].dataPtr();
+            m_fft_fwd_y.template init_c2c<Direction::forward>(box, p);
+#if defined(AMREX_USE_SYCL)
+            m_fft_bwd_y = m_fft_fwd_y;
+#else
+            m_fft_bwd_y.template init_c2c<Direction::backward>(box, p);
+#endif
+        }
+    } else if (!m_ry.empty() && m_bc[1].first == Boundary::periodic) {
+        if (myproc < m_ry.size()) {
+            Box const& box = m_ry.box(myproc);
+            auto* pr =                 m_ry[myproc].dataPtr();
+            auto* pc = (VendorComplex*)m_cy[myproc].dataPtr();
+            m_fft_fwd_y.template init_r2c<Direction::forward>(box, pr, pc);
+#if defined(AMREX_USE_SYCL)
+            m_fft_bwd_y = m_fft_fwd_y;
+#else
+            m_fft_bwd_y.template init_r2c<Direction::backward>(box, pr, pc);
+#endif
+        }
+    } else if (!m_cy.empty()) {
+        if (myproc < m_cy.size()) {
+            Box const& box = m_cy.box(myproc);
+            auto* p = (VendorComplex*) m_cy[myproc].dataPtr();
+            m_fft_fwd_y.template init_r2r<Direction::forward>(box, p, bc[1]);
+#if defined(AMREX_USE_GPU)
+            if ((bc[1].first == Boundary::even && bc[1].second == Boundary::odd) ||
+                (bc[1].first == Boundary::odd && bc[1].second == Boundary::even)) {
+                m_fft_bwd_y = m_fft_fwd_y;
+            } else
+#endif
+            {
+                m_fft_bwd_y.template init_r2r<Direction::backward>(box, p, bc[1]);
+            }
+        }
+    } else {
+        if (myproc < m_ry.size()) {
+            Box const& box = m_ry.box(myproc);
+            auto* p = m_ry[myproc].dataPtr();
+            m_fft_fwd_y.template init_r2r<Direction::forward>(box, p, bc[1]);
+#if defined(AMREX_USE_GPU)
+            if ((bc[1].first == Boundary::even && bc[1].second == Boundary::odd) ||
+                (bc[1].first == Boundary::odd && bc[1].second == Boundary::even)) {
+                m_fft_bwd_y = m_fft_fwd_y;
+            } else
+#endif
+            {
+                m_fft_bwd_y.template init_r2r<Direction::backward>(box, p, bc[1]);
+            }
+        }
+    }
+#endif
+
+#if (AMREX_SPACEDIM == 3)
+    if (m_rz.empty() && m_bc[2].first == Boundary::periodic) {
+        if (myproc < m_cz.size()) {
+            Box const& box = m_cz.box(myproc);
+            auto* p = (VendorComplex*)m_cz[myproc].dataPtr();
+            m_fft_fwd_z.template init_c2c<Direction::forward>(box, p);
+#if defined(AMREX_USE_SYCL)
+            m_fft_bwd_z = m_fft_fwd_z;
+#else
+            m_fft_bwd_z.template init_c2c<Direction::backward>(box, p);
+#endif
+        }
+    } else if (!m_rz.empty() && m_bc[2].first == Boundary::periodic) {
+        if (myproc < m_rz.size()) {
+            Box const& box = m_rz.box(myproc);
+            auto* pr =                 m_rz[myproc].dataPtr();
+            auto* pc = (VendorComplex*)m_cz[myproc].dataPtr();
+            m_fft_fwd_z.template init_r2c<Direction::forward>(box, pr, pc);
+#if defined(AMREX_USE_SYCL)
+            m_fft_bwd_z = m_fft_fwd_z;
+#else
+            m_fft_bwd_z.template init_r2c<Direction::backward>(box, pr, pc);
+#endif
+        }
+    } else if (!m_cz.empty()) {
+        if (myproc < m_cz.size()) {
+            Box const& box = m_cz.box(myproc);
+            auto* p = (VendorComplex*) m_cz[myproc].dataPtr();
+            m_fft_fwd_z.template init_r2r<Direction::forward>(box, p, bc[2]);
+#if defined(AMREX_USE_GPU)
+            if ((bc[2].first == Boundary::even && bc[2].second == Boundary::odd) ||
+                (bc[2].first == Boundary::odd && bc[2].second == Boundary::even)) {
+                m_fft_bwd_z = m_fft_fwd_z;
+            } else
+#endif
+            {
+                m_fft_bwd_z.template init_r2r<Direction::backward>(box, p, bc[2]);
+            }
+        }
+    } else {
+        if (myproc < m_rz.size()) {
+            Box const& box = m_rz.box(myproc);
+            auto* p = m_rz[myproc].dataPtr();
+            m_fft_fwd_z.template init_r2r<Direction::forward>(box, p, bc[2]);
+#if defined(AMREX_USE_GPU)
+            if ((bc[2].first == Boundary::even && bc[2].second == Boundary::odd) ||
+                (bc[2].first == Boundary::odd && bc[2].second == Boundary::even)) {
+                m_fft_bwd_z = m_fft_fwd_z;
+            } else
+#endif
+            {
+                m_fft_bwd_z.template init_r2r<Direction::backward>(box, p, bc[2]);
+            }
+        }
+    }
+#endif
+}
+
+template <typename T>
+R2X<T>::~R2X ()
+{
+    if (m_fft_bwd_x.plan != m_fft_fwd_x.plan) {
+        m_fft_bwd_x.destroy();
+    }
+    if (m_fft_bwd_y.plan != m_fft_fwd_y.plan) {
+        m_fft_bwd_y.destroy();
+    }
+    if (m_fft_bwd_z.plan != m_fft_fwd_z.plan) {
+        m_fft_bwd_z.destroy();
+    }
+    m_fft_fwd_x.destroy();
+    m_fft_fwd_y.destroy();
+    m_fft_fwd_z.destroy();
+}
+
+template <typename T>
+T R2X<T>::scalingFactor () const
+{
+    auto r = m_dom_0.numPts();
+    for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) {
+        if (m_bc[idim].first != Boundary::periodic
+            && m_dom_0.length(idim) > 1)
+        {
+            r *= 2;
+        }
+    }
+    return T(1)/T(r);
+}
+
+template <typename T>
+template <typename F>
+void R2X<T>::forwardThenBackward (MF const& inmf, MF& outmf, F const& post_forward)
+{
+    BL_PROFILE("FFT::R2X::forwardbackward");
+
+    // forward
+
+    m_rx.ParallelCopy(inmf, 0, 0, 1);
+    if (m_bc[0].first == Boundary::periodic) {
+        m_fft_fwd_x.template compute_r2c<Direction::forward>();
+    } else {
+        m_fft_fwd_x.template compute_r2r<Direction::forward>();
+    }
+
+#if (AMREX_SPACEDIM >= 2)
+    if (                          m_cmd_cx2cy) {
+        ParallelCopy(m_cy, m_cx, *m_cmd_cx2cy, 0, 0, 1, m_dtos_x2y);
+    } else if (                   m_cmd_rx2ry) {
+        ParallelCopy(m_ry, m_rx, *m_cmd_rx2ry, 0, 0, 1, m_dtos_x2y);
+    }
+    if (m_bc[1].first != Boundary::periodic)
+    {
+        m_fft_fwd_y.template compute_r2r<Direction::forward>();
+    }
+    else if (m_bc[0].first == Boundary::periodic)
+    {
+        m_fft_fwd_y.template compute_c2c<Direction::forward>();
+    }
+    else
+    {
+        m_fft_fwd_y.template compute_r2c<Direction::forward>();
+    }
+#endif
+
+#if (AMREX_SPACEDIM == 3)
+    if (                          m_cmd_cy2cz) {
+        ParallelCopy(m_cz, m_cy, *m_cmd_cy2cz, 0, 0, 1, m_dtos_y2z);
+    } else if (                   m_cmd_ry2rz) {
+        ParallelCopy(m_rz, m_ry, *m_cmd_ry2rz, 0, 0, 1, m_dtos_y2z);
+    }
+    if (m_bc[2].first != Boundary::periodic)
+    {
+        m_fft_fwd_z.template compute_r2r<Direction::forward>();
+    }
+    else if (m_bc[0].first == Boundary::periodic ||
+             m_bc[1].first == Boundary::periodic)
+    {
+        m_fft_fwd_z.template compute_c2c<Direction::forward>();
+    }
+    else
+    {
+        m_fft_fwd_z.template compute_r2c<Direction::forward>();
+    }
+#endif
+
+    // post-forward
+
+    int actual_dim = AMREX_SPACEDIM;
+#if (AMREX_SPACEDIM >= 2)
+    if (m_dom_0.length(1) == 1) { actual_dim = 1; }
+#endif
+#if (AMREX_SPACEDIM == 3)
+    if ((m_dom_0.length(2) == 1) && (m_dom_0.length(1) > 1)) { actual_dim = 2; }
+#endif
+
+    if (actual_dim == 1) {
+        if (m_cx.empty()) {
+            post_forward_doit<0>(detail::get_fab(m_rx), post_forward);
+        } else {
+            post_forward_doit<0>(detail::get_fab(m_cx), post_forward);
+        }
+    }
+#if (AMREX_SPACEDIM >= 2)
+    else if (actual_dim == 2) {
+        if (m_cy.empty()) {
+            post_forward_doit<1>(detail::get_fab(m_ry), post_forward);
+        } else {
+            post_forward_doit<1>(detail::get_fab(m_cy), post_forward);
+        }
+    }
+#endif
+#if (AMREX_SPACEDIM == 3)
+    else if (actual_dim == 3) {
+        if (m_cz.empty()) {
+            post_forward_doit<2>(detail::get_fab(m_rz), post_forward);
+        } else {
+            post_forward_doit<2>(detail::get_fab(m_cz), post_forward);
+        }
+    }
+#endif
+
+    // backward
+
+#if (AMREX_SPACEDIM == 3)
+    if (m_bc[2].first != Boundary::periodic)
+    {
+        m_fft_bwd_z.template compute_r2r<Direction::backward>();
+    }
+    else if (m_bc[0].first == Boundary::periodic ||
+             m_bc[1].first == Boundary::periodic)
+    {
+        m_fft_bwd_z.template compute_c2c<Direction::backward>();
+    }
+    else
+    {
+        m_fft_bwd_z.template compute_r2c<Direction::backward>();
+    }
+    if (                          m_cmd_cz2cy) {
+        ParallelCopy(m_cy, m_cz, *m_cmd_cz2cy, 0, 0, 1, m_dtos_z2y);
+    } else if (                   m_cmd_rz2ry) {
+        ParallelCopy(m_ry, m_rz, *m_cmd_rz2ry, 0, 0, 1, m_dtos_z2y);
+    }
+#endif
+
+#if (AMREX_SPACEDIM >= 2)
+    if (m_bc[1].first != Boundary::periodic)
+    {
+        m_fft_bwd_y.template compute_r2r<Direction::backward>();
+    }
+    else if (m_bc[0].first == Boundary::periodic)
+    {
+        m_fft_bwd_y.template compute_c2c<Direction::backward>();
+    }
+    else
+    {
+        m_fft_bwd_y.template compute_r2c<Direction::backward>();
+    }
+    if (                          m_cmd_cy2cx) {
+        ParallelCopy(m_cx, m_cy, *m_cmd_cy2cx, 0, 0, 1, m_dtos_y2x);
+    } else if (                   m_cmd_ry2rx) {
+        ParallelCopy(m_rx, m_ry, *m_cmd_ry2rx, 0, 0, 1, m_dtos_y2x);
+    }
+#endif
+
+    if (m_bc[0].first == Boundary::periodic) {
+        m_fft_bwd_x.template compute_r2c<Direction::backward>();
+    } else {
+        m_fft_bwd_x.template compute_r2r<Direction::backward>();
+    }
+    outmf.ParallelCopy(m_rx, 0, 0, 1);
+}
+
+template <typename T>
+template <int dim, typename FAB, typename F>
+void R2X<T>::post_forward_doit (FAB* fab, F const& f)
+{
+    if (fab) {
+        auto const& a = fab->array();
+        ParallelFor(fab->box(),
+        [f=f,a=a] AMREX_GPU_DEVICE (int i, int j, int k)
+        {
+            if constexpr (dim == 0) {
+                f(i,j,k,a(i,j,k));
+            } else if constexpr (dim == 1) {
+                f(j,i,k,a(i,j,k));
+            } else {
+                f(j,k,i,a(i,j,k));
+            }
+        });
+    }
+}
+
+}
+
+#endif
diff --git a/Src/FFT/CMakeLists.txt b/Src/FFT/CMakeLists.txt
index 2c695a9aec..6dd8150711 100644
--- a/Src/FFT/CMakeLists.txt
+++ b/Src/FFT/CMakeLists.txt
@@ -7,6 +7,10 @@ foreach(D IN LISTS AMReX_SPACEDIM)
        PRIVATE
        AMReX_FFT.H
        AMReX_FFT.cpp
+       AMReX_FFT_LocalR2C.H
+       AMReX_FFT_OpenBCSolver.H
+       AMReX_FFT_R2C.H
+       AMReX_FFT_R2X.H
        AMReX_FFT_Helper.H
        AMReX_FFT_Poisson.H
        )
diff --git a/Src/FFT/Make.package b/Src/FFT/Make.package
index 1dcd714f64..fb369b7caf 100644
--- a/Src/FFT/Make.package
+++ b/Src/FFT/Make.package
@@ -2,6 +2,8 @@ ifndef AMREX_FFT_MAKE
        AMREX_FFT_MAKE := 1
 
 CEXE_headers += AMReX_FFT.H AMReX_FFT_Helper.H AMReX_FFT_Poisson.H
+CEXE_headers += AMReX_FFT_OpenBCSolver.H AMReX_FFT_R2C.H AMReX_FFT_R2X.H
+CEXE_headers += AMReX_FFT_LocalR2C.H
 CEXE_sources += AMReX_FFT.cpp
 
 VPATH_LOCATIONS += $(AMREX_HOME)/Src/FFT
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLCGSolver.H b/Src/LinearSolvers/MLMG/AMReX_MLCGSolver.H
index b613a4f3a8..30d0b96a6f 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLCGSolver.H
+++ b/Src/LinearSolvers/MLMG/AMReX_MLCGSolver.H
@@ -182,7 +182,6 @@ MLCGSolverT<MF>::solve_bicgstab (MF& sol, const MF& rhs, RT eps_rel, RT eps_abs)
         Saxpy(sol, alpha, p, 0, 0, ncomp, nghost); // sol += alpha * p
         Saxpy(r,  -alpha, v, 0, 0, ncomp, nghost); // r += -alpha * v
 
-        rnorm = norm_inf(r);
         rnorm = norm_inf(r);
 
         if ( verbose > 2 && ParallelDescriptor::IOProcessor() )
diff --git a/Src/Particle/AMReX_ParticleBufferMap.H b/Src/Particle/AMReX_ParticleBufferMap.H
index 476d3d5343..33cec4b823 100644
--- a/Src/Particle/AMReX_ParticleBufferMap.H
+++ b/Src/Particle/AMReX_ParticleBufferMap.H
@@ -36,10 +36,10 @@ struct GetBucket
     const int* m_lev_gid_to_bucket;
     const int* m_lev_offsets;
 
-    GetBucket (const Gpu::DeviceVector<int>& lev_gid_to_bucket,
-               const Gpu::DeviceVector<int>& lev_offsets)
-        : m_lev_gid_to_bucket(lev_gid_to_bucket.dataPtr()),
-          m_lev_offsets(lev_offsets.dataPtr())
+    GetBucket (const int* lev_gid_to_bucket_ptr,
+               const int* lev_offsets_ptr)
+        : m_lev_gid_to_bucket(lev_gid_to_bucket_ptr),
+          m_lev_offsets(lev_offsets_ptr)
         {}
 
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
@@ -154,7 +154,9 @@ public:
     }
 
     [[nodiscard]] GetPID getPIDFunctor () const noexcept { return GetPID(d_bucket_to_pid, d_lev_gid_to_bucket, d_lev_offsets);}
-    [[nodiscard]] GetBucket getBucketFunctor () const noexcept { return GetBucket(d_lev_gid_to_bucket, d_lev_offsets);}
+    [[nodiscard]] GetBucket getBucketFunctor () const noexcept { return GetBucket(d_lev_gid_to_bucket.data(), d_lev_offsets.data());}
+    [[nodiscard]] GetBucket getHostBucketFunctor () const noexcept { return GetBucket(m_lev_gid_to_bucket.data(), m_lev_offsets.data());}
+
 };
 
 } // namespace amrex
diff --git a/Src/Particle/AMReX_ParticleCommunication.H b/Src/Particle/AMReX_ParticleCommunication.H
index 91933c75e5..00bf423478 100644
--- a/Src/Particle/AMReX_ParticleCommunication.H
+++ b/Src/Particle/AMReX_ParticleCommunication.H
@@ -154,7 +154,12 @@ struct ParticleCopyPlan
         m_box_counts_d.resize(num_buckets+1, 0);
         m_box_offsets.resize(num_buckets+1);
         auto* p_dst_box_counts = m_box_counts_d.dataPtr();
-        auto getBucket = pc.BufferMap().getBucketFunctor();
+        auto getBucket = pc.stableRedistribute() ? pc.BufferMap().getHostBucketFunctor() : pc.BufferMap().getBucketFunctor();
+
+        Gpu::HostVector<unsigned int> h_box_counts;
+        if (pc.stableRedistribute() ) {
+            h_box_counts.resize(m_box_counts_d.size(), 0);
+        }
 
         m_dst_indices.resize(num_levels);
         for (int lev = 0; lev < num_levels; ++lev)
@@ -166,24 +171,49 @@ struct ParticleCopyPlan
                 if (num_copies == 0) { continue; }
                 m_dst_indices[lev][gid].resize(num_copies);
 
-                const auto* p_boxes = op.m_boxes[lev].at(gid).dataPtr();
-                const auto* p_levs = op.m_levels[lev].at(gid).dataPtr();
-                auto* p_dst_indices = m_dst_indices[lev][gid].dataPtr();
-
-                AMREX_FOR_1D ( num_copies, i,
-                {
-                    int dst_box = p_boxes[i];
-                    if (dst_box >= 0)
-                    {
-                        int dst_lev = p_levs[i];
-                        int index = static_cast<int>(Gpu::Atomic::Add(
-                            &p_dst_box_counts[getBucket(dst_lev, dst_box)], 1U));
-                        p_dst_indices[i] = index;
+                if (pc.stableRedistribute()) {
+                    const Gpu::DeviceVector<int>& d_boxes = op.m_boxes[lev].at(gid);
+                    Gpu::HostVector<int> h_boxes(d_boxes.size());
+                    Gpu::copy(Gpu::deviceToHost,d_boxes.begin(),d_boxes.end(),h_boxes.begin());
+
+                    const Gpu::DeviceVector<int>& d_levs = op.m_levels[lev].at(gid);
+                    Gpu::HostVector<int> h_levs(d_levs.size());
+                    Gpu::copy(Gpu::deviceToHost,d_levs.begin(),d_levs.end(),h_levs.begin());
+
+                    Gpu::HostVector<int> h_dst_indices(num_copies);
+                    for (int i = 0; i < num_copies; ++i) {
+                        int dst_box = h_boxes[i];
+                        if (dst_box >= 0) {
+                            int dst_lev = h_levs[i];
+                            int index = static_cast<int>(h_box_counts[getBucket(dst_lev, dst_box)]++);
+                            h_dst_indices[i] = index;
+                        }
                     }
-                });
+                    Gpu::copy(Gpu::hostToDevice,h_dst_indices.begin(),h_dst_indices.end(),m_dst_indices[lev][gid].begin());
+                }
+                else {
+                    const auto* p_boxes = op.m_boxes[lev].at(gid).dataPtr();
+                    const auto* p_levs = op.m_levels[lev].at(gid).dataPtr();
+                    auto* p_dst_indices = m_dst_indices[lev][gid].dataPtr();
+                    AMREX_FOR_1D ( num_copies, i,
+                    {
+                        int dst_box = p_boxes[i];
+                        if (dst_box >= 0)
+                        {
+                            int dst_lev = p_levs[i];
+                            int index = static_cast<int>(Gpu::Atomic::Add(
+                                &p_dst_box_counts[getBucket(dst_lev, dst_box)], 1U));
+                            p_dst_indices[i] = index;
+                        }
+                    });
+                }
             }
         }
 
+        if (pc.stableRedistribute()) {
+            Gpu::copy(Gpu::hostToDevice,h_box_counts.begin(),h_box_counts.end(),m_box_counts_d.begin());
+        }
+
         amrex::Gpu::exclusive_scan(m_box_counts_d.begin(), m_box_counts_d.end(),
                                    m_box_offsets.begin());
 
diff --git a/Src/Particle/AMReX_ParticleContainerBase.H b/Src/Particle/AMReX_ParticleContainerBase.H
index 64adf750f0..433890b157 100644
--- a/Src/Particle/AMReX_ParticleContainerBase.H
+++ b/Src/Particle/AMReX_ParticleContainerBase.H
@@ -231,6 +231,10 @@ public:
 
     void SetVerbose (int verbose) { m_verbose = verbose; }
 
+    [[nodiscard]] int stableRedistribute () const {return m_stable_redistribute; }
+
+    void setStableRedistribute (int stable) { m_stable_redistribute = stable; }
+
     const ParticleBufferMap& BufferMap () const {return m_buffer_map;}
 
     Vector<int> NeighborProcs(int ngrow) const
@@ -260,6 +264,7 @@ protected:
     void defineBufferMap () const;
 
     int         m_verbose{0};
+    int m_stable_redistribute = 0;
     std::unique_ptr<ParGDB> m_gdb_object = std::make_unique<ParGDB>();
     ParGDBBase* m_gdb{nullptr};
     Vector<std::unique_ptr<MultiFab> > m_dummy_mf;
diff --git a/Tests/FFT/OpenBC/CMakeLists.txt b/Tests/FFT/OpenBC/CMakeLists.txt
new file mode 100644
index 0000000000..41d3fbcdc2
--- /dev/null
+++ b/Tests/FFT/OpenBC/CMakeLists.txt
@@ -0,0 +1,12 @@
+if (NOT (3 IN_LIST AMReX_SPACEDIM))
+    return()
+endif()
+
+set(_sources  main.cpp)
+
+set(_input_files)
+
+setup_test(3 _sources _input_files)
+
+unset(_sources)
+unset(_input_files)
diff --git a/Tests/FFT/OpenBC/GNUmakefile b/Tests/FFT/OpenBC/GNUmakefile
new file mode 100644
index 0000000000..93376f4485
--- /dev/null
+++ b/Tests/FFT/OpenBC/GNUmakefile
@@ -0,0 +1,26 @@
+AMREX_HOME := ../../..
+
+DEBUG	= FALSE
+
+DIM	= 3
+
+COMP    = gcc
+
+USE_MPI   = TRUE
+USE_OMP   = FALSE
+USE_CUDA  = FALSE
+USE_HIP   = FALSE
+USE_SYCL  = FALSE
+
+USE_FFT = TRUE
+
+BL_NO_FORT = TRUE
+
+TINY_PROFILE = FALSE
+
+include $(AMREX_HOME)/Tools/GNUMake/Make.defs
+
+include ./Make.package
+include $(AMREX_HOME)/Src/Base/Make.package
+
+include $(AMREX_HOME)/Tools/GNUMake/Make.rules
diff --git a/Tests/FFT/OpenBC/Make.package b/Tests/FFT/OpenBC/Make.package
new file mode 100644
index 0000000000..6b4b865e8f
--- /dev/null
+++ b/Tests/FFT/OpenBC/Make.package
@@ -0,0 +1 @@
+CEXE_sources += main.cpp
diff --git a/Tests/FFT/OpenBC/main.cpp b/Tests/FFT/OpenBC/main.cpp
new file mode 100644
index 0000000000..0f2d92d289
--- /dev/null
+++ b/Tests/FFT/OpenBC/main.cpp
@@ -0,0 +1,123 @@
+#include <AMReX_FFT_Poisson.H> // Put this at the top for testing
+
+#include <AMReX.H>
+#include <AMReX_MultiFab.H>
+#include <AMReX_MultiFabUtil.H>
+#include <AMReX_ParmParse.H>
+
+using namespace amrex;
+
+int main (int argc, char* argv[])
+{
+    static_assert(AMREX_SPACEDIM == 3);
+
+    amrex::Initialize(argc, argv);
+    {
+        BL_PROFILE("main");
+
+        int n_cell_x = 128;
+        int n_cell_y = 128;
+        int n_cell_z = 128;
+
+        int max_grid_size_x = 32;
+        int max_grid_size_y = 32;
+        int max_grid_size_z = 32;
+
+        ParmParse pp;
+        pp.query("n_cell_x", n_cell_x);
+        pp.query("n_cell_y", n_cell_y);
+        pp.query("n_cell_z", n_cell_z);
+        pp.query("max_grid_size_x", max_grid_size_x);
+        pp.query("max_grid_size_y", max_grid_size_y);
+        pp.query("max_grid_size_z", max_grid_size_z);
+
+        Box domain(IntVect(0), IntVect(n_cell_x-1,n_cell_y-1,n_cell_z-1));
+        BoxArray ba(domain);
+        ba.maxSize(IntVect(max_grid_size_x, max_grid_size_y, max_grid_size_z));
+        DistributionMapping dm(ba);
+
+        Geometry geom(domain, RealBox(-1._rt, -1._rt, -1._rt, 1._rt, 1._rt, 1._rt),
+                      CoordSys::cartesian, {AMREX_D_DECL(0,0,0)});
+
+        auto const& dx = geom.CellSizeArray();
+        auto const& problo = geom.ProbLoArray();
+
+        std::array<IndexType,2> ixtypes{IndexType::TheCellType(),
+                                        IndexType::TheNodeType()};
+        for (auto const ixtype : ixtypes)
+        {
+            amrex::Print() << "\nTesting " << ixtype << "\n";
+
+            BoxArray const& iba = amrex::convert(ba, ixtype);
+            int ng = ixtype.cellCentered() ? 1 : 0;
+            MultiFab rho(iba,dm,1,0);
+            MultiFab phi(iba,dm,1,ng);
+            phi.setVal(std::numeric_limits<Real>::max());
+
+            auto const& rhoma = rho.arrays();
+
+            constexpr int nsub = 4;
+            Real dxsub = dx[0]/nsub;
+            Real dysub = dx[1]/nsub;
+            Real dzsub = dx[2]/nsub;
+
+            ParallelFor(rho, [=] AMREX_GPU_DEVICE (int b, int i, int j, int k)
+            {
+                Real x = (i+0.5_rt/nsub)*dx[0] + problo[0];
+                Real y = (j+0.5_rt/nsub)*dx[1] + problo[1];
+                Real z = (k+0.5_rt/nsub)*dx[2] + problo[2];
+                if (ixtype.nodeCentered()) {
+                    x -= 0.5_rt*dx[0];
+                    y -= 0.5_rt*dx[1];
+                    z -= 0.5_rt*dx[2];
+                }
+                int n = 0;
+                for (int isub = 0; isub < nsub; ++isub) {
+                for (int jsub = 0; jsub < nsub; ++jsub) {
+                for (int ksub = 0; ksub < nsub; ++ksub) {
+                    auto xs = x + isub*dxsub;
+                    auto ys = y + jsub*dysub;
+                    auto zs = z + ksub*dzsub;
+                    if ((xs*xs+ys*ys+zs*zs) < 0.25_rt) { ++n; }
+                }}}
+                rhoma[b](i,j,k) = Real(n) / Real(nsub*nsub*nsub);
+            });
+
+            FFT::PoissonOpenBC solver(geom, ixtype, IntVect(ng));
+            solver.solve(phi, rho);
+
+            Real mass = rho.sum_unique(0) * dx[0]*dx[1]*dx[2];
+            Real offset = ixtype.cellCentered() ? 0.5_rt : 0.0_rt;
+            auto x0 = -1._rt + offset*dx[0];
+            auto y0 = -1._rt + offset*dx[1];
+            auto z0 = -1._rt + offset*dx[2];
+            auto r0 = std::sqrt(x0*x0+y0*y0+z0*z0); // radius of the corner cell
+            auto expected = -mass/(4._rt*Math::pi<Real>()*r0);
+            amrex::Print() << "  Expected phi: " << expected << "\n";
+
+            int iextra = ixtype.cellCentered() ? 1 : 0;
+
+            for (int k = 0; k < 2; ++k) {
+            for (int j = 0; j < 2; ++j) {
+            for (int i = 0; i < 2; ++i) {
+                int ii = (i == 0) ? 0 : n_cell_x-iextra;
+                int jj = (j == 0) ? 0 : n_cell_y-iextra;
+                int kk = (k == 0) ? 0 : n_cell_z-iextra;
+                IntVect corner(ii,jj,kk);
+                auto v = amrex::get_cell_data(phi, corner);
+                if (!v.empty()) {
+                    amrex::AllPrint() << "  phi at " << corner << " is " << v[0] << "\n";
+                    auto error = std::abs(expected-v[0])/std::max(std::abs(expected),std::abs(v[0]));
+                    amrex::AllPrint() << "  error " << error << "\n";
+#ifdef AMREX_USE_FLOAT
+                    constexpr Real eps = 1.e-5;
+#else
+                    constexpr Real eps = 1.e-6;
+#endif
+                    AMREX_ALWAYS_ASSERT(error < eps);
+                }
+            }}}
+        }
+    }
+    amrex::Finalize();
+}
diff --git a/Tests/FFT/Poisson/main.cpp b/Tests/FFT/Poisson/main.cpp
index 1286d80dad..392a81124f 100644
--- a/Tests/FFT/Poisson/main.cpp
+++ b/Tests/FFT/Poisson/main.cpp
@@ -3,7 +3,6 @@
 #include <AMReX.H>
 #include <AMReX_MultiFab.H>
 #include <AMReX_ParmParse.H>
-#include <AMReX_PlotFileUtil.H>
 
 using namespace amrex;
 
@@ -14,7 +13,7 @@ int main (int argc, char* argv[])
         BL_PROFILE("main");
 
         AMREX_D_TERM(int n_cell_x = 64;,
-                     int n_cell_y = 32;,
+                     int n_cell_y = 48;,
                      int n_cell_z = 128);
 
         AMREX_D_TERM(int max_grid_size_x = 32;,
@@ -24,9 +23,9 @@ int main (int argc, char* argv[])
         AMREX_D_TERM(Real prob_lo_x = 0.;,
                      Real prob_lo_y = 0.;,
                      Real prob_lo_z = 0.);
-        AMREX_D_TERM(Real prob_hi_x = 1.;,
-                     Real prob_hi_y = 1.;,
-                     Real prob_hi_z = 1.);
+        AMREX_D_TERM(Real prob_hi_x = 1.1;,
+                     Real prob_hi_y = 0.8;,
+                     Real prob_hi_z = 1.9);
 
         {
             ParmParse pp;
@@ -53,59 +52,102 @@ int main (int argc, char* argv[])
                         CoordSys::cartesian, {AMREX_D_DECL(1,1,1)});
         }
         auto const& dx = geom.CellSizeArray();
+        GpuArray<Real,AMREX_SPACEDIM> center
+            {AMREX_D_DECL(0.5_rt*(prob_lo_x+prob_hi_x),
+                          0.5_rt*(prob_lo_y+prob_hi_y),
+                          0.5_rt*(prob_lo_z+prob_hi_z))};
+        GpuArray<Real,AMREX_SPACEDIM> problen
+            {AMREX_D_DECL((prob_hi_x-prob_lo_x),
+                          (prob_hi_y-prob_lo_y),
+                          (prob_hi_z-prob_lo_z))};
+
+        // For each dimension, there are 5 possibilities
+        constexpr int ncases = 5;
+        Array<std::pair<FFT::Boundary,FFT::Boundary>,ncases>
+            bcs{std::pair<FFT::Boundary,FFT::Boundary>{FFT::Boundary::periodic,
+                                                       FFT::Boundary::periodic},
+                std::pair<FFT::Boundary,FFT::Boundary>{FFT::Boundary::odd,
+                                                       FFT::Boundary::odd},
+                std::pair<FFT::Boundary,FFT::Boundary>{FFT::Boundary::even,
+                                                       FFT::Boundary::even},
+                std::pair<FFT::Boundary,FFT::Boundary>{FFT::Boundary::odd,
+                                                       FFT::Boundary::even},
+                std::pair<FFT::Boundary,FFT::Boundary>{FFT::Boundary::even,
+                                                       FFT::Boundary::odd}};
+
+        int ncasesy = (AMREX_SPACEDIM > 1) ? ncases : 1;
+        int ncasesz = (AMREX_SPACEDIM > 2) ? ncases : 1;
+        int icase = 0;
+        for (int zcase = 0; zcase < ncasesz; ++zcase) {
+        for (int ycase = 0; ycase < ncasesy; ++ycase) {
+        for (int xcase = 0; xcase < ncases ; ++xcase) {
+            ++icase;
+            Array<std::pair<FFT::Boundary,FFT::Boundary>,AMREX_SPACEDIM>
+                fft_bc{AMREX_D_DECL(bcs[xcase],bcs[ycase],bcs[zcase])};
+            amrex::Print() << "  (" << icase << ") Testing (";
+            for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) {
+                amrex::Print() << "(" << getEnumNameString(fft_bc[idim].first)
+                               << "," << getEnumNameString(fft_bc[idim].second)
+                               << ")";
+                if (idim+1 < AMREX_SPACEDIM) { amrex::Print() << " "; }
+            }
+            amrex::Print() << ")\n";
 
-        MultiFab rhs(ba,dm,1,0);
-        MultiFab soln(ba,dm,1,0);
-        auto const& rhsma = rhs.arrays();
-        ParallelFor(rhs, [=] AMREX_GPU_DEVICE (int b, int i, int j, int k)
-        {
-            AMREX_D_TERM(Real x = (i+0.5_rt) * dx[0] - 0.5_rt;,
-                         Real y = (j+0.5_rt) * dx[1] - 0.5_rt;,
-                         Real z = (k+0.5_rt) * dx[2] - 0.5_rt);
-            rhsma[b](i,j,k) = std::exp(-10._rt*
-                (AMREX_D_TERM(x*x*1.05_rt, + y*y*0.90_rt, + z*z)));
-        });
+            GpuArray<Real,AMREX_SPACEDIM> fac
+                {AMREX_D_DECL(2._rt*Math::pi<Real>()/problen[0],
+                              2._rt*Math::pi<Real>()/problen[1],
+                              2._rt*Math::pi<Real>()/problen[2])};
 
-        // Shift rhs so that its sum is zero.
-        auto rhosum = rhs.sum(0);
-        rhs.plus(-rhosum/geom.Domain().d_numPts(), 0, 1);
+            MultiFab rhs(ba,dm,1,0);
+            MultiFab soln(ba,dm,1,0);
+            soln.setVal(std::numeric_limits<Real>::max());
 
-#if (AMREX_SPACEDIM == 3)
-        Array<int,2> solvers{0,1};
-#else
-        Array<int,2> solvers{0};
-#endif
-
-        for (int solver_type : solvers) {
-            double tsetup, tsolve;
-            if (solver_type == 0) {
-                auto t0 = amrex::second();
-                FFT::Poisson<MultiFab> fft_poisson(geom);
-                auto t1 = amrex::second();
-                tsetup = t1-t0;
-
-                for (int n = 0; n < 2; ++n) {
-                    auto ta = amrex::second();
-                    fft_poisson.solve(soln, rhs);
-                    auto tb = amrex::second();
-                    tsolve = tb-ta;
-                }
-            } else {
-                auto t0 = amrex::second();
-                FFT::PoissonHybrid<MultiFab> fft_poisson(geom);
-                auto t1 = amrex::second();
-                tsetup = t1-t0;
-
-                for (int n = 0; n < 2; ++n) {
-                    auto ta = amrex::second();
-                    fft_poisson.solve(soln, rhs);
-                    auto tb = amrex::second();
-                    tsolve = tb-ta;
+            auto const& rhsma = rhs.arrays();
+            ParallelFor(rhs, [=] AMREX_GPU_DEVICE (int b, int i, int j, int k)
+            {
+                IntVect iv(AMREX_D_DECL(i,j,k));
+                Real r = 1.0_rt;
+                for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) {
+                    Real x = (iv[idim]+0.5_rt) * dx[idim];
+                    if (fft_bc[idim].first == FFT::Boundary::periodic) {
+                        r *= (0.11_rt + std::sin((x+0.1_rt)*fac[idim]));
+                    } else if (fft_bc[idim].first == FFT::Boundary::even &&
+                               fft_bc[idim].second == FFT::Boundary::even) {
+                        r *= (0.12_rt + std::cos(x*2._rt*fac[idim]));
+                    } else if (fft_bc[idim].first == FFT::Boundary::odd &&
+                               fft_bc[idim].second == FFT::Boundary::odd) {
+                        r *= std::sin(x*1.5_rt*fac[idim]);
+                    } else if (fft_bc[idim].first == FFT::Boundary::odd &&
+                               fft_bc[idim].second == FFT::Boundary::even) {
+                        r *= std::sin(x*0.75_rt*fac[idim]);
+                    } else if (fft_bc[idim].first == FFT::Boundary::even &&
+                               fft_bc[idim].second == FFT::Boundary::odd) {
+                        r *= std::cos(x*0.75_rt*fac[idim]);
+                    }
+                    x -= center[idim];
+                    x /= problen[idim];
+                    r *= 1.0_rt + 0.1_rt*Math::abs(std::tanh(x));
                 }
+                rhsma[b](i,j,k) = r;
+            });
+
+            bool has_dirichlet = false;
+            for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) {
+                has_dirichlet = has_dirichlet ||
+                    fft_bc[idim].first == FFT::Boundary::odd ||
+                    fft_bc[idim].second == FFT::Boundary::odd;
+            }
+            if (! has_dirichlet) {
+                // Shift rhs so that its sum is zero.
+                auto rhosum = rhs.sum(0);
+                rhs.plus(-rhosum/geom.Domain().d_numPts(), 0, 1);
             }
 
-            amrex::Print() << "  AMReX FFT setup time: " << tsetup
-                           << ", solve time " << tsolve << "\n";
+            // We know that the sum of our rhs is zero for non-Dirichlet
+            // cases. Otherwise, we should shift rhs so that its sum is zero.
+
+            FFT::Poisson fft_poisson(geom, fft_bc);
+            fft_poisson.solve(soln, rhs);
 
             MultiFab phi(soln.boxArray(), soln.DistributionMap(), 1, 1);
             MultiFab res(soln.boxArray(), soln.DistributionMap(), 1, 0);
@@ -114,35 +156,64 @@ int main (int argc, char* argv[])
             auto const& res_ma = res.arrays();
             auto const& phi_ma = phi.const_arrays();
             auto const& rhs_ma = rhs.const_arrays();
+            GpuArray<Real,AMREX_SPACEDIM> lapfac
+                {AMREX_D_DECL(1._rt/(dx[0]*dx[0]),
+                              1._rt/(dx[1]*dx[1]),
+                              1._rt/(dx[2]*dx[2]))};
             ParallelFor(res, [=] AMREX_GPU_DEVICE (int b, int i, int j, int k)
             {
                 auto const& phia = phi_ma[b];
-                auto lap = (phia(i-1,j,k)-2._rt*phia(i,j,k)+phia(i+1,j,k)) / (dx[0]*dx[0]);
+                Real lap = 0;
+                if (i == 0 && fft_bc[0].first == FFT::Boundary::odd) {
+                    lap += (-3._rt*phia(i,j,k)+phia(i+1,j,k)) * lapfac[0];
+                } else if (i == 0 && fft_bc[0].first == FFT::Boundary::even) {
+                    lap += (-phia(i,j,k)+phia(i+1,j,k)) * lapfac[0];
+                } else if (i == n_cell_x-1 && fft_bc[0].second == FFT::Boundary::odd) {
+                    lap += (phia(i-1,j,k)-3._rt*phia(i,j,k)) * lapfac[0];
+                } else if (i == n_cell_x-1 && fft_bc[0].second == FFT::Boundary::even) {
+                    lap += (phia(i-1,j,k)-phia(i,j,k)) * lapfac[0];
+                } else {
+                    lap += (phia(i-1,j,k)-2._rt*phia(i,j,k)+phia(i+1,j,k)) * lapfac[0];
+                }
 #if (AMREX_SPACEDIM >= 2)
-                lap += (phia(i,j-1,k)-2._rt*phia(i,j,k)+phia(i,j+1,k)) / (dx[1]*dx[1]);
+                if (j == 0 && fft_bc[1].first == FFT::Boundary::odd) {
+                    lap += (-3._rt*phia(i,j,k)+phia(i,j+1,k)) * lapfac[1];
+                } else if (j == 0 && fft_bc[1].first == FFT::Boundary::even) {
+                    lap += (-phia(i,j,k)+phia(i,j+1,k)) * lapfac[1];
+                } else if (j == n_cell_y-1 && fft_bc[1].second == FFT::Boundary::odd) {
+                    lap += (phia(i,j-1,k)-3._rt*phia(i,j,k)) * lapfac[1];
+                } else if (j == n_cell_y-1 && fft_bc[1].second == FFT::Boundary::even) {
+                    lap += (phia(i,j-1,k)-phia(i,j,k)) * lapfac[1];
+                } else {
+                    lap += (phia(i,j-1,k)-2._rt*phia(i,j,k)+phia(i,j+1,k)) * lapfac[1];
+                }
 #endif
 #if (AMREX_SPACEDIM == 3)
-                if ((solver_type == 1) && (k == 0)) { // Neumann
-                    lap += (-phia(i,j,k)+phia(i,j,k+1)) / (dx[2]*dx[2]);
-                } else if ((solver_type == 1) && ((k+1) == n_cell_z)) { // Neumann
-                    lap += (phia(i,j,k-1)-phia(i,j,k)) / (dx[2]*dx[2]);
+                if (k == 0 && fft_bc[2].first == FFT::Boundary::odd) {
+                    lap += (-3._rt*phia(i,j,k)+phia(i,j,k+1)) * lapfac[2];
+                } else if (k == 0 && fft_bc[2].first == FFT::Boundary::even) {
+                    lap += (-phia(i,j,k)+phia(i,j,k+1)) * lapfac[2];
+                } else if (k == n_cell_z-1 && fft_bc[2].second == FFT::Boundary::odd) {
+                    lap += (phia(i,j,k-1)-3._rt*phia(i,j,k)) * lapfac[2];
+                } else if (k == n_cell_z-1 && fft_bc[2].second == FFT::Boundary::even) {
+                    lap += (phia(i,j,k-1)-phia(i,j,k)) * lapfac[2];
                 } else {
-                    lap += (phia(i,j,k-1)-2._rt*phia(i,j,k)+phia(i,j,k+1)) / (dx[2]*dx[2]);
+                    lap += (phia(i,j,k-1)-2._rt*phia(i,j,k)+phia(i,j,k+1)) * lapfac[2];
                 }
 #endif
                 res_ma[b](i,j,k) = rhs_ma[b](i,j,k) - lap;
             });
             auto bnorm = rhs.norminf();
             auto rnorm = res.norminf();
-            amrex::Print() << "  rhs inf norm " << bnorm << "\n"
-                           << "  res inf norm " << rnorm << "\n";
+            amrex::Print() << "       rhs inf norm " << bnorm << "\n"
+                           << "       res inf norm " << rnorm << "\n";
 #ifdef AMREX_USE_FLOAT
             auto eps = 2.e-3f;
 #else
             auto eps = 1.e-11;
 #endif
             AMREX_ALWAYS_ASSERT(rnorm < eps*bnorm);
-        }
+        }}}
     }
     amrex::Finalize();
 }
diff --git a/Tests/FFT/R2C/main.cpp b/Tests/FFT/R2C/main.cpp
index 7103038575..ee70b43b7b 100644
--- a/Tests/FFT/R2C/main.cpp
+++ b/Tests/FFT/R2C/main.cpp
@@ -17,7 +17,7 @@ int main (int argc, char* argv[])
                      int n_cell_y = 32;,
                      int n_cell_z = 64);
 
-        AMREX_D_TERM(int max_grid_size_x = 32;,
+        AMREX_D_TERM(int max_grid_size_x = 64;,
                      int max_grid_size_y = 32;,
                      int max_grid_size_z = 32);
 
@@ -70,17 +70,19 @@ int main (int argc, char* argv[])
         auto scaling = Real(1) / Real(geom.Domain().d_numPts());
 
         {
-            cMultiFab cmf(ba,dm,1,0);
+            cMultiFab cmf;
 
             // forward
             {
-                FFT::R2C<Real,FFT::Direction::forward> r2c(geom.Domain());
+                FFT::R2C<Real,FFT::Direction::forward,FFT::DomainStrategy::pencil> r2c(geom.Domain());
+                auto const& [cba, cdm] = r2c.getSpectralDataLayout();
+                cmf.define(cba, cdm, 1, 0);
                 r2c.forward(mf,cmf);
             }
 
             // backward
             {
-                FFT::R2C<Real,FFT::Direction::backward> r2c(geom.Domain());
+                FFT::R2C<Real,FFT::Direction::backward,FFT::DomainStrategy::pencil> r2c(geom.Domain());
                 r2c.backward(cmf,mf2);
             }
 
@@ -103,7 +105,7 @@ int main (int argc, char* argv[])
         mf2.setVal(std::numeric_limits<Real>::max());
 
         { // forward and backward
-            FFT::R2C<Real,FFT::Direction::both> r2c(geom.Domain());
+            FFT::R2C<Real,FFT::Direction::both,FFT::DomainStrategy::slab> r2c(geom.Domain());
             r2c.forwardThenBackward(mf, mf2,
                                     [=] AMREX_GPU_DEVICE (int, int, int, auto& sp)
             {
@@ -118,6 +120,49 @@ int main (int argc, char* argv[])
             auto eps = 1.e-6f;
 #else
             auto eps = 1.e-13;
+#endif
+            AMREX_ALWAYS_ASSERT(error < eps);
+        }
+
+        {
+            Real error = 0;
+            BaseFab<GpuComplex<Real>> cfab;
+            for (MFIter mfi(mf); mfi.isValid(); ++mfi)
+            {
+                auto& fab = mf[mfi];
+                auto& fab2 = mf2[mfi];
+                Box const& box = fab.box();
+                {
+                    FFT::LocalR2C<Real,FFT::Direction::both> fft(box.length());
+                    Box cbox(IntVect(0), fft.spectralSize() - 1);
+                    cfab.resize(cbox);
+                    fft.forward(fab.dataPtr(), cfab.dataPtr());
+                    fft.backward(cfab.dataPtr(), fab2.dataPtr());
+                    auto fac = fft.scalingFactor();
+                    fab2.template xpay<RunOn::Device>(-fac, fab, box, box, 0, 0, 1);
+                    auto e = fab2.template norm<RunOn::Device>(0);
+                    error = std::max(e,error);
+                }
+                {
+                    FFT::LocalR2C<Real,FFT::Direction::forward> fft(box.length());
+                    fft.forward(fab.dataPtr(), cfab.dataPtr());
+                }
+                {
+                    FFT::LocalR2C<Real,FFT::Direction::backward> fft(box.length());
+                    fft.backward(cfab.dataPtr(), fab2.dataPtr());
+                    auto fac = fft.scalingFactor();
+                    fab2.template xpay<RunOn::Device>(-fac, fab, box, box, 0, 0, 1);
+                    auto e = fab2.template norm<RunOn::Device>(0);
+                    error = std::max(e,error);
+                }
+            }
+
+            ParallelDescriptor::ReduceRealMax(error);
+            amrex::Print() << "  Expected to be close to zero: " << error << "\n";
+#ifdef AMREX_USE_FLOAT
+            auto eps = 1.e-6f;
+#else
+            auto eps = 1.e-13;
 #endif
             AMREX_ALWAYS_ASSERT(error < eps);
         }
diff --git a/Tests/FFT/R2X/CMakeLists.txt b/Tests/FFT/R2X/CMakeLists.txt
new file mode 100644
index 0000000000..21a9d3b268
--- /dev/null
+++ b/Tests/FFT/R2X/CMakeLists.txt
@@ -0,0 +1,10 @@
+foreach(D IN LISTS AMReX_SPACEDIM)
+    set(_sources  main.cpp)
+
+    set(_input_files)
+
+    setup_test(${D} _sources _input_files)
+
+    unset(_sources)
+    unset(_input_files)
+endforeach()
diff --git a/Tests/FFT/R2X/GNUmakefile b/Tests/FFT/R2X/GNUmakefile
new file mode 100644
index 0000000000..93376f4485
--- /dev/null
+++ b/Tests/FFT/R2X/GNUmakefile
@@ -0,0 +1,26 @@
+AMREX_HOME := ../../..
+
+DEBUG	= FALSE
+
+DIM	= 3
+
+COMP    = gcc
+
+USE_MPI   = TRUE
+USE_OMP   = FALSE
+USE_CUDA  = FALSE
+USE_HIP   = FALSE
+USE_SYCL  = FALSE
+
+USE_FFT = TRUE
+
+BL_NO_FORT = TRUE
+
+TINY_PROFILE = FALSE
+
+include $(AMREX_HOME)/Tools/GNUMake/Make.defs
+
+include ./Make.package
+include $(AMREX_HOME)/Src/Base/Make.package
+
+include $(AMREX_HOME)/Tools/GNUMake/Make.rules
diff --git a/Tests/FFT/R2X/Make.package b/Tests/FFT/R2X/Make.package
new file mode 100644
index 0000000000..6b4b865e8f
--- /dev/null
+++ b/Tests/FFT/R2X/Make.package
@@ -0,0 +1 @@
+CEXE_sources += main.cpp
diff --git a/Tests/FFT/R2X/main.cpp b/Tests/FFT/R2X/main.cpp
new file mode 100644
index 0000000000..039917ca3a
--- /dev/null
+++ b/Tests/FFT/R2X/main.cpp
@@ -0,0 +1,121 @@
+#include <AMReX_FFT.H> // Put this at the top for testing
+
+#include <AMReX.H>
+#include <AMReX_MultiFab.H>
+#include <AMReX_ParmParse.H>
+#include <AMReX_PlotFileUtil.H>
+
+using namespace amrex;
+
+int main (int argc, char* argv[])
+{
+    amrex::Initialize(argc, argv);
+    {
+        BL_PROFILE("main");
+
+        AMREX_D_TERM(int n_cell_x = 128;,
+                     int n_cell_y = 32;,
+                     int n_cell_z = 64);
+
+        AMREX_D_TERM(int max_grid_size_x = 32;,
+                     int max_grid_size_y = 32;,
+                     int max_grid_size_z = 32);
+
+        AMREX_D_TERM(Real prob_lo_x = 0.;,
+                     Real prob_lo_y = 0.;,
+                     Real prob_lo_z = 0.);
+        AMREX_D_TERM(Real prob_hi_x = 1.;,
+                     Real prob_hi_y = 1.;,
+                     Real prob_hi_z = 1.);
+
+        {
+            ParmParse pp;
+            AMREX_D_TERM(pp.query("n_cell_x", n_cell_x);,
+                         pp.query("n_cell_y", n_cell_y);,
+                         pp.query("n_cell_z", n_cell_z));
+            AMREX_D_TERM(pp.query("max_grid_size_x", max_grid_size_x);,
+                         pp.query("max_grid_size_y", max_grid_size_y);,
+                         pp.query("max_grid_size_z", max_grid_size_z));
+        }
+
+        Box domain(IntVect(0),IntVect(AMREX_D_DECL(n_cell_x-1,n_cell_y-1,n_cell_z-1)));
+        BoxArray ba(domain);
+        ba.maxSize(IntVect(AMREX_D_DECL(max_grid_size_x,
+                                        max_grid_size_y,
+                                        max_grid_size_z)));
+        DistributionMapping dm(ba);
+
+        Geometry geom;
+        {
+            geom.define(domain,
+                        RealBox(AMREX_D_DECL(prob_lo_x,prob_lo_y,prob_lo_z),
+                                AMREX_D_DECL(prob_hi_x,prob_hi_y,prob_hi_z)),
+                        CoordSys::cartesian, {AMREX_D_DECL(1,1,1)});
+        }
+        auto const& dx = geom.CellSizeArray();
+
+        MultiFab mf(ba,dm,1,0);
+        auto const& ma = mf.arrays();
+        ParallelFor(mf, [=] AMREX_GPU_DEVICE (int b, int i, int j, int k)
+        {
+            AMREX_D_TERM(Real x = (i+0.5_rt) * dx[0] - 0.5_rt;,
+                         Real y = (j+0.5_rt) * dx[1] - 0.5_rt;,
+                         Real z = (k+0.5_rt) * dx[2] - 0.5_rt);
+            ma[b](i,j,k) = std::exp(-10._rt*
+                (AMREX_D_TERM(x*x*1.05_rt, + y*y*0.90_rt, + z*z)));
+        });
+
+        MultiFab mf2(ba,dm,1,0);
+
+        // For each dimension, there are 5 possibilities
+        constexpr int ncases = 5;
+        Array<std::pair<FFT::Boundary,FFT::Boundary>,ncases>
+            bcs{std::pair<FFT::Boundary,FFT::Boundary>{FFT::Boundary::periodic,
+                                                       FFT::Boundary::periodic},
+                std::pair<FFT::Boundary,FFT::Boundary>{FFT::Boundary::even,
+                                                       FFT::Boundary::even},
+                std::pair<FFT::Boundary,FFT::Boundary>{FFT::Boundary::even,
+                                                       FFT::Boundary::odd },
+                std::pair<FFT::Boundary,FFT::Boundary>{FFT::Boundary::odd,
+                                                       FFT::Boundary::even},
+                std::pair<FFT::Boundary,FFT::Boundary>{FFT::Boundary::odd,
+                                                       FFT::Boundary::odd }};
+        int ncasesy = (AMREX_SPACEDIM > 1) ? ncases : 1;
+        int ncasesz = (AMREX_SPACEDIM > 2) ? ncases : 1;
+        for (int zcase = 0; zcase < ncasesz; ++zcase) {
+        for (int ycase = 0; ycase < ncasesy; ++ycase) {
+        for (int xcase = 0; xcase < ncases ; ++xcase) {
+            Array<std::pair<FFT::Boundary,FFT::Boundary>,AMREX_SPACEDIM>
+                fft_bc{AMREX_D_DECL(bcs[xcase],bcs[ycase],bcs[zcase])};
+            amrex::Print() << "  Testing (";
+            for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) {
+                amrex::Print() << "(" << getEnumNameString(fft_bc[idim].first)
+                               << "," << getEnumNameString(fft_bc[idim].second)
+                               << ")";
+                if (idim+1 < AMREX_SPACEDIM) { amrex::Print() << " "; }
+            }
+            amrex::Print() << ")\n";
+
+            mf2.setVal(std::numeric_limits<Real>::max());
+
+            FFT::R2X fft(geom.Domain(), fft_bc);
+            auto scaling_factor = fft.scalingFactor();
+            fft.forwardThenBackward(mf, mf2, [=] AMREX_GPU_DEVICE (int, int, int, auto& sp)
+            {
+                sp *= scaling_factor;
+            });
+
+            MultiFab::Subtract(mf2, mf, 0, 0, 1, 0);
+
+            auto error = mf2.norminf();
+            amrex::Print() << "    Expected to be close to zero: " << error << "\n";
+#ifdef AMREX_USE_FLOAT
+            auto eps = 1.e-6f;
+#else
+            auto eps = 1.e-13;
+#endif
+            AMREX_ALWAYS_ASSERT(error < eps);
+        }}}
+    }
+    amrex::Finalize();
+}
diff --git a/Tests/Particles/Redistribute/main.cpp b/Tests/Particles/Redistribute/main.cpp
index b08c751590..a9314cce4f 100644
--- a/Tests/Particles/Redistribute/main.cpp
+++ b/Tests/Particles/Redistribute/main.cpp
@@ -328,6 +328,7 @@ struct TestParams
     int do_regrid;
     int sort;
     int test_level_lost = 0;
+    int stable_redistribute = 0;
 };
 
 void testRedistribute();
@@ -358,6 +359,7 @@ void get_test_params(TestParams& params, const std::string& prefix)
     pp.query("num_runtime_real", num_runtime_real);
     pp.query("num_runtime_int", num_runtime_int);
     pp.query("remove_negative", remove_negative);
+    pp.query("stable_redistribute", params.stable_redistribute);
 
     params.sort = 0;
     pp.query("sort", params.sort);
@@ -410,6 +412,7 @@ void testRedistribute ()
     }
 
     TestParticleContainer pc(geom, dm, ba, rr);
+    pc.setStableRedistribute(params.stable_redistribute);
 
     IntVect nppc(params.num_ppc);
 
diff --git a/Tools/GNUMake/Make.defs b/Tools/GNUMake/Make.defs
index 66de8ec6a5..136a30e7e0 100644
--- a/Tools/GNUMake/Make.defs
+++ b/Tools/GNUMake/Make.defs
@@ -1062,12 +1062,6 @@ ifeq ($(TP_PROFILING),FORGE)
   include        $(AMREX_HOME)/Tools/GNUMake/tools/Make.forge
 endif
 
-# Use Sam Williams's HPGMG
-ifeq ($(USE_HPGMG), TRUE)
-  $(info Loading $(AMREX_HOME)/Tools/GNUMake/packages/Make.hpgmg...)
-  include        $(AMREX_HOME)/Tools/GNUMake/packages/Make.hpgmg
-endif
-
 ifneq ("$(wildcard $(AMREX_HOME)/Tools/GNUMake/sites/Make.$(host_name))","")
   $(info Loading $(AMREX_HOME)/Tools/GNUMake/sites/Make.$(host_name)...)
   include        $(AMREX_HOME)/Tools/GNUMake/sites/Make.$(host_name)
diff --git a/Tools/GNUMake/packages/Make.hpgmg b/Tools/GNUMake/packages/Make.hpgmg
deleted file mode 100644
index 4083c85fc4..0000000000
--- a/Tools/GNUMake/packages/Make.hpgmg
+++ /dev/null
@@ -1,64 +0,0 @@
-# Sam Williams's HPGMG
-
-  include $(HPGMG_DIR)/source/Make.package
-  DEFINES += -DUSEHPGMG=1 -DUSE_MPI=1
-
-# use a unique MPI subcommunicator for each level of the multigrid?
-  ifeq ($(HPGMG_USE_SUBCOMM), TRUE)
-    DEFINES += -DUSE_SUBCOMM=1
-  endif
-
-# which bottom solver?
-  ifeq ($(HPGMG_BOTTOM_SOLVER), BICGSTAB)
-    DEFINES += -DUSE_BICGSTAB=1
-  else ifeq ($(HPGMG_BOTTOM_SOLVER), CG)
-    DEFINES += -DUSE_CG=1
-  else ifeq ($(HPGMG_BOTTOM_SOLVER), CABICGSTAB)
-    DEFINES += -DUSE_CABICGSTAB=1
-  else
-    DEFINES += -DUSE_BICGSTAB=1
-  endif
-
-# which smoothing method?
-  ifeq ($(HPGMG_SMOOTHER), GSRB)
-    DEFINES += -DUSE_GSRB=1
-    # HPGMG has several different ways to apply the GSRB smooths
-    ifeq ($(HPGMG_GSRB_TYPE), STRIDE2)
-      DEFINES += -DGSRB_STRIDE2=1
-    else ifeq ($(HPGMG_GSRB_TYPE), FP)
-      DEFINES += -DGSRB_FP=1
-    else ifeq ($(HPGMG_GSRB_TYPE), BRANCH)
-      DEFINES += -DGSRB_BRANCH=1
-    else
-      DEFINES += -DGSRB_STRIDE2=1
-    endif
-    ifeq ($(HPGMG_GSRB_OOP), TRUE)
-      DEFINES += -DGSRB_OOP=1
-    endif
-  else ifeq ($(HPGMG_SMOOTHER), CHEBY)
-    DEFINES += -DUSE_CHEBY=1
-  else ifeq ($(HPGMG_SMOOTHER), JACOBI)
-    DEFINES += -DUSE_JACOBI=1
-  else ifeq ($(HPGMG_SMOOTHER), L1JACOBI)
-    DEFINES += -DUSE_L1JACOBI=1
-  else
-   DEFINES += -DUSE_GSRB=1
-  endif
-
-  DEFINES += -DHPGMG_F_CYCLES=$(HPGMG_F_CYCLES)
-  DEFINES += -DHPGMG_V_CYCLES=$(HPGMG_V_CYCLES)
-
-# is this a Helmholtz problem or a Poisson problem?
-  ifeq ($(HPGMG_HELMHOLTZ), TRUE)
-    DEFINES += -DUSE_HELMHOLTZ=1
-  endif
-
-# the constant-coefficient stencil requires significantly less data movement during the calculation
-  ifeq ($(HPGMG_STENCIL_VARIABLE_COEFFICIENT), TRUE)
-    DEFINES += -DSTENCIL_VARIABLE_COEFFICIENT=1
-  endif
-
-  include $(HPGMG_DIR)/source/Make.package
-  include $(AMREX_HOME)/Src/Extern/hpgmg/Make.package
-  INCLUDE_LOCATIONS += $(HPGMG_DIR)/source
-  VPATH_LOCATIONS   += $(HPGMG_DIR)/source