diff --git a/.github/workflows/apps.yml b/.github/workflows/apps.yml index e909193601..19a3c4699a 100644 --- a/.github/workflows/apps.yml +++ b/.github/workflows/apps.yml @@ -52,7 +52,9 @@ jobs: export AMREX_HOME=${PWD} export MICROPHYSICS_HOME=${PWD}/Microphysics cd Castro/Exec/hydro_tests/Sedov/ - make -j4 CCACHE=ccache USE_MPI=FALSE + make -j4 CCACHE=ccache USE_MPI=FALSE \ + USE_LINEAR_SOLVERS_INCFLO=FALSE \ + USE_LINEAR_SOLVERS_EM=FALSE ccache -s du -hs ~/.cache/ccache @@ -92,7 +94,9 @@ jobs: -DWarpX_QED=OFF \ -DWarpX_OPENPMD=OFF \ -DCMAKE_VERBOSE_MAKEFILE=ON \ - -DCMAKE_CXX_COMPILER_LAUNCHER=ccache + -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ + -DAMReX_FFT=ON \ + -DAMReX_LINEAR_SOLVERS_INCFLO=OFF cmake --build WarpX/build -j 4 ccache -s diff --git a/.github/workflows/clang.yml b/.github/workflows/clang.yml index c84d19850a..1ed5240164 100644 --- a/.github/workflows/clang.yml +++ b/.github/workflows/clang.yml @@ -44,6 +44,7 @@ jobs: -DCMAKE_BUILD_TYPE=Debug \ -DCMAKE_VERBOSE_MAKEFILE=ON \ -DCMAKE_INSTALL_PREFIX=/tmp/my-amrex \ + -DAMReX_FFT=ON \ -DAMReX_EB=ON \ -DAMReX_FORTRAN=ON \ -DAMReX_MPI=OFF \ @@ -104,6 +105,7 @@ jobs: cmake .. \ -DCMAKE_BUILD_TYPE=Debug \ -DCMAKE_VERBOSE_MAKEFILE=ON \ + -DAMReX_FFT=ON \ -DAMReX_EB=ON \ -DAMReX_ENABLE_TESTS=ON \ -DAMReX_FORTRAN=ON \ @@ -158,6 +160,7 @@ jobs: cmake .. \ -DCMAKE_BUILD_TYPE=RelWithDebInfo \ -DCMAKE_VERBOSE_MAKEFILE=ON \ + -DAMReX_FFT=ON \ -DAMReX_EB=ON \ -DAMReX_ENABLE_TESTS=ON \ -DAMReX_FORTRAN=OFF \ @@ -200,7 +203,7 @@ jobs: export CCACHE_LOGFILE=${{ github.workspace }}/ccache.log.txt ccache -z - ./configure --dim 2 --with-fortran no --comp llvm --with-mpi no + ./configure --dim 2 --with-fortran no --comp llvm --with-mpi no --enable-fft yes make -j4 WARN_ALL=TRUE WARN_ERROR=TRUE XTRA_CXXFLAGS="-fno-operator-names" \ CCACHE=ccache make install diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml index 9e96aefac5..927e99ded4 100644 --- a/.github/workflows/cuda.yml +++ b/.github/workflows/cuda.yml @@ -38,6 +38,7 @@ jobs: cmake -S . -B build \ -DCMAKE_VERBOSE_MAKEFILE=ON \ + -DAMReX_FFT=ON \ -DAMReX_EB=ON \ -DAMReX_ENABLE_TESTS=ON \ -DAMReX_FORTRAN=OFF \ @@ -97,6 +98,7 @@ jobs: cmake -S . -B build \ -DCMAKE_VERBOSE_MAKEFILE=ON \ -DAMReX_MPI=OFF \ + -DAMReX_FFT=ON \ -DAMReX_EB=ON \ -DAMReX_ENABLE_TESTS=ON \ -DAMReX_FORTRAN=OFF \ @@ -153,6 +155,7 @@ jobs: -DCMAKE_VERBOSE_MAKEFILE=ON \ -DAMReX_ENABLE_TESTS=ON \ -DAMReX_TEST_TYPE=Small \ + -DAMReX_FFT=ON \ -DAMReX_FORTRAN=ON \ -DAMReX_FORTRAN_INTERFACES=ON \ -DAMReX_GPU_BACKEND=CUDA \ @@ -196,7 +199,7 @@ jobs: ccache -z export PATH=/usr/local/nvidia/bin:/usr/local/cuda/bin:${PATH} - ./configure --dim 3 --with-cuda yes --enable-eb yes --enable-xsdk-defaults yes --with-fortran no + ./configure --dim 3 --with-cuda yes --enable-eb yes --enable-xsdk-defaults yes --with-fortran no --enable-fft yes # # /home/runner/work/amrex/amrex/Src/Base/AMReX_GpuLaunchGlobal.H:16:41: error: unused parameter ‘f0’ [-Werror=unused-parameter] # 16 | AMREX_GPU_GLOBAL void launch_global (L f0) { f0(); } diff --git a/.github/workflows/dependencies/dependencies.sh b/.github/workflows/dependencies/dependencies.sh index 07e461f577..c7cde49651 100755 --- a/.github/workflows/dependencies/dependencies.sh +++ b/.github/workflows/dependencies/dependencies.sh @@ -16,6 +16,7 @@ sudo apt-get update sudo apt-get install -y --no-install-recommends\ build-essential \ + libfftw3-dev \ g++ gfortran \ libopenmpi-dev \ openmpi-bin diff --git a/.github/workflows/dependencies/dependencies_clang.sh b/.github/workflows/dependencies/dependencies_clang.sh index 2e96b5196d..4c329321b6 100755 --- a/.github/workflows/dependencies/dependencies_clang.sh +++ b/.github/workflows/dependencies/dependencies_clang.sh @@ -16,5 +16,6 @@ sudo apt-get update sudo apt-get install -y --no-install-recommends \ build-essential \ + libfftw3-dev \ gfortran \ clang-$1 diff --git a/.github/workflows/dependencies/dependencies_gcc.sh b/.github/workflows/dependencies/dependencies_gcc.sh index 2a576c0b52..93d9aa27ec 100755 --- a/.github/workflows/dependencies/dependencies_gcc.sh +++ b/.github/workflows/dependencies/dependencies_gcc.sh @@ -17,6 +17,7 @@ sudo apt-get update sudo apt-get install -y --no-install-recommends \ build-essential \ + libfftw3-dev \ g++-$1 gfortran-$1 \ libopenmpi-dev \ openmpi-bin diff --git a/.github/workflows/dependencies/dependencies_hip.sh b/.github/workflows/dependencies/dependencies_hip.sh index ab5185ce0a..df4f274ef3 100755 --- a/.github/workflows/dependencies/dependencies_hip.sh +++ b/.github/workflows/dependencies/dependencies_hip.sh @@ -56,6 +56,7 @@ sudo apt-get install -y --no-install-recommends \ roctracer-dev \ rocprofiler-dev \ rocrand-dev \ + rocfft-dev \ rocprim-dev # hiprand-dev is a new package that does not exist in old versions diff --git a/.github/workflows/dependencies/dependencies_nvcc.sh b/.github/workflows/dependencies/dependencies_nvcc.sh index abf9504801..2578bd33fe 100755 --- a/.github/workflows/dependencies/dependencies_nvcc.sh +++ b/.github/workflows/dependencies/dependencies_nvcc.sh @@ -35,5 +35,6 @@ sudo apt-get install -y \ cuda-minimal-build-$VERSION_DASHED \ cuda-nvml-dev-$VERSION_DASHED \ cuda-nvtx-$VERSION_DASHED \ + libcufft-dev-$VERSION_DASHED \ libcurand-dev-$VERSION_DASHED sudo ln -s cuda-$VERSION_DOTTED /usr/local/cuda diff --git a/.github/workflows/gcc.yml b/.github/workflows/gcc.yml index f4ae08f76d..88fe47c988 100644 --- a/.github/workflows/gcc.yml +++ b/.github/workflows/gcc.yml @@ -42,6 +42,7 @@ jobs: mkdir build cd build cmake .. \ + -DAMReX_FFT=ON \ -DAMReX_FORTRAN=ON \ -DAMReX_PLOTFILE_TOOLS=ON \ -DCMAKE_VERBOSE_MAKEFILE=ON \ @@ -86,7 +87,7 @@ jobs: restore-keys: | ccache-${{ github.workflow }}-${{ github.job }}-git- - name: Build & Install - env: {CXXFLAGS: "-fno-operator-names -Werror -Wall -Wextra -Wpedantic -Wnull-dereference -Wfloat-conversion -Wshadow -Woverloaded-virtual -O1 -Wnon-virtual-dtor -Wlogical-op -Wmisleading-indentation -Wduplicated-cond -Wduplicated-branches -Wmissing-include-dirs -Wno-null-dereference"} + env: {CXXFLAGS: "-fno-operator-names -Werror -Wall -Wextra -Wpedantic -Wnull-dereference -Wfloat-conversion -Wshadow -Woverloaded-virtual -O1 -Wnon-virtual-dtor -Wlogical-op -Wmisleading-indentation -Wduplicated-cond -Wduplicated-branches -Wmissing-include-dirs"} # It's too slow with -O0 run: | export CCACHE_COMPRESS=1 @@ -99,6 +100,7 @@ jobs: cmake -S . -B build \ -DCMAKE_BUILD_TYPE=Debug \ -DCMAKE_VERBOSE_MAKEFILE=ON \ + -DAMReX_FFT=ON \ -DAMReX_EB=ON \ -DAMReX_ENABLE_TESTS=ON \ -DAMReX_FORTRAN=ON \ @@ -147,6 +149,7 @@ jobs: cmake -S . -B build \ -DCMAKE_BUILD_TYPE=Debug \ -DCMAKE_VERBOSE_MAKEFILE=ON \ + -DAMReX_FFT=ON \ -DAMReX_EB=ON \ -DAMReX_ENABLE_TESTS=ON \ -DAMReX_FORTRAN=ON \ @@ -196,6 +199,7 @@ jobs: cmake -S . -B build \ -DCMAKE_BUILD_TYPE=Debug \ -DCMAKE_VERBOSE_MAKEFILE=ON \ + -DAMReX_FFT=ON \ -DAMReX_EB=OFF \ -DAMReX_ENABLE_TESTS=ON \ -DAMReX_FORTRAN=ON \ @@ -248,6 +252,7 @@ jobs: -DCMAKE_VERBOSE_MAKEFILE=ON \ -DAMReX_ASSERTIONS=ON \ -DAMReX_TESTING=ON \ + -DAMReX_FFT=ON \ -DAMReX_EB=OFF \ -DAMReX_ENABLE_TESTS=ON \ -DAMReX_BOUND_CHECK=ON \ @@ -310,6 +315,7 @@ jobs: -DAMReX_TESTING=ON \ -DAMReX_BOUND_CHECK=ON \ -DAMReX_FPE=ON \ + -DAMReX_FFT=ON \ -DAMReX_EB=ON \ -DAMReX_ENABLE_TESTS=ON \ -DAMReX_FORTRAN=ON \ @@ -353,10 +359,7 @@ jobs: # /home/runner/work/amrex/amrex/Src/Base/AMReX_IntVect.H:194:92: error: array subscript -1 is below array bounds of ‘int [3]’ [-Werror=array-bounds] # int& operator[] (int i) noexcept { BL_ASSERT(i>=0 && i < AMREX_SPACEDIM); return vect[i]; } # - # inlined from ‘const amrex::MultiFab& amrex::EBFArrayBoxFactory::getVolFrac() const’ at /home/runner/work/amrex/amrex/Src/EB/AMReX_EBFabFactory.H:53:91, - # /usr/include/c++/12/bits/shared_ptr_base.h:1666:16: error: potential null pointer dereference [-Werror=null-dereference] - # - env: {CXXFLAGS: "-fno-operator-names -Werror -Wall -Wextra -Wpedantic -Wnull-dereference -Wfloat-conversion -Wshadow -Woverloaded-virtual -Wunreachable-code -Wnon-virtual-dtor -Wlogical-op -Wmisleading-indentation -Wduplicated-cond -Wduplicated-branches -Wmissing-include-dirs -Wno-array-bounds -Wno-null-dereference"} + env: {CXXFLAGS: "-fno-operator-names -Werror -Wall -Wextra -Wpedantic -Wnull-dereference -Wfloat-conversion -Wshadow -Woverloaded-virtual -Wunreachable-code -Wnon-virtual-dtor -Wlogical-op -Wmisleading-indentation -Wduplicated-cond -Wduplicated-branches -Wmissing-include-dirs -Wno-array-bounds"} run: | export CCACHE_COMPRESS=1 export CCACHE_COMPRESSLEVEL=10 @@ -374,10 +377,10 @@ jobs: -DAMReX_TESTING=ON \ -DAMReX_BOUND_CHECK=ON \ -DAMReX_FPE=ON \ + -DAMReX_FFT=ON \ -DAMReX_EB=ON \ -DAMReX_ENABLE_TESTS=ON \ -DAMReX_FORTRAN=OFF \ - -DAMReX_FORTRAN=OFF \ -DCMAKE_C_COMPILER=$(which gcc-12) \ -DCMAKE_CXX_COMPILER=$(which g++-12) \ -DCMAKE_CXX_STANDARD=17 \ @@ -461,7 +464,7 @@ jobs: export CCACHE_LOGFILE=${{ github.workspace }}/ccache.log.txt ccache -z - ./configure --dim 3 --enable-eb yes --enable-xsdk-defaults yes + ./configure --dim 3 --enable-eb yes --enable-xsdk-defaults yes --enable-fft yes make -j4 WARN_ALL=TRUE WARN_ERROR=TRUE XTRA_CXXFLAGS=-fno-operator-names \ CCACHE=ccache make install @@ -501,7 +504,8 @@ jobs: export CCACHE_LOGFILE=${{ github.workspace }}/ccache.log.txt ccache -z - ./configure --dim 3 --enable-eb no --enable-xsdk-defaults no --single-precision yes --single-precision-particles yes --enable-tiny-profile yes + ./configure --dim 3 --enable-eb no --enable-xsdk-defaults no --single-precision yes \ + --single-precision-particles yes --enable-tiny-profile yes --enable-fft yes make -j4 WARN_ALL=TRUE WARN_ERROR=TRUE XTRA_CXXFLAGS=-fno-operator-names \ CCACHE=ccache make install @@ -627,6 +631,7 @@ jobs: -DAMReX_OMP=ON \ -DCMAKE_VERBOSE_MAKEFILE=ON \ -DAMReX_ENABLE_TESTS=ON \ + -DAMReX_FFT=ON \ -DCMAKE_CXX_COMPILER_LAUNCHER=ccache make -j 4 diff --git a/.github/workflows/hip.yml b/.github/workflows/hip.yml index 345d7c468b..22154d6b01 100644 --- a/.github/workflows/hip.yml +++ b/.github/workflows/hip.yml @@ -48,6 +48,7 @@ jobs: cmake -S . -B build \ -DCMAKE_VERBOSE_MAKEFILE=ON \ + -DAMReX_FFT=ON \ -DAMReX_EB=ON \ -DAMReX_ENABLE_TESTS=ON \ -DAMReX_FORTRAN=ON \ @@ -103,6 +104,7 @@ jobs: cmake -S . -B build_full_legacywrapper \ -DCMAKE_VERBOSE_MAKEFILE=ON \ + -DAMReX_FFT=ON \ -DAMReX_EB=OFF \ -DAMReX_ENABLE_TESTS=ON \ -DAMReX_FORTRAN=ON \ @@ -145,7 +147,9 @@ jobs: export CCACHE_MAXSIZE=100M ccache -z - ./configure --dim 2 --with-hip yes --enable-eb yes --enable-xsdk-defaults yes --with-mpi no --with-omp no --single-precision yes --single-precision-particles yes + ./configure --dim 2 --with-hip yes --enable-eb yes --enable-xsdk-defaults yes \ + --with-mpi no --with-omp no --single-precision yes \ + --single-precision-particles yes --enable-fft yes make -j4 WARN_ALL=TRUE AMD_ARCH=gfx90a CCACHE=ccache make install diff --git a/.github/workflows/intel.yml b/.github/workflows/intel.yml index 227b0f9738..15c7bbda58 100644 --- a/.github/workflows/intel.yml +++ b/.github/workflows/intel.yml @@ -41,6 +41,7 @@ jobs: set -e cmake -S . -B build \ -DCMAKE_VERBOSE_MAKEFILE=ON \ + -DAMReX_FFT=ON \ -DAMReX_EB=OFF \ -DAMReX_ENABLE_TESTS=ON \ -DAMReX_FORTRAN=ON \ @@ -89,6 +90,7 @@ jobs: set -e cmake -S . -B build \ -DCMAKE_VERBOSE_MAKEFILE=ON \ + -DAMReX_FFT=ON \ -DAMReX_EB=ON \ -DAMReX_ENABLE_TESTS=ON \ -DAMReX_FORTRAN=OFF \ diff --git a/CHANGES b/CHANGES index 6c0cf8c6fe..2f5d0e5373 100644 --- a/CHANGES +++ b/CHANGES @@ -1,3 +1,81 @@ +# 24.10 + + -- TinyProfiler: Remove unnecessary precision() call (#4174) + + -- Fix GCC 12 & 13 warnings on null-dereference (#4171) + + -- fix: wavefront_size for gfx11* (#4170) + + -- CI: Test GCC-13 (#4169) + + -- allow hidden dimension when calling FFlux routines (#4162) + + -- Deregister BArena from Profiling in Arena::Finalize (#4164) + + -- FillPatchSingleLevel and FillPatchTwoLevels for ERF (#4158) + + -- `ParmParse:addFile`: User-Friendly Error (#4156) + + -- Adding computation of complete elliptic integrals into amrex::Math (#4151) + + -- Fix roundoff issue in SUNDIALS evolve() (#4148) (#4157) + + -- Add a new InterFromCoarseLevel for ERF (#4150) + + -- Add ParmParse features for WarpX (#4149) + + -- ParmParse::queryAsDouble: Support bool and std::optional (#4152) + + -- add geometric terms for spherical 2D support. (#4141) + + -- Add std::setfill to PrintMemStats (#4147) + + -- Add ParmParse::query_enum_sloppy that can ignore characters (#4145) + + -- Fix ParmParse::query_enum_case_insensitive (#4144) + + -- AMREX_ENUM: Add more capabilites (#4143) + + -- Add ParmParse::eval (#4142) + + -- AnyCTO with arbitrary number of functions (#4135) + + -- IOFormatSaver (#4104) + + -- amrex::Stack (#4139) + + -- Use BL_PROFILE instead of BL_PROFILE_VAR to time in knapsack()swap (#4134) + + -- Add iMultiFab::sum that returns the sum over a region (#4132) + + -- EB Boundary Area: Fix issues for anisotropic cell size (#4131) + + -- `ParmParse`: Prefix to `FILE` (#4126) + + -- MLMG: Minimum domain width (#4129) + + -- Capability adds for ParmParse enum (#4119) + + -- use perl instead of sed in style checks for portability to MacOS (#4127) + + -- Fortran Interfaces: Add new average down functions (#4124) + + -- TinyProfiler: A few updates (#4102) + + -- ArenaProfiler: Fix clang-tidy warning (#4128) + + -- CTOParallelFor with BoxND / add AnyCTO (#4109) + + -- TinyProfiler with BArena and PArena (#4113) + + -- Fix Fortran interface compilation issue using `nvfortran` (#4115) + + -- `AMREX_DEVICE_PRINTF`: Host (#4116) + + -- EB: don't abort for no-op case in unsupported addFineLevels functions (#4123) + + -- Fix FillPatchNLevels (#4117) + # 24.09 -- Curl Curl Solver: Option to use PCG instead of LU (#3812) diff --git a/Docs/sphinx_documentation/source/BuildingAMReX.rst b/Docs/sphinx_documentation/source/BuildingAMReX.rst index 90fb4d6eb3..831346765b 100644 --- a/Docs/sphinx_documentation/source/BuildingAMReX.rst +++ b/Docs/sphinx_documentation/source/BuildingAMReX.rst @@ -463,12 +463,20 @@ The list of available options is reported in the :ref:`table ` bel +------------------------------+-------------------------------------------------+-------------------------+-----------------------+ | AMReX_LINEAR_SOLVERS | Build AMReX linear solvers | YES | YES, NO | +------------------------------+-------------------------------------------------+-------------------------+-----------------------+ + | AMReX_LINEAR_SOLVERS_INCFLO | Build AMReX linear solvers for incompressible | YES | YES, NO | + | | flow | | + +------------------------------+-------------------------------------------------+-------------------------+-----------------------+ + | AMReX_LINEAR_SOLVERS_EM | Build AMReX linear solvers for electromagnetic | YES | YES, NO | + | | solvers | | + +------------------------------+-------------------------------------------------+-------------------------+-----------------------+ | AMReX_AMRDATA | Build data services | NO | YES, NO | +------------------------------+-------------------------------------------------+-------------------------+-----------------------+ | AMReX_AMRLEVEL | Build AmrLevel class | YES | YES, NO | +------------------------------+-------------------------------------------------+-------------------------+-----------------------+ | AMReX_EB | Build Embedded Boundary support | NO | YES, NO | +------------------------------+-------------------------------------------------+-------------------------+-----------------------+ + | AMReX_FFT | Build FFT support | NO | YES, NO | + +------------------------------+-------------------------------------------------+-------------------------+-----------------------+ | AMReX_PARTICLES | Build particle classes | YES | YES, NO | +------------------------------+-------------------------------------------------+-------------------------+-----------------------+ | AMReX_PARTICLES_PRECISION | Set reals precision in particle classes | Same as AMReX_PRECISION | DOUBLE, SINGLE | @@ -681,12 +689,18 @@ A list of AMReX component names and related configure options are shown in the t +------------------------------+-----------------+ | AMReX_LINEAR_SOLVERS | LSOLVERS | +------------------------------+-----------------+ + | AMReX_LINEAR_SOLVERS_INCFLO | LSOLVERS_INCFLO | + +------------------------------+-----------------+ + | AMReX_LINEAR_SOLVERS_EM | LSOLVERS_EM | + +------------------------------+-----------------+ | AMReX_AMRDATA | AMRDATA | +------------------------------+-----------------+ | AMReX_AMRLEVEL | AMRLEVEL | +------------------------------+-----------------+ | AMReX_EB | EB | +------------------------------+-----------------+ + | AMReX_FFT | FFT | + +------------------------------+-----------------+ | AMReX_PARTICLES | PARTICLES | +------------------------------+-----------------+ | AMReX_PARTICLES_PRECISION | PDOUBLE, PSINGLE| diff --git a/Docs/sphinx_documentation/source/FFT.rst b/Docs/sphinx_documentation/source/FFT.rst new file mode 100644 index 0000000000..3fc24fcab8 --- /dev/null +++ b/Docs/sphinx_documentation/source/FFT.rst @@ -0,0 +1,71 @@ +.. role:: cpp(code) + :language: c++ + +.. _sec:FFT:r2c: + +FFT::R2C Class +============== + +Class template `FFT::R2C` supports discrete Fourier transforms between real +and complex data. The name R2C indicates that the forward transform converts +real data to complex data, while the backward transform converts complex +data to real data. It should be noted that both directions of transformation +are supported, not just from real to complex. + +The implementation utilizes cuFFT, rocFFT, oneMKL and FFTW, for CUDA, HIP, +SYCL and CPU builds, respectively. Because the parallel communication is +handled by AMReX, it does not need the parallel version of +FFTW. Furthermore, there is no constraint on the domain decomposition such +as one Box per process. This class performs parallel FFT on AMReX's parallel +data containers (e.g., :cpp:`MultiFab` and +:cpp:`FabArray>>`. For local FFT, the users can +use FFTW, cuFFT, rocFFT, or oneMKL directly. + +Other than using column-majored order, AMReX follows the convention of +FFTW. Applying the forward transform followed by the backward transform +scales the original data by the size of the input array. The layout of the +complex data also follows the FFTW convention, where the complex Hermitian +output array has `(nx/2+1,ny,nz)` elements. Here `nx`, `ny` and `nz` are the +sizes of the real array and the division is rounded down. + +Below are examples of using :cpp:`FFT:R2C`. + +.. highlight:: c++ + +:: + + Geometry geom(...); + MultiFab mfin(...); + MultiFab mfout(...); + + auto scaling = 1. / geom.Domain().d_numPts(); + + FFT::R2C r2c(geom.Domain()); + r2c.forwardThenBackward(mfin, mfout, + [=] AMREX_GPU_DEVICE (int, int, int, auto& sp) + { + sp *= scaling; + }); + + cMultiFab cmf(...); + FFT::R2C r2c_forward(geom.Domain()); + r2c_forward(mfin, cmf); + + FFT::R2C r2c_backward(geom.Domain()); + r2c_backward(cmf, mfout); + +Note that using :cpp:`forwardThenBackward` is expected to be more efficient +than separate calls to :cpp:`forward` and :cpp:`backward` because some +parallel communication can be avoided. It should also be noted that a lot of +preparation works are done in the construction of an :cpp:`FFT::R2C` +object. Therefore, one should cache it for reuse if possible. + + +Poisson Solver +============== + +AMReX provides FFT based Poisson solvers. :cpp:`FFT::Poisson` supports all +periodic boundaries using purely FFT. :cpp:`FFT::PoissonHybrid` is a 3D only +solver that supports periodic boundaries in the first two dimensions and +Neumann boundary in the last dimension. Similar to :cpp:`FFT::R2C`, the +Poisson solvers should be cached for reuse. diff --git a/Docs/sphinx_documentation/source/FFT_Chapter.rst b/Docs/sphinx_documentation/source/FFT_Chapter.rst new file mode 100644 index 0000000000..9d6e9505d4 --- /dev/null +++ b/Docs/sphinx_documentation/source/FFT_Chapter.rst @@ -0,0 +1,16 @@ +.. _Chap:FFT: + +.. _sec:FFT:FFTOverview: + +Discrete Fourier Transform +========================== + +AMReX provides support for parallel discrete Fourier transform. The +implementation utilizes cuFFT, rocFFT, oneMKL and FFTW, for CUDA, HIP, SYCL +and CPU builds, respectively. It also provides FFT based Poisson +solvers. + +.. toctree:: + :maxdepth: 1 + + FFT diff --git a/Docs/sphinx_documentation/source/index.rst b/Docs/sphinx_documentation/source/index.rst index 203545cf40..09ffbb5c0b 100644 --- a/Docs/sphinx_documentation/source/index.rst +++ b/Docs/sphinx_documentation/source/index.rst @@ -52,6 +52,7 @@ Documentation on migration from BoxLib is available in the AMReX repository at D Fortran_Chapter Python_Chapter EB_Chapter + FFT_Chapter TimeIntegration_Chapter GPU_Chapter Visualization_Chapter diff --git a/GNUmakefile.in b/GNUmakefile.in index b85c2e0c35..67c789d97c 100644 --- a/GNUmakefile.in +++ b/GNUmakefile.in @@ -26,6 +26,9 @@ ifeq ($(USE_LINEAR_SOLVERS),TRUE) Pdirs += F_Interfaces/LinearSolvers endif endif +ifeq ($(USE_FFT),TRUE) + Pdirs += FFT +endif ifeq ($(USE_EB),TRUE) Pdirs += EB endif diff --git a/Src/Base/AMReX_Array.H b/Src/Base/AMReX_Array.H index 15ddde4d1e..d9f5fd1af5 100644 --- a/Src/Base/AMReX_Array.H +++ b/Src/Base/AMReX_Array.H @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -148,10 +149,6 @@ namespace amrex { * order (last index moving fastest). If not specified, Fortran order is * assumed. */ - namespace Order { - struct C {}; - struct F {}; - } /** * A GPU-compatible one-dimensional array. @@ -280,7 +277,7 @@ namespace amrex { * default if not given) */ template + Order ORDER = Order::F> struct Array2D { /** @@ -370,8 +367,7 @@ namespace amrex { * If the order is not specified, Fortran column-major order is assumed * (the index \c i moves the fastest) */ - template ,int> = 0> + template = 0> [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE const T& operator() (int i, int j) const noexcept { AMREX_ASSERT(i >= XLO && i <= XHI && j >= YLO && j <= YHI); @@ -384,8 +380,7 @@ namespace amrex { * If the order is not specified, Fortran column-major order is assumed * (the index \c i moves the fastest) */ - template ,int> = 0> + template = 0> [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE T& operator() (int i, int j) noexcept { AMREX_ASSERT(i >= XLO && i <= XHI && j >= YLO && j <= YHI); @@ -398,8 +393,7 @@ namespace amrex { * When the order is manually specified as Order::C, row-major order * is used (the index \c j moves the fastest). */ - template ,int> = 0> + template = 0> [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE const T& operator() (int i, int j) const noexcept { AMREX_ASSERT(i >= XLO && i <= XHI && j >= YLO && j <= YHI); @@ -412,8 +406,7 @@ namespace amrex { * When the order is manually specified as Order::C, row-major order * is used (the index \c j moves the fastest). */ - template ,int> = 0> + template = 0> [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE T& operator() (int i, int j) noexcept { AMREX_ASSERT(i >= XLO && i <= XHI && j >= YLO && j <= YHI); @@ -551,7 +544,7 @@ namespace amrex { * default if not given) */ template + Order ORDER=Order::F> struct Array3D { /** @@ -662,8 +655,7 @@ namespace amrex { * If the order is not specified, Fortran column-major order is assumed * (the index \c i moves the fastest) */ - template ,int> = 0> + template = 0> [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE const T& operator() (int i, int j, int k) const noexcept { return arr[i+j*(XHI-XLO+1)+k*((XHI-XLO+1)*(YHI-YLO+1)) @@ -676,8 +668,7 @@ namespace amrex { * If the order is not specified, Fortran column-major order is assumed * (the index \c i moves the fastest) */ - template ,int> = 0> + template = 0> [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE T& operator() (int i, int j, int k) noexcept { return arr[i+j*(XHI-XLO+1)+k*((XHI-XLO+1)*(YHI-YLO+1)) @@ -690,8 +681,7 @@ namespace amrex { * When the order is manually specified as Order::C, row-major order * is used (the index \c k moves the fastest). */ - template ,int> = 0> + template = 0> [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE const T& operator() (int i, int j, int k) const noexcept { return arr[k+j*(ZHI-ZLO+1)+i*((ZHI-ZLO+1)*(YHI-YLO+1)) @@ -704,8 +694,7 @@ namespace amrex { * When the order is manually specified as Order::C, row-major order * is used (the index \c k moves the fastest). */ - template ,int> = 0> + template = 0> [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE T& operator() (int i, int j, int k) noexcept { return arr[k+j*(ZHI-ZLO+1)+i*((ZHI-ZLO+1)*(YHI-YLO+1)) diff --git a/Src/Base/AMReX_BoxArray.H b/Src/Base/AMReX_BoxArray.H index b3b339c33b..e85946872c 100644 --- a/Src/Base/AMReX_BoxArray.H +++ b/Src/Base/AMReX_BoxArray.H @@ -53,6 +53,24 @@ namespace amrex //! Note that two BoxArrays that match are not necessarily equal. [[nodiscard]] bool match (const BoxArray& x, const BoxArray& y); + /** + * \brief Decompose domain box into BoxArray + * + * The returned BoxArray has nboxes Boxes, unless the the domain is too + * small. We aim to decompose the domain into subdomains that are as + * cubic as possible, even if this results in Boxes with odd numbers of + * cells. Thus, this function is generally not suited for applications + * with multiple AMR levels or for multigrid solvers. + * + * \param domain Domain Box + * \param nboxes the target number of Boxes + * \param decomp controls whether domain decomposition should be done in + * that direction. + */ + [[nodiscard]] BoxArray decompose (Box const& domain, int nboxes, + Array const& decomp + = {AMREX_D_DECL(true,true,true)}); + struct BARef { BARef (); diff --git a/Src/Base/AMReX_BoxArray.cpp b/Src/Base/AMReX_BoxArray.cpp index 9bca594352..576d4cb870 100644 --- a/Src/Base/AMReX_BoxArray.cpp +++ b/Src/Base/AMReX_BoxArray.cpp @@ -12,6 +12,9 @@ #include +#include +#include +#include #include namespace amrex { @@ -1887,6 +1890,173 @@ bool match (const BoxArray& x, const BoxArray& y) } } +BoxArray decompose (Box const& domain, int nboxes, + Array const& decomp) +{ + auto ndecomp = std::count(decomp.begin(), decomp.end(), true); + + if (nboxes <= 1 || ndecomp == 0) { + return BoxArray(domain); + } + + Box const& ccdomain = amrex::enclosedCells(domain); + IntVect const& ncells = ccdomain.length(); + IntVect nprocs(1); + + if (ndecomp == 1) { + for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) { + if (decomp[idim]) { + nprocs[idim] = nboxes; + } + } + } else { + // Factorization of nboxes + Vector factors; + { + int x = 2; + int n = nboxes; + while (x*x <= n) { + std::div_t dv = std::div(n, x); + if (dv.rem == 0) { + factors.push_back(x); + n = dv.quot; + } else { + ++x; + } + } + if (n != 1) { + factors.push_back(n); + } + AMREX_ALWAYS_ASSERT(nboxes == std::accumulate(factors.begin(), factors.end(), + 1, std::multiplies<>())); + } + + struct ProcDim + { + int nproc; + int idim; + Vector procs; + ProcDim (int np, int dim) : nproc(np), idim(dim) {} + }; + + Vector procdim; + procdim.reserve(AMREX_SPACEDIM); + + Array nblocks; + + for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) { + if (decomp[idim]) { + nblocks[idim] = ncells[idim]; + procdim.emplace_back(1,idim); + } else { + nblocks[idim] = 0; // This dimension will not be decomposed. + } + } + + auto comp = [&] (ProcDim const& a, ProcDim const& b) { + if (nblocks[a.idim]*b.nproc < + nblocks[b.idim]*a.nproc) { + return true; + } else if (nblocks[a.idim]*b.nproc > + nblocks[b.idim]*a.nproc) { + return false; + } else { + return a.procs.size() > b.procs.size(); + } + }; + + int nprocs_tot = 1; + while (!factors.empty()) { + std::sort(procdim.begin(), procdim.end(), comp); + auto f = factors.back(); + factors.pop_back(); + procdim.back().nproc *= f; + procdim.back().procs.push_back(f); + nprocs_tot *= f; + if (nprocs_tot == nboxes) { + break; + } + } + + // swap to see if the decomposition can be improved. + while (true) + { + std::sort(procdim.begin(), procdim.end(), comp); + auto fit = std::find_if(procdim.begin(),procdim.end(), + [] (ProcDim const& x) { return x.nproc > 1; }); + if (fit == procdim.end()) { break; } // This should not actually happen. + auto& light = *fit; + auto& heavy = procdim.back(); + Long w0 = nblocks[light.idim] * heavy.nproc; + Long w1 = nblocks[heavy.idim] * light.nproc; + if (w0 >= w1) { break; } + bool swapped = false; + for (auto& f0 : light.procs) { + for (auto& f1 : heavy.procs) { + if ((f0 > f1) && (w0*f0 < w1*f1)) { + light.nproc /= f0; + light.nproc *= f1; + heavy.nproc /= f1; + heavy.nproc *= f0; + std::swap(f0,f1); + swapped = true; + break; + } + } + if (swapped) { break;} + } + if (!swapped) { break; } + } + + for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) { + if (!decomp[idim]) { + procdim.emplace_back(1,idim); + } + } + for (auto const& pd : procdim) { + nprocs[pd.idim] = pd.nproc; + } + } + + AMREX_ALWAYS_ASSERT(AMREX_D_TERM(nprocs[0],*nprocs[1],*nprocs[2]) == nboxes); + + IntVect const domlo = ccdomain.smallEnd(); + IntVect const sz = ncells / nprocs; + IntVect const extra = ncells - sz*nprocs; + auto ixtyp = domain.ixType(); + BoxList bl(ixtyp); +#if (AMREX_SPACEDIM == 3) + for (int k = 0; k < nprocs[2]; ++k) { + // The first extra[2] blocks get one extra cell with a total of + // sz[2]+1. The rest get sz[2] cells. The decomposition in y + // and x directions are similar. + int klo = (k < extra[2]) ? k*(sz[2]+1) : (k*sz[2]+extra[2]); + int khi = (k < extra[2]) ? klo+(sz[2]+1)-1 : klo+sz[2]-1; + klo += domlo[2]; + khi += domlo[2]; +#endif +#if (AMREX_SPACEDIM >= 2) + for (int j = 0; j < nprocs[1]; ++j) { + int jlo = (j < extra[1]) ? j*(sz[1]+1) : (j*sz[1]+extra[1]); + int jhi = (j < extra[1]) ? jlo+(sz[1]+1)-1 : jlo+sz[1]-1; + jlo += domlo[1]; + jhi += domlo[1]; +#endif + for (int i = 0; i < nprocs[0]; ++i) { + int ilo = (i < extra[0]) ? i*(sz[0]+1) : (i*sz[0]+extra[0]); + int ihi = (i < extra[0]) ? ilo+(sz[0]+1)-1 : ilo+sz[0]-1; + ilo += domlo[0]; + ihi += domlo[0]; + Box b{IntVect(AMREX_D_DECL(ilo,jlo,klo)), + IntVect(AMREX_D_DECL(ihi,jhi,khi))}; + if (b.ok()) { + bl.push_back(b.convert(ixtyp)); + } + AMREX_D_TERM(},},}) + + return BoxArray(std::move(bl)); +} + std::ostream& operator<< (std::ostream& os, const BoxArray::RefID& id) { diff --git a/Src/Base/AMReX_ConstexprFor.H b/Src/Base/AMReX_ConstexprFor.H new file mode 100644 index 0000000000..972dd1ac30 --- /dev/null +++ b/Src/Base/AMReX_ConstexprFor.H @@ -0,0 +1,38 @@ +#ifndef AMREX_CONSTEXPR_FOR_H_ +#define AMREX_CONSTEXPR_FOR_H_ +#include + +#include +#include + +#include + +namespace amrex { + +// Implementation of "constexpr for" based on +// https://artificial-mind.net/blog/2020/10/31/constexpr-for +// +// Approximates what one would get from a compile-time +// unrolling of the loop +// for (int i = 0; i < N; ++i) { +// f(i); +// } +// +// The mechanism is recursive: we evaluate f(i) at the current +// i and then call the for loop at i+1. f() is a lambda function +// that provides the body of the loop and takes only an integer +// i as its argument. + +template +AMREX_GPU_HOST_DEVICE AMREX_INLINE +constexpr void constexpr_for (F const& f) +{ + if constexpr (I < N) { + f(std::integral_constant()); + constexpr_for(f); + } +} + +} + +#endif diff --git a/Src/Base/AMReX_FabArray.H b/Src/Base/AMReX_FabArray.H index 69970d6401..a67a72f0a3 100644 --- a/Src/Base/AMReX_FabArray.H +++ b/Src/Base/AMReX_FabArray.H @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -3679,6 +3680,8 @@ FabArray::norminf (FabArray const& mask, int comp, int ncomp, return nm0; } +using cMultiFab = FabArray > >; + } #endif /*BL_FABARRAY_H*/ diff --git a/Src/Base/AMReX_GpuComplex.H b/Src/Base/AMReX_GpuComplex.H index 274da82604..42dfc7626e 100644 --- a/Src/Base/AMReX_GpuComplex.H +++ b/Src/Base/AMReX_GpuComplex.H @@ -41,16 +41,16 @@ struct alignas(2*sizeof(T)) GpuComplex /** * \brief Return the real part. */ - AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE + [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE constexpr T real () const noexcept { return m_real; } /** * \brief Return the imaginary part. */ - AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE + [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE constexpr T imag () const noexcept { return m_imag; } - /** + /** * \brief Add a real number to this complex number. */ AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE diff --git a/Src/Base/AMReX_GpuError.H b/Src/Base/AMReX_GpuError.H index ce3ac188a8..65457c8f4e 100644 --- a/Src/Base/AMReX_GpuError.H +++ b/Src/Base/AMReX_GpuError.H @@ -84,6 +84,16 @@ namespace Gpu { std::string errStr(std::string("CURAND error in file ") + __FILE__ \ + " line " + std::to_string(__LINE__)); \ amrex::Abort(errStr); }} while(0) + +#define AMREX_CUFFT_SAFE_CALL(call) { \ + cufftResult_t amrex_i_err = call; \ + if (CUFFT_SUCCESS != amrex_i_err) { \ + std::string errStr(std::string("CUFFT error ")+std::to_string(amrex_i_err) \ + + std::string(" in file ") + __FILE__ \ + + " line " + std::to_string(__LINE__)); \ + amrex::Abort(errStr); \ + }} + #endif #ifdef AMREX_USE_HIP @@ -100,6 +110,16 @@ namespace Gpu { std::string errStr(std::string("HIPRAND error in file ") + __FILE__ \ + " line " + std::to_string(__LINE__)); \ amrex::Abort(errStr); }} while(0) + +#define AMREX_ROCFFT_SAFE_CALL(call) { \ + auto amrex_i_err = call; \ + if (rocfft_status_success != amrex_i_err) { \ + std::string errStr(std::string("rocFFT error ")+std::to_string(amrex_i_err) \ + + std::string(" in file ") + __FILE__ \ + + " line " + std::to_string(__LINE__)); \ + amrex::Abort(errStr); \ + }} + #endif #define AMREX_GPU_ERROR_CHECK() amrex::Gpu::ErrorCheck(__FILE__, __LINE__) diff --git a/Src/Base/AMReX_GpuLaunch.H b/Src/Base/AMReX_GpuLaunch.H index 435a11f342..5f1e61e008 100644 --- a/Src/Base/AMReX_GpuLaunch.H +++ b/Src/Base/AMReX_GpuLaunch.H @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -176,6 +177,47 @@ namespace Gpu { { return makeExecutionConfig(box.numPts()); } + + struct ExecConfig + { + Long start_idx; + int nblocks; + }; + + template + Vector makeNExecutionConfigs (Long N) noexcept + { + // Max # of blocks in a kernel launch + int numblocks_max = std::numeric_limits::max(); + // Max # of threads in a kernel launch + Long nmax = Long(MT) * numblocks_max; + // # of launches needed for N elements without using grid-stride + // loops inside GPU kernels. + auto nlaunches = int((N+nmax-1)/nmax); + Vector r(nlaunches); + Long ndone = 0; + for (int i = 0; i < nlaunches; ++i) { + int nblocks; + if (N > nmax) { + nblocks = numblocks_max; + N -= nmax; + } else { + nblocks = int((N+MT-1)/MT); + } + // At which element ID the kernel should start + r[i].start_idx = ndone; + ndone += Long(nblocks) * MT; + // # of blocks in this launch + r[i].nblocks = nblocks; + } + return r; + } + + template + Vector makeNExecutionConfigs (BoxND const& box) noexcept + { + return makeNExecutionConfigs(box.numPts()); + } #endif } diff --git a/Src/Base/AMReX_GpuLaunchFunctsG.H b/Src/Base/AMReX_GpuLaunchFunctsG.H index 7955410f8b..56a95dbc5b 100644 --- a/Src/Base/AMReX_GpuLaunchFunctsG.H +++ b/Src/Base/AMReX_GpuLaunchFunctsG.H @@ -747,17 +747,45 @@ void launch (int nblocks, int nthreads_per_block, gpuStream_t stream, L&& f) noe launch(nblocks, nthreads_per_block, 0, stream, std::forward(f)); } -template +template,int> FOO = 0> void launch (T const& n, L const& f) noexcept { + static_assert(sizeof(T) >= 2); if (amrex::isEmpty(n)) { return; } - const auto ec = Gpu::makeExecutionConfig(n); - AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(), - [=] AMREX_GPU_DEVICE () noexcept { - for (auto const i : Gpu::Range(n)) { - f(i); - } - }); + const auto& nec = Gpu::makeNExecutionConfigs(n); + for (auto const& ec : nec) { + const T start_idx = T(ec.start_idx); + const T nleft = n - start_idx; + AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(), + [=] AMREX_GPU_DEVICE () noexcept { + // This will not overflow, even though nblocks*MT might. + auto tid = T(MT)*T(blockIdx.x)+T(threadIdx.x); + if (tid < nleft) { + f(tid+start_idx); + } + }); + } + AMREX_GPU_ERROR_CHECK(); +} + +template +void launch (BoxND const& box, L const& f) noexcept +{ + if (box.isEmpty()) { return; } + const auto& nec = Gpu::makeNExecutionConfigs(box); + const BoxIndexerND indexer(box); + const auto type = box.ixType(); + for (auto const& ec : nec) { + const auto start_idx = std::uint64_t(ec.start_idx); + AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(), + [=] AMREX_GPU_DEVICE () noexcept { + auto icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x + start_idx; + if (icell < indexer.numPts()) { + auto iv = indexer.intVect(icell); + f(BoxND(iv,iv,type)); + } + }); + } AMREX_GPU_ERROR_CHECK(); } @@ -765,17 +793,23 @@ template ::value> ParallelFor (Gpu::KernelInfo const&, T n, L const& f) noexcept { + static_assert(sizeof(T) >= 2); if (amrex::isEmpty(n)) { return; } - const auto ec = Gpu::makeExecutionConfig(n); - AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(), - [=] AMREX_GPU_DEVICE () noexcept { - for (Long i = Long(blockDim.x)*blockIdx.x+threadIdx.x, stride = Long(blockDim.x)*gridDim.x; - i < Long(n); i += stride) { - detail::call_f_scalar_handler(f, T(i), - Gpu::Handler(amrex::min((std::uint64_t(n)-i+(std::uint64_t)threadIdx.x), - (std::uint64_t)blockDim.x))); - } - }); + const auto& nec = Gpu::makeNExecutionConfigs(n); + for (auto const& ec : nec) { + const T start_idx = T(ec.start_idx); + const T nleft = n - start_idx; + AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(), + [=] AMREX_GPU_DEVICE () noexcept { + // This will not overflow, even though nblocks*MT might. + auto tid = T(MT)*T(blockIdx.x)+T(threadIdx.x); + if (tid < nleft) { + detail::call_f_scalar_handler(f, tid+start_idx, + Gpu::Handler(amrex::min((std::uint64_t(nleft-tid)+(std::uint64_t)threadIdx.x), + (std::uint64_t)blockDim.x))); + } + }); + } AMREX_GPU_ERROR_CHECK(); } @@ -785,18 +819,20 @@ ParallelFor (Gpu::KernelInfo const&, BoxND const& box, L const& f) noexcept { if (amrex::isEmpty(box)) { return; } const BoxIndexerND indexer(box); - const auto ec = Gpu::makeExecutionConfig(box.numPts()); - AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(), - [=] AMREX_GPU_DEVICE () noexcept { - for (std::uint64_t icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x, stride = std::uint64_t(blockDim.x)*gridDim.x; - icell < indexer.numPts(); icell += stride) - { - auto iv = indexer.intVect(icell); - detail::call_f_intvect_handler(f, iv, - Gpu::Handler(amrex::min((indexer.numPts()-icell+(std::uint64_t)threadIdx.x), - (std::uint64_t)blockDim.x))); - } - }); + const auto& nec = Gpu::makeNExecutionConfigs(box); + for (auto const& ec : nec) { + const auto start_idx = std::uint64_t(ec.start_idx); + AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(), + [=] AMREX_GPU_DEVICE () noexcept { + auto icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x + start_idx; + if (icell < indexer.numPts()) { + auto iv = indexer.intVect(icell); + detail::call_f_intvect_handler(f, iv, + Gpu::Handler(amrex::min((indexer.numPts()-icell+(std::uint64_t)threadIdx.x), + (std::uint64_t)blockDim.x))); + } + }); + } AMREX_GPU_ERROR_CHECK(); } @@ -806,17 +842,20 @@ ParallelFor (Gpu::KernelInfo const&, BoxND const& box, T ncomp, L const& f) { if (amrex::isEmpty(box)) { return; } const BoxIndexerND indexer(box); - const auto ec = Gpu::makeExecutionConfig(box.numPts()); - AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(), - [=] AMREX_GPU_DEVICE () noexcept { - for (std::uint64_t icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x, stride = std::uint64_t(blockDim.x)*gridDim.x; - icell < indexer.numPts(); icell += stride) { - auto iv = indexer.intVect(icell); - detail::call_f_intvect_ncomp_handler(f, iv, ncomp, - Gpu::Handler(amrex::min((indexer.numPts()-icell+(std::uint64_t)threadIdx.x), - (std::uint64_t)blockDim.x))); - } - }); + const auto& nec = Gpu::makeNExecutionConfigs(box); + for (auto const& ec : nec) { + const auto start_idx = std::uint64_t(ec.start_idx); + AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(), + [=] AMREX_GPU_DEVICE () noexcept { + auto icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x + start_idx; + if (icell < indexer.numPts()) { + auto iv = indexer.intVect(icell); + detail::call_f_intvect_ncomp_handler(f, iv, ncomp, + Gpu::Handler(amrex::min((indexer.numPts()-icell+(std::uint64_t)threadIdx.x), + (std::uint64_t)blockDim.x))); + } + }); + } AMREX_GPU_ERROR_CHECK(); } diff --git a/Src/Base/AMReX_Loop.H b/Src/Base/AMReX_Loop.H index fe76b8c988..fe216bac45 100644 --- a/Src/Base/AMReX_Loop.H +++ b/Src/Base/AMReX_Loop.H @@ -3,6 +3,7 @@ #include #include +#include #include namespace amrex { @@ -567,30 +568,6 @@ void LoopConcurrentOnCpu (BoxND const& bx, int ncomp, F const& f) noexcept } } -// Implementation of "constexpr for" based on -// https://artificial-mind.net/blog/2020/10/31/constexpr-for -// -// Approximates what one would get from a compile-time -// unrolling of the loop -// for (int i = 0; i < N; ++i) { -// f(i); -// } -// -// The mechanism is recursive: we evaluate f(i) at the current -// i and then call the for loop at i+1. f() is a lambda function -// that provides the body of the loop and takes only an integer -// i as its argument. - -template -AMREX_GPU_HOST_DEVICE AMREX_INLINE -constexpr void constexpr_for (F const& f) -{ - if constexpr (I < N) { - f(std::integral_constant()); - constexpr_for(f); - } -} - #include } diff --git a/Src/Base/AMReX_MultiFabUtil.H b/Src/Base/AMReX_MultiFabUtil.H index 228070a13c..0740069c68 100644 --- a/Src/Base/AMReX_MultiFabUtil.H +++ b/Src/Base/AMReX_MultiFabUtil.H @@ -169,7 +169,8 @@ namespace amrex std::unique_ptr get_slice_data(int dir, Real coord, const MultiFab& cc, const Geometry& geom, int start_comp, int ncomp, - bool interpolate=false); + bool interpolate=false, + RealBox const& bnd_rbx = RealBox()); /** * \brief Get data in a cell of MultiFab/FabArray @@ -188,7 +189,7 @@ namespace amrex * specified by a direction and a cell. */ template ::value,int> FOO = 0> - MF get_line_data (MF const& mf, int dir, IntVect const& cell); + MF get_line_data (MF const& mf, int dir, IntVect const& cell, Box const& bnd_bx = Box()); //! Return an iMultiFab that has the same BoxArray and DistributionMapping //! as the coarse MultiFab cmf. Cells covered by the coarsened fine grids @@ -996,8 +997,10 @@ Vector get_cell_data (MF const& mf, IntVect const& cell } template ::value,int> FOO> -MF get_line_data (MF const& mf, int dir, IntVect const& cell) +MF get_line_data (MF const& mf, int dir, IntVect const& cell, Box const& bnd_bx) { + bool do_bnd = (!bnd_bx.isEmpty()); + BoxArray const& ba = mf.boxArray(); DistributionMapping const& dm = mf.DistributionMap(); const auto nboxes = static_cast(ba.size()); @@ -1005,17 +1008,29 @@ MF get_line_data (MF const& mf, int dir, IntVect const& cell) BoxList bl(ba.ixType()); Vector procmap; Vector index_map; - for (int i = 0; i < nboxes; ++i) { - Box const& b = ba[i]; - IntVect lo = cell; - lo[dir] = b.smallEnd(dir); - if (b.contains(lo)) { - IntVect hi = lo; - hi[dir] = b.bigEnd(dir); - Box b1d(lo,hi,b.ixType()); - bl.push_back(b1d); - procmap.push_back(dm[i]); - index_map.push_back(i); + if (!do_bnd) { + for (int i = 0; i < nboxes; ++i) { + Box const& b = ba[i]; + IntVect lo = cell; + lo[dir] = b.smallEnd(dir); + if (b.contains(lo)) { + IntVect hi = lo; + hi[dir] = b.bigEnd(dir); + Box b1d(lo,hi,b.ixType()); + bl.push_back(b1d); + procmap.push_back(dm[i]); + index_map.push_back(i); + } + } + } else { + for (int i = 0; i < nboxes; ++i) { + Box const& b = ba[i]; + Box const& b1d = bnd_bx & b; + if (b1d.ok()) { + bl.push_back(b1d); + procmap.push_back(dm[i]); + index_map.push_back(i); + } } } diff --git a/Src/Base/AMReX_MultiFabUtil.cpp b/Src/Base/AMReX_MultiFabUtil.cpp index 86a1e29054..721919f509 100644 --- a/Src/Base/AMReX_MultiFabUtil.cpp +++ b/Src/Base/AMReX_MultiFabUtil.cpp @@ -9,7 +9,7 @@ namespace { using namespace amrex; Box - getIndexBox(const RealBox& real_box, const Geometry& geom) { + getIndexBox (const RealBox& real_box, const Geometry& geom) { IntVect slice_lo, slice_hi; AMREX_D_TERM(slice_lo[0]=static_cast(std::floor((real_box.lo(0) - geom.ProbLo(0))/geom.CellSize(0)));, @@ -24,12 +24,11 @@ namespace { } - std::unique_ptr allocateSlice(int dir, const MultiFab& cell_centered_data, - int ncomp, const Geometry& geom, Real dir_coord, - Vector& slice_to_full_ba_map) { + std::unique_ptr allocateSlice (int dir, const MultiFab& cell_centered_data, + int ncomp, const Geometry& geom, Real dir_coord, + Vector& slice_to_full_ba_map, RealBox real_slice) { // Get our slice and convert to index space - RealBox real_slice = geom.ProbDomain(); real_slice.setLo(dir, dir_coord); real_slice.setHi(dir, dir_coord); Box slice_box = getIndexBox(real_slice, geom); @@ -550,7 +549,7 @@ namespace amrex return amrex::cast > > (imf); } - std::unique_ptr get_slice_data(int dir, Real coord, const MultiFab& cc, const Geometry& geom, int start_comp, int ncomp, bool interpolate) { + std::unique_ptr get_slice_data(int dir, Real coord, const MultiFab& cc, const Geometry& geom, int start_comp, int ncomp, bool interpolate, RealBox const& bnd_rbx) { BL_PROFILE("amrex::get_slice_data"); @@ -559,9 +558,15 @@ namespace amrex } const auto geomdata = geom.data(); + RealBox real_slice; + if (bnd_rbx.ok()) { + real_slice = bnd_rbx; + } else { + real_slice = geom.ProbDomain(); + } Vector slice_to_full_ba_map; - std::unique_ptr slice = allocateSlice(dir, cc, ncomp, geom, coord, slice_to_full_ba_map); + std::unique_ptr slice = allocateSlice(dir, cc, ncomp, geom, coord, slice_to_full_ba_map, real_slice); if (!slice) { return nullptr; diff --git a/Src/Base/AMReX_ParmParse.cpp b/Src/Base/AMReX_ParmParse.cpp index 34d383c99a..3ecfc8503a 100644 --- a/Src/Base/AMReX_ParmParse.cpp +++ b/Src/Base/AMReX_ParmParse.cpp @@ -349,7 +349,7 @@ getToken (const char*& str, std::string& ostr, int& num_linefeeds) // // Return the index of the n'th occurrence of a parameter name, // except if n==-1, return the index of the last occurrence. -// Return 0 if the specified occurrence does not exist. +// Return nullptr if the specified occurrence does not exist. // std::vector const* ppindex (const ParmParse::Table& table, int n, const std::string& name) @@ -365,6 +365,9 @@ ppindex (const ParmParse::Table& table, int n, const std::string& name) if (n == ParmParse::LAST) { return &(found->second.m_vals.back()); } else { + if(found->second.m_vals.size() < (std::size_t)n + 1) { + return nullptr; + } return &(found->second.m_vals[n]); } } @@ -642,7 +645,7 @@ squeryval (const ParmParse::Table& table, int occurrence) { // - // Get last occurrence of name in table. + // Get specified occurrence of name in table. // auto const* def = ppindex(table, occurrence, name); if ( def == nullptr ) diff --git a/Src/Base/AMReX_SmallMatrix.H b/Src/Base/AMReX_SmallMatrix.H new file mode 100644 index 0000000000..05305d1839 --- /dev/null +++ b/Src/Base/AMReX_SmallMatrix.H @@ -0,0 +1,490 @@ +#ifndef AMREX_SMALL_MATRIX_H_ +#define AMREX_SMALL_MATRIX_H_ +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace amrex { + + enum struct Order { C, F, RowMajor=C, ColumnMajor=F }; + + /** + * \brief Matrix class with compile-time size + * + * Note that column vectors and row vectors are special cases of a + * Matrix. + * + * \tparam T Matrix element data type. + * \tparam NRows Number of rows. + * \tparam NCols Number of columns. + * \tparam ORDER Memory layout order. Order::F (i.e., column-major) by default. + * \tparam StartIndex Starting index. Either 0 or 1. + */ + template + struct SmallMatrix + { + using value_type = T; + using reference_type = T&; + static constexpr int row_size = NRows; + static constexpr int column_size = NCols; + static constexpr Order ordering = ORDER; + static constexpr int starting_index = StartIndex; + + /** + * \brief Default constructor + * + * The data are uninitialized by default. If you want to initialize + * to zero, you can do `SmallMatrix M{};`. + */ + AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE + constexpr SmallMatrix () = default; + + /** + * \brief Constructs column- or row-vector + * + * The data are initialized with the given variadic arguments. If + * the number of argument is less than the size of the vector, the + * rest of the vector is initialized to zero. + */ + template = 0> + AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE + constexpr explicit SmallMatrix (Ts... vs) + : m_mat{vs...} + { + static_assert(sizeof...(vs) <= std::max(NRows,NCols)); + } + + /** + * \brief Constructs SmallMatrix with nested std::initializer_list + * + * The initializer list is assumed to be in row-major order, even when + * the ordering for the SmallMatrix object is colum-major. Below is + * an example of constructing a matrix with 2 rows and 3 columns. + \verbatim + SmallMatrix M{{11., 12., 13.}, + {21., 22., 23.}}; + \endverbatim + */ + AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE + explicit SmallMatrix (std::initializer_list> const& init) + { + AMREX_ASSERT(NRows == init.size()); + int i = StartIndex; + for (auto const& row : init) { + AMREX_ASSERT(NCols == row.size()); + int j = StartIndex; + for (auto const& x : row) { + (*this)(i,j) = x; + ++j; + } + ++i; + } + } + + //! Returns a const reference to the element at row i and column j. + [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE + const T& operator() (int i, int j) const noexcept { + static_assert(StartIndex == 0 || StartIndex == 1); + if constexpr (StartIndex == 1) { + --i; + --j; + } + AMREX_ASSERT(i < NRows && j < NCols); + if constexpr (ORDER == Order::F) { + return m_mat[i+j*NRows]; + } else { + return m_mat[j+i*NCols]; + } + } + + //! Returns a reference to the element at row i and column j. + [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE + T& operator() (int i, int j) noexcept { + static_assert(StartIndex == 0 || StartIndex == 1); + if constexpr (StartIndex == 1) { + --i; + --j; + } + AMREX_ASSERT(i < NRows && j < NCols); + if constexpr (ORDER == Order::F) { + return m_mat[i+j*NRows]; + } else { + return m_mat[j+i*NCols]; + } + } + + //! Returns a const reference to element i of a vector + template = 0> + [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE + const T& operator() (int i) const noexcept { + static_assert(StartIndex == 0 || StartIndex == 1); + if constexpr (StartIndex == 1) { + --i; + } + AMREX_ASSERT(i < NRows*NCols); + return m_mat[i]; + } + + //! Returns a reference to element i of a vector + template = 0> + [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE + T& operator() (int i) noexcept { + static_assert(StartIndex == 0 || StartIndex == 1); + if constexpr (StartIndex == 1) { + --i; + } + AMREX_ASSERT(i < NRows*NCols); + return m_mat[i]; + } + + //! Returns a const reference to element i of a vector + template = 0> + [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE + const T& operator[] (int i) const noexcept { + static_assert(StartIndex == 0 || StartIndex == 1); + if constexpr (StartIndex == 1) { + --i; + } + AMREX_ASSERT(i < NRows*NCols); + return m_mat[i]; + } + + //! Returns a reference to element i of a vector + template = 0> + [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE + T& operator[] (int i) noexcept { + static_assert(StartIndex == 0 || StartIndex == 1); + if constexpr (StartIndex == 1) { + --i; + } + AMREX_ASSERT(i < NRows*NCols); + return m_mat[i]; + } + + /** + * Returns a \c const pointer address to the first element of the + * SmallMatrix object, as if the object is treated as one-dimensional. + */ + [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE + const T* begin () const noexcept { return m_mat; } + + /** + * Returns a \c const pointer address right after the last element of the + * SmallMatrix object, as if the object is treated as one-dimensional. + */ + [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE + const T* end () const noexcept { return m_mat + NRows*NCols; } + + /** + * Returns a pointer address to the first element of the + * SmallMatrix object, as if the object is treated as one-dimensional. + */ + [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE + T* begin () noexcept { return m_mat; } + + /** + * Returns a pointer address right after the last element of the + * SmallMatrix object, as if the object is treated as one-dimensional. + */ + [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE + T* end () noexcept { return m_mat + NRows*NCols; } + + //! Set all elements in the matrix to the given value + AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE + SmallMatrix& + setVal (T val) + { + for (auto& x : m_mat) { x = val; } + return *this; + } + + //! Returns an identity matrix + template = 0> + static constexpr + AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE + SmallMatrix + Identity () noexcept { + static_assert(StartIndex == 0 || StartIndex == 1); + SmallMatrix I{}; + constexpr_for( + [&] (int i) { I(i,i) = T(1); }); + return I; + } + + //! Returns a matrix initialized with zeros + static constexpr + AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE + SmallMatrix + Zero () noexcept { + SmallMatrix Z{}; + return Z; + } + + //! Returns transposed matrix + [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE + SmallMatrix + transpose () const + { + SmallMatrix r; + for (int j = StartIndex; j < NRows+StartIndex; ++j) { + for (int i = StartIndex; i < NCols+StartIndex; ++i) { + r(i,j) = (*this)(j,i); + } + } + return r; + } + + //! Transposes a square matrix in-place. + template = 0> + AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE + SmallMatrix& + transposeInPlace () + { + static_assert(StartIndex == 0 || StartIndex == 1); + for (int j = 1+StartIndex; j < NCols+StartIndex; ++j) { + for (int i = StartIndex; i < j; ++i) { + amrex::Swap((*this)(i,j), (*this)(j,i)); + } + } + return *this; + } + + //! Returns the product of all elements in the matrix + [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE + T product () const + { + T p = 1; + for (auto const& x : m_mat) { + p *= x; + } + return p; + } + + //! Returns the sum of all elements in the matrix + [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE + T sum () const + { + T s = 0; + for (auto const& x : m_mat) { + s += x; + } + return s; + } + + //! Returns the trace of a square matrix + template = 0> + [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE + T trace () const + { + T t = 0; + constexpr_for([&] (int i) { t += (*this)(i,i); }); + return t; + } + + //! Operator += performing matrix addition as in (*this) += rhs + AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE + SmallMatrix& + operator += (SmallMatrix const& rhs) + { + for (int n = 0; n < NRows*NCols; ++n) { + m_mat[n] += rhs.m_mat[n]; + } + return *this; + } + + //! Binary operator + returning the result of maxtrix addition, lhs+rhs + AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE + friend SmallMatrix + operator+ (SmallMatrix lhs, + SmallMatrix const& rhs) + { + lhs += rhs; + return lhs; + } + + //! Operator -= performing matrix subtraction as in (*this) -= rhs + AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE + SmallMatrix& + operator -= (SmallMatrix const& rhs) + { + for (int n = 0; n < NRows*NCols; ++n) { + m_mat[n] -= rhs.m_mat[n]; + } + return *this; + } + + //! Binary operator - returning the result of maxtrix subtraction, lhs-rhs + AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE + friend SmallMatrix + operator- (SmallMatrix lhs, + SmallMatrix const& rhs) + { + lhs -= rhs; + return lhs; + } + + //! Unary minus operator + [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE + SmallMatrix + operator- () const + { + return (*this) * T(-1); + } + + //! Operator *= that scales this matrix in place by a scalar. + AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE + SmallMatrix& + operator *= (T a) + { + for (auto& x : m_mat) { + x *= a; + } + return *this; + } + + //! Returns the product of a matrix and a scalar + AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE + friend SmallMatrix + operator* (SmallMatrix m, T a) + { + m *= a; + return m; + } + + //! Returns the product of a scalar and a matrix + AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE + friend SmallMatrix + operator* (T a, SmallMatrix m) + { + m *= a; + return m; + } + + //! Returns matrix product of two matrices + template + AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE + friend SmallMatrix + operator* (SmallMatrix const& lhs, + SmallMatrix const& rhs); + + //! Returns the dot product of two vectors + [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE + T dot (SmallMatrix const& rhs) const + { + T r = 0; + for (int n = 0; n < NRows*NCols; ++n) { + r += m_mat[n] * rhs.m_mat[n]; + } + return r; + } + + template = 0> + [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE + constexpr T const& get () const { return m_mat[N]; } + + template = 0> + [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE + constexpr T& get () { return m_mat[N]; } + + private: + T m_mat[NRows*NCols]; + }; + + template + [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE + SmallMatrix + operator* (SmallMatrix const& lhs, + SmallMatrix const& rhs) + { + static_assert(SI == 0 || SI == 1); + SmallMatrix r; + if constexpr (Ord == Order::F) { + for (int j = SI; j < N3+SI; ++j) { + constexpr_for([&] (int i) { r(i,j) = U(0); }); + for (int k = SI; k < N2+SI; ++k) { + auto b = rhs(k,j); + constexpr_for([&] (int i) + { + r(i,j) += lhs(i,k) * b; + }); + } + } + } else { + for (int i = SI; i < N1+SI; ++i) { + constexpr_for([&] (int j) { r(i,j) = U(0); }); + for (int k = SI; k < N2+SI; ++k) { + auto a = lhs(i,k); + constexpr_for([&] (int j) + { + r(i,j) += a * rhs(k,j); + }); + } + } + } + return r; + } + + template + std::ostream& operator<< (std::ostream& os, + SmallMatrix const& mat) + { + for (int i = SI; i < NRows+SI; ++i) { + os << mat(i,SI); + for (int j = 1+SI; j < NCols+SI; ++j) { + os << " " << mat(i,j); + } + os << "\n"; + } + return os; + } + + template + using SmallVector = SmallMatrix; + + template + using SmallRowVector = SmallMatrix; +} + +template +struct std::tuple_size > + : std::integral_constant {}; + +template +struct std::tuple_element > +{ + using type = T; +}; + +#endif + +/* + * Notes on why SmallMatrix matrix{} is zero initialized. + * + * SmallMatrix is not an aggregate, because it has a user declared default + * constructor. The rule is that, for `SmallMatrix matrix{}` with an empty + * brace-enclosed initializer list, value-initialization is performed. The + * effects of value-initialization of SmallMatrix (which has a user-declared + * but not user-provided default constructor) are that the matrix object is + * first zero-initialized and then the object's default constructor is + * applied. Since the default constructor does nothing, the final result is + * the object is zero-initialized. + * + * Why is SmallMatrix's default constructor user-declared not user-provided? + * It's because we first declare it with `SmallMatrix () = default`. + * + * Reference: + * https://en.cppreference.com/w/cpp/language/list_initialization + * https://en.cppreference.com/w/cpp/language/value_initialization + * https://en.cppreference.com/w/cpp/language/zero_initialization + */ diff --git a/Src/Base/AMReX_TableData.H b/Src/Base/AMReX_TableData.H index 9c2a0d06ca..ee2471d36c 100644 --- a/Src/Base/AMReX_TableData.H +++ b/Src/Base/AMReX_TableData.H @@ -72,7 +72,7 @@ struct Table1D #endif }; -template +template struct Table2D { T* AMREX_RESTRICT p = nullptr; @@ -110,9 +110,7 @@ struct Table2D #if defined(AMREX_DEBUG) || defined(AMREX_BOUND_CHECK) index_assert(i,j); #endif - static_assert(std::is_same_v || - std::is_same_v); - if constexpr (std::is_same_v) { + if constexpr (ORDER == Order::F) { return p[(i-begin[0])+(j-begin[1])*stride1]; } else { return p[(i-begin[0])*stride1+(j-begin[1])]; @@ -146,7 +144,7 @@ private: static constexpr int len0 (GpuArray const& a_begin, GpuArray const& a_end) noexcept { - if constexpr (std::is_same_v) { + if constexpr (ORDER == Order::F) { return a_end[0] - a_begin[0]; } else { return a_end[1] - a_begin[1]; @@ -154,7 +152,7 @@ private: } }; -template +template struct Table3D { T* AMREX_RESTRICT p = nullptr; @@ -195,9 +193,7 @@ struct Table3D #if defined(AMREX_DEBUG) || defined(AMREX_BOUND_CHECK) index_assert(i,j,k); #endif - static_assert(std::is_same_v || - std::is_same_v); - if constexpr (std::is_same_v) { + if constexpr (ORDER == Order::F) { return p[(i-begin[0])+(j-begin[1])*stride1+(k-begin[2])*stride2]; } else { return p[(i-begin[0])*stride2+(j-begin[1])*stride1+(k-begin[2])]; @@ -234,7 +230,7 @@ private: static constexpr int len0 (GpuArray const& a_begin, GpuArray const& a_end) noexcept { - if constexpr (std::is_same_v) { + if constexpr (ORDER == Order::F) { return a_end[0] - a_begin[0]; } else { return a_end[2] - a_begin[2]; @@ -248,7 +244,7 @@ private: } }; -template +template struct Table4D { T* AMREX_RESTRICT p = nullptr; @@ -292,9 +288,7 @@ struct Table4D #if defined(AMREX_DEBUG) || defined(AMREX_BOUND_CHECK) index_assert(i,j,k,n); #endif - static_assert(std::is_same_v || - std::is_same_v); - if constexpr (std::is_same_v) { + if constexpr (ORDER == Order::F) { return p[(i-begin[0])+(j-begin[1])*stride1+(k-begin[2])*stride2+(n-begin[3])*stride3]; } else { return p[(i-begin[0])*stride3+(j-begin[1])*stride2+(k-begin[2])*stride1+(n-begin[3])]; @@ -333,7 +327,7 @@ private: static constexpr int len0 (GpuArray const& a_begin, GpuArray const& a_end) noexcept { - if constexpr (std::is_same_v) { + if constexpr (ORDER == Order::F) { return a_end[0] - a_begin[0]; } else { return a_end[3] - a_begin[3]; @@ -343,7 +337,7 @@ private: static constexpr int len1 (GpuArray const& a_begin, GpuArray const& a_end) noexcept { - if constexpr (std::is_same_v) { + if constexpr (ORDER == Order::F) { return a_end[1] - a_begin[1]; } else { return a_end[2] - a_begin[2]; @@ -353,7 +347,7 @@ private: static constexpr int len2 (GpuArray const& a_begin, GpuArray const& a_end) noexcept { - if constexpr (std::is_same_v) { + if constexpr (ORDER == Order::F) { return a_end[2] - a_begin[2]; } else { return a_end[1] - a_begin[1]; @@ -399,13 +393,13 @@ private: * // We can now use table in device lambda. * \endcode */ -template +template class TableData : public DataAllocator { public: - template friend class TableData; + template friend class TableData; using value_type = T; using table_type = std::conditional_t, std::conditional_t, @@ -459,7 +453,7 @@ private: bool m_ptr_owner = false; }; -template +template TableData::TableData (Array const& lo, Array const& hi, Arena* ar) : DataAllocator{ar}, m_lo(lo), m_hi(hi) { @@ -467,7 +461,7 @@ TableData::TableData (Array const& lo, Array const& hi, } -template +template TableData::TableData (TableData&& rhs) noexcept : DataAllocator{rhs.arena()}, m_dptr(rhs.m_dptr), @@ -480,7 +474,7 @@ TableData::TableData (TableData&& rhs) noexcept rhs.m_ptr_owner = false; } -template +template TableData& TableData::operator= (TableData && rhs) noexcept { @@ -498,20 +492,17 @@ TableData::operator= (TableData && rhs) noexcept return *this; } -template +template TableData::~TableData () noexcept { static_assert(std::is_trivially_copyable() && std::is_trivially_destructible(), "TableData: T must be trivially copyable and trivially destructible"); static_assert(N>=1 && N <=4, "TableData: N must be in the range of [1,4]"); - static_assert(std::is_same_v || - std::is_same_v, - "TableDat: ORDER must be either Order::F or Order::C"); clear(); } -template +template void TableData::resize (Array const& lo, Array const& hi, Arena* ar) { @@ -535,7 +526,7 @@ TableData::resize (Array const& lo, Array const& hi, Ar } } -template +template Long TableData::size () const noexcept { @@ -546,7 +537,7 @@ TableData::size () const noexcept return r; } -template +template void TableData::clear () noexcept { @@ -559,7 +550,7 @@ TableData::clear () noexcept } } -template +template void TableData::define () { @@ -574,46 +565,46 @@ TableData::define () } namespace detail { - template + template Table1D make_table (T* p, Array const& lo, Array const& hi) { return Table1D(p, lo[0], hi[0]+1); } - template + template Table2D make_table (T* p, Array const& lo, Array const& hi) { return Table2D(p, {lo[0],lo[1]}, {hi[0]+1,hi[1]+1}); } - template + template Table3D make_table (T* p, Array const& lo, Array const& hi) { return Table3D(p, {lo[0],lo[1],lo[2]}, {hi[0]+1,hi[1]+1,hi[2]+1}); } - template + template Table4D make_table (T* p, Array const& lo, Array const& hi) { return Table4D(p, {lo[0],lo[1],lo[2],lo[3]}, {hi[0]+1,hi[1]+1,hi[2]+1,hi[3]+1}); } } -template +template typename TableData::table_type TableData::table () noexcept { return detail::make_table(m_dptr, m_lo, m_hi); } -template +template typename TableData::const_table_type TableData::table () const noexcept { return detail::make_table(m_dptr, m_lo, m_hi); } -template +template typename TableData::const_table_type TableData::const_table () const noexcept { return detail::make_table(m_dptr, m_lo, m_hi); } -template +template void TableData::copy (TableData const& rhs) noexcept { diff --git a/Src/Base/AMReX_TinyProfiler.cpp b/Src/Base/AMReX_TinyProfiler.cpp index 32b35bf770..78a1685855 100644 --- a/Src/Base/AMReX_TinyProfiler.cpp +++ b/Src/Base/AMReX_TinyProfiler.cpp @@ -465,7 +465,6 @@ TinyProfiler::MemoryFinalize (bool bFlushing) noexcept std::ofstream ofs; std::ostream* os = nullptr; - std::streamsize oldprec = 0; if (ParallelDescriptor::IOProcessor()) { auto const& ofile = get_output_file(); if (ofile.empty()) { @@ -487,8 +486,6 @@ TinyProfiler::MemoryFinalize (bool bFlushing) noexcept all_memstats.clear(); all_memnames.clear(); } - - if(os) { os->precision(oldprec); } } bool diff --git a/Src/Base/CMakeLists.txt b/Src/Base/CMakeLists.txt index 882f401228..2b6387ece1 100644 --- a/Src/Base/CMakeLists.txt +++ b/Src/Base/CMakeLists.txt @@ -14,6 +14,8 @@ foreach(D IN LISTS AMReX_SPACEDIM) AMReX_BlockMutex.cpp AMReX_Enum.H AMReX_GpuComplex.H + AMReX_SmallMatrix.H + AMReX_ConstexprFor.H AMReX_Vector.H AMReX_TableData.H AMReX_Tuple.H diff --git a/Src/Base/Make.package b/Src/Base/Make.package index c64fa50f11..264de0581f 100644 --- a/Src/Base/Make.package +++ b/Src/Base/Make.package @@ -4,6 +4,7 @@ AMREX_BASE=EXE C$(AMREX_BASE)_headers += AMReX_ccse-mpi.H AMReX_Algorithm.H AMReX_Any.H AMReX_Array.H C$(AMREX_BASE)_headers += AMReX_Enum.H C$(AMREX_BASE)_headers += AMReX_Vector.H AMReX_TableData.H AMReX_Tuple.H AMReX_Math.H +C$(AMREX_BASE)_headers += AMReX_SmallMatrix.H AMReX_ConstexprFor.H C$(AMREX_BASE)_headers += AMReX_TypeList.H diff --git a/Src/CMakeLists.txt b/Src/CMakeLists.txt index 6e8af043e0..25455d7263 100644 --- a/Src/CMakeLists.txt +++ b/Src/CMakeLists.txt @@ -136,6 +136,10 @@ if (AMReX_PARTICLES) add_subdirectory(Particle) endif () +if (AMReX_FFT) + add_subdirectory(FFT) +endif () + # # Optional external components # diff --git a/Src/EB/AMReX_EB2.cpp b/Src/EB/AMReX_EB2.cpp index f99eb504d2..4f7a5cb84b 100644 --- a/Src/EB/AMReX_EB2.cpp +++ b/Src/EB/AMReX_EB2.cpp @@ -216,6 +216,8 @@ Build (const Geometry& geom, int required_coarsening_level, pp.queryAdd("stl_center", stl_center); bool stl_reverse_normal = false; pp.queryAdd("stl_reverse_normal", stl_reverse_normal); + bool stl_use_bvh = true; + pp.queryAdd("stl_use_bvh", stl_use_bvh); IndexSpace::push(new IndexSpaceSTL(stl_file, stl_scale, // NOLINT(clang-analyzer-cplusplus.NewDeleteLeaks) {stl_center[0], stl_center[1], stl_center[2]}, int(stl_reverse_normal), @@ -223,7 +225,8 @@ Build (const Geometry& geom, int required_coarsening_level, max_coarsening_level, ngrow, build_coarse_level_by_coarsening, a_extend_domain_face, - a_num_coarsen_opt)); + a_num_coarsen_opt, + stl_use_bvh)); } else { diff --git a/Src/EB/AMReX_EB2_2D_C.cpp b/Src/EB/AMReX_EB2_2D_C.cpp index b99b5559c7..231faf0cb8 100644 --- a/Src/EB/AMReX_EB2_2D_C.cpp +++ b/Src/EB/AMReX_EB2_2D_C.cpp @@ -342,7 +342,8 @@ int build_faces (Box const& bx, Array4 const& cell, nsmallfaces += *(hp+1); if (*hp > 0 && !cover_multiple_cuts) { - amrex::Abort("amrex::EB2::build_faces: more than 2 cuts not supported"); + amrex::Abort("amrex::EB2::build_faces: more than 2 cuts not supported. " + "You can try to fix it by using runtime parameter eb2.cover_multiple_cuts=1."); } return *hp; diff --git a/Src/EB/AMReX_EB2_3D_C.cpp b/Src/EB/AMReX_EB2_3D_C.cpp index 2d02e53bdc..ec7d643391 100644 --- a/Src/EB/AMReX_EB2_3D_C.cpp +++ b/Src/EB/AMReX_EB2_3D_C.cpp @@ -768,7 +768,8 @@ int build_faces (Box const& bx, Array4 const& cell, } }); } else { - amrex::Abort("amrex::EB2::build_faces: more than 2 cuts not supported"); + amrex::Abort("amrex::EB2::build_faces: more than 2 cuts not supported. " + "You can try to fix it by using runtime parameter eb2.cover_multiple_cuts=1."); } } @@ -932,7 +933,8 @@ void build_cells (Box const& bx, Array4 const& cell, if (nsmallcells > 0 || nmulticuts > 0) { if (!cover_multiple_cuts && nmulticuts > 0) { - amrex::Abort("amrex::EB2::build_cells: multi-cuts not supported"); + amrex::Abort("amrex::EB2::build_cells: multi-cuts not supported. " + "You can try to fix it by using runtime parameter eb2.cover_multiple_cuts=1."); } return; } else { diff --git a/Src/EB/AMReX_EB2_IndexSpace_STL.H b/Src/EB/AMReX_EB2_IndexSpace_STL.H index 0c72d076ea..f974daba7a 100644 --- a/Src/EB/AMReX_EB2_IndexSpace_STL.H +++ b/Src/EB/AMReX_EB2_IndexSpace_STL.H @@ -19,7 +19,7 @@ public: const Geometry& geom, int required_coarsening_level, int max_coarsening_level, int ngrow, bool build_coarse_level_by_coarsening, - bool extend_domain_face, int num_coarsen_opt); + bool extend_domain_face, int num_coarsen_opt, bool bvh_optimization); IndexSpaceSTL (IndexSpaceSTL const&) = delete; IndexSpaceSTL (IndexSpaceSTL &&) = delete; diff --git a/Src/EB/AMReX_EB2_IndexSpace_STL.cpp b/Src/EB/AMReX_EB2_IndexSpace_STL.cpp index 70e3b492d8..f8f62684f2 100644 --- a/Src/EB/AMReX_EB2_IndexSpace_STL.cpp +++ b/Src/EB/AMReX_EB2_IndexSpace_STL.cpp @@ -7,11 +7,13 @@ IndexSpaceSTL::IndexSpaceSTL (const std::string& stl_file, Real stl_scale, const Geometry& geom, int required_coarsening_level, int max_coarsening_level, int ngrow, bool build_coarse_level_by_coarsening, - bool extend_domain_face, int num_coarsen_opt) + bool extend_domain_face, int num_coarsen_opt, + bool bvh_optimization) { Gpu::LaunchSafeGuard lsg(true); // Always use GPU STLtools stl_tools; + stl_tools.setBVHOptimization(bvh_optimization); stl_tools.read_stl_file(stl_file, stl_scale, stl_center, stl_reverse_normal); // build finest level (i.e., level 0) first diff --git a/Src/EB/AMReX_EB_STL_utils.H b/Src/EB/AMReX_EB_STL_utils.H index eb277202cd..828d5a120c 100644 --- a/Src/EB/AMReX_EB_STL_utils.H +++ b/Src/EB/AMReX_EB_STL_utils.H @@ -7,6 +7,11 @@ #include #include +#include +#include +#include +#include + namespace amrex { @@ -15,33 +20,47 @@ class STLtools public: struct Triangle { XDim3 v1, v2, v3; - }; - - static constexpr int allregular = -1; - static constexpr int mixedcells = 0; - static constexpr int allcovered = 1; - -private: - Gpu::PinnedVector m_tri_pts_h; - Gpu::DeviceVector m_tri_pts_d; - Gpu::DeviceVector m_tri_normals_d; + [[nodiscard]] Real cent (int d) const + { + static_assert(sizeof(XDim3) == sizeof(Real)*3); + return Real(1./3.)*((&v1.x)[d] + (&v2.x)[d] + (&v3.x)[d]); + } + + [[nodiscard]] std::pair minmax (int d) const + { + static_assert(sizeof(XDim3) == sizeof(Real)*3); + return std::minmax({(&v1.x)[d], (&v2.x)[d], (&v3.x)[d]}); + } + }; - int m_num_tri=0; + template + struct BVHNodeT + { + RealBox boundingbox{AMREX_D_DECL(std::numeric_limits::max(), + std::numeric_limits::max(), + std::numeric_limits::max()), + AMREX_D_DECL(std::numeric_limits::lowest(), + std::numeric_limits::lowest(), + std::numeric_limits::lowest())}; + STLtools::Triangle triangles[M]; + XDim3 trinorm[M]; + int children[N]; + std::int8_t ntriangles = 0; + std::int8_t nchildren = 0; + }; - XDim3 m_ptmin; // All triangles are inside the bounding box defined by - XDim3 m_ptmax; // m_ptmin and m_ptmax. - XDim3 m_ptref; // The reference point is slightly outside the bounding box. - bool m_boundry_is_outside; // Is the bounding box boundary outside or inside the object? + static constexpr int m_bvh_max_size = 4; // max # of triangles in a leaf node + static constexpr int m_bvh_max_splits = 4; // max # of children + static constexpr int m_bvh_max_stack_size = 12; // max depth of the tree - void read_ascii_stl_file (std::string const& fname, Real scale, - Array const& center, int reverse_normal); - void read_binary_stl_file (std::string const& fname, Real scale, - Array const& center, int reverse_normal); + using Node = BVHNodeT; -public: + static constexpr int allregular = -1; + static constexpr int mixedcells = 0; + static constexpr int allcovered = 1; - void prepare (); // public for cuda + void setBVHOptimization (bool flag) { m_bvh_optimization = flag; } void read_stl_file (std::string const& fname, Real scale, Array const& center, int reverse_normal); @@ -65,6 +84,32 @@ public: Array,AMREX_SPACEDIM> const& type_arr, Array4 const& lst, Geometry const& geom) ; + void prepare (Gpu::PinnedVector a_tri_pts); // public for cuda + +private: + + bool m_bvh_optimization = true; + + Gpu::DeviceVector m_tri_pts_d; + Gpu::DeviceVector m_tri_normals_d; + Gpu::DeviceVector m_bvh_nodes; + + int m_num_tri=0; + + XDim3 m_ptmin; // All triangles are inside the bounding box defined by + XDim3 m_ptmax; // m_ptmin and m_ptmax. + XDim3 m_ptref; // The reference point is slightly outside the bounding box. + bool m_boundry_is_outside; // Is the bounding box boundary outside or inside the object? + + void read_ascii_stl_file (std::string const& fname, Real scale, + Array const& center, int reverse_normal, + Gpu::PinnedVector& a_tri_pts); + void read_binary_stl_file (std::string const& fname, Real scale, + Array const& center, int reverse_normal, + Gpu::PinnedVector& a_tri_pts); + + static void build_bvh (Triangle* begin, Triangle * end, Gpu::PinnedVector& bvh_nodes); + static void bvh_size (int ntri, std::size_t& nnodes); }; } diff --git a/Src/EB/AMReX_EB_STL_utils.cpp b/Src/EB/AMReX_EB_STL_utils.cpp index f7ce3045d9..3a3070f188 100644 --- a/Src/EB/AMReX_EB_STL_utils.cpp +++ b/Src/EB/AMReX_EB_STL_utils.cpp @@ -1,15 +1,34 @@ +#include #include #include #include +#include +#include + #include +// Reference for BVH: https://rmrsk.github.io/EBGeometry/Concepts.html#bounding-volume-hierarchies + namespace amrex { namespace { + + AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE + XDim3 triangle_norm (STLtools::Triangle const& tri) + { + XDim3 vec1{tri.v2.x-tri.v1.x, tri.v2.y-tri.v1.y, tri.v2.z-tri.v1.z}; + XDim3 vec2{tri.v3.x-tri.v2.x, tri.v3.y-tri.v2.y, tri.v3.z-tri.v2.z}; + XDim3 norm{vec1.y*vec2.z-vec1.z*vec2.y, + vec1.z*vec2.x-vec1.x*vec2.z, + vec1.x*vec2.y-vec1.y*vec2.x}; + Real tmp = 1._rt / std::sqrt(norm.x*norm.x + norm.y*norm.y + norm.z*norm.z); + return {norm.x * tmp, norm.y * tmp, norm.z * tmp}; + } + // Does line ab intersect with the triangle? AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE - bool line_tri_intersects (Real a[3], Real b[3], STLtools::Triangle const& tri) + bool line_tri_intersects (Real const a[3], Real const b[3], STLtools::Triangle const& tri) { if (amrex::max(a[0],b[0]) < amrex::min(tri.v1.x,tri.v2.x,tri.v3.x) || amrex::min(a[0],b[0]) > amrex::max(tri.v1.x,tri.v2.x,tri.v3.x) || @@ -89,12 +108,95 @@ namespace { return std::make_pair(false,0.0_rt); } } + + AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE + bool line_box_intersects (Real const a[3], Real const b[3], RealBox const& box) + { + for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) { + if ((a[idim] < box.lo(idim) && b[idim] < box.lo(idim)) || + (a[idim] > box.hi(idim) && b[idim] > box.hi(idim))) { + return false; + } + } + if (box.contains(a) || box.contains(b)) { + return true; + } + for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) { + // Note that we have made bounding box slightly bigger. So it's + // safe to assume that a line in the plane does not intersect + // with the actual bounding box. + if (a[idim] == b[idim]) { continue; } + Real xi[] = {box.lo(idim), box.hi(idim)}; + for (auto xface : xi) { + if (!((a[idim] > xface && b[idim] > xface) || + (a[idim] < xface && b[idim] < xface))) + { + Real w = (xface-a[idim]) / (b[idim]-a[idim]); + bool inside = true; + for (int jdim = 0; jdim < AMREX_SPACEDIM; ++jdim) { + if (idim != jdim) { + Real xpt = a[jdim] + (b[jdim]-a[jdim]) * w; + inside = inside && (xpt >= box.lo(jdim) + && xpt <= box.hi(jdim)); + } + } + if (inside) { return true; } + } + } + } + + return false; + } + + template + AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE + void bvh_line_tri_intersects (Real const a[3], Real const b[3], + STLtools::BVHNodeT const* root, + F const& f) + { + // Use stack to avoid recursion + Stack nodes_to_do; + Stack nchildren_done; + + if (line_box_intersects(a, b, root->boundingbox)) { + nodes_to_do.push(0); + nchildren_done.push(0); + } + + while (!nodes_to_do.empty()) { + auto const& node = root[nodes_to_do.top()]; + if (node.nchildren == 0) { // leaf node + int ret = f(node.ntriangles, node.triangles, node.trinorm); + if (ret != 0) { break; } + nodes_to_do.pop(); + nchildren_done.pop(); + } else { + auto& ndone = nchildren_done.top(); + if (ndone < node.nchildren) { + for (auto ichild = ndone; ichild < node.nchildren; ++ichild) { + ++ndone; + int inode = node.children[ichild]; + if (line_box_intersects(a, b, root[inode].boundingbox)) { + nodes_to_do.push(inode); + nchildren_done.push(0); + break; + } + } + } else { + nodes_to_do.pop(); + nchildren_done.pop(); + } + } + } + } } void STLtools::read_stl_file (std::string const& fname, Real scale, Array const& center, int reverse_normal) { + Gpu::PinnedVector tri_pts; + if (ParallelDescriptor::IOProcessor()) { char header[6]; header[5] = '\0'; @@ -107,18 +209,19 @@ STLtools::read_stl_file (std::string const& fname, Real scale, Array con } int is_binary = std::strcmp(header, "solid"); if (is_binary) { - read_binary_stl_file(fname, scale, center, reverse_normal); + read_binary_stl_file(fname, scale, center, reverse_normal, tri_pts); } else { - read_ascii_stl_file(fname, scale, center, reverse_normal); + read_ascii_stl_file(fname, scale, center, reverse_normal, tri_pts); } } - prepare(); + prepare(std::move(tri_pts)); } void STLtools::read_binary_stl_file (std::string const& fname, Real scale, - Array const& center, int reverse_normal) + Array const& center, int reverse_normal, + Gpu::PinnedVector& a_tri_pts) { if (ParallelDescriptor::IOProcessor()) { if (amrex::Verbose()) { @@ -140,9 +243,13 @@ STLtools::read_binary_stl_file (std::string const& fname, Real scale, uint32_t numtris; // uint32 - Number of triangles - 4 bytes amrex::readIntData(&numtris, 1, is, uint32_descr); - AMREX_ASSERT(numtris < uint32_t(std::numeric_limits::max())); + AMREX_ALWAYS_ASSERT(numtris < uint32_t(std::numeric_limits::max())); m_num_tri = static_cast(numtris); - m_tri_pts_h.resize(m_num_tri); + // maximum number of triangles allowed for traversing the BVH tree + // using stack. + int max_tri_stack = Math::powi(m_bvh_max_splits)*m_bvh_max_size; + AMREX_ALWAYS_ASSERT(m_num_tri <= max_tri_stack); + a_tri_pts.resize(m_num_tri); if (amrex::Verbose()) { Print() << " Number of triangles: " << m_num_tri << "\n"; @@ -150,7 +257,7 @@ STLtools::read_binary_stl_file (std::string const& fname, Real scale, for (int i=0; i < m_num_tri; ++i) { is.read(tmp, 50); // 50 bytes for each triangle. Vertex 1 starts at 12 bytes. - Real* p = &(m_tri_pts_h[i].v1.x); + Real* p = &(a_tri_pts[i].v1.x); RealDescriptor::convertToNativeFormat(p, 9, tmp+12, real32_descr); for (int j = 0; j < 3; ++j) { p[0] = p[0] * scale + center[0]; @@ -159,7 +266,7 @@ STLtools::read_binary_stl_file (std::string const& fname, Real scale, p += 3; } if (reverse_normal) { - std::swap(m_tri_pts_h[i].v1, m_tri_pts_h[i].v2); + std::swap(a_tri_pts[i].v1, a_tri_pts[i].v2); } } } @@ -167,7 +274,8 @@ STLtools::read_binary_stl_file (std::string const& fname, Real scale, void STLtools::read_ascii_stl_file (std::string const& fname, Real scale, - Array const& center, int reverse_normal) + Array const& center, int reverse_normal, + Gpu::PinnedVector& a_tri_pts) { if (ParallelDescriptor::IOProcessor()) { if (amrex::Verbose()) { @@ -200,9 +308,9 @@ STLtools::read_ascii_stl_file (std::string const& fname, Real scale, } m_num_tri = nlines / nlines_per_facet; - m_tri_pts_h.resize(m_num_tri); + a_tri_pts.resize(m_num_tri); static_assert(sizeof(Triangle) == sizeof(Real)*9, "sizeof(Triangle) is wrong"); - Real* p = &(m_tri_pts_h[0].v1.x); + Real* p = &(a_tri_pts[0].v1.x); if (amrex::Verbose()) { Print() << " Number of triangles: " << m_num_tri << "\n"; @@ -230,45 +338,52 @@ STLtools::read_ascii_stl_file (std::string const& fname, Real scale, std::getline(is,tmp); //end facet if (reverse_normal) { - std::swap(m_tri_pts_h[i].v1, m_tri_pts_h[i].v2); + std::swap(a_tri_pts[i].v1, a_tri_pts[i].v2); } } } } void -STLtools::prepare () +STLtools::prepare (Gpu::PinnedVector a_tri_pts) { + BL_PROFILE("STLtools::prepare"); + ParallelDescriptor::Bcast(&m_num_tri, 1); if (!ParallelDescriptor::IOProcessor()) { - m_tri_pts_h.resize(m_num_tri); + a_tri_pts.resize(m_num_tri); } - ParallelDescriptor::Bcast((char*)(m_tri_pts_h.dataPtr()), m_num_tri*sizeof(Triangle)); + ParallelDescriptor::Bcast((char*)(a_tri_pts.dataPtr()), m_num_tri*sizeof(Triangle)); - //device vectors - m_tri_pts_d.resize(m_num_tri); - m_tri_normals_d.resize(m_num_tri); + Gpu::PinnedVector bvh_nodes; + if (m_bvh_optimization) { + BL_PROFILE("STLtools::build_bvh"); + std::size_t nnodes = 0; + bvh_size(int(a_tri_pts.size()), nnodes); + bvh_nodes.reserve(nnodes); + build_bvh(a_tri_pts.data(), a_tri_pts.data()+a_tri_pts.size(), bvh_nodes); +#ifdef AMREX_USE_GPU + m_bvh_nodes.resize(bvh_nodes.size()); + Gpu::copyAsync(Gpu::hostToDevice, bvh_nodes.begin(), bvh_nodes.end(), + m_bvh_nodes.begin()); +#else + m_bvh_nodes = std::move(bvh_nodes); +#endif + } - Gpu::copyAsync(Gpu::hostToDevice, m_tri_pts_h.begin(), m_tri_pts_h.end(), - m_tri_pts_d.begin()); + auto const tri0 = a_tri_pts[0]; +#ifdef AMREX_USE_GPU + m_tri_pts_d.resize(m_num_tri); + Gpu::copyAsync(Gpu::hostToDevice, a_tri_pts.begin(), a_tri_pts.end(), + m_tri_pts_d.begin()); +#else + m_tri_pts_d = std::move(a_tri_pts); +#endif Triangle const* tri_pts = m_tri_pts_d.data(); - XDim3* tri_norm = m_tri_normals_d.data(); - // Compute normals in case the STL file does not have valid data for normals - ParallelFor(m_num_tri, [=] AMREX_GPU_DEVICE (int i) noexcept - { - Triangle const& tri = tri_pts[i]; - XDim3 vec1{tri.v2.x-tri.v1.x, tri.v2.y-tri.v1.y, tri.v2.z-tri.v1.z}; - XDim3 vec2{tri.v3.x-tri.v2.x, tri.v3.y-tri.v2.y, tri.v3.z-tri.v2.z}; - XDim3 norm{vec1.y*vec2.z-vec1.z*vec2.y, - vec1.z*vec2.x-vec1.x*vec2.z, - vec1.x*vec2.y-vec1.y*vec2.x}; - Real tmp = 1._rt / std::sqrt(norm.x*norm.x + norm.y*norm.y + norm.z*norm.z); - tri_norm[i].x = norm.x * tmp; - tri_norm[i].y = norm.y * tmp; - tri_norm[i].z = norm.z * tmp; - }); + m_tri_normals_d.resize(m_num_tri); + XDim3* tri_norm = m_tri_normals_d.data(); ReduceOps reduce_op; ReduceData reduce_data(reduce_op); @@ -276,6 +391,7 @@ STLtools::prepare () reduce_op.eval(m_num_tri, reduce_data, [=] AMREX_GPU_DEVICE (int i) -> ReduceTuple { + tri_norm[i] = triangle_norm(tri_pts[i]); return {amrex::min(tri_pts[i].v1.x, tri_pts[i].v2.x, tri_pts[i].v3.x), @@ -309,24 +425,12 @@ STLtools::prepare () // Choose a reference point by extending the normal vector of the first // triangle until it's slightly outside the bounding box. - XDim3 cent0; // centroid of the first triangle + XDim3 cent0{tri0.cent(0), tri0.cent(1), tri0.cent(2)}; int is_ref_positive; { - Triangle const& tri = m_tri_pts_h[0]; - cent0 = XDim3{(tri.v1.x + tri.v2.x + tri.v3.x) / 3._rt, - (tri.v1.y + tri.v2.y + tri.v3.y) / 3._rt, - (tri.v1.z + tri.v2.z + tri.v3.z) / 3._rt}; // We are computing the normal ourselves in case the stl file does // not have valid data on normal. - XDim3 vec1{tri.v2.x-tri.v1.x, tri.v2.y-tri.v1.y, tri.v2.z-tri.v1.z}; - XDim3 vec2{tri.v3.x-tri.v2.x, tri.v3.y-tri.v2.y, tri.v3.z-tri.v2.z}; - XDim3 norm{vec1.y*vec2.z-vec1.z*vec2.y, - vec1.z*vec2.x-vec1.x*vec2.z, - vec1.x*vec2.y-vec1.y*vec2.x}; - Real tmp = 1._rt / std::sqrt(norm.x*norm.x + norm.y*norm.y + norm.z*norm.z); - norm.x *= tmp; - norm.y *= tmp; - norm.z *= tmp; + XDim3 norm = triangle_norm(tri0); // Now we need to find out where the normal vector will intersect // with the bounding box defined by m_ptmin and m_ptmax. Real Lx, Ly, Lz; @@ -415,10 +519,113 @@ STLtools::prepare () m_boundry_is_outside = num_isects % 2 == 0; } +void +STLtools::build_bvh (Triangle* begin, Triangle* end, Gpu::PinnedVector& bvh_nodes) +{ + auto ntri = int(end - begin); + + if (ntri <= m_bvh_max_size) { + // This is a leaf node + bvh_nodes.push_back(Node()); + auto& node = bvh_nodes.back(); + auto& bbox = node.boundingbox; + for (int tr = 0; tr < ntri; ++tr) { + auto const& tri = begin[tr]; + for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) { + auto const& [xmin,xmax] = tri.minmax(idim); + bbox.setLo(idim,amrex::min(xmin, bbox.lo(idim))); + bbox.setHi(idim,amrex::max(xmax, bbox.hi(idim))); + } + node.triangles[tr] = tri; + node.trinorm[tr] = triangle_norm(tri); + } +#ifdef AMREX_USE_FLOAT + constexpr Real eps = Real(1.e-5); +#else + constexpr Real eps = Real(1.e-10); +#endif + Real small = eps*std::max({AMREX_D_DECL(bbox.length(0), + bbox.length(1), + bbox.length(2))}); + // Make bounding box slightly bigger for robustness. + for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) { + bbox.setLo(idim,bbox.lo(idim)-small); + bbox.setHi(idim,bbox.hi(idim)+small); + } + node.ntriangles = int(ntri); // NOLINT + return; + } + + RealVect centmin(std::numeric_limits::max()); + RealVect centmax(std::numeric_limits::lowest()); + for (auto* p = begin; p != end; ++p) { + RealVect cent(AMREX_D_DECL(p->cent(0), p->cent(1), p->cent(2))); + centmin.min(cent); + centmax.max(cent); + } + int max_dir = (centmax-centmin).maxDir(false); + std::sort(begin, end, [max_dir] (Triangle const& a, Triangle const& b) -> bool + { return a.cent(max_dir) < b.cent(max_dir); }); + + int nsplits = std::min((ntri + (m_bvh_max_size-1)) / m_bvh_max_size, m_bvh_max_splits); + int tsize = ntri / nsplits; + int nleft = ntri - tsize*nsplits; + + bvh_nodes.push_back(Node()); + bvh_nodes.back().nchildren = std::int8_t(nsplits); + auto this_node = bvh_nodes.size()-1; + + for (int isplit = 0; isplit < nsplits; ++isplit) { + int tbegin, tend; + if (isplit < nleft) { + tbegin = isplit * (tsize+1); + tend = tbegin + tsize + 1; + } else { + tbegin = isplit * tsize + nleft; + tend = tbegin + tsize; + } + bvh_nodes[this_node].children[isplit] = int(bvh_nodes.size()); + build_bvh(begin+tbegin, begin+tend, bvh_nodes); + } + + // update bounding box + auto& node = bvh_nodes[this_node]; + for (int ichild = 0; ichild < node.nchildren; ++ichild) { + int inode = node.children[ichild]; + for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) { + auto lo = node.boundingbox.lo(idim); + auto hi = node.boundingbox.hi(idim); + auto clo = bvh_nodes[inode].boundingbox.lo(idim); + auto chi = bvh_nodes[inode].boundingbox.hi(idim); + node.boundingbox.setLo(idim, std::min(lo,clo)); + node.boundingbox.setHi(idim, std::max(hi,chi)); + } + } +} + +void +STLtools::bvh_size (int ntri, std::size_t& nnodes) +{ + ++nnodes; + + if (ntri <= m_bvh_max_size) { return; } // This is a leaf node + + int nsplits = std::min((ntri + (m_bvh_max_size-1)) / m_bvh_max_size, m_bvh_max_splits); + int tsize = ntri / nsplits; + int nleft = ntri - tsize*nsplits; + + for (int isplit = 0; isplit < nsplits; ++isplit) { + int child_size = (isplit < nleft) ? (tsize+1) : tsize; + bvh_size(child_size, nnodes); + } +} + void STLtools::fill (MultiFab& mf, IntVect const& nghost, Geometry const& geom, Real outside_value, Real inside_value) const { + BL_PROFILE("STLtools::fill"); + int num_triangles = m_num_tri; const auto plo = geom.ProbLoArray(); @@ -432,8 +639,15 @@ STLtools::fill (MultiFab& mf, IntVect const& nghost, Geometry const& geom, Real other_value = m_boundry_is_outside ? inside_value : outside_value; auto const& ma = mf.arrays(); + auto const* bvh_root = m_bvh_nodes.data(); + + enum bvh_opt_options : int { no_bvh, yes_bvh }; + int bvh_opt_runtime_option = m_bvh_optimization ? yes_bvh : no_bvh; - ParallelFor(mf, nghost, [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k) noexcept + AnyCTO(TypeList>{}, + {bvh_opt_runtime_option}, + [&] (auto cto_func) { ParallelFor(mf, nghost, cto_func); }, + [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k, auto control) noexcept { Real coords[3]; coords[0]=plo[0]+static_cast(i)*dx[0]; @@ -449,9 +663,26 @@ STLtools::fill (MultiFab& mf, IntVect const& nghost, Geometry const& geom, coords[2] >= ptmin.z && coords[2] <= ptmax.z) { Real pr[]={ptref.x, ptref.y, ptref.z}; - for (int tr=0; tr < num_triangles; ++tr) { - if (line_tri_intersects(pr, coords, tri_pts[tr])) { - ++num_intersects; +#ifdef AMREX_USE_CUDA + amrex::ignore_unused(bvh_root, num_triangles, tri_pts); +#endif + if constexpr (control == yes_bvh) { + bvh_line_tri_intersects(pr, coords, bvh_root, + [&] (int ntri, Triangle const* tri, + XDim3 const*) -> int + { + for (int tr=0; tr < ntri; ++tr) { + if (line_tri_intersects(pr, coords, tri[tr])) { + ++num_intersects; + } + } + return 0; + }); + } else { + for (int tr=0; tr < num_triangles; ++tr) { + if (line_tri_intersects(pr, coords, tri_pts[tr])) { + ++num_intersects; + } } } } @@ -463,6 +694,8 @@ STLtools::fill (MultiFab& mf, IntVect const& nghost, Geometry const& geom, int STLtools::getBoxType (Box const& box, Geometry const& geom, RunOn) const { + BL_PROFILE("STLtools::getBoxType"); + const auto plo = geom.ProbLoArray(); const auto dx = geom.CellSizeArray(); @@ -498,11 +731,19 @@ STLtools::getBoxType (Box const& box, Geometry const& geom, RunOn) const XDim3 ptref = m_ptref; int ref_value = m_boundry_is_outside ? 1 : 0; + auto const* bvh_root = m_bvh_nodes.data(); + ReduceOps reduce_op; ReduceData reduce_data(reduce_op); using ReduceTuple = typename decltype(reduce_data)::Type; - reduce_op.eval(box, reduce_data, - [=] AMREX_GPU_DEVICE (int i, int j, int k) -> ReduceTuple + + enum bvh_opt_options : int { no_bvh, yes_bvh }; + int bvh_opt_runtime_option = m_bvh_optimization ? yes_bvh : no_bvh; + + AnyCTO(TypeList>{}, + {bvh_opt_runtime_option}, + [&] (auto cto_func) { reduce_op.eval(box, reduce_data, cto_func); }, + [=] AMREX_GPU_DEVICE (int i, int j, int k, auto control) -> ReduceTuple { Real coords[3]; coords[0]=plo[0]+static_cast(i)*dx[0]; @@ -519,9 +760,26 @@ STLtools::getBoxType (Box const& box, Geometry const& geom, RunOn) const coords[2] >= ptmin.z && coords[2] <= ptmax.z) { Real pr[]={ptref.x, ptref.y, ptref.z}; - for (int tr=0; tr < num_triangles; ++tr) { - if (line_tri_intersects(pr, coords, tri_pts[tr])) { - ++num_intersects; +#ifdef AMREX_USE_CUDA + amrex::ignore_unused(bvh_root,num_triangles,tri_pts); +#endif + if constexpr (control == yes_bvh) { + bvh_line_tri_intersects(pr, coords, bvh_root, + [&] (int ntri, Triangle const* tri, + XDim3 const*) -> int + { + for (int tr=0; tr < ntri; ++tr) { + if (line_tri_intersects(pr, coords, tri[tr])) { + ++num_intersects; + } + } + return 0; + }); + } else { + for (int tr=0; tr < num_triangles; ++tr) { + if (line_tri_intersects(pr, coords, tri_pts[tr])) { + ++num_intersects; + } } } } @@ -556,9 +814,17 @@ STLtools::fillFab (BaseFab& levelset, const Geometry& geom, RunOn, Box con Real reference_value = m_boundry_is_outside ? -1.0_rt : 1.0_rt; Real other_value = m_boundry_is_outside ? 1.0_rt : -1.0_rt; + auto const* bvh_root = m_bvh_nodes.data(); + auto const& a = levelset.array(); const Box& bx = levelset.box(); - ParallelFor(bx, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept + + enum bvh_opt_options : int { no_bvh, yes_bvh }; + int bvh_opt_runtime_option = m_bvh_optimization ? yes_bvh : no_bvh; + + ParallelFor(TypeList>{}, + {bvh_opt_runtime_option}, + bx, [=] AMREX_GPU_DEVICE (int i, int j, int k, auto control) noexcept { Real coords[3]; coords[0]=plo[0]+static_cast(i)*dx[0]; @@ -574,9 +840,26 @@ STLtools::fillFab (BaseFab& levelset, const Geometry& geom, RunOn, Box con coords[2] >= ptmin.z && coords[2] <= ptmax.z) { Real pr[]={ptref.x, ptref.y, ptref.z}; - for (int tr=0; tr < num_triangles; ++tr) { - if (line_tri_intersects(pr, coords, tri_pts[tr])) { - ++num_intersects; +#ifdef AMREX_USE_CUDA + amrex::ignore_unused(bvh_root,num_triangles,tri_pts); +#endif + if constexpr (control == yes_bvh) { + bvh_line_tri_intersects(pr, coords, bvh_root, + [&] (int ntri, Triangle const* tri, + XDim3 const*) -> int + { + for (int tr=0; tr < ntri; ++tr) { + if (line_tri_intersects(pr, coords, tri[tr])) { + ++num_intersects; + } + } + return 0; + }); + } else { + for (int tr=0; tr < num_triangles; ++tr) { + if (line_tri_intersects(pr, coords, tri_pts[tr])) { + ++num_intersects; + } } } } @@ -597,13 +880,22 @@ STLtools::getIntercept (Array,AMREX_SPACEDIM> const& inter_arr, const Triangle* tri_pts = m_tri_pts_d.data(); const XDim3* tri_norm = m_tri_normals_d.data(); + const Node* bvh_root = m_bvh_nodes.data(); + + enum bvh_opt_options : int { no_bvh, yes_bvh }; + int bvh_opt_runtime_option = m_bvh_optimization ? yes_bvh : no_bvh; for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) { Array4 const& inter = inter_arr[idim]; Array4 const& type = type_arr[idim]; const Box bx{inter}; - ParallelFor(bx, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept + ParallelFor(TypeList>{}, + {bvh_opt_runtime_option}, + bx, [=] AMREX_GPU_DEVICE (int i, int j, int k, auto bvh_control) noexcept { +#ifdef AMREX_USE_CUDA + amrex::ignore_unused(num_triangles,tri_pts,tri_norm,lst,bvh_root); +#endif Real r = std::numeric_limits::quiet_NaN(); if (type(i,j,k) == EB2::Type::irregular) { XDim3 p1{plo[0]+static_cast(i)*dx[0], @@ -616,62 +908,143 @@ STLtools::getIntercept (Array,AMREX_SPACEDIM> const& inter_arr, }; if (idim == 0) { Real x2 = plo[0]+static_cast(i+1)*dx[0]; - int it; - for (it=0; it < num_triangles; ++it) { - auto const& tri = tri_pts[it]; - auto tmp = edge_tri_intersects(p1.x, x2, p1.y, p1.z, - tri.v1, tri.v2, tri.v3, - tri_norm[it], - lst(i+1,j,k)-lst(i,j,k)); - if (tmp.first) { - r = tmp.second; - break; + bool found = false; + if constexpr (bvh_control == no_bvh) { + for (int it=0; it < num_triangles; ++it) { + auto const& tri = tri_pts[it]; + auto tmp = edge_tri_intersects(p1.x, x2, p1.y, p1.z, + tri.v1, tri.v2, tri.v3, + tri_norm[it], + lst(i+1,j,k)-lst(i,j,k)); + if (tmp.first) { + r = tmp.second; + found = true; + break; + } } + } else { + Real a[3] = {p1.x , p1.y, p1.z}; + Real b[3] = { x2, p1.y, p1.z}; + bvh_line_tri_intersects(a, b, bvh_root, + [&] (int ntri, Triangle const* ptri, + XDim3 const* ptrinorm) -> int + { + for (int it=0; it < ntri; ++it) { + auto const& tri = ptri[it]; + auto tmp = edge_tri_intersects(p1.x, x2, p1.y, p1.z, + tri.v1, tri.v2, tri.v3, + ptrinorm[it], + lst(i+1,j,k)-lst(i,j,k)); + if (tmp.first) { + r = tmp.second; + found = true; + return 1; + } + } + return 0; + }); } - if (it == num_triangles) { + if (!found) { r = (lst(i,j,k) > 0._rt) ? p1.x : x2; } } else if (idim == 1) { Real y2 = plo[1]+static_cast(j+1)*dx[1]; - int it; - for (it=0; it < num_triangles; ++it) { - auto const& tri = tri_pts[it]; - auto const& norm = tri_norm[it]; - auto tmp = edge_tri_intersects(p1.y, y2, p1.z, p1.x, - {tri.v1.y, tri.v1.z, tri.v1.x}, - {tri.v2.y, tri.v2.z, tri.v2.x}, - {tri.v3.y, tri.v3.z, tri.v3.x}, - { norm.y, norm.z, norm.x}, - lst(i,j+1,k)-lst(i,j,k)); - if (tmp.first) { - r = tmp.second; - break; + bool found = false; + if constexpr (bvh_control == no_bvh) { + for (int it=0; it < num_triangles; ++it) { + auto const& tri = tri_pts[it]; + auto const& norm = tri_norm[it]; + auto tmp = edge_tri_intersects(p1.y, y2, p1.z, p1.x, + {tri.v1.y, tri.v1.z, tri.v1.x}, + {tri.v2.y, tri.v2.z, tri.v2.x}, + {tri.v3.y, tri.v3.z, tri.v3.x}, + { norm.y, norm.z, norm.x}, + lst(i,j+1,k)-lst(i,j,k)); + if (tmp.first) { + r = tmp.second; + found = true; + break; + } } + } else { + Real a[3] = {p1.x, p1.y , p1.z}; + Real b[3] = {p1.x, y2, p1.z}; + bvh_line_tri_intersects(a, b, bvh_root, + [&] (int ntri, Triangle const* ptri, + XDim3 const* ptrinorm) -> int + { + for (int it=0; it < ntri; ++it) { + auto const& tri = ptri[it]; + auto const& norm = ptrinorm[it]; + auto tmp = edge_tri_intersects(p1.y, y2, p1.z, p1.x, + {tri.v1.y, tri.v1.z, tri.v1.x}, + {tri.v2.y, tri.v2.z, tri.v2.x}, + {tri.v3.y, tri.v3.z, tri.v3.x}, + { norm.y, norm.z, norm.x}, + lst(i,j+1,k)-lst(i,j,k)); + if (tmp.first) { + r = tmp.second; + found = true; + return 1; + } + } + return 0; + }); } - if (it == num_triangles) { + if (!found) { r = (lst(i,j,k) > 0._rt) ? p1.y : y2; } - } else { + } +#if (AMREX_SPACEDIM == 3) + else { Real z2 = plo[2]+static_cast(k+1)*dx[2]; - int it; - for (it=0; it < num_triangles; ++it) { - auto const& tri = tri_pts[it]; - auto const& norm = tri_norm[it]; - auto tmp = edge_tri_intersects(p1.z, z2, p1.x, p1.y, - {tri.v1.z, tri.v1.x, tri.v1.y}, - {tri.v2.z, tri.v2.x, tri.v2.y}, - {tri.v3.z, tri.v3.x, tri.v3.y}, - { norm.z, norm.x, norm.y}, - lst(i,j,k+1)-lst(i,j,k)); - if (tmp.first) { - r = tmp.second; - break; + bool found = false; + if constexpr (bvh_control == no_bvh) { + for (int it=0; it < num_triangles; ++it) { + auto const& tri = tri_pts[it]; + auto const& norm = tri_norm[it]; + auto tmp = edge_tri_intersects(p1.z, z2, p1.x, p1.y, + {tri.v1.z, tri.v1.x, tri.v1.y}, + {tri.v2.z, tri.v2.x, tri.v2.y}, + {tri.v3.z, tri.v3.x, tri.v3.y}, + { norm.z, norm.x, norm.y}, + lst(i,j,k+1)-lst(i,j,k)); + if (tmp.first) { + r = tmp.second; + found = true; + break; + } } + } else { + Real a[3] = {p1.x, p1.y, p1.z }; + Real b[3] = {p1.x, p1.y, z2}; + bvh_line_tri_intersects(a, b, bvh_root, + [&] (int ntri, Triangle const* ptri, + XDim3 const* ptrinorm) -> int + { + for (int it=0; it < ntri; ++it) { + auto const& tri = ptri[it]; + auto const& norm = ptrinorm[it]; + auto tmp = edge_tri_intersects(p1.z, z2, p1.x, p1.y, + {tri.v1.z, tri.v1.x, tri.v1.y}, + {tri.v2.z, tri.v2.x, tri.v2.y}, + {tri.v3.z, tri.v3.x, tri.v3.y}, + { norm.z, norm.x, norm.y}, + lst(i,j,k+1)-lst(i,j,k)); + if (tmp.first) { + r = tmp.second; + found = true; + return 1; + } + } + return 0; + }); } - if (it == num_triangles) { + if (!found) { r = (lst(i,j,k) > 0._rt) ? p1.z : z2; } } +#endif } inter(i,j,k) = r; }); @@ -723,7 +1096,7 @@ STLtools::updateIntercept (Array,AMREX_SPACEDIM> const& inter_arr, (lst(i,j,k) > Real(0.0) && is_nan)) { inter(i,j,k) = problo[2] + static_cast(k)*dx[2]; - } + } else if (lst(i,j,k+1) == Real(0.0) || (lst(i,j,k+1) > Real(0.0) && is_nan)) { diff --git a/Src/EB/AMReX_EB_triGeomOps_K.H b/Src/EB/AMReX_EB_triGeomOps_K.H index 25a803892b..7ab517efe9 100644 --- a/Src/EB/AMReX_EB_triGeomOps_K.H +++ b/Src/EB/AMReX_EB_triGeomOps_K.H @@ -63,8 +63,8 @@ namespace amrex::tri_geom_ops L[5] = v2[1] - v1[1]; } //================================================================================ - AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void side_op3(Real v1[3],Real v2[3], - Real t1[3],Real t2[3],Real t3[3], + AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void side_op3(const Real v1[3], const Real v2[3], + const Real t1[3], const Real t2[3], const Real t3[3], Real &S1, Real &S2, Real &S3) { @@ -81,8 +81,8 @@ namespace amrex::tri_geom_ops } //================================================================================ //get normal of triangle pointing at a test-point - AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void tri_n(Real P1[3],Real P2[3],Real P3[3], - Real testp[3],Real n[3]) + AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void tri_n(const Real P1[3], const Real P2[3], const Real P3[3], + const Real testp[3], Real n[3]) { Real v1[3],v2[3],magn; Real centr[3],c_tp_vec[3]; @@ -92,9 +92,9 @@ namespace amrex::tri_geom_ops CrossProd(v1,v2,n); - centr[0]=Real(0.333333)*(P1[0]+P2[0]+P3[0]); - centr[1]=Real(0.333333)*(P1[1]+P2[1]+P3[1]); - centr[2]=Real(0.333333)*(P1[2]+P2[2]+P3[2]); + centr[0]=Real(1./3.)*(P1[0]+P2[0]+P3[0]); + centr[1]=Real(1./3.)*(P1[1]+P2[1]+P3[1]); + centr[2]=Real(1./3.)*(P1[2]+P2[2]+P3[2]); getvec(centr,testp,c_tp_vec); magn=std::sqrt(n[0]*n[0]+n[1]*n[1]+n[2]*n[2]); @@ -109,7 +109,7 @@ namespace amrex::tri_geom_ops n[2]=n[2]/magn; } //================================================================================ - AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Real triangle_area(Real P1[3],Real P2[3],Real P3[3]) + AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Real triangle_area(const Real P1[3], const Real P2[3], const Real P3[3]) { Real v1[3],v2[3],area[3]; @@ -121,7 +121,7 @@ namespace amrex::tri_geom_ops //================================================================================ //this is only useful when v1-v2 segment intersects the triangle AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE bool find_intersection_point(const Real v1[3],const Real v2[3], - Real t1[3], Real t2[3], Real t3[3],Real ip[3],int bisect_iters=20,Real tol=1e-6) + const Real t1[3], const Real t2[3], const Real t3[3], Real ip[3],int bisect_iters=20,Real tol=1e-6) { Real plane_eq_mid,plane_eq1,plane_eq2; @@ -166,13 +166,13 @@ namespace amrex::tri_geom_ops break; } - if(plane_eq_mid*plane_eq1 < 0.0) + if(plane_eq_mid*plane_eq1 < Real(0.0)) { p2[0]=midp[0]; p2[1]=midp[1]; p2[2]=midp[2]; } - else if(plane_eq_mid*plane_eq2 < 0.0) + else if(plane_eq_mid*plane_eq2 < Real(0.0)) { p1[0]=midp[0]; p1[1]=midp[1]; @@ -182,7 +182,7 @@ namespace amrex::tri_geom_ops //or error: p1,midp and p2 are on the same side //which is not what this function is meant for { - if(plane_eq_mid*plane_eq1 > 0.0 && plane_eq_mid*plane_eq2 > 0.0) + if(plane_eq_mid*plane_eq1 > Real(0.0) && plane_eq_mid*plane_eq2 > Real(0.0)) { all_ok=false; } @@ -197,8 +197,8 @@ namespace amrex::tri_geom_ops return(all_ok); } //================================================================================ - AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE int lineseg_tri_intersect(Real v1[3],Real v2[3], - Real t1[3],Real t2[3],Real t3[3]) + AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE int lineseg_tri_intersect(const Real v1[3], const Real v2[3], + const Real t1[3], const Real t2[3], const Real t3[3]) { //see plucker coordinates based method //https://members.loria.fr/SLazard/ARC-Visi3D/Pant-project/files/Line_Triangle.html @@ -233,11 +233,11 @@ namespace amrex::tri_geom_ops } } //proper and edge intersection - else if( (S1 < 0.0 && S2 < 0.0 && S3 < 0.0) || - (S1 > 0.0 && S2 > 0.0 && S3 > 0.0) || - (std::abs(S1) < eps && S2*S3 > 0.0) || //S1=0 - (std::abs(S2) < eps && S3*S1 > 0.0) || //S2=0 - (std::abs(S3) < eps && S1*S2 > 0.0) ) //S3=0 + else if( (S1 < Real(0.0) && S2 < Real(0.0) && S3 < Real(0.0)) || + (S1 > Real(0.0) && S2 > Real(0.0) && S3 > Real(0.0)) || + (std::abs(S1) < eps && S2*S3 > Real(0.0)) || //S1=0 + (std::abs(S2) < eps && S3*S1 > Real(0.0)) || //S2=0 + (std::abs(S3) < eps && S1*S2 > Real(0.0)) ) //S3=0 { get_plucker_coords(v1,t1,L2); @@ -253,7 +253,7 @@ namespace amrex::tri_geom_ops ls_s1 = side_op(L4,L3); ls_s2 = side_op(L4,L2); - if(ls_s1*ls_s2 > 0.0) + if(ls_s1*ls_s2 > Real(0.0)) { no_intersections = 0; } diff --git a/Src/EB/AMReX_algoim.cpp b/Src/EB/AMReX_algoim.cpp index 254e15dab0..864ec626a0 100644 --- a/Src/EB/AMReX_algoim.cpp +++ b/Src/EB/AMReX_algoim.cpp @@ -66,8 +66,16 @@ compute_integrals (MultiFab& intgmf, IntVect nghost) if (Gpu::inLaunchRegion()) { +#if defined(AMREX_USE_CUDA) + // It appears that there is a nvcc bug. We have to use the + // 4D ParallelFor here, even though ncomp is 1. + int ncomp = fg.nComp(); + amrex::ParallelFor(bx, ncomp, + [=] AMREX_GPU_DEVICE (int i, int j, int k, int) noexcept +#else amrex::ParallelFor(bx, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept +#endif { const auto ebflag = fg(i,j,k); if (ebflag.isRegular()) { diff --git a/Src/Extern/HYPRE/AMReX_Habec_2D_K.H b/Src/Extern/HYPRE/AMReX_Habec_2D_K.H index 731ad04d4b..2d1d63432a 100644 --- a/Src/Extern/HYPRE/AMReX_Habec_2D_K.H +++ b/Src/Extern/HYPRE/AMReX_Habec_2D_K.H @@ -6,7 +6,7 @@ #include #include #include -#include +#include #endif namespace amrex { diff --git a/Src/Extern/HYPRE/AMReX_Habec_3D_K.H b/Src/Extern/HYPRE/AMReX_Habec_3D_K.H index 5d5c054758..6b4e67587d 100644 --- a/Src/Extern/HYPRE/AMReX_Habec_3D_K.H +++ b/Src/Extern/HYPRE/AMReX_Habec_3D_K.H @@ -6,7 +6,7 @@ #include #include #include -#include +#include #endif namespace amrex { diff --git a/Src/FFT/AMReX_FFT.H b/Src/FFT/AMReX_FFT.H new file mode 100644 index 0000000000..f8050fff93 --- /dev/null +++ b/Src/FFT/AMReX_FFT.H @@ -0,0 +1,969 @@ +#ifndef AMREX_FFT_H_ +#define AMREX_FFT_H_ +#include + +#include +#include +#include +#include +#include + +#if defined(AMREX_USE_CUDA) +# include +# include +#elif defined(AMREX_USE_HIP) +# if __has_include() // ROCm 5.3+ +# include +# else +# include +# endif +# include +#elif defined(AMREX_USE_SYCL) +# include +#else +# include +#endif + +namespace amrex::FFT +{ + +/** + * \brief Discrete Fourier Transform + * + * This class supports Fourier transforms between real and complex data. The + * name R2C indicates that the forward transform converts real data to + * complex data, while the backward transform converts complex data to real + * data. It should be noted that both directions of transformation are + * supported, not just from real to complex. The scaling follows the FFTW + * convention, where applying the forward transform followed by the backward + * transform scales the original data by the size of the input array. + * + * For more details, we refer the users to + * https://amrex-codes.github.io/amrex/docs_html/FFT_Chapter.html. + */ +template +class R2C +{ +public: + using MF = std::conditional_t, + MultiFab, FabArray > >; + using cMF = FabArray > >; + + /** + * \brief Constructor + * + * \param domain the forward domain (i.e., the domain of the real data) + * \param info optional information + */ + explicit R2C (Box const& domain, Info const& info = Info{}); + + ~R2C (); + + R2C (R2C const&) = delete; + R2C (R2C &&) = delete; + R2C& operator= (R2C const&) = delete; + R2C& operator= (R2C &&) = delete; + + /** + * \brief Forward and then backward transform + * + * This function is available only when this class template is + * instantiated for transforms in both directions. It's more efficient + * than calling the forward function that stores the spectral data in a + * caller provided container followed by the backward function, because + * this can avoid parallel communication between the internal data and + * the caller's data container. + * + * \param inmf input data in MultiFab or FabArray> + * \param outmf output data in MultiFab or FabArray> + * \param post_forward a callable object for processing the post-forward + * data before the backward transform. Its interface + * is `(int,int,int,GpuComplex&)`, where the integers + * are indices in the spectral space, and the reference + * to the complex number allows for the modification of + * the spectral data at that location. + */ + template = 0> + void forwardThenBackward (MF const& inmf, MF& outmf, F const& post_forward) + { + this->forward(inmf); + this->post_forward_doit(post_forward); + this->backward(outmf); + } + + /** + * \brief Forward transform + * + * The output is stored in this object's internal data. This function is + * not available when this class template is instantiated for + * backward-only transform. + * + * \param inmf input data in MultiFab or FabArray> + */ + template = 0> + void forward (MF const& inmf); + + /** + * \brief Forward transform + * + * This function is not available when this class template is + * instantiated for backward-only transform. + * + * \param inmf input data in MultiFab or FabArray> + * \param outmf output data in FabArray>> + */ + template = 0> + void forward (MF const& inmf, cMF& outmf); + + /** + * \brief Backward transform + * + * This function is available only when this class template is + * instantiated for transforms in both directions. + * + * \param outmf output data in MultiFab or FabArray> + */ + template = 0> + void backward (MF& outmf); + + /** + * \brief Backward transform + * + * This function is not available when this class template is + * instantiated for forward-only transform. + * + * \param inmf input data in FabArray>> + * \param outmf output data in MultiFab or FabArray> + */ + template = 0> + void backward (cMF const& inmf, MF& outmf); + + /** + * \brief Get the internal spectral data + * + * This function is not available when this class template is + * instantiated for backward-only transform. For performance reasons, + * the returned data array does not have the usual ordering of + * `(x,y,z)`. The order is specified in the second part of the return + * value. + */ + template = 0> + std::pair getSpectralData (); + + struct Swap01 + { + [[nodiscard]] AMREX_GPU_HOST_DEVICE Dim3 operator() (Dim3 i) const noexcept + { + return {i.y, i.x, i.z}; + } + + [[nodiscard]] AMREX_GPU_HOST_DEVICE Dim3 Inverse (Dim3 i) const noexcept + { + return {i.y, i.x, i.z}; + } + + [[nodiscard]] IndexType operator() (IndexType it) const noexcept + { + return it; + } + + [[nodiscard]] IndexType Inverse (IndexType it) const noexcept + { + return it; + } + }; + + struct Swap02 + { + [[nodiscard]] AMREX_GPU_HOST_DEVICE Dim3 operator() (Dim3 i) const noexcept + { + return {i.z, i.y, i.x}; + } + + [[nodiscard]] AMREX_GPU_HOST_DEVICE Dim3 Inverse (Dim3 i) const noexcept + { + return {i.z, i.y, i.x}; + } + + [[nodiscard]] IndexType operator() (IndexType it) const noexcept + { + return it; + } + + [[nodiscard]] IndexType Inverse (IndexType it) const noexcept + { + return it; + } + }; + + struct RotateFwd + { + // dest -> src: (x,y,z) -> (y,z,x) + [[nodiscard]] AMREX_GPU_HOST_DEVICE Dim3 operator() (Dim3 i) const noexcept + { + return {i.y, i.z, i.x}; + } + + // src -> dest: (x,y,z) -> (z,x,y) + [[nodiscard]] AMREX_GPU_HOST_DEVICE Dim3 Inverse (Dim3 i) const noexcept + { + return {i.z, i.x, i.y}; + } + + [[nodiscard]] IndexType operator() (IndexType it) const noexcept + { + return it; + } + + [[nodiscard]] IndexType Inverse (IndexType it) const noexcept + { + return it; + } + }; + + struct RotateBwd + { + // dest -> src: (x,y,z) -> (z,x,y) + [[nodiscard]] AMREX_GPU_HOST_DEVICE Dim3 operator() (Dim3 i) const noexcept + { + return {i.z, i.x, i.y}; + } + + // src -> dest: (x,y,z) -> (y,z,x) + [[nodiscard]] AMREX_GPU_HOST_DEVICE Dim3 Inverse (Dim3 i) const noexcept + { + return {i.y, i.z, i.x}; + } + + [[nodiscard]] IndexType operator() (IndexType it) const noexcept + { + return it; + } + + [[nodiscard]] IndexType Inverse (IndexType it) const noexcept + { + return it; + } + }; + + // public for cuda + template + void post_forward_doit (F const& post_forward); + +private: + +#if defined(AMREX_USE_CUDA) + using VendorPlan = cufftHandle; + using VendorPlan2 = VendorPlan; + using FFTComplex = std::conditional_t, + cuComplex, cuDoubleComplex>; +#elif defined(AMREX_USE_HIP) + using VendorPlan = rocfft_plan; + using VendorPlan2 = VendorPlan; + using FFTComplex = std::conditional_t, + float2, double2>; +#elif defined(AMREX_USE_SYCL) + using VendorPlan = oneapi::mkl::dft::descriptor< + std::is_same_v ? oneapi::mkl::dft::precision::SINGLE + : oneapi::mkl::dft::precision::DOUBLE, + oneapi::mkl::dft::domain::REAL> *; + using VendorPlan2 = oneapi::mkl::dft::descriptor< + std::is_same_v ? oneapi::mkl::dft::precision::SINGLE + : oneapi::mkl::dft::precision::DOUBLE, + oneapi::mkl::dft::domain::COMPLEX> *; + using FFTComplex = GpuComplex; +#else + using VendorPlan = std::conditional_t, + fftwf_plan, fftw_plan>; + using VendorPlan2 = VendorPlan; + using FFTComplex = std::conditional_t, + fftwf_complex, fftw_complex>; +#endif + + struct Plan { + bool defined = false; + VendorPlan plan = 0; // NOLINT + }; + + struct Plan2 { + bool defined = false; + VendorPlan2 plan = 0; // NOLINT + }; + + template + static typename FA::FABType::value_type * + get_fab (FA& fa) { + auto myproc = ParallelContext::MyProcSub(); + if (myproc < fa.size()) { + return fa.fabPtr(myproc); + } else { + return nullptr; + } + } + + static void exec_r2c (Plan plan, MF& in, cMF& out); + static void exec_c2r (Plan plan, cMF& in, MF& out); + template + static void exec_c2c (Plan2 plan, cMF& inout); + + template + static void destroy_plan (P plan); + static std::pair make_c2c_plans (cMF& inout); + + void backward_doit (MF& outmf); + + Plan m_fft_fwd_x{}; + Plan m_fft_bwd_x{}; + Plan2 m_fft_fwd_y{}; + Plan2 m_fft_bwd_y{}; + Plan2 m_fft_fwd_z{}; + Plan2 m_fft_bwd_z{}; + + // Comm meta-data. In the forward phase, we start with (x,y,z), + // transpose to (y,x,z) and then (z,x,y). In the backward phase, we + // perform inverse transpose. + std::unique_ptr m_cmd_x2y; // (x,y,z) -> (y,x,z) + std::unique_ptr m_cmd_y2x; // (y,x,z) -> (x,y,z) + std::unique_ptr m_cmd_y2z; // (y,x,z) -> (z,x,y) + std::unique_ptr m_cmd_z2y; // (z,x,y) -> (y,x,z) + Swap01 m_dtos_x2y{}; + Swap01 m_dtos_y2x{}; + Swap02 m_dtos_y2z{}; + Swap02 m_dtos_z2y{}; + + MF m_rx; + cMF m_cx; + cMF m_cy; + cMF m_cz; + + Box m_real_domain; + Box m_spectral_domain_x; + Box m_spectral_domain_y; + Box m_spectral_domain_z; + + Info m_info; +}; + +template +R2C::R2C (Box const& domain, Info const& info) + : m_real_domain(domain), + m_spectral_domain_x(IntVect(0), IntVect(AMREX_D_DECL(domain.length(0)/2, + domain.bigEnd(1), + domain.bigEnd(2)))), +#if (AMREX_SPACEDIM >= 2) + m_spectral_domain_y(IntVect(0), IntVect(AMREX_D_DECL(domain.bigEnd(1), + domain.length(0)/2, + domain.bigEnd(2)))), +#if (AMREX_SPACEDIM == 3) + m_spectral_domain_z(IntVect(0), IntVect(AMREX_D_DECL(domain.bigEnd(2), + domain.length(0)/2, + domain.bigEnd(1)))), +#endif +#endif + m_info(info) +{ + static_assert(std::is_same_v || std::is_same_v); + AMREX_ALWAYS_ASSERT(m_real_domain.smallEnd() == 0 && + m_real_domain.length(0) > 1 && + m_real_domain.cellCentered()); +#if (AMREX_SPACEDIM == 3) + AMREX_ALWAYS_ASSERT(m_real_domain.length(2) > 1 || ! m_info.batch_mode); + AMREX_ALWAYS_ASSERT(m_real_domain.length(1) > 1 || m_real_domain.length(2) == 1); +#else + AMREX_ALWAYS_ASSERT(! m_info.batch_mode); +#endif + + int myproc = ParallelContext::MyProcSub(); + int nprocs = ParallelContext::NProcsSub(); + + auto bax = amrex::decompose(m_real_domain, nprocs, {AMREX_D_DECL(false,true,true)}); + DistributionMapping dmx = detail::make_iota_distromap(bax.size()); + m_rx.define(bax, dmx, 1, 0); + + { + BoxList bl = bax.boxList(); + for (auto & b : bl) { + b.setBig(0, m_spectral_domain_x.bigEnd(0)); + } + BoxArray cbax(std::move(bl)); + m_cx.define(cbax, dmx, 1, 0); + } + + // plans for x-direction + if (myproc < m_rx.size()) + { + Box const local_box = m_rx.boxArray()[myproc]; + int n = local_box.length(0); + int howmany = AMREX_D_TERM(1, *local_box.length(1), *local_box.length(2)); + +#if defined(AMREX_USE_CUDA) + if constexpr (D == Direction::both || D == Direction::forward) { + cufftType fwd_type = std::is_same_v ? CUFFT_R2C : CUFFT_D2Z; + AMREX_CUFFT_SAFE_CALL + (cufftPlanMany(&m_fft_fwd_x.plan, 1, &n, + nullptr, 1, m_real_domain.length(0), + nullptr, 1, m_spectral_domain_x.length(0), + fwd_type, howmany)); + AMREX_CUFFT_SAFE_CALL(cufftSetStream(m_fft_fwd_x.plan, Gpu::gpuStream())); + } + if constexpr (D == Direction::both || D == Direction::backward) { + cufftType bwd_type = std::is_same_v ? CUFFT_C2R : CUFFT_Z2D; + AMREX_CUFFT_SAFE_CALL + (cufftPlanMany(&m_fft_bwd_x.plan, 1, &n, + nullptr, 1, m_spectral_domain_x.length(0), + nullptr, 1, m_real_domain.length(0), + bwd_type, howmany)); + AMREX_CUFFT_SAFE_CALL(cufftSetStream(m_fft_bwd_x.plan, Gpu::gpuStream())); + } +#elif defined(AMREX_USE_HIP) + + auto prec = std::is_same_v ? rocfft_precision_single : rocfft_precision_double; + const std::size_t length = n; + if constexpr (D == Direction::both || D == Direction::forward) { + AMREX_ROCFFT_SAFE_CALL + (rocfft_plan_create(&m_fft_fwd_x.plan, rocfft_placement_notinplace, + rocfft_transform_type_real_forward, prec, 1, + &length, howmany, nullptr)); + } + if constexpr (D == Direction::both || D == Direction::backward) { + AMREX_ROCFFT_SAFE_CALL + (rocfft_plan_create(&m_fft_bwd_x.plan, rocfft_placement_notinplace, + rocfft_transform_type_real_inverse, prec, 1, + &length, howmany, nullptr)); + } + +#elif defined(AMREX_USE_SYCL) + + m_fft_fwd_x.plan = new std::remove_pointer_t(n); + m_fft_fwd_x.plan->set_value(oneapi::mkl::dft::config_param::PLACEMENT, + DFTI_NOT_INPLACE); + m_fft_fwd_x.plan->set_value(oneapi::mkl::dft::config_param::NUMBER_OF_TRANSFORMS, + howmany); + m_fft_fwd_x.plan->set_value(oneapi::mkl::dft::config_param::FWD_DISTANCE, + m_real_domain.length(0)); + m_fft_fwd_x.plan->set_value(oneapi::mkl::dft::config_param::BWD_DISTANCE, + m_spectral_domain_x.length(0)); + std::array strides{0,1}; + m_fft_fwd_x.plan->set_value(oneapi::mkl::dft::config_param::FWD_STRIDES, + strides.data()); + m_fft_fwd_x.plan->set_value(oneapi::mkl::dft::config_param::BWD_STRIDES, + strides.data()); + m_fft_fwd_x.plan->set_value(oneapi::mkl::dft::config_param::WORKSPACE, + oneapi::mkl::dft::config_value::WORKSPACE_EXTERNAL); + m_fft_fwd_x.plan->commit(amrex::Gpu::Device::streamQueue()); + + m_fft_bwd_x.plan = m_fft_fwd_x.plan; + +#else /* FFTW */ + + auto* in = m_rx[myproc].dataPtr(); + auto* out = (FFTComplex*)(m_cx[myproc].dataPtr()); + + if constexpr (std::is_same_v) { + if constexpr (D == Direction::both || D == Direction::forward) { + m_fft_fwd_x.plan = fftwf_plan_many_dft_r2c + (1, &n, howmany, in, nullptr, 1, m_real_domain.length(0), + out, nullptr, 1, m_spectral_domain_x.length(0), + FFTW_ESTIMATE | FFTW_DESTROY_INPUT); + } + if constexpr (D == Direction::both || D == Direction::backward) { + m_fft_bwd_x.plan = fftwf_plan_many_dft_c2r + (1, &n, howmany, out, nullptr, 1, m_spectral_domain_x.length(0), + in, nullptr, 1, m_real_domain.length(0), + FFTW_ESTIMATE | FFTW_DESTROY_INPUT); + } + } else { + if constexpr (D == Direction::both || D == Direction::forward) { + m_fft_fwd_x.plan = fftw_plan_many_dft_r2c + (1, &n, howmany, in, nullptr, 1, m_real_domain.length(0), + out, nullptr, 1, m_spectral_domain_x.length(0), + FFTW_ESTIMATE | FFTW_DESTROY_INPUT); + } + if constexpr (D == Direction::both || D == Direction::backward) { + m_fft_bwd_x.plan = fftw_plan_many_dft_c2r + (1, &n, howmany, out, nullptr, 1, m_spectral_domain_x.length(0), + in, nullptr, 1, m_real_domain.length(0), + FFTW_ESTIMATE | FFTW_DESTROY_INPUT); + } + } +#endif + if constexpr (D == Direction::both || D == Direction::forward) { + m_fft_fwd_x.defined = true; + } + if constexpr (D == Direction::both || D == Direction::backward) { + m_fft_bwd_x.defined = true; + } + } + +#if (AMREX_SPACEDIM >= 2) + DistributionMapping cdmy; + if (m_real_domain.length(1) > 1) { + auto cbay = amrex::decompose(m_spectral_domain_y, nprocs, {AMREX_D_DECL(false,true,true)}); + if (cbay.size() == dmx.size()) { + cdmy = dmx; + } else { + cdmy = detail::make_iota_distromap(cbay.size()); + } + m_cy.define(cbay, cdmy, 1, 0); + + std::tie(m_fft_fwd_y, m_fft_bwd_y) = make_c2c_plans(m_cy); + + // comm meta-data between x and y phases + m_cmd_x2y = std::make_unique + (m_cy, m_spectral_domain_y, m_cx, IntVect(0), m_dtos_x2y); + m_cmd_y2x = std::make_unique + (m_cx, m_spectral_domain_x, m_cy, IntVect(0), m_dtos_y2x); + } + +#if (AMREX_SPACEDIM == 3) + if (m_real_domain.length(1) > 1 && + (! m_info.batch_mode && m_real_domain.length(2) > 1)) + { + auto cbaz = amrex::decompose(m_spectral_domain_z, nprocs, {false,true,true}); + DistributionMapping cdmz; + if (cbaz.size() == dmx.size()) { + cdmz = dmx; + } else if (cbaz.size() == cdmy.size()) { + cdmz = cdmy; + } else { + cdmz = detail::make_iota_distromap(cbaz.size()); + } + m_cz.define(cbaz, cdmz, 1, 0); + + std::tie(m_fft_fwd_z, m_fft_bwd_z) = make_c2c_plans(m_cz); + + // comm meta-data between y and z phases + m_cmd_y2z = std::make_unique + (m_cz, m_spectral_domain_z, m_cy, IntVect(0), m_dtos_y2z); + m_cmd_z2y = std::make_unique + (m_cy, m_spectral_domain_y, m_cz, IntVect(0), m_dtos_z2y); + } +#endif +#endif +} + +template +template +void R2C::destroy_plan (P plan) +{ + if (! plan.defined) { return; } + +#if defined(AMREX_USE_CUDA) + AMREX_CUFFT_SAFE_CALL(cufftDestroy(plan.plan)); +#elif defined(AMREX_USE_HIP) + AMREX_ROCFFT_SAFE_CALL(rocfft_plan_destroy(plan.plan)); +#elif defined(AMREX_USE_SYCL) + delete plan.plan; +#else + if constexpr (std::is_same_v) { + fftwf_destroy_plan(plan.plan); + } else { + fftw_destroy_plan(plan.plan); + } +#endif + + plan.defined = false; +} + +template +R2C::~R2C () +{ +#if defined(AMREX_USE_SYCL) + if constexpr (D == Direction::both || D == Direction::forward) { + destroy_plan(m_fft_fwd_x); + destroy_plan(m_fft_fwd_y); + destroy_plan(m_fft_fwd_z); + } else { + destroy_plan(m_fft_bwd_x); + destroy_plan(m_fft_bwd_y); + destroy_plan(m_fft_bwd_z); + } +#else + destroy_plan(m_fft_fwd_x); + destroy_plan(m_fft_fwd_y); + destroy_plan(m_fft_fwd_z); + destroy_plan(m_fft_bwd_x); + destroy_plan(m_fft_bwd_y); + destroy_plan(m_fft_bwd_z); +#endif +} + +#ifdef AMREX_USE_HIP +namespace detail { void hip_execute (rocfft_plan plan, void **in, void **out); } +#endif + +#ifdef AMREX_USE_SYCL +namespace detail +{ +template +void sycl_execute (P plan, TI* in, TO* out) +{ + std::size_t workspaceSize = 0; + plan->get_value(oneapi::mkl::dft::config_param::WORKSPACE_BYTES, + &workspaceSize); + auto* buffer = (T*)amrex::The_Arena()->alloc(workspaceSize); + plan->set_workspace(buffer); + sycl::event r; + if (std::is_same_v) { + amrex::ignore_unused(in); + if constexpr (direction == Direction::forward) { + r = oneapi::mkl::dft::compute_forward(*plan, out); + } else { + r = oneapi::mkl::dft::compute_backward(*plan, out); + } + } else { + if constexpr (direction == Direction::forward) { + r = oneapi::mkl::dft::compute_forward(*plan, in, out); + } else { + r = oneapi::mkl::dft::compute_backward(*plan, in, out); + } + } + r.wait(); + amrex::The_Arena()->free(buffer); +} +} +#endif + +template +void R2C::exec_r2c (Plan plan, MF& in, cMF& out) +{ + if (! plan.defined) { return; } + +#if defined(AMREX_USE_GPU) + auto* pin = in[ParallelContext::MyProcSub()].dataPtr(); + auto* pout = out[ParallelContext::MyProcSub()].dataPtr(); +#else + amrex::ignore_unused(in,out); +#endif + +#if defined(AMREX_USE_CUDA) + if constexpr (std::is_same_v) { + AMREX_CUFFT_SAFE_CALL(cufftExecR2C(plan.plan, pin, (FFTComplex*)pout)); + } else { + AMREX_CUFFT_SAFE_CALL(cufftExecD2Z(plan.plan, pin, (FFTComplex*)pout)); + } +#elif defined(AMREX_USE_HIP) + detail::hip_execute(plan.plan, (void**)&pin, (void**)&pout); +#elif defined(AMREX_USE_SYCL) + detail::sycl_execute(plan.plan, pin, (std::complex*)pout); +#else + if constexpr (std::is_same_v) { + fftwf_execute(plan.plan); + } else { + fftw_execute(plan.plan); + } +#endif +} + +template +void R2C::exec_c2r (Plan plan, cMF& in, MF& out) +{ + if (! plan.defined) { return; } + +#if defined(AMREX_USE_GPU) + auto* pin = in[ParallelContext::MyProcSub()].dataPtr(); + auto* pout = out[ParallelContext::MyProcSub()].dataPtr(); +#else + amrex::ignore_unused(in,out); +#endif + +#if defined(AMREX_USE_CUDA) + if constexpr (std::is_same_v) { + AMREX_CUFFT_SAFE_CALL(cufftExecC2R(plan.plan, (FFTComplex*)pin, pout)); + } else { + AMREX_CUFFT_SAFE_CALL(cufftExecZ2D(plan.plan, (FFTComplex*)pin, pout)); + } +#elif defined(AMREX_USE_HIP) + detail::hip_execute(plan.plan, (void**)&pin, (void**)&pout); +#elif defined(AMREX_USE_SYCL) + detail::sycl_execute(plan.plan, (std::complex*)pin, pout); +#else + if constexpr (std::is_same_v) { + fftwf_execute(plan.plan); + } else { + fftw_execute(plan.plan); + } +#endif +} + +template +template +void R2C::exec_c2c (Plan2 plan, cMF& inout) +{ + if (! plan.defined) { return; } + + amrex::ignore_unused(inout); +#if defined(AMREX_USE_GPU) + auto* p = inout[ParallelContext::MyProcSub()].dataPtr(); +#endif + +#if defined(AMREX_USE_CUDA) + auto cufft_direction = (direction == Direction::forward) ? CUFFT_FORWARD : CUFFT_INVERSE; + if constexpr (std::is_same_v) { + AMREX_CUFFT_SAFE_CALL(cufftExecC2C(plan.plan, (FFTComplex*)p, (FFTComplex*)p, + cufft_direction)); + } else { + AMREX_CUFFT_SAFE_CALL(cufftExecZ2Z(plan.plan, (FFTComplex*)p, (FFTComplex*)p, + cufft_direction)); + } +#elif defined(AMREX_USE_HIP) + detail::hip_execute(plan.plan, (void**)&p, (void**)&p); +#elif defined(AMREX_USE_SYCL) + detail::sycl_execute(plan.plan, (std::complex*)p, (std::complex*)p); +#else + if constexpr (std::is_same_v) { + fftwf_execute(plan.plan); + } else { + fftw_execute(plan.plan); + } +#endif +} + +template +template > +void R2C::forward (MF const& inmf) +{ + m_rx.ParallelCopy(inmf, 0, 0, 1); + exec_r2c(m_fft_fwd_x, m_rx, m_cx); + + if ( m_cmd_x2y) { + ParallelCopy(m_cy, m_cx, *m_cmd_x2y, 0, 0, 1, m_dtos_x2y); + } + exec_c2c(m_fft_fwd_y, m_cy); + + if ( m_cmd_y2z) { + ParallelCopy(m_cz, m_cy, *m_cmd_y2z, 0, 0, 1, m_dtos_y2z); + } + exec_c2c(m_fft_fwd_z, m_cz); +} + +template +template > +void R2C::backward (MF& outmf) +{ + backward_doit(outmf); +} + +template +void R2C::backward_doit (MF& outmf) +{ + exec_c2c(m_fft_bwd_z, m_cz); + if ( m_cmd_z2y) { + ParallelCopy(m_cy, m_cz, *m_cmd_z2y, 0, 0, 1, m_dtos_z2y); + } + + exec_c2c(m_fft_bwd_y, m_cy); + if ( m_cmd_y2x) { + ParallelCopy(m_cx, m_cy, *m_cmd_y2x, 0, 0, 1, m_dtos_y2x); + } + + exec_c2r(m_fft_bwd_x, m_cx, m_rx); + outmf.ParallelCopy(m_rx, 0, 0, 1); +} + +template +std::pair::Plan2, typename R2C::Plan2> +R2C::make_c2c_plans (cMF& inout) +{ + Plan2 fwd; + Plan2 bwd; + + auto* fab = get_fab(inout); + if (!fab) { return {fwd, bwd};} + + Box const& local_box = fab->box(); + + int n = local_box.length(0); + int howmany = AMREX_D_TERM(1, *local_box.length(1), *local_box.length(2)); + +#if defined(AMREX_USE_CUDA) + + if constexpr (D == Direction::both || D == Direction::forward) { + cufftType fwd_type = std::is_same_v ? CUFFT_C2C : CUFFT_Z2Z; + AMREX_CUFFT_SAFE_CALL + (cufftPlanMany(&fwd.plan, 1, &n, nullptr, 1, n, nullptr, 1, n, + fwd_type, howmany)); + AMREX_CUFFT_SAFE_CALL(cufftSetStream(fwd.plan, Gpu::gpuStream())); + } + if constexpr (D == Direction::both || D == Direction::backward) { + cufftType bwd_type = std::is_same_v ? CUFFT_C2C : CUFFT_Z2Z; + AMREX_CUFFT_SAFE_CALL + (cufftPlanMany(&bwd.plan, 1, &n, nullptr, 1, n, nullptr, 1, n, + bwd_type, howmany)); + AMREX_CUFFT_SAFE_CALL(cufftSetStream(bwd.plan, Gpu::gpuStream())); + } + +#elif defined(AMREX_USE_HIP) + + auto prec = std::is_same_v ? rocfft_precision_single : rocfft_precision_double; + const std::size_t length = n; + if constexpr (D == Direction::both || D == Direction::forward) { + AMREX_ROCFFT_SAFE_CALL + (rocfft_plan_create(&fwd.plan, rocfft_placement_inplace, + rocfft_transform_type_complex_forward, prec, 1, + &length, howmany, nullptr)); + } + if constexpr (D == Direction::both || D == Direction::backward) { + AMREX_ROCFFT_SAFE_CALL + (rocfft_plan_create(&bwd.plan, rocfft_placement_inplace, + rocfft_transform_type_complex_inverse, prec, 1, + &length, howmany, nullptr)); + } + +#elif defined(AMREX_USE_SYCL) + + fwd.plan = new std::remove_pointer_t(n); + fwd.plan->set_value(oneapi::mkl::dft::config_param::PLACEMENT, + DFTI_INPLACE); + fwd.plan->set_value(oneapi::mkl::dft::config_param::NUMBER_OF_TRANSFORMS, + howmany); + fwd.plan->set_value(oneapi::mkl::dft::config_param::FWD_DISTANCE, n); + fwd.plan->set_value(oneapi::mkl::dft::config_param::BWD_DISTANCE, n); + std::array strides{0,1}; + fwd.plan->set_value(oneapi::mkl::dft::config_param::FWD_STRIDES, strides.data()); + fwd.plan->set_value(oneapi::mkl::dft::config_param::BWD_STRIDES, strides.data()); + fwd.plan->set_value(oneapi::mkl::dft::config_param::WORKSPACE, + oneapi::mkl::dft::config_value::WORKSPACE_EXTERNAL); + fwd.plan->commit(amrex::Gpu::Device::streamQueue()); + + bwd.plan = fwd.plan; + +#else + auto* pinout = (FFTComplex*)fab->dataPtr(); + + if constexpr (std::is_same_v) { + if constexpr (D == Direction::both || D == Direction::forward) { + fwd.plan = fftwf_plan_many_dft(1, &n, howmany, pinout, nullptr, 1, n, + pinout, nullptr, 1, n, -1, FFTW_ESTIMATE); + } + if constexpr (D == Direction::both || D == Direction::backward) { + bwd.plan = fftwf_plan_many_dft(1, &n, howmany, pinout, nullptr, 1, n, + pinout, nullptr, 1, n, +1, FFTW_ESTIMATE); + } + } else { + if constexpr (D == Direction::both || D == Direction::forward) { + fwd.plan = fftw_plan_many_dft(1, &n, howmany, pinout, nullptr, 1, n, + pinout, nullptr, 1, n, -1, FFTW_ESTIMATE); + } + if constexpr (D == Direction::both || D == Direction::backward) { + bwd.plan = fftw_plan_many_dft(1, &n, howmany, pinout, nullptr, 1, n, + pinout, nullptr, 1, n, +1, FFTW_ESTIMATE); + } + } +#endif + + if constexpr (D == Direction::both || D == Direction::forward) { + fwd.defined = true; + } + if constexpr (D == Direction::both || D == Direction::backward) { + bwd.defined = true; + } + + return {fwd,bwd}; +} + +template +template +void R2C::post_forward_doit (F const& post_forward) +{ + if (m_info.batch_mode) { + amrex::Abort("xxxxx todo: post_forward"); + } else { + if ( ! m_cz.empty()) { + auto* spectral_fab = get_fab(m_cz); + if (spectral_fab) { + auto const& a = spectral_fab->array(); // m_cz's ordering is z,x,y + ParallelFor(spectral_fab->box(), + [=] AMREX_GPU_DEVICE (int iz, int jx, int ky) + { + post_forward(jx,ky,iz,a(iz,jx,ky)); + }); + } + } else if ( ! m_cy.empty()) { + auto* spectral_fab = get_fab(m_cy); + if (spectral_fab) { + auto const& a = spectral_fab->array(); // m_cy's ordering is y,x,z + ParallelFor(spectral_fab->box(), + [=] AMREX_GPU_DEVICE (int iy, int jx, int k) + { + post_forward(jx,iy,k,a(iy,jx,k)); + }); + } + } else { + auto* spectral_fab = get_fab(m_cx); + if (spectral_fab) { + auto const& a = spectral_fab->array(); + ParallelFor(spectral_fab->box(), + [=] AMREX_GPU_DEVICE (int i, int j, int k) + { + post_forward(i,j,k,a(i,j,k)); + }); + } + } + } +} + +template +template > +std::pair::cMF *, IntVect> +R2C::getSpectralData () +{ + if (!m_cz.empty()) { + return std::make_pair(&m_cz, IntVect{AMREX_D_DECL(2,0,1)}); + } else if (!m_cy.empty()) { + return std::make_pair(&m_cy, IntVect{AMREX_D_DECL(1,0,2)}); + } else { + return std::make_pair(&m_cx, IntVect{AMREX_D_DECL(0,1,2)}); + } +} + +template +template > +void R2C::forward (MF const& inmf, cMF& outmf) +{ + forward(inmf); + if (!m_cz.empty()) { // m_cz's order (z,x,y) -> (x,y,z) + RotateBwd dtos{}; + MultiBlockCommMetaData cmd + (outmf, m_spectral_domain_x, m_cz, IntVect(0), dtos); + ParallelCopy(outmf, m_cz, cmd, 0, 0, 1, dtos); + } else if (!m_cy.empty()) { // m_cy's order (y,x,z) -> (x,y,z) + MultiBlockCommMetaData cmd + (outmf, m_spectral_domain_x, m_cy, IntVect(0), m_dtos_y2x); + ParallelCopy(outmf, m_cy, cmd, 0, 0, 1, m_dtos_y2x); + } else { + outmf.ParallelCopy(m_cx, 0, 0, 1); + } +} + +template +template > +void R2C::backward (cMF const& inmf, MF& outmf) +{ + if (!m_cz.empty()) { // (x,y,z) -> m_cz's order (z,x,y) + RotateFwd dtos{}; + MultiBlockCommMetaData cmd + (m_cz, m_spectral_domain_z, inmf, IntVect(0), dtos); + ParallelCopy(m_cz, inmf, cmd, 0, 0, 1, dtos); + } else if (!m_cy.empty()) { // (x,y,z) -> m_cy's ordering (y,x,z) + MultiBlockCommMetaData cmd + (m_cy, m_spectral_domain_y, inmf, IntVect(0), m_dtos_x2y); + ParallelCopy(m_cy, inmf, cmd, 0, 0, 1, m_dtos_x2y); + } else { + m_cx.ParallelCopy(inmf, 0, 0, 1); + } + backward_doit(outmf); +} + +} + +#endif diff --git a/Src/FFT/AMReX_FFT.cpp b/Src/FFT/AMReX_FFT.cpp new file mode 100644 index 0000000000..68984a8f24 --- /dev/null +++ b/Src/FFT/AMReX_FFT.cpp @@ -0,0 +1,40 @@ +#include +#include + +namespace amrex::FFT::detail +{ + +DistributionMapping make_iota_distromap (Long n) +{ + AMREX_ASSERT(n <= ParallelContext::NProcsSub()); + Vector pm(n); + for (int i = 0; i < n; ++i) { + pm[i] = ParallelContext::local_to_global_rank(i); + } + return DistributionMapping(std::move(pm)); +} + +#ifdef AMREX_USE_HIP +void hip_execute (rocfft_plan plan, void **in, void **out) +{ + rocfft_execution_info execinfo = nullptr; + AMREX_ROCFFT_SAFE_CALL(rocfft_execution_info_create(&execinfo)); + + std::size_t buffersize = 0; + AMREX_ROCFFT_SAFE_CALL(rocfft_plan_get_work_buffer_size(plan, &buffersize)); + + auto* buffer = (void*)amrex::The_Arena()->alloc(buffersize); + AMREX_ROCFFT_SAFE_CALL(rocfft_execution_info_set_work_buffer(execinfo, buffer, buffersize)); + + AMREX_ROCFFT_SAFE_CALL(rocfft_execution_info_set_stream(execinfo, amrex::Gpu::gpuStream())); + + AMREX_ROCFFT_SAFE_CALL(rocfft_execute(plan, in, out, execinfo)); + + amrex::Gpu::streamSynchronize(); + amrex::The_Arena()->free(buffer); + + AMREX_ROCFFT_SAFE_CALL(rocfft_execution_info_destroy(execinfo)); +} +#endif + +} diff --git a/Src/FFT/AMReX_FFT_Helper.H b/Src/FFT/AMReX_FFT_Helper.H new file mode 100644 index 0000000000..c8ae2b74ea --- /dev/null +++ b/Src/FFT/AMReX_FFT_Helper.H @@ -0,0 +1,29 @@ +#ifndef AMREX_FFT_HELPER_H_ +#define AMREX_FFT_HELPER_H_ +#include + +#include + +namespace amrex::FFT +{ + +enum struct Direction { forward, backward, both }; + +struct Info +{ + //! Supported only in 3D. When batch_mode is true, FFT is performed on + //! the first two dimensions only and the third dimension size is the + //! batch size. + bool batch_mode = false; + + Info& setBatchMode (bool x) { batch_mode = x; return *this; } +}; + +namespace detail +{ + DistributionMapping make_iota_distromap (Long n); +} + +} + +#endif diff --git a/Src/FFT/AMReX_FFT_Poisson.H b/Src/FFT/AMReX_FFT_Poisson.H new file mode 100644 index 0000000000..6206206210 --- /dev/null +++ b/Src/FFT/AMReX_FFT_Poisson.H @@ -0,0 +1,259 @@ +#ifndef AMREX_FFT_POISSON_H_ +#define AMREX_FFT_POISSON_H_ + +#include +#include + +namespace amrex::FFT +{ + +/** + * \brief Poisson solver for all periodic boundaries using FFT + */ +template +class Poisson +{ +public: + + template ,int> = 0> + explicit Poisson (Geometry const& geom) + : m_geom(geom), m_r2c(geom.Domain()) + { + AMREX_ALWAYS_ASSERT(geom.isAllPeriodic()); + } + + void solve (MF& soln, MF const& rhs); + +private: + Geometry m_geom; + R2C m_r2c; +}; + +/** + * \brief 3D Poisson solver for periodic boundaries in the first two + * dimensions and Neumann in the last dimension. + */ +template +class PoissonHybrid +{ +public: + + template ,int> = 0> + explicit PoissonHybrid (Geometry const& geom) + : m_geom(geom), m_r2c(geom.Domain(), Info().setBatchMode(true)) + { +#if (AMREX_SPACEDIM == 3) + AMREX_ALWAYS_ASSERT(geom.isPeriodic(0) && geom.isPeriodic(1)); +#else + amrex::Abort("FFT::PoissonHybrid: 1D & 2D todo"); +#endif + } + + void solve (MF& soln, MF const& rhs); + +private: + Geometry m_geom; + R2C m_r2c; +}; + +template +void Poisson::solve (MF& soln, MF const& rhs) +{ + using T = typename MF::value_type; + + GpuArray fac + {AMREX_D_DECL(T(2)*Math::pi()/T(m_geom.ProbLength(0)), + T(2)*Math::pi()/T(m_geom.ProbLength(1)), + T(2)*Math::pi()/T(m_geom.ProbLength(2)))}; + GpuArray dx + {AMREX_D_DECL(T(m_geom.CellSize(0)), + T(m_geom.CellSize(1)), + T(m_geom.CellSize(2)))}; + auto scale = T(1.0/m_geom.Domain().d_numPts()); +#if (AMREX_SPACEDIM > 1) + auto const& len = m_geom.Domain().length(); +#endif + + m_r2c.forwardThenBackward(rhs, soln, + [=] AMREX_GPU_DEVICE (int i, int j, int k, + GpuComplex& spectral_data) + { + amrex::ignore_unused(i,j,k); + // the values in the upper-half of the spectral array in y and z + // are here interpreted as negative wavenumbers + AMREX_D_TERM(T a = fac[0]*i;, + T b = (j < len[1]/2) ? fac[1]*j : fac[1]*(len[1]-j);, + T c = (k < len[2]/2) ? fac[2]*k : fac[2]*(len[2]-k)); + T k2 = AMREX_D_TERM(T(2)*(std::cos(a*dx[0])-T(1))/(dx[0]*dx[0]), + +T(2)*(std::cos(b*dx[1])-T(1))/(dx[1]*dx[1]), + +T(2)*(std::cos(c*dx[2])-T(1))/(dx[2]*dx[2])); + if (k2 != T(0)) { + spectral_data /= k2; + } else { + // interpretation here is that the average value of the + // solution is zero + spectral_data = 0; + } + spectral_data *= scale; + }); +} + +template +void PoissonHybrid::solve (MF& soln, MF const& rhs) +{ +#if (AMREX_SPACEDIM < 3) + amrex::ignore_unused(soln, rhs); +#else + using T = typename MF::value_type; + + auto facx = T(2)*Math::pi()/T(m_geom.ProbLength(0)); + auto facy = T(2)*Math::pi()/T(m_geom.ProbLength(1)); + auto dx = T(m_geom.CellSize(0)); + auto dy = T(m_geom.CellSize(1)); + auto scale = T(1.0)/(T(m_geom.Domain().length(0)) * + T(m_geom.Domain().length(1))); + auto ny = m_geom.Domain().length(1); + auto nz = m_geom.Domain().length(2); + + Gpu::DeviceVector delzv(nz, T(m_geom.CellSize(2))); + auto const* delz = delzv.data(); + + Box cdomain = m_geom.Domain(); + cdomain.setBig(0,cdomain.length(0)/2); + auto cba = amrex::decompose(cdomain, ParallelContext::NProcsSub(), + {AMREX_D_DECL(true,true,false)}); + DistributionMapping dm = detail::make_iota_distromap(cba.size()); + FabArray > > spmf(cba, dm, 1, 0); + + m_r2c.forward(rhs, spmf); + + for (MFIter mfi(spmf); mfi.isValid(); ++mfi) + { + auto const& spectral = spmf.array(mfi); + auto const& box = mfi.validbox(); + auto const& xybox = amrex::makeSlab(box, 2, 0); + +#ifdef AMREX_USE_GPU + // xxxxx TODO: We need to explore how to optimize this + // function. Maybe we can use cusparse. Maybe we should make + // z-direction to be the unit stride direction. + + FArrayBox tridiag_workspace(box,4); + auto const& ald = tridiag_workspace.array(0); + auto const& bd = tridiag_workspace.array(1); + auto const& cud = tridiag_workspace.array(2); + auto const& scratch = tridiag_workspace.array(3); + + amrex::ParallelFor(xybox, [=] AMREX_GPU_DEVICE (int i, int j, int) + { + T a = facx*i; + T b = (j < ny/2) ? facy*j : facy*(ny-j); + + T k2 = T(2)*(std::cos(a*dx)-T(1))/(dx*dx) + + T(2)*(std::cos(b*dy)-T(1))/(dy*dy); + + // Tridiagonal solve with homogeneous Neumann + for(int k=0; k < nz; k++) { + if(k==0) { + ald(i,j,k) = 0.; + cud(i,j,k) = 2.0 /(delz[k]*(delz[k]+delz[k+1])); + bd(i,j,k) = k2 -ald(i,j,k)-cud(i,j,k); + } else if (k == nz-1) { + ald(i,j,k) = 2.0 /(delz[k]*(delz[k]+delz[k-1])); + cud(i,j,k) = 0.; + bd(i,j,k) = k2 -ald(i,j,k)-cud(i,j,k); + if (i == 0 && j == 0) { + bd(i,j,k) *= 2.0; + } + } else { + ald(i,j,k) = 2.0 /(delz[k]*(delz[k]+delz[k-1])); + cud(i,j,k) = 2.0 /(delz[k]*(delz[k]+delz[k+1])); + bd(i,j,k) = k2 -ald(i,j,k)-cud(i,j,k); + } + } + + scratch(i,j,0) = cud(i,j,0)/bd(i,j,0); + spectral(i,j,0) = spectral(i,j,0)/bd(i,j,0); + + for (int k = 1; k < nz; k++) { + if (k < nz-1) { + scratch(i,j,k) = cud(i,j,k) / (bd(i,j,k) - ald(i,j,k) * scratch(i,j,k-1)); + } + spectral(i,j,k) = (spectral(i,j,k) - ald(i,j,k) * spectral(i,j,k - 1)) + / (bd(i,j,k) - ald(i,j,k) * scratch(i,j,k-1)); + } + + for (int k = nz - 2; k >= 0; k--) { + spectral(i,j,k) -= scratch(i,j,k) * spectral(i,j,k + 1); + } + + for (int k = 0; k < nz; ++k) { + spectral(i,j,k) *= scale; + } + }); + Gpu::streamSynchronize(); + +#else + + Gpu::DeviceVector> ald(nz); + Gpu::DeviceVector> bd(nz); + Gpu::DeviceVector> cud(nz); + Gpu::DeviceVector> scratch(nz); + + amrex::LoopOnCpu(xybox, [&] (int i, int j, int) + { + T a = facx*i; + T b = (j < ny/2) ? facy*j : facy*(ny-j); + + T k2 = T(2)*(std::cos(a*dx)-T(1))/(dx*dx) + + T(2)*(std::cos(b*dy)-T(1))/(dy*dy); + + // Tridiagonal solve with homogeneous Neumann + for(int k=0; k < nz; k++) { + if(k==0) { + ald[k] = 0.; + cud[k] = 2.0 /(delz[k]*(delz[k]+delz[k+1])); + bd[k] = k2 -ald[k]-cud[k]; + } else if (k == nz-1) { + ald[k] = 2.0 /(delz[k]*(delz[k]+delz[k-1])); + cud[k] = 0.; + bd[k] = k2 -ald[k]-cud[k]; + if (i == 0 && j == 0) { + bd[k] *= 2.0; + } + } else { + ald[k] = 2.0 /(delz[k]*(delz[k]+delz[k-1])); + cud[k] = 2.0 /(delz[k]*(delz[k]+delz[k+1])); + bd[k] = k2 -ald[k]-cud[k]; + } + } + + scratch[0] = cud[0]/bd[0]; + spectral(i,j,0) = spectral(i,j,0)/bd[0]; + + for (int k = 1; k < nz; k++) { + if (k < nz-1) { + scratch[k] = cud[k] / (bd[k] - ald[k] * scratch[k-1]); + } + spectral(i,j,k) = (spectral(i,j,k) - ald[k] * spectral(i,j,k - 1)) + / (bd[k] - ald[k] * scratch[k-1]); + } + + for (int k = nz - 2; k >= 0; k--) { + spectral(i,j,k) -= scratch[k] * spectral(i,j,k + 1); + } + + for (int k = 0; k < nz; ++k) { + spectral(i,j,k) *= scale; + } + }); +#endif + } + + m_r2c.backward(spmf, soln); +#endif +} + +} + +#endif diff --git a/Src/FFT/CMakeLists.txt b/Src/FFT/CMakeLists.txt new file mode 100644 index 0000000000..2c695a9aec --- /dev/null +++ b/Src/FFT/CMakeLists.txt @@ -0,0 +1,14 @@ +add_amrex_define(AMREX_USE_FFT NO_LEGACY) + +foreach(D IN LISTS AMReX_SPACEDIM) + target_include_directories(amrex_${D}d PUBLIC $) + + target_sources(amrex_${D}d + PRIVATE + AMReX_FFT.H + AMReX_FFT.cpp + AMReX_FFT_Helper.H + AMReX_FFT_Poisson.H + ) + +endforeach() diff --git a/Src/FFT/Make.package b/Src/FFT/Make.package new file mode 100644 index 0000000000..1dcd714f64 --- /dev/null +++ b/Src/FFT/Make.package @@ -0,0 +1,10 @@ +ifndef AMREX_FFT_MAKE + AMREX_FFT_MAKE := 1 + +CEXE_headers += AMReX_FFT.H AMReX_FFT_Helper.H AMReX_FFT_Poisson.H +CEXE_sources += AMReX_FFT.cpp + +VPATH_LOCATIONS += $(AMREX_HOME)/Src/FFT +INCLUDE_LOCATIONS += $(AMREX_HOME)/Src/FFT + +endif diff --git a/Src/F_Interfaces/Base/AMReX_boxarray_fi.cpp b/Src/F_Interfaces/Base/AMReX_boxarray_fi.cpp index dd7916a9ad..48d8eae7a6 100644 --- a/Src/F_Interfaces/Base/AMReX_boxarray_fi.cpp +++ b/Src/F_Interfaces/Base/AMReX_boxarray_fi.cpp @@ -86,4 +86,9 @@ extern "C" { Box bx(IntVect(lo), IntVect(hi), ba->ixType()); return ba->intersects(bx); } + + int amrex_fi_boxarray_issame (const BoxArray* baa, const BoxArray* bab) + { + return *baa == *bab; + } } diff --git a/Src/F_Interfaces/Base/AMReX_boxarray_mod.F90 b/Src/F_Interfaces/Base/AMReX_boxarray_mod.F90 index 0181c6cfb9..f85b5e8d74 100644 --- a/Src/F_Interfaces/Base/AMReX_boxarray_mod.F90 +++ b/Src/F_Interfaces/Base/AMReX_boxarray_mod.F90 @@ -9,7 +9,8 @@ module amrex_boxarray_module private - public :: amrex_boxarray_build, amrex_boxarray_destroy, amrex_print + public :: amrex_boxarray_build, amrex_boxarray_destroy, amrex_print, & + operator(==) type, public :: amrex_boxarray logical :: owner = .false. @@ -36,6 +37,10 @@ module amrex_boxarray_module #endif end type amrex_boxarray + interface operator(==) + module procedure amrex_boxarray_issame + end interface operator(==) + interface amrex_boxarray_build module procedure amrex_boxarray_build_bx module procedure amrex_boxarray_build_bxs @@ -122,12 +127,18 @@ pure function amrex_fi_boxarray_numpts (ba) bind(c) integer(amrex_long) :: amrex_fi_boxarray_numpts end function amrex_fi_boxarray_numpts - pure integer function amrex_fi_boxarray_intersects_box (ba, lo, hi) bind(c) + pure integer(c_int) function amrex_fi_boxarray_intersects_box (ba, lo, hi) bind(c) import implicit none type(c_ptr), value, intent(in) :: ba integer, intent(in) :: lo(*), hi(*) end function amrex_fi_boxarray_intersects_box + + pure integer(c_int) function amrex_fi_boxarray_issame (baa, bab) bind(c) + import + implicit none + type(c_ptr), value, intent(in) :: baa, bab + end function amrex_fi_boxarray_issame end interface contains @@ -258,4 +269,10 @@ pure function amrex_boxarray_intersects_box (this, bx) result(r) r = ir .ne. 0 end function amrex_boxarray_intersects_box + pure logical function amrex_boxarray_issame(baa, bab) result(r) + type(amrex_boxarray), intent(in) :: baa + type(amrex_boxarray), intent(in) :: bab + r = amrex_fi_boxarray_issame(baa%p, bab%p) .ne. 0 + end function amrex_boxarray_issame + end module amrex_boxarray_module diff --git a/Src/F_Interfaces/Base/AMReX_distromap_fi.cpp b/Src/F_Interfaces/Base/AMReX_distromap_fi.cpp index e50031a588..7fc7adc171 100644 --- a/Src/F_Interfaces/Base/AMReX_distromap_fi.cpp +++ b/Src/F_Interfaces/Base/AMReX_distromap_fi.cpp @@ -41,4 +41,9 @@ extern "C" { { AllPrint() << *dm; } + + int amrex_fi_distromap_issame (const DistributionMapping* dma, const DistributionMapping* dmb) + { + return *dma == *dmb; + } } diff --git a/Src/F_Interfaces/Base/AMReX_distromap_mod.F90 b/Src/F_Interfaces/Base/AMReX_distromap_mod.F90 index adbb91b442..9c0884168e 100644 --- a/Src/F_Interfaces/Base/AMReX_distromap_mod.F90 +++ b/Src/F_Interfaces/Base/AMReX_distromap_mod.F90 @@ -8,7 +8,8 @@ module amrex_distromap_module private - public :: amrex_distromap_build, amrex_distromap_destroy, amrex_print + public :: amrex_distromap_build, amrex_distromap_destroy, amrex_print, & + operator(==) type, public :: amrex_distromap logical :: owner = .false. @@ -25,6 +26,10 @@ module amrex_distromap_module #endif end type amrex_distromap + interface operator(==) + module procedure amrex_distromap_issame + end interface operator(==) + interface amrex_distromap_build module procedure amrex_distromap_build_ba module procedure amrex_distromap_build_pmap @@ -89,6 +94,12 @@ subroutine amrex_fi_print_distromap (dm) bind(c) implicit none type(c_ptr), value :: dm end subroutine amrex_fi_print_distromap + + pure integer(c_int) function amrex_fi_distromap_issame (dma, dmb) bind(c) + import + implicit none + type(c_ptr), value, intent(in) :: dma, dmb + end function amrex_fi_distromap_issame end interface contains @@ -158,4 +169,9 @@ subroutine amrex_distromap_print (dm) call amrex_fi_print_distromap(dm%p) end subroutine amrex_distromap_print + pure logical function amrex_distromap_issame (dma, dmb) result(r) + type(amrex_distromap), intent(in) :: dma, dmb + r = amrex_fi_distromap_issame(dma%p, dmb%p) .ne. 0 + end function amrex_distromap_issame + end module amrex_distromap_module diff --git a/Src/LinearSolvers/CMakeLists.txt b/Src/LinearSolvers/CMakeLists.txt index 6287ef4b42..ddf2de454a 100644 --- a/Src/LinearSolvers/CMakeLists.txt +++ b/Src/LinearSolvers/CMakeLists.txt @@ -16,6 +16,8 @@ foreach(D IN LISTS AMReX_SPACEDIM) MLMG/AMReX_MLLinOp_K.H MLMG/AMReX_MLCellLinOp.H MLMG/AMReX_MLNodeLinOp.H + MLMG/AMReX_MLNodeLinOp_K.H + MLMG/AMReX_MLNodeLinOp_${D}D_K.H MLMG/AMReX_MLNodeLinOp.cpp MLMG/AMReX_MLCellABecLap.H MLMG/AMReX_MLCellABecLap_K.H @@ -31,30 +33,6 @@ foreach(D IN LISTS AMReX_SPACEDIM) MLMG/AMReX_MLPoisson.H MLMG/AMReX_MLPoisson_K.H MLMG/AMReX_MLPoisson_${D}D_K.H - MLMG/AMReX_MLNodeLaplacian.H - MLMG/AMReX_MLNodeLaplacian.cpp - MLMG/AMReX_MLNodeLaplacian_sync.cpp - MLMG/AMReX_MLNodeLaplacian_sten.cpp - MLMG/AMReX_MLNodeLaplacian_misc.cpp - MLMG/AMReX_MLNodeLap_K.H - MLMG/AMReX_MLNodeLap_${D}D_K.H - MLMG/AMReX_MLNodeTensorLaplacian.H - MLMG/AMReX_MLNodeTensorLaplacian.cpp - MLMG/AMReX_MLNodeTensorLap_K.H - MLMG/AMReX_MLNodeTensorLap_${D}D_K.H - MLMG/AMReX_MLTensorOp.H - MLMG/AMReX_MLTensorOp.cpp - MLMG/AMReX_MLTensorOp_grad.cpp - MLMG/AMReX_MLTensor_K.H - MLMG/AMReX_MLTensor_${D}D_K.H - MLMG/AMReX_MLEBNodeFDLaplacian.H - MLMG/AMReX_MLEBNodeFDLaplacian.cpp - MLMG/AMReX_MLEBNodeFDLap_K.H - MLMG/AMReX_MLEBNodeFDLap_${D}D_K.H - MLMG/AMReX_MLNodeABecLaplacian.H - MLMG/AMReX_MLNodeABecLaplacian.cpp - MLMG/AMReX_MLNodeABecLap_K.H - MLMG/AMReX_MLNodeABecLap_${D}D_K.H AMReX_GMRES.H AMReX_GMRES_MLMG.H ) @@ -68,30 +46,72 @@ foreach(D IN LISTS AMReX_SPACEDIM) ) endif () - if (NOT D EQUAL 1) + if (AMReX_LINEAR_SOLVERS_EM) + if (NOT D EQUAL 1 AND AMReX_LINEAR_SOLVERS_EM) + target_sources(amrex_${D}d + PRIVATE + MLMG/AMReX_MLCurlCurl.H + MLMG/AMReX_MLCurlCurl.cpp + MLMG/AMReX_MLCurlCurl_K.H + ) + endif () + target_sources(amrex_${D}d PRIVATE - MLMG/AMReX_MLCurlCurl.H - MLMG/AMReX_MLCurlCurl.cpp - MLMG/AMReX_MLCurlCurl_K.H - ) + MLMG/AMReX_MLEBNodeFDLaplacian.H + MLMG/AMReX_MLEBNodeFDLaplacian.cpp + MLMG/AMReX_MLEBNodeFDLap_K.H + MLMG/AMReX_MLEBNodeFDLap_${D}D_K.H + MLMG/AMReX_MLNodeTensorLaplacian.H + MLMG/AMReX_MLNodeTensorLaplacian.cpp + MLMG/AMReX_MLNodeTensorLap_K.H + MLMG/AMReX_MLNodeTensorLap_${D}D_K.H + ) + endif () + + if (AMReX_LINEAR_SOLVERS_INCFLO) + target_sources(amrex_${D}d + PRIVATE + MLMG/AMReX_MLNodeABecLaplacian.H + MLMG/AMReX_MLNodeABecLaplacian.cpp + MLMG/AMReX_MLNodeABecLap_K.H + MLMG/AMReX_MLNodeABecLap_${D}D_K.H + MLMG/AMReX_MLNodeLaplacian.H + MLMG/AMReX_MLNodeLaplacian.cpp + MLMG/AMReX_MLNodeLaplacian_sync.cpp + MLMG/AMReX_MLNodeLaplacian_sten.cpp + MLMG/AMReX_MLNodeLaplacian_misc.cpp + MLMG/AMReX_MLNodeLap_K.H + MLMG/AMReX_MLNodeLap_${D}D_K.H + MLMG/AMReX_MLTensorOp.H + MLMG/AMReX_MLTensorOp.cpp + MLMG/AMReX_MLTensorOp_grad.cpp + MLMG/AMReX_MLTensor_K.H + MLMG/AMReX_MLTensor_${D}D_K.H + ) endif () if (AMReX_EB AND NOT D EQUAL 1) target_sources(amrex_${D}d PRIVATE - MLMG/AMReX_MLNodeLaplacian_eb.cpp MLMG/AMReX_MLEBABecLap.H MLMG/AMReX_MLEBABecLap.cpp MLMG/AMReX_MLEBABecLap_F.cpp MLMG/AMReX_MLEBABecLap_K.H MLMG/AMReX_MLEBABecLap_${D}D_K.H - MLMG/AMReX_MLEBTensorOp.H - MLMG/AMReX_MLEBTensorOp.cpp - MLMG/AMReX_MLEBTensorOp_bc.cpp - MLMG/AMReX_MLEBTensor_K.H - MLMG/AMReX_MLEBTensor_${D}D_K.H ) + + if (AMReX_LINEAR_SOLVERS_INCFLO) + target_sources(amrex_${D}d + PRIVATE + MLMG/AMReX_MLNodeLaplacian_eb.cpp + MLMG/AMReX_MLEBTensorOp.H + MLMG/AMReX_MLEBTensorOp.cpp + MLMG/AMReX_MLEBTensorOp_bc.cpp + MLMG/AMReX_MLEBTensor_K.H + MLMG/AMReX_MLEBTensor_${D}D_K.H + ) + endif () endif () if (AMReX_FORTRAN) @@ -102,7 +122,7 @@ foreach(D IN LISTS AMReX_SPACEDIM) ) endif () - if (AMReX_HYPRE) + if (AMReX_HYPRE AND AMReX_LINEAR_SOLVERS_INCFLO) target_sources(amrex_${D}d PRIVATE MLMG/AMReX_MLNodeLaplacian_hypre.cpp diff --git a/Src/LinearSolvers/MLMG/AMReX_MLCellLinOp.H b/Src/LinearSolvers/MLMG/AMReX_MLCellLinOp.H index e04e16f8bd..b318b318eb 100644 --- a/Src/LinearSolvers/MLMG/AMReX_MLCellLinOp.H +++ b/Src/LinearSolvers/MLMG/AMReX_MLCellLinOp.H @@ -536,7 +536,11 @@ MLCellLinOpT::setLevelBC (int amrlev, const MF* a_levelbcdata, const MF* rob { if (this->needsCoarseDataForBC()) { - AMREX_ALWAYS_ASSERT(!this->hasHiddenDimension()); + // AMREX_ALWAYS_ASSERT(!this->hasHiddenDimension()); + if (this->hasHiddenDimension()) { + int hidden_dir = this->hiddenDirection(); + AMREX_ALWAYS_ASSERT(this->m_coarse_data_crse_ratio[hidden_dir] == 1); + } br_ref_ratio = this->m_coarse_data_crse_ratio.allGT(0) ? this->m_coarse_data_crse_ratio : IntVect(2); if (m_crse_sol_br[amrlev] == nullptr && br_ref_ratio.allGT(0)) { @@ -1946,8 +1950,6 @@ MLCellLinOpT::computeVolInv () const m_volinv[amrlev].resize(this->NMGLevels(amrlev)); } - AMREX_ASSERT(this->m_coarse_fine_bc_type == LinOpBCType::Dirichlet || ! this->hasHiddenDimension()); - // We don't need to compute for every level auto f = [&] (int amrlev, int mglev) { @@ -2009,11 +2011,11 @@ MLCellLinOpT::normInf (int amrlev, MF const& mf, bool local) const -> RT const int finest_level = this->NAMRLevels() - 1; RT norm = RT(0.0); #ifdef AMREX_USE_EB - if (! mf.isAllRegular()) { + const auto *factory = dynamic_cast(this->Factory(amrlev)); + if (factory && !factory->isAllRegular()) { if constexpr (!std::is_same()) { amrex::Abort("MLCellLinOpT with EB only works with MultiFab"); } else { - const auto *factory = dynamic_cast(this->Factory(amrlev)); const MultiFab& vfrac = factory->getVolFrac(); if (amrlev == finest_level) { #ifdef AMREX_USE_GPU diff --git a/Src/LinearSolvers/MLMG/AMReX_MLEBABecLap_K.H b/Src/LinearSolvers/MLMG/AMReX_MLEBABecLap_K.H index 517b1875bc..434789c5c5 100644 --- a/Src/LinearSolvers/MLMG/AMReX_MLEBABecLap_K.H +++ b/Src/LinearSolvers/MLMG/AMReX_MLEBABecLap_K.H @@ -6,13 +6,6 @@ #include -namespace amrex { - AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE - Real get_dx_eb (Real kappa) noexcept { - return amrex::max(Real(0.3),(kappa*kappa-Real(0.25))/(Real(2.0)*kappa)); - } -} - #if (AMREX_SPACEDIM == 2) #include #else diff --git a/Src/LinearSolvers/MLMG/AMReX_MLEBNodeFDLaplacian.cpp b/Src/LinearSolvers/MLMG/AMReX_MLEBNodeFDLaplacian.cpp index af4a6a6d74..0a0bdf39a1 100644 --- a/Src/LinearSolvers/MLMG/AMReX_MLEBNodeFDLaplacian.cpp +++ b/Src/LinearSolvers/MLMG/AMReX_MLEBNodeFDLaplacian.cpp @@ -1,6 +1,6 @@ #include #include -#include +#include #include #include diff --git a/Src/LinearSolvers/MLMG/AMReX_MLLinOp_K.H b/Src/LinearSolvers/MLMG/AMReX_MLLinOp_K.H index a6bc651736..74fcc1302f 100644 --- a/Src/LinearSolvers/MLMG/AMReX_MLLinOp_K.H +++ b/Src/LinearSolvers/MLMG/AMReX_MLLinOp_K.H @@ -1053,6 +1053,15 @@ void mllinop_apply_innu_zhi (int i, int j, int k, } } +#ifdef AMREX_USE_EB + +AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +Real get_dx_eb (Real kappa) noexcept { + return amrex::max(Real(0.3),(kappa*kappa-Real(0.25))/(Real(2.0)*kappa)); +} + +#endif + } #endif diff --git a/Src/LinearSolvers/MLMG/AMReX_MLNodeLap_1D_K.H b/Src/LinearSolvers/MLMG/AMReX_MLNodeLap_1D_K.H index 91d0225739..7b6bc1e1fc 100644 --- a/Src/LinearSolvers/MLMG/AMReX_MLNodeLap_1D_K.H +++ b/Src/LinearSolvers/MLMG/AMReX_MLNodeLap_1D_K.H @@ -4,79 +4,6 @@ namespace amrex { -AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE -void mlndlap_set_nodal_mask (int i, int, int, Array4 const& nmsk, - Array4 const& cmsk) noexcept -{ - using namespace nodelap_detail; - - int s = cmsk(i-1,0,0) + cmsk(i,0,0); - if (s == 2*crse_cell) { - nmsk(i,0,0) = crse_node; - } else if (s == 2*fine_cell) { - nmsk(i,0,0) = fine_node; - } else { - nmsk(i,0,0) = crse_fine_node; - } -} - -AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE -void mlndlap_set_dirichlet_mask (Box const& bx, Array4 const& dmsk, - Array4 const& omsk, Box const& dom, - GpuArray const& bclo, - GpuArray const& bchi) noexcept -{ - const auto lo = bx.smallEnd(0); - const auto hi = bx.bigEnd(0); - AMREX_PRAGMA_SIMD - for (int i = lo; i <= hi; ++i) { - if (!dmsk(i,0,0)) { - dmsk(i,0,0) = (omsk(i-1,0,0) == 1 || omsk(i,0,0) == 1); - } - } - - const auto domlo = dom.smallEnd(0); - const auto domhi = dom.bigEnd(0); - - if (bclo[0] == LinOpBCType::Dirichlet && lo == domlo) { - dmsk(lo,0,0) = 1; - } - - if (bchi[0] == LinOpBCType::Dirichlet && hi == domhi) { - dmsk(hi,0,0) = 1; - } -} - -AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE -void mlndlap_set_dot_mask (Box const& bx, Array4 const& dmsk, - Array4 const& omsk, Box const& dom, - GpuArray const& bclo, - GpuArray const& bchi) noexcept -{ - const auto lo = bx.smallEnd(0); - const auto hi = bx.bigEnd(0); - - AMREX_PRAGMA_SIMD - for (int i = lo; i <= hi; ++i) { - dmsk(i,0,0) = static_cast(omsk(i,0,0)); - } - - const auto domlo = dom.smallEnd(0); - const auto domhi = dom.bigEnd(0); - - if ((bclo[0] == LinOpBCType::Neumann || bclo[0] == LinOpBCType::inflow) - && lo == domlo) - { - dmsk(lo,0,0) *= Real(0.5); - } - - if ((bchi[0] == LinOpBCType::Neumann || bchi[0] == LinOpBCType::inflow) - && hi == domhi) - { - dmsk(hi,0,0) *= Real(0.5); - } -} - AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void mlndlap_zero_fine (int i, int, int, Array4 const& phi, Array4 const& msk, int fine_flag) noexcept @@ -106,39 +33,6 @@ void mlndlap_semi_avgdown_coeff (int i, int j, int k, Array4 const& crse, mlndlap_avgdown_coeff_x(i,j,k,crse,fine); } -template -void mlndlap_bc_doit (Box const& vbx, Array4 const& a, Box const& domain, - GpuArray const& bflo, - GpuArray const& bfhi) noexcept -{ - Box gdomain = domain; - int const idim = 0; - if (! bflo[idim]) { gdomain.growLo(idim,1); } - if (! bfhi[idim]) { gdomain.growHi(idim,1); } - - if (gdomain.strictly_contains(vbx)) { return; } - - const int offset = domain.cellCentered() ? 0 : 1; - - const auto dlo = domain.smallEnd(0); - const auto dhi = domain.bigEnd(0); - - Box const& sbox = amrex::grow(vbx,1); - AMREX_HOST_DEVICE_FOR_3D(sbox, i, j, k, - { - if (! gdomain.contains(IntVect(i))) { - if (i == dlo-1 && bflo[0]) - { - a(i,0,0) = a(i+1+offset, j, k); - } - else if (i == dhi+1 && bfhi[0]) - { - a(i,0,0) = a(i-1-offset, j, k); - } - } - }); -} - AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Real mlndlap_adotx_c (int i, int, int, Array4 const& x, Real sigma, Array4 const& msk, @@ -335,59 +229,6 @@ void mlndlap_gauss_seidel_with_line_solve_aa(Box const&, Array4 const&, amrex::Abort("mlndlap_gauss_seidel_with_line_solve_aa: not implemented in 1D"); } - -AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE -void mlndlap_restriction (int i, int, int, Array4 const& crse, - Array4 const& fine, Array4 const& msk) noexcept -{ - int ii = i*2; - if (msk(ii,0,0)) { - crse(i,0,0) = Real(0.0); - } else { - crse(i,0,0) = Real(1./4.) *(fine(ii-1,0,0) - + Real(2.)* fine(ii ,0,0) - + fine(ii+1,0,0)); - } -} - -template -AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE -void mlndlap_restriction (int i, int, int, Array4 const& crse, - Array4 const& fine, Array4 const& msk, - Box const& fdom, - GpuArray const& bclo, - GpuArray const& bchi) noexcept - -{ - const int ii = i*rr; - if (msk(ii,0,0)) { - crse(i,0,0) = Real(0.0); - } else { - const auto ndlo = fdom.smallEnd(0); - const auto ndhi = fdom.bigEnd(0); - Real tmp = Real(0.0); - for (int ioff = -rr+1; ioff <= rr-1; ++ioff) { - Real wx = rr - std::abs(ioff); - int itmp = ii + ioff; - if ((itmp < ndlo && (bclo[0] == LinOpBCType::Neumann || - bclo[0] == LinOpBCType::inflow)) || - (itmp > ndhi && (bchi[0] == LinOpBCType::Neumann || - bchi[0] == LinOpBCType::inflow))) { - itmp = ii - ioff; - } - tmp += wx*fine(itmp,0,0); - } - crse(i,0,0) = tmp*(Real(1.0)/Real(rr*rr)); - } -} - -AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE -void mlndlap_semi_restriction (int /*i*/, int /*j*/, int /*k*/, Array4 const&, - Array4 const&, Array4 const&, int) noexcept -{ - amrex::Abort("mlndlap_semi_restriction: not implemented in 1D"); -} - AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void mlndlap_interpadd_c (int i, int , int, Array4 const& fine, Array4 const& crse, diff --git a/Src/LinearSolvers/MLMG/AMReX_MLNodeLap_2D_K.H b/Src/LinearSolvers/MLMG/AMReX_MLNodeLap_2D_K.H index 05f02aaa92..db6922c410 100644 --- a/Src/LinearSolvers/MLMG/AMReX_MLNodeLap_2D_K.H +++ b/Src/LinearSolvers/MLMG/AMReX_MLNodeLap_2D_K.H @@ -4,127 +4,6 @@ namespace amrex { -// -// masks -// - -AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE -void mlndlap_set_nodal_mask (int i, int j, int k, Array4 const& nmsk, - Array4 const& cmsk) noexcept -{ - using namespace nodelap_detail; - - int s = cmsk(i-1,j-1,k) + cmsk(i ,j-1,k) - + cmsk(i-1,j ,k) + cmsk(i ,j ,k); - if (s == 4*crse_cell) { - nmsk(i,j,k) = crse_node; - } - else if (s == 4*fine_cell) { - nmsk(i,j,k) = fine_node; - } else { - nmsk(i,j,k) = crse_fine_node; - } -} - -AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE -void mlndlap_set_dirichlet_mask (Box const& bx, Array4 const& dmsk, - Array4 const& omsk, Box const& dom, - GpuArray const& bclo, - GpuArray const& bchi) noexcept -{ - const auto lo = amrex::lbound(bx); - const auto hi = amrex::ubound(bx); - for (int j = lo.y; j <= hi.y; ++j) { - AMREX_PRAGMA_SIMD - for (int i = lo.x; i <= hi.x; ++i) { - if (!dmsk(i,j,0)) { - dmsk(i,j,0) = (omsk(i-1,j-1,0) == 1 || omsk(i,j-1,0) == 1 || - omsk(i-1,j ,0) == 1 || omsk(i,j ,0) == 1); - } - }} - - const auto domlo = amrex::lbound(dom); - const auto domhi = amrex::ubound(dom); - - if (bclo[0] == LinOpBCType::Dirichlet && lo.x == domlo.x) { - for (int j = lo.y; j <= hi.y; ++j) { - dmsk(lo.x,j,0) = 1; - } - } - - if (bchi[0] == LinOpBCType::Dirichlet && hi.x == domhi.x) { - for (int j = lo.y; j <= hi.y; ++j) { - dmsk(hi.x,j,0) = 1; - } - } - - if (bclo[1] == LinOpBCType::Dirichlet && lo.y == domlo.y) { - AMREX_PRAGMA_SIMD - for (int i = lo.x; i <= hi.x; ++i) { - dmsk(i,lo.y,0) = 1; - } - } - - if (bchi[1] == LinOpBCType::Dirichlet && hi.y == domhi.y) { - AMREX_PRAGMA_SIMD - for (int i = lo.x; i <= hi.x; ++i) { - dmsk(i,hi.y,0) = 1; - } - } -} - -AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE -void mlndlap_set_dot_mask (Box const& bx, Array4 const& dmsk, - Array4 const& omsk, Box const& dom, - GpuArray const& bclo, - GpuArray const& bchi) noexcept -{ - const auto lo = amrex::lbound(bx); - const auto hi = amrex::ubound(bx); - for (int j = lo.y; j <= hi.y; ++j) { - AMREX_PRAGMA_SIMD - for (int i = lo.x; i <= hi.x; ++i) { - dmsk(i,j,0) = static_cast(omsk(i,j,0)); - }} - - const auto domlo = amrex::lbound(dom); - const auto domhi = amrex::ubound(dom); - - if ((bclo[0] == LinOpBCType::Neumann || bclo[0] == LinOpBCType::inflow) - && lo.x == domlo.x) - { - for (int j = lo.y; j <= hi.y; ++j) { - dmsk(lo.x,j,0) *= Real(0.5); - } - } - - if ((bchi[0] == LinOpBCType::Neumann || bchi[0] == LinOpBCType::inflow) - && hi.x == domhi.x) - { - for (int j = lo.y; j <= hi.y; ++j) { - dmsk(hi.x,j,0) *= Real(0.5); - } - } - - if ((bclo[1] == LinOpBCType::Neumann || bclo[1] == LinOpBCType::inflow) - && lo.y == domlo.y) - { - AMREX_PRAGMA_SIMD - for (int i = lo.x; i <= hi.x; ++i) { - dmsk(i,lo.y,0) *= Real(0.5); - } - } - - if ((bchi[1] == LinOpBCType::Neumann || bchi[1] == LinOpBCType::inflow) - && hi.y == domhi.y) - { - AMREX_PRAGMA_SIMD - for (int i = lo.x; i <= hi.x; ++i) { - dmsk(i,hi.y,0) *= Real(0.5); - } - } -} - AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void mlndlap_zero_fine (int i, int j, int, Array4 const& phi, Array4 const& msk, int fine_flag) noexcept @@ -177,116 +56,6 @@ void mlndlap_semi_avgdown_coeff (int i, int j, int k, Array4 const& crse, } } -// -// bc -// - -template -void mlndlap_bc_doit (Box const& vbx, Array4 const& a, Box const& domain, - GpuArray const& bflo, - GpuArray const& bfhi) noexcept -{ - Box gdomain = domain; - for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) { - if (! bflo[idim]) { gdomain.growLo(idim,1); } - if (! bfhi[idim]) { gdomain.growHi(idim,1); } - } - - if (gdomain.strictly_contains(vbx)) { return; } - - const int offset = domain.cellCentered() ? 0 : 1; - - const auto dlo = amrex::lbound(domain); - const auto dhi = amrex::ubound(domain); - - Box const& sbox = amrex::grow(vbx,1); - AMREX_HOST_DEVICE_FOR_3D(sbox, i, j, k, - { - if (! gdomain.contains(IntVect(i,j))) { - // xlo & ylo - if (i == dlo.x-1 && j == dlo.y-1 && (bflo[0] || bflo[1])) - { - if (bflo[0] && bflo[1]) - { - a(i,j,k) = a(i+1+offset, j+1+offset, k); - } - else if (bflo[0]) - { - a(i,j,k) = a(i+1+offset, j, k); - } - else if (bflo[1]) - { - a(i,j,k) = a(i, j+1+offset, k); - } - } - // xhi & ylo - else if (i == dhi.x+1 && j == dlo.y-1 && (bfhi[0] || bflo[1])) - { - if (bfhi[0] && bflo[1]) - { - a(i,j,k) = a(i-1-offset, j+1+offset, k); - } - else if (bfhi[0]) - { - a(i,j,k) = a(i-1-offset, j, k); - } - else if (bflo[1]) - { - a(i,j,k) = a(i, j+1+offset, k); - } - } - // xlo & yhi - else if (i == dlo.x-1 && j == dhi.y+1 && (bflo[0] || bfhi[1])) - { - if (bflo[0] && bfhi[1]) - { - a(i,j,k) = a(i+1+offset, j-1-offset, k); - } - else if (bflo[0]) - { - a(i,j,k) = a(i+1+offset, j, k); - } - else if (bfhi[1]) - { - a(i,j,k) = a(i, j-1-offset, k); - } - } - // xhi & yhi - else if (i == dhi.x+1 && j == dhi.y+1 && (bfhi[0] || bfhi[1])) - { - if (bfhi[0] && bfhi[1]) - { - a(i,j,k) = a(i-1-offset, j-1-offset, k); - } - else if (bfhi[0]) - { - a(i,j,k) = a(i-1-offset, j, k); - } - else if (bfhi[1]) - { - a(i,j,k) = a(i, j-1-offset, k); - } - } - else if (i == dlo.x-1 && bflo[0]) - { - a(i,j,k) = a(i+1+offset, j, k); - } - else if (i == dhi.x+1 && bfhi[0]) - { - a(i,j,k) = a(i-1-offset, j, k); - } - else if (j == dlo.y-1 && bflo[1]) - { - a(i,j,k) = a(i, j+1+offset, k); - } - else if (j == dhi.y+1 && bfhi[1]) - { - a(i,j,k) = a(i, j-1-offset, k); - } - } - }); -} - // // operator // @@ -796,91 +565,6 @@ void mlndlap_gauss_seidel_with_line_solve_aa (Box const& bx, Array4 const& } -// -// restriction -// - -AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE -void mlndlap_restriction (int i, int j, int k, Array4 const& crse, - Array4 const& fine, Array4 const& msk) noexcept -{ - int ii = i*2; - int jj = j*2; - int kk = 0; - if (msk(ii,jj,kk)) { - crse(i,j,k) = Real(0.0); - } else { - crse(i,j,k) = Real(1./16.)*(fine(ii-1,jj-1,kk) + Real(2.)*fine(ii ,jj-1,kk) + fine(ii+1,jj-1,kk) - + Real(2.)*fine(ii-1,jj ,kk) + Real(4.)*fine(ii ,jj ,kk) + Real(2.)*fine(ii+1,jj ,kk) - + fine(ii-1,jj+1,kk) + Real(2.)*fine(ii ,jj+1,kk) + fine(ii+1,jj+1,kk)); - } -} - -template -AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE -void mlndlap_restriction (int i, int j, int k, Array4 const& crse, - Array4 const& fine, Array4 const& msk, - Box const& fdom, - GpuArray const& bclo, - GpuArray const& bchi) noexcept -{ - const int ii = i*rr; - const int jj = j*rr; - if (msk(ii,jj,0)) { - crse(i,j,k) = Real(0.0); - } else { - const auto ndlo = amrex::lbound(fdom); - const auto ndhi = amrex::ubound(fdom); - Real tmp = Real(0.0); - for (int joff = -rr+1; joff <= rr-1; ++joff) { - Real wy = rr - std::abs(joff); - for (int ioff = -rr+1; ioff <= rr-1; ++ioff) { - Real wx = rr - std::abs(ioff); - int itmp = ii + ioff; - int jtmp = jj + joff; - if ((itmp < ndlo.x && (bclo[0] == LinOpBCType::Neumann || - bclo[0] == LinOpBCType::inflow)) || - (itmp > ndhi.x && (bchi[0] == LinOpBCType::Neumann || - bchi[0] == LinOpBCType::inflow))) { - itmp = ii - ioff; - } - if ((jtmp < ndlo.y && (bclo[1] == LinOpBCType::Neumann || - bclo[1] == LinOpBCType::inflow)) || - (jtmp > ndhi.y && (bchi[1] == LinOpBCType::Neumann || - bchi[1] == LinOpBCType::inflow))) { - jtmp = jj - joff; - } - tmp += wx*wy*fine(itmp,jtmp,0); - } - } - crse(i,j,k) = tmp*(Real(1.0)/Real(rr*rr*rr*rr)); - } -} - -AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE -void mlndlap_semi_restriction (int i, int j, int k, Array4 const& crse, - Array4 const& fine, Array4 const& msk, int idir) noexcept -{ - int kk = 0; - if (idir == 1) { - int ii = i*2; - int jj = j; - if (msk(ii,jj,kk)) { - crse(i,j,k) = Real(0.0); - } else { - crse(i,j,k) = Real(1./4.)*(fine(ii-1,jj,kk) + Real(2.)*fine(ii,jj,kk) + fine(ii+1,jj,kk)); - } - } else if (idir == 0) { - int ii = i; - int jj = j*2; - if (msk(ii,jj,kk)) { - crse(i,j,k) = Real(0.0); - } else { - crse(i,j,k) = Real(1./4.)*(fine(ii,jj-1,kk) + Real(2.)*fine(ii,jj,kk) + fine(ii,jj+1,kk)); - } - } -} - // // interpolation // diff --git a/Src/LinearSolvers/MLMG/AMReX_MLNodeLap_3D_K.H b/Src/LinearSolvers/MLMG/AMReX_MLNodeLap_3D_K.H index 5d31de0271..2ddcecfe37 100644 --- a/Src/LinearSolvers/MLMG/AMReX_MLNodeLap_3D_K.H +++ b/Src/LinearSolvers/MLMG/AMReX_MLNodeLap_3D_K.H @@ -4,177 +4,6 @@ namespace amrex { -// -// masks -// - -AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE -void mlndlap_set_nodal_mask (int i, int j, int k, Array4 const& nmsk, - Array4 const& cmsk) noexcept -{ - using namespace nodelap_detail; - - int s = cmsk(i-1,j-1,k-1) + cmsk(i ,j-1,k-1) - + cmsk(i-1,j ,k-1) + cmsk(i ,j ,k-1) - + cmsk(i-1,j-1,k ) + cmsk(i ,j-1,k ) - + cmsk(i-1,j ,k ) + cmsk(i ,j ,k ); - if (s == 8*crse_cell) { - nmsk(i,j,k) = crse_node; - } - else if (s == 8*fine_cell) { - nmsk(i,j,k) = fine_node; - } else { - nmsk(i,j,k) = crse_fine_node; - } -} - -AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE -void mlndlap_set_dirichlet_mask (Box const& bx, Array4 const& dmsk, - Array4 const& omsk, Box const& dom, - GpuArray const& bclo, - GpuArray const& bchi) noexcept -{ - const auto lo = amrex::lbound(bx); - const auto hi = amrex::ubound(bx); - for (int k = lo.z; k <= hi.z; ++k) { - for (int j = lo.y; j <= hi.y; ++j) { - AMREX_PRAGMA_SIMD - for (int i = lo.x; i <= hi.x; ++i) { - if (!dmsk(i,j,k)) { - dmsk(i,j,k) = (omsk(i-1,j-1,k-1) == 1 || omsk(i,j-1,k-1) == 1 || - omsk(i-1,j ,k-1) == 1 || omsk(i,j ,k-1) == 1 || - omsk(i-1,j-1,k ) == 1 || omsk(i,j-1,k ) == 1 || - omsk(i-1,j ,k ) == 1 || omsk(i,j ,k ) == 1); - } - }}} - - const auto domlo = amrex::lbound(dom); - const auto domhi = amrex::ubound(dom); - - if (bclo[0] == LinOpBCType::Dirichlet && lo.x == domlo.x) { - for (int k = lo.z; k <= hi.z; ++k) { - for (int j = lo.y; j <= hi.y; ++j) { - dmsk(lo.x,j,k) = 1; - }} - } - - if (bchi[0] == LinOpBCType::Dirichlet && hi.x == domhi.x) { - for (int k = lo.z; k <= hi.z; ++k) { - for (int j = lo.y; j <= hi.y; ++j) { - dmsk(hi.x,j,k) = 1; - }} - } - - if (bclo[1] == LinOpBCType::Dirichlet && lo.y == domlo.y) { - for (int k = lo.z; k <= hi.z; ++k) { - AMREX_PRAGMA_SIMD - for (int i = lo.x; i <= hi.x; ++i) { - dmsk(i,lo.y,k) = 1; - }} - } - - if (bchi[1] == LinOpBCType::Dirichlet && hi.y == domhi.y) { - for (int k = lo.z; k <= hi.z; ++k) { - AMREX_PRAGMA_SIMD - for (int i = lo.x; i <= hi.x; ++i) { - dmsk(i,hi.y,k) = 1; - }} - } - - if (bclo[2] == LinOpBCType::Dirichlet && lo.z == domlo.z) { - for (int j = lo.y; j <= hi.y; ++j) { - AMREX_PRAGMA_SIMD - for (int i = lo.x; i <= hi.x; ++i) { - dmsk(i,j,lo.z) = 1; - }} - } - - if (bchi[2] == LinOpBCType::Dirichlet && hi.z == domhi.z) { - for (int j = lo.y; j <= hi.y; ++j) { - AMREX_PRAGMA_SIMD - for (int i = lo.x; i <= hi.x; ++i) { - dmsk(i,j,hi.z) = 1; - }} - } -} - -AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE -void mlndlap_set_dot_mask (Box const& bx, Array4 const& dmsk, - Array4 const& omsk, Box const& dom, - GpuArray const& bclo, - GpuArray const& bchi) noexcept -{ - const auto lo = amrex::lbound(bx); - const auto hi = amrex::ubound(bx); - for (int k = lo.z; k <= hi.z; ++k) { - for (int j = lo.y; j <= hi.y; ++j) { - AMREX_PRAGMA_SIMD - for (int i = lo.x; i <= hi.x; ++i) { - dmsk(i,j,k) = static_cast(omsk(i,j,k)); - }}} - - const auto domlo = amrex::lbound(dom); - const auto domhi = amrex::ubound(dom); - - if ((bclo[0] == LinOpBCType::Neumann || bclo[0] == LinOpBCType::inflow) - && lo.x == domlo.x) - { - for (int k = lo.z; k <= hi.z; ++k) { - for (int j = lo.y; j <= hi.y; ++j) { - dmsk(lo.x,j,k) *= Real(0.5); - }} - } - - if ((bchi[0] == LinOpBCType::Neumann || bchi[0] == LinOpBCType::inflow) - && hi.x == domhi.x) - { - for (int k = lo.z; k <= hi.z; ++k) { - for (int j = lo.y; j <= hi.y; ++j) { - dmsk(hi.x,j,k) *= Real(0.5); - }} - } - - if ((bclo[1] == LinOpBCType::Neumann || bclo[1] == LinOpBCType::inflow) - && lo.y == domlo.y) - { - for (int k = lo.z; k <= hi.z; ++k) { - AMREX_PRAGMA_SIMD - for (int i = lo.x; i <= hi.x; ++i) { - dmsk(i,lo.y,k) *= Real(0.5); - }} - } - - if ((bchi[1] == LinOpBCType::Neumann || bchi[1] == LinOpBCType::inflow) - && hi.y == domhi.y) - { - for (int k = lo.z; k <= hi.z; ++k) { - AMREX_PRAGMA_SIMD - for (int i = lo.x; i <= hi.x; ++i) { - dmsk(i,hi.y,k) *= Real(0.5); - }} - } - - if ((bclo[2] == LinOpBCType::Neumann || bclo[2] == LinOpBCType::inflow) - && lo.z == domlo.z) - { - for (int j = lo.y; j <= hi.y; ++j) { - AMREX_PRAGMA_SIMD - for (int i = lo.x; i <= hi.x; ++i) { - dmsk(i,j,lo.z) *= Real(0.5); - }} - } - - if ((bchi[2] == LinOpBCType::Neumann || bchi[2] == LinOpBCType::inflow) - && hi.z == domhi.z) - { - for (int j = lo.y; j <= hi.y; ++j) { - AMREX_PRAGMA_SIMD - for (int i = lo.x; i <= hi.x; ++i) { - dmsk(i,j,hi.z) *= Real(0.5); - }} - } -} - AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void mlndlap_zero_fine (int i, int j, int k, Array4 const& phi, Array4 const& msk, int fine_flag) noexcept @@ -249,507 +78,6 @@ void mlndlap_semi_avgdown_coeff (int i, int j, int k, Array4 const& crse, crse(i,j,k) = cl*cr/(cl+cr); } } -// -// bc -// - -template -inline void mlndlap_bc_doit (Box const& vbx, Array4 const& a, Box const& domain, - GpuArray const& bflo, - GpuArray const& bfhi) noexcept -{ - Box gdomain = domain; - for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) { - if (! bflo[idim]) { gdomain.growLo(idim,1); } - if (! bfhi[idim]) { gdomain.growHi(idim,1); } - } - - if (gdomain.strictly_contains(vbx)) { return; } - - const int offset = domain.cellCentered() ? 0 : 1; - - const auto dlo = amrex::lbound(domain); - const auto dhi = amrex::ubound(domain); - - Box const& sbox = amrex::grow(vbx,1); - AMREX_HOST_DEVICE_FOR_3D(sbox, i, j, k, - { - if (! gdomain.contains(IntVect(i,j,k))) { - // xlo & ylo & zlo - if (i == dlo.x-1 && j == dlo.y-1 && k == dlo.z-1 && (bflo[0] || bflo[1] || bflo[2])) - { - if (bflo[0] && bflo[1] && bflo[2]) - { - a(i,j,k) = a(i+1+offset, j+1+offset, k+1+offset); - } - else if (bflo[0] && bflo[1]) - { - a(i,j,k) = a(i+1+offset, j+1+offset, k); - } - else if (bflo[0] && bflo[2]) - { - a(i,j,k) = a(i+1+offset, j, k+1+offset); - } - else if (bflo[1] && bflo[2]) - { - a(i,j,k) = a(i, j+1+offset, k+1+offset); - } - else if (bflo[0]) - { - a(i,j,k) = a(i+1+offset, j, k); - } - else if (bflo[1]) - { - a(i,j,k) = a(i, j+1+offset, k); - } - else if (bflo[2]) - { - a(i,j,k) = a(i, j, k+1+offset); - } - } - // xhi & ylo & zlo - else if (i == dhi.x+1 && j == dlo.y-1 && k == dlo.z-1 && (bfhi[0] || bflo[1] || bflo[2])) - { - if (bfhi[0] && bflo[1] && bflo[2]) - { - a(i,j,k) = a(i-1-offset, j+1+offset, k+1+offset); - } - else if (bfhi[0] && bflo[1]) - { - a(i,j,k) = a(i-1-offset, j+1+offset, k); - } - else if (bfhi[0] && bflo[2]) - { - a(i,j,k) = a(i-1-offset, j, k+1+offset); - } - else if (bflo[1] && bflo[2]) - { - a(i,j,k) = a(i, j+1+offset, k+1+offset); - } - else if (bfhi[0]) - { - a(i,j,k) = a(i-1-offset, j, k); - } - else if (bflo[1]) - { - a(i,j,k) = a(i, j+1+offset, k); - } - else if (bflo[2]) - { - a(i,j,k) = a(i, j, k+1+offset); - } - } - // xlo & yhi & zlo - else if (i == dlo.x-1 && j == dhi.y+1 && k == dlo.z-1 && (bflo[0] || bfhi[1] || bflo[2])) - { - if (bflo[0] && bfhi[1] && bflo[2]) - { - a(i,j,k) = a(i+1+offset, j-1-offset, k+1+offset); - } - else if (bflo[0] && bfhi[1]) - { - a(i,j,k) = a(i+1+offset, j-1-offset, k); - } - else if (bflo[0] && bflo[2]) - { - a(i,j,k) = a(i+1+offset, j, k+1+offset); - } - else if (bfhi[1] && bflo[2]) - { - a(i,j,k) = a(i, j-1-offset, k+1+offset); - } - else if (bflo[0]) - { - a(i,j,k) = a(i+1+offset, j, k); - } - else if (bfhi[1]) - { - a(i,j,k) = a(i, j-1-offset, k); - } - else if (bflo[2]) - { - a(i,j,k) = a(i, j, k+1+offset); - } - } - // xhi & yhi & zlo - else if (i == dhi.x+1 && j == dhi.y+1 && k == dlo.z-1 && (bfhi[0] || bfhi[1] || bflo[2])) - { - if (bfhi[0] && bfhi[1] && bflo[2]) - { - a(i,j,k) = a(i-1-offset, j-1-offset, k+1+offset); - } - else if (bfhi[0] && bfhi[1]) - { - a(i,j,k) = a(i-1-offset, j-1-offset, k); - } - else if (bfhi[0] && bflo[2]) - { - a(i,j,k) = a(i-1-offset, j, k+1+offset); - } - else if (bfhi[1] && bflo[2]) - { - a(i,j,k) = a(i, j-1-offset, k+1+offset); - } - else if (bfhi[0]) - { - a(i,j,k) = a(i-1-offset, j, k); - } - else if (bfhi[1]) - { - a(i,j,k) = a(i, j-1-offset, k); - } - else if (bflo[2]) - { - a(i,j,k) = a(i, j, k+1+offset); - } - } - // xlo & ylo & zhi - else if (i == dlo.x-1 && j == dlo.y-1 && k == dhi.z+1 && (bflo[0] || bflo[1] || bfhi[2])) - { - if (bflo[0] && bflo[1] && bfhi[2]) - { - a(i,j,k) = a(i+1+offset, j+1+offset, k-1-offset); - } - else if (bflo[0] && bflo[1]) - { - a(i,j,k) = a(i+1+offset, j+1+offset, k); - } - else if (bflo[0] && bfhi[2]) - { - a(i,j,k) = a(i+1+offset, j, k-1-offset); - } - else if (bflo[1] && bfhi[2]) - { - a(i,j,k) = a(i, j+1+offset, k-1-offset); - } - else if (bflo[0]) - { - a(i,j,k) = a(i+1+offset, j, k); - } - else if (bflo[1]) - { - a(i,j,k) = a(i, j+1+offset, k); - } - else if (bfhi[2]) - { - a(i,j,k) = a(i, j, k-1-offset); - } - } - // xhi & ylo & zhi - else if (i == dhi.x+1 && j == dlo.y-1 && k == dhi.z+1 && (bfhi[0] || bflo[1] || bfhi[2])) - { - if (bfhi[0] && bflo[1] && bfhi[2]) - { - a(i,j,k) = a(i-1-offset, j+1+offset, k-1-offset); - } - else if (bfhi[0] && bflo[1]) - { - a(i,j,k) = a(i-1-offset, j+1+offset, k); - } - else if (bfhi[0] && bfhi[2]) - { - a(i,j,k) = a(i-1-offset, j, k-1-offset); - } - else if (bflo[1] && bfhi[2]) - { - a(i,j,k) = a(i, j+1+offset, k-1-offset); - } - else if (bfhi[0]) - { - a(i,j,k) = a(i-1-offset, j, k); - } - else if (bflo[1]) - { - a(i,j,k) = a(i, j+1+offset, k); - } - else if (bfhi[2]) - { - a(i,j,k) = a(i, j, k-1-offset); - } - } - // xlo & yhi & zhi - else if (i == dlo.x-1 && j == dhi.y+1 && k == dhi.z+1 && (bflo[0] || bfhi[1] || bfhi[2])) - { - if (bflo[0] && bfhi[1] && bfhi[2]) - { - a(i,j,k) = a(i+1+offset, j-1-offset, k-1-offset); - } - else if (bflo[0] && bfhi[1]) - { - a(i,j,k) = a(i+1+offset, j-1-offset, k); - } - else if (bflo[0] && bfhi[2]) - { - a(i,j,k) = a(i+1+offset, j, k-1-offset); - } - else if (bfhi[1] && bfhi[2]) - { - a(i,j,k) = a(i, j-1-offset, k-1-offset); - } - else if (bflo[0]) - { - a(i,j,k) = a(i+1+offset, j, k); - } - else if (bfhi[1]) - { - a(i,j,k) = a(i, j-1-offset, k); - } - else if (bfhi[2]) - { - a(i,j,k) = a(i, j, k-1-offset); - } - } - // xhi & yhi & zhi - else if (i == dhi.x+1 && j == dhi.y+1 && k == dhi.z+1 && (bfhi[0] || bfhi[1] || bfhi[2])) - { - if (bfhi[0] && bfhi[1] && bfhi[2]) - { - a(i,j,k) = a(i-1-offset, j-1-offset, k-1-offset); - } - else if (bfhi[0] && bfhi[1]) - { - a(i,j,k) = a(i-1-offset, j-1-offset, k); - } - else if (bfhi[0] && bfhi[2]) - { - a(i,j,k) = a(i-1-offset, j, k-1-offset); - } - else if (bfhi[1] && bfhi[2]) - { - a(i,j,k) = a(i, j-1-offset, k-1-offset); - } - else if (bfhi[0]) - { - a(i,j,k) = a(i-1-offset, j, k); - } - else if (bfhi[1]) - { - a(i,j,k) = a(i, j-1-offset, k); - } - else if (bfhi[2]) - { - a(i,j,k) = a(i, j, k-1-offset); - } - } - // xlo & ylo - else if (i == dlo.x-1 && j == dlo.y-1 && (bflo[0] || bflo[1])) - { - if (bflo[0] && bflo[1]) - { - a(i,j,k) = a(i+1+offset, j+1+offset, k); - } - else if (bflo[0]) - { - a(i,j,k) = a(i+1+offset, j, k); - } - else if (bflo[1]) - { - a(i,j,k) = a(i, j+1+offset, k); - } - } - // xhi & ylo - else if (i == dhi.x+1 && j == dlo.y-1 && (bfhi[0] || bflo[1])) - { - if (bfhi[0] && bflo[1]) - { - a(i,j,k) = a(i-1-offset, j+1+offset, k); - } - else if (bfhi[0]) - { - a(i,j,k) = a(i-1-offset, j, k); - } - else if (bflo[1]) - { - a(i,j,k) = a(i, j+1+offset, k); - } - } - // xlo & yhi - else if (i == dlo.x-1 && j == dhi.y+1 && (bflo[0] || bfhi[1])) - { - if (bflo[0] && bfhi[1]) - { - a(i,j,k) = a(i+1+offset, j-1-offset, k); - } - else if (bflo[0]) - { - a(i,j,k) = a(i+1+offset, j, k); - } - else if (bfhi[1]) - { - a(i,j,k) = a(i, j-1-offset, k); - } - } - // xhi & yhi - else if (i == dhi.x+1 && j == dhi.y+1 && (bfhi[0] || bfhi[1])) - { - if (bfhi[0] && bfhi[1]) - { - a(i,j,k) = a(i-1-offset, j-1-offset, k); - } - else if (bfhi[0]) - { - a(i,j,k) = a(i-1-offset, j, k); - } - else if (bfhi[1]) - { - a(i,j,k) = a(i, j-1-offset, k); - } - } - // xlo & zlo - else if (i == dlo.x-1 && k == dlo.z-1 && (bflo[0] || bflo[2])) - { - if (bflo[0] && bflo[2]) - { - a(i,j,k) = a(i+1+offset, j, k+1+offset); - } - else if (bflo[0]) - { - a(i,j,k) = a(i+1+offset, j, k); - } - else if (bflo[2]) - { - a(i,j,k) = a(i, j, k+1+offset); - } - } - // xhi & zlo - else if (i == dhi.x+1 && k == dlo.z-1 && (bfhi[0] || bflo[2])) - { - if (bfhi[0] && bflo[2]) - { - a(i,j,k) = a(i-1-offset, j, k+1+offset); - } - else if (bfhi[0]) - { - a(i,j,k) = a(i-1-offset, j, k); - } - else if (bflo[2]) - { - a(i,j,k) = a(i, j, k+1+offset); - } - } - // xlo & zhi - else if (i == dlo.x-1 && k == dhi.z+1 && (bflo[0] || bfhi[2])) - { - if (bflo[0] && bfhi[2]) - { - a(i,j,k) = a(i+1+offset, j, k-1-offset); - } - else if (bflo[0]) - { - a(i,j,k) = a(i+1+offset, j, k); - } - else if (bfhi[2]) - { - a(i,j,k) = a(i, j, k-1-offset); - } - } - // xhi & zhi - else if (i == dhi.x+1 && k == dhi.z+1 && (bfhi[0] || bfhi[2])) - { - if (bfhi[0] && bfhi[2]) - { - a(i,j,k) = a(i-1-offset, j, k-1-offset); - } - else if (bfhi[0]) - { - a(i,j,k) = a(i-1-offset, j, k); - } - else if (bfhi[2]) - { - a(i,j,k) = a(i, j, k-1-offset); - } - } - // ylo & zlo - else if (j == dlo.y-1 && k == dlo.z-1 && (bflo[1] || bflo[2])) - { - if (bflo[1] && bflo[2]) - { - a(i,j,k) = a(i, j+1+offset, k+1+offset); - } - else if (bflo[1]) - { - a(i,j,k) = a(i, j+1+offset, k); - } - else if (bflo[2]) - { - a(i,j,k) = a(i, j, k+1+offset); - } - } - // yhi & zlo - else if (j == dhi.y+1 && k == dlo.z-1 && (bfhi[1] || bflo[2])) - { - if (bfhi[1] && bflo[2]) - { - a(i,j,k) = a(i, j-1-offset, k+1+offset); - } - else if (bfhi[1]) - { - a(i,j,k) = a(i, j-1-offset, k); - } - else if (bflo[2]) - { - a(i,j,k) = a(i, j, k+1+offset); - } - } - // ylo & zhi - else if (j == dlo.y-1 && k == dhi.z+1 && (bflo[1] || bfhi[2])) - { - if (bflo[1] && bfhi[2]) - { - a(i,j,k) = a(i, j+1+offset, k-1-offset); - } - else if (bflo[1]) - { - a(i,j,k) = a(i, j+1+offset, k); - } - else if (bfhi[2]) - { - a(i,j,k) = a(i, j, k-1-offset); - } - } - // yhi & zhi - else if (j == dhi.y+1 && k == dhi.z+1 && (bfhi[1] || bfhi[2])) - { - if (bfhi[1] && bfhi[2]) - { - a(i,j,k) = a(i, j-1-offset, k-1-offset); - } - else if (bfhi[1]) - { - a(i,j,k) = a(i, j-1-offset, k); - } - else if (bfhi[2]) - { - a(i,j,k) = a(i, j, k-1-offset); - } - } - else if (i == dlo.x-1 && bflo[0]) - { - a(i,j,k) = a(i+1+offset, j, k); - } - else if (i == dhi.x+1 && bfhi[0]) - { - a(i,j,k) = a(i-1-offset, j, k); - } - else if (j == dlo.y-1 && bflo[1]) - { - a(i,j,k) = a(i, j+1+offset, k); - } - else if (j == dhi.y+1 && bfhi[1]) - { - a(i,j,k) = a(i, j-1-offset, k); - } - else if (k == dlo.z-1 && bflo[2]) - { - a(i,j,k) = a(i, j, k+1+offset); - } - else if (k == dhi.z+1 && bfhi[2]) - { - a(i,j,k) = a(i, j, k-1-offset); - } - } - }); -} // // operator @@ -1587,138 +915,6 @@ void mlndlap_gauss_seidel_with_line_solve_aa (Box const& bx, Array4 const& } } -// -// restriction -// - -AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE -void mlndlap_restriction (int i, int j, int k, Array4 const& crse, - Array4 const& fine, Array4 const& msk) noexcept -{ - int ii = i*2; - int jj = j*2; - int kk = k*2; - if (msk(ii,jj,kk)) { - crse(i,j,k) = Real(0.0); - } else { - crse(i,j,k) = Real(1./64.)*(fine(ii-1,jj-1,kk-1)+fine(ii+1,jj-1,kk-1) - +fine(ii-1,jj+1,kk-1)+fine(ii+1,jj+1,kk-1) - +fine(ii-1,jj-1,kk+1)+fine(ii+1,jj-1,kk+1) - +fine(ii-1,jj+1,kk+1)+fine(ii+1,jj+1,kk+1)) - + Real(1./32.)*(fine(ii ,jj-1,kk-1)+fine(ii ,jj+1,kk-1) - +fine(ii ,jj-1,kk+1)+fine(ii ,jj+1,kk+1) - +fine(ii-1,jj ,kk-1)+fine(ii+1,jj ,kk-1) - +fine(ii-1,jj ,kk+1)+fine(ii+1,jj ,kk+1) - +fine(ii-1,jj-1,kk )+fine(ii+1,jj-1,kk ) - +fine(ii-1,jj+1,kk )+fine(ii+1,jj+1,kk )) - + Real(1./16.)*(fine(ii-1,jj,kk)+fine(ii+1,jj,kk) - +fine(ii,jj-1,kk)+fine(ii,jj+1,kk) - +fine(ii,jj,kk-1)+fine(ii,jj,kk+1)) - + Real(1./8.)*fine(ii,jj,kk); - } -} - -template -AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE -void mlndlap_restriction (int i, int j, int k, Array4 const& crse, - Array4 const& fine, Array4 const& msk, - Box const& fdom, - GpuArray const& bclo, - GpuArray const& bchi) noexcept -{ - const int ii = i*rr; - const int jj = j*rr; - const int kk = k*rr; - if (msk(ii,jj,kk)) { - crse(i,j,k) = Real(0.0); - } else { - const auto ndlo = amrex::lbound(fdom); - const auto ndhi = amrex::ubound(fdom); - Real tmp = Real(0.0); - for (int koff = -rr+1; koff <= rr-1; ++koff) { - Real wz = rr - std::abs(koff); - for (int joff = -rr+1; joff <= rr-1; ++joff) { - Real wy = rr - std::abs(joff); - for (int ioff = -rr+1; ioff <= rr-1; ++ioff) { - Real wx = rr - std::abs(ioff); - int itmp = ii + ioff; - int jtmp = jj + joff; - int ktmp = kk + koff; - if ((itmp < ndlo.x && (bclo[0] == LinOpBCType::Neumann || - bclo[0] == LinOpBCType::inflow)) || - (itmp > ndhi.x && (bchi[0] == LinOpBCType::Neumann || - bchi[0] == LinOpBCType::inflow))) { - itmp = ii - ioff; - } - if ((jtmp < ndlo.y && (bclo[1] == LinOpBCType::Neumann || - bclo[1] == LinOpBCType::inflow)) || - (jtmp > ndhi.y && (bchi[1] == LinOpBCType::Neumann || - bchi[1] == LinOpBCType::inflow))) { - jtmp = jj - joff; - } - if ((ktmp < ndlo.z && (bclo[2] == LinOpBCType::Neumann || - bclo[2] == LinOpBCType::inflow)) || - (ktmp > ndhi.z && (bchi[2] == LinOpBCType::Neumann || - bchi[2] == LinOpBCType::inflow))) { - ktmp = kk - koff; - } - tmp += wx*wy*wz*fine(itmp,jtmp,ktmp); - } - } - } - crse(i,j,k) = tmp/Real(rr*rr*rr*rr*rr*rr); - } -} - -AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE -void mlndlap_semi_restriction (int i, int j, int k, Array4 const& crse, - Array4 const& fine, Array4 const& msk, int idir) noexcept -{ - if (idir == 2) - { - int ii = i*2; - int jj = j*2; - int kk = k; - if (msk(ii,jj,kk)) { - crse(i,j,k) = Real(0.0); - } else { // use 2-D formula - crse(i,j,k) = Real(1./16.)*(fine(ii-1,jj-1,kk) + Real(2.)*fine(ii ,jj-1,kk) + fine(ii+1,jj-1,kk) - + Real(2.)*fine(ii-1,jj ,kk) + Real(4.)*fine(ii ,jj ,kk) + Real(2.)*fine(ii+1,jj ,kk) - + fine(ii-1,jj+1,kk) + Real(2.)*fine(ii ,jj+1,kk) + fine(ii+1,jj+1,kk)); - } - } - else if (idir == 1) - { - int ii = i*2; - int jj = j; - int kk = k*2; - if (msk(ii,jj,kk)) { - crse(i,j,k) = Real(0.0); - } else { // use 2-D formula - crse(i,j,k) = Real(1./16.)*(fine(ii-1,jj,kk-1) + Real(2.)*fine(ii ,jj,kk-1) + fine(ii+1,jj,kk-1) - + Real(2.)*fine(ii-1,jj ,kk) + Real(4.)*fine(ii ,jj,kk ) + Real(2.)*fine(ii+1,jj,kk ) - + fine(ii-1,jj,kk+1) + Real(2.)*fine(ii ,jj,kk+1) + fine(ii+1,jj,kk+1)); - } - } - else if (idir == 0) - { - int ii = i; - int jj = j*2; - int kk = k*2; - if (msk(ii,jj,kk)) { - crse(i,j,k) = Real(0.0); - } else { // use 2-D formula - crse(i,j,k) = Real(1./16.)*(fine(ii,jj-1,kk-1) + Real(2.)*fine(ii ,jj,kk-1) + fine(ii,jj+1,kk-1) - + Real(2.)*fine(ii,jj-1 ,kk) + Real(4.)*fine(ii ,jj,kk ) + Real(2.)*fine(ii,jj+1,kk ) - + fine(ii,jj-1,kk+1) + Real(2.)*fine(ii ,jj,kk+1) + fine(ii,jj+1,kk+1)); - } - } - else - { - amrex::Abort("mlndlap_semi_restriction semi direction wrong semi-direction. "); - } -} - // // interpolation // diff --git a/Src/LinearSolvers/MLMG/AMReX_MLNodeLap_K.H b/Src/LinearSolvers/MLMG/AMReX_MLNodeLap_K.H index 4e76a48689..97f8e07817 100644 --- a/Src/LinearSolvers/MLMG/AMReX_MLNodeLap_K.H +++ b/Src/LinearSolvers/MLMG/AMReX_MLNodeLap_K.H @@ -7,57 +7,12 @@ #ifdef AMREX_USE_EB #include #endif +#include namespace amrex { namespace nodelap_detail { - struct GetNode { - AMREX_GPU_DEVICE Dim3 operator() (Dim3 const& lo, Dim3 const& len, int& offset) - { - Dim3 node; - constexpr int nsten = AMREX_D_TERM(3,*3,*3); - int icell = offset / nsten; - node.z = icell / (len.x*len.y); - node.y = (icell - node.z*(len.x*len.y)) / len.x; - node.x = (icell - node.z*(len.x*len.y)) - node.y*len.x; - node.x += lo.x; - node.y += lo.y; - node.z += lo.z; - offset -= icell*nsten; - return node; - } - }; - - struct GetNode2 { - AMREX_GPU_DEVICE Dim3 operator() (int offset, Dim3 const& node) - { - // In 2D the offsets are - // 6 7 8 - // 4 0 5 - // 1 2 3 - constexpr int nstenhalf = AMREX_SPACEDIM == 2 ? 4 : 13; - if (offset == 0) { - return node; - } else { - if (offset <= nstenhalf) { --offset; } - Dim3 node2; - node2.z = offset / 9; - node2.y = (offset - node2.z*9) / 3; - node2.x = (offset - node2.z*9) - node2.y*3; - AMREX_D_TERM(node2.x += node.x-1;, - node2.y += node.y-1;, - node2.z += node.z-1); - return node2; - } - } - }; - - constexpr int crse_cell = 0; // Do NOT change the values - constexpr int fine_cell = 1; - constexpr int crse_node = 0; - constexpr int crse_fine_node = 1; - constexpr int fine_node = 2; #if (BL_USE_FLOAT) constexpr float eps = 1.e-30_rt; #else @@ -123,40 +78,6 @@ mlndlap_unimpose_neumann_bc (Box const& bx, Array4 const& rhs, Box const& namespace amrex { -template -void mlndlap_fillbc_cc (Box const& vbx, Array4 const& sigma, Box const& domain, - GpuArray bclo, - GpuArray bchi) noexcept -{ - GpuArray bflo{{AMREX_D_DECL(bclo[0] != LinOpBCType::Periodic, - bclo[1] != LinOpBCType::Periodic, - bclo[2] != LinOpBCType::Periodic)}}; - GpuArray bfhi{{AMREX_D_DECL(bchi[0] != LinOpBCType::Periodic, - bchi[1] != LinOpBCType::Periodic, - bchi[2] != LinOpBCType::Periodic)}}; - mlndlap_bc_doit(vbx, sigma, domain, bflo, bfhi); -} - -template -void mlndlap_applybc (Box const& vbx, Array4 const& phi, Box const& domain, - GpuArray bclo, - GpuArray bchi) noexcept -{ - GpuArray bflo{{AMREX_D_DECL(bclo[0] == LinOpBCType::Neumann || - bclo[0] == LinOpBCType::inflow, - bclo[1] == LinOpBCType::Neumann || - bclo[1] == LinOpBCType::inflow, - bclo[2] == LinOpBCType::Neumann || - bclo[2] == LinOpBCType::inflow)}}; - GpuArray bfhi{{AMREX_D_DECL(bchi[0] == LinOpBCType::Neumann || - bchi[0] == LinOpBCType::inflow, - bchi[1] == LinOpBCType::Neumann || - bchi[1] == LinOpBCType::inflow, - bchi[2] == LinOpBCType::Neumann || - bchi[2] == LinOpBCType::inflow)}}; - mlndlap_bc_doit(vbx, phi, domain, bflo, bfhi); -} - AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void mlndlap_normalize_sten (int i, int j, int k, Array4 const& x, Array4 const& sten, diff --git a/Src/LinearSolvers/MLMG/AMReX_MLNodeLinOp.cpp b/Src/LinearSolvers/MLMG/AMReX_MLNodeLinOp.cpp index 38f58b70bb..929d05dc5a 100644 --- a/Src/LinearSolvers/MLMG/AMReX_MLNodeLinOp.cpp +++ b/Src/LinearSolvers/MLMG/AMReX_MLNodeLinOp.cpp @@ -1,6 +1,6 @@ #include -#include +#include #include #include diff --git a/Src/LinearSolvers/MLMG/AMReX_MLNodeLinOp_1D_K.H b/Src/LinearSolvers/MLMG/AMReX_MLNodeLinOp_1D_K.H new file mode 100644 index 0000000000..b842dd81b8 --- /dev/null +++ b/Src/LinearSolvers/MLMG/AMReX_MLNodeLinOp_1D_K.H @@ -0,0 +1,167 @@ +#ifndef AMREX_ML_NODE_LINOP_1D_K_H_ +#define AMREX_ML_NODE_LINOP_1D_K_H_ +#include + +namespace amrex { + +template +void mlndlap_bc_doit (Box const& vbx, Array4 const& a, Box const& domain, + GpuArray const& bflo, + GpuArray const& bfhi) noexcept +{ + Box gdomain = domain; + int const idim = 0; + if (! bflo[idim]) { gdomain.growLo(idim,1); } + if (! bfhi[idim]) { gdomain.growHi(idim,1); } + + if (gdomain.strictly_contains(vbx)) { return; } + + const int offset = domain.cellCentered() ? 0 : 1; + + const auto dlo = domain.smallEnd(0); + const auto dhi = domain.bigEnd(0); + + Box const& sbox = amrex::grow(vbx,1); + AMREX_HOST_DEVICE_FOR_3D(sbox, i, j, k, + { + if (! gdomain.contains(IntVect(i))) { + if (i == dlo-1 && bflo[0]) + { + a(i,0,0) = a(i+1+offset, j, k); + } + else if (i == dhi+1 && bfhi[0]) + { + a(i,0,0) = a(i-1-offset, j, k); + } + } + }); +} + +AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +void mlndlap_restriction (int i, int, int, Array4 const& crse, + Array4 const& fine, Array4 const& msk) noexcept +{ + int ii = i*2; + if (msk(ii,0,0)) { + crse(i,0,0) = Real(0.0); + } else { + crse(i,0,0) = Real(1./4.) *(fine(ii-1,0,0) + + Real(2.)* fine(ii ,0,0) + + fine(ii+1,0,0)); + } +} + +template +AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +void mlndlap_restriction (int i, int, int, Array4 const& crse, + Array4 const& fine, Array4 const& msk, + Box const& fdom, + GpuArray const& bclo, + GpuArray const& bchi) noexcept + +{ + const int ii = i*rr; + if (msk(ii,0,0)) { + crse(i,0,0) = Real(0.0); + } else { + const auto ndlo = fdom.smallEnd(0); + const auto ndhi = fdom.bigEnd(0); + Real tmp = Real(0.0); + for (int ioff = -rr+1; ioff <= rr-1; ++ioff) { + Real wx = rr - std::abs(ioff); + int itmp = ii + ioff; + if ((itmp < ndlo && (bclo[0] == LinOpBCType::Neumann || + bclo[0] == LinOpBCType::inflow)) || + (itmp > ndhi && (bchi[0] == LinOpBCType::Neumann || + bchi[0] == LinOpBCType::inflow))) { + itmp = ii - ioff; + } + tmp += wx*fine(itmp,0,0); + } + crse(i,0,0) = tmp*(Real(1.0)/Real(rr*rr)); + } +} + +AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +void mlndlap_semi_restriction (int /*i*/, int /*j*/, int /*k*/, Array4 const&, + Array4 const&, Array4 const&, int) noexcept +{ + amrex::Abort("mlndlap_semi_restriction: not implemented in 1D"); +} + +AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +void mlndlap_set_nodal_mask (int i, int, int, Array4 const& nmsk, + Array4 const& cmsk) noexcept +{ + using namespace nodelap_detail; + + int s = cmsk(i-1,0,0) + cmsk(i,0,0); + if (s == 2*crse_cell) { + nmsk(i,0,0) = crse_node; + } else if (s == 2*fine_cell) { + nmsk(i,0,0) = fine_node; + } else { + nmsk(i,0,0) = crse_fine_node; + } +} + +AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +void mlndlap_set_dirichlet_mask (Box const& bx, Array4 const& dmsk, + Array4 const& omsk, Box const& dom, + GpuArray const& bclo, + GpuArray const& bchi) noexcept +{ + const auto lo = bx.smallEnd(0); + const auto hi = bx.bigEnd(0); + AMREX_PRAGMA_SIMD + for (int i = lo; i <= hi; ++i) { + if (!dmsk(i,0,0)) { + dmsk(i,0,0) = (omsk(i-1,0,0) == 1 || omsk(i,0,0) == 1); + } + } + + const auto domlo = dom.smallEnd(0); + const auto domhi = dom.bigEnd(0); + + if (bclo[0] == LinOpBCType::Dirichlet && lo == domlo) { + dmsk(lo,0,0) = 1; + } + + if (bchi[0] == LinOpBCType::Dirichlet && hi == domhi) { + dmsk(hi,0,0) = 1; + } +} + +AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +void mlndlap_set_dot_mask (Box const& bx, Array4 const& dmsk, + Array4 const& omsk, Box const& dom, + GpuArray const& bclo, + GpuArray const& bchi) noexcept +{ + const auto lo = bx.smallEnd(0); + const auto hi = bx.bigEnd(0); + + AMREX_PRAGMA_SIMD + for (int i = lo; i <= hi; ++i) { + dmsk(i,0,0) = static_cast(omsk(i,0,0)); + } + + const auto domlo = dom.smallEnd(0); + const auto domhi = dom.bigEnd(0); + + if ((bclo[0] == LinOpBCType::Neumann || bclo[0] == LinOpBCType::inflow) + && lo == domlo) + { + dmsk(lo,0,0) *= Real(0.5); + } + + if ((bchi[0] == LinOpBCType::Neumann || bchi[0] == LinOpBCType::inflow) + && hi == domhi) + { + dmsk(hi,0,0) *= Real(0.5); + } +} + +} + +#endif diff --git a/Src/LinearSolvers/MLMG/AMReX_MLNodeLinOp_2D_K.H b/Src/LinearSolvers/MLMG/AMReX_MLNodeLinOp_2D_K.H new file mode 100644 index 0000000000..3d8746cf05 --- /dev/null +++ b/Src/LinearSolvers/MLMG/AMReX_MLNodeLinOp_2D_K.H @@ -0,0 +1,317 @@ +#ifndef AMREX_ML_NODE_LINOP_2D_K_H_ +#define AMREX_ML_NODE_LINOP_2D_K_H_ +#include + +namespace amrex { + +template +void mlndlap_bc_doit (Box const& vbx, Array4 const& a, Box const& domain, + GpuArray const& bflo, + GpuArray const& bfhi) noexcept +{ + Box gdomain = domain; + for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) { + if (! bflo[idim]) { gdomain.growLo(idim,1); } + if (! bfhi[idim]) { gdomain.growHi(idim,1); } + } + + if (gdomain.strictly_contains(vbx)) { return; } + + const int offset = domain.cellCentered() ? 0 : 1; + + const auto dlo = amrex::lbound(domain); + const auto dhi = amrex::ubound(domain); + + Box const& sbox = amrex::grow(vbx,1); + AMREX_HOST_DEVICE_FOR_3D(sbox, i, j, k, + { + if (! gdomain.contains(IntVect(i,j))) { + // xlo & ylo + if (i == dlo.x-1 && j == dlo.y-1 && (bflo[0] || bflo[1])) + { + if (bflo[0] && bflo[1]) + { + a(i,j,k) = a(i+1+offset, j+1+offset, k); + } + else if (bflo[0]) + { + a(i,j,k) = a(i+1+offset, j, k); + } + else if (bflo[1]) + { + a(i,j,k) = a(i, j+1+offset, k); + } + } + // xhi & ylo + else if (i == dhi.x+1 && j == dlo.y-1 && (bfhi[0] || bflo[1])) + { + if (bfhi[0] && bflo[1]) + { + a(i,j,k) = a(i-1-offset, j+1+offset, k); + } + else if (bfhi[0]) + { + a(i,j,k) = a(i-1-offset, j, k); + } + else if (bflo[1]) + { + a(i,j,k) = a(i, j+1+offset, k); + } + } + // xlo & yhi + else if (i == dlo.x-1 && j == dhi.y+1 && (bflo[0] || bfhi[1])) + { + if (bflo[0] && bfhi[1]) + { + a(i,j,k) = a(i+1+offset, j-1-offset, k); + } + else if (bflo[0]) + { + a(i,j,k) = a(i+1+offset, j, k); + } + else if (bfhi[1]) + { + a(i,j,k) = a(i, j-1-offset, k); + } + } + // xhi & yhi + else if (i == dhi.x+1 && j == dhi.y+1 && (bfhi[0] || bfhi[1])) + { + if (bfhi[0] && bfhi[1]) + { + a(i,j,k) = a(i-1-offset, j-1-offset, k); + } + else if (bfhi[0]) + { + a(i,j,k) = a(i-1-offset, j, k); + } + else if (bfhi[1]) + { + a(i,j,k) = a(i, j-1-offset, k); + } + } + else if (i == dlo.x-1 && bflo[0]) + { + a(i,j,k) = a(i+1+offset, j, k); + } + else if (i == dhi.x+1 && bfhi[0]) + { + a(i,j,k) = a(i-1-offset, j, k); + } + else if (j == dlo.y-1 && bflo[1]) + { + a(i,j,k) = a(i, j+1+offset, k); + } + else if (j == dhi.y+1 && bfhi[1]) + { + a(i,j,k) = a(i, j-1-offset, k); + } + } + }); +} + +AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +void mlndlap_restriction (int i, int j, int k, Array4 const& crse, + Array4 const& fine, Array4 const& msk) noexcept +{ + int ii = i*2; + int jj = j*2; + int kk = 0; + if (msk(ii,jj,kk)) { + crse(i,j,k) = Real(0.0); + } else { + crse(i,j,k) = Real(1./16.)*(fine(ii-1,jj-1,kk) + Real(2.)*fine(ii ,jj-1,kk) + fine(ii+1,jj-1,kk) + + Real(2.)*fine(ii-1,jj ,kk) + Real(4.)*fine(ii ,jj ,kk) + Real(2.)*fine(ii+1,jj ,kk) + + fine(ii-1,jj+1,kk) + Real(2.)*fine(ii ,jj+1,kk) + fine(ii+1,jj+1,kk)); + } +} + +template +AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +void mlndlap_restriction (int i, int j, int k, Array4 const& crse, + Array4 const& fine, Array4 const& msk, + Box const& fdom, + GpuArray const& bclo, + GpuArray const& bchi) noexcept +{ + const int ii = i*rr; + const int jj = j*rr; + if (msk(ii,jj,0)) { + crse(i,j,k) = Real(0.0); + } else { + const auto ndlo = amrex::lbound(fdom); + const auto ndhi = amrex::ubound(fdom); + Real tmp = Real(0.0); + for (int joff = -rr+1; joff <= rr-1; ++joff) { + Real wy = rr - std::abs(joff); + for (int ioff = -rr+1; ioff <= rr-1; ++ioff) { + Real wx = rr - std::abs(ioff); + int itmp = ii + ioff; + int jtmp = jj + joff; + if ((itmp < ndlo.x && (bclo[0] == LinOpBCType::Neumann || + bclo[0] == LinOpBCType::inflow)) || + (itmp > ndhi.x && (bchi[0] == LinOpBCType::Neumann || + bchi[0] == LinOpBCType::inflow))) { + itmp = ii - ioff; + } + if ((jtmp < ndlo.y && (bclo[1] == LinOpBCType::Neumann || + bclo[1] == LinOpBCType::inflow)) || + (jtmp > ndhi.y && (bchi[1] == LinOpBCType::Neumann || + bchi[1] == LinOpBCType::inflow))) { + jtmp = jj - joff; + } + tmp += wx*wy*fine(itmp,jtmp,0); + } + } + crse(i,j,k) = tmp*(Real(1.0)/Real(rr*rr*rr*rr)); + } +} + +AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +void mlndlap_semi_restriction (int i, int j, int k, Array4 const& crse, + Array4 const& fine, Array4 const& msk, int idir) noexcept +{ + int kk = 0; + if (idir == 1) { + int ii = i*2; + int jj = j; + if (msk(ii,jj,kk)) { + crse(i,j,k) = Real(0.0); + } else { + crse(i,j,k) = Real(1./4.)*(fine(ii-1,jj,kk) + Real(2.)*fine(ii,jj,kk) + fine(ii+1,jj,kk)); + } + } else if (idir == 0) { + int ii = i; + int jj = j*2; + if (msk(ii,jj,kk)) { + crse(i,j,k) = Real(0.0); + } else { + crse(i,j,k) = Real(1./4.)*(fine(ii,jj-1,kk) + Real(2.)*fine(ii,jj,kk) + fine(ii,jj+1,kk)); + } + } +} + +// +// masks +// + +AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +void mlndlap_set_nodal_mask (int i, int j, int k, Array4 const& nmsk, + Array4 const& cmsk) noexcept +{ + using namespace nodelap_detail; + + int s = cmsk(i-1,j-1,k) + cmsk(i ,j-1,k) + + cmsk(i-1,j ,k) + cmsk(i ,j ,k); + if (s == 4*crse_cell) { + nmsk(i,j,k) = crse_node; + } + else if (s == 4*fine_cell) { + nmsk(i,j,k) = fine_node; + } else { + nmsk(i,j,k) = crse_fine_node; + } +} + +AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +void mlndlap_set_dirichlet_mask (Box const& bx, Array4 const& dmsk, + Array4 const& omsk, Box const& dom, + GpuArray const& bclo, + GpuArray const& bchi) noexcept +{ + const auto lo = amrex::lbound(bx); + const auto hi = amrex::ubound(bx); + for (int j = lo.y; j <= hi.y; ++j) { + AMREX_PRAGMA_SIMD + for (int i = lo.x; i <= hi.x; ++i) { + if (!dmsk(i,j,0)) { + dmsk(i,j,0) = (omsk(i-1,j-1,0) == 1 || omsk(i,j-1,0) == 1 || + omsk(i-1,j ,0) == 1 || omsk(i,j ,0) == 1); + } + }} + + const auto domlo = amrex::lbound(dom); + const auto domhi = amrex::ubound(dom); + + if (bclo[0] == LinOpBCType::Dirichlet && lo.x == domlo.x) { + for (int j = lo.y; j <= hi.y; ++j) { + dmsk(lo.x,j,0) = 1; + } + } + + if (bchi[0] == LinOpBCType::Dirichlet && hi.x == domhi.x) { + for (int j = lo.y; j <= hi.y; ++j) { + dmsk(hi.x,j,0) = 1; + } + } + + if (bclo[1] == LinOpBCType::Dirichlet && lo.y == domlo.y) { + AMREX_PRAGMA_SIMD + for (int i = lo.x; i <= hi.x; ++i) { + dmsk(i,lo.y,0) = 1; + } + } + + if (bchi[1] == LinOpBCType::Dirichlet && hi.y == domhi.y) { + AMREX_PRAGMA_SIMD + for (int i = lo.x; i <= hi.x; ++i) { + dmsk(i,hi.y,0) = 1; + } + } +} + +AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +void mlndlap_set_dot_mask (Box const& bx, Array4 const& dmsk, + Array4 const& omsk, Box const& dom, + GpuArray const& bclo, + GpuArray const& bchi) noexcept +{ + const auto lo = amrex::lbound(bx); + const auto hi = amrex::ubound(bx); + for (int j = lo.y; j <= hi.y; ++j) { + AMREX_PRAGMA_SIMD + for (int i = lo.x; i <= hi.x; ++i) { + dmsk(i,j,0) = static_cast(omsk(i,j,0)); + }} + + const auto domlo = amrex::lbound(dom); + const auto domhi = amrex::ubound(dom); + + if ((bclo[0] == LinOpBCType::Neumann || bclo[0] == LinOpBCType::inflow) + && lo.x == domlo.x) + { + for (int j = lo.y; j <= hi.y; ++j) { + dmsk(lo.x,j,0) *= Real(0.5); + } + } + + if ((bchi[0] == LinOpBCType::Neumann || bchi[0] == LinOpBCType::inflow) + && hi.x == domhi.x) + { + for (int j = lo.y; j <= hi.y; ++j) { + dmsk(hi.x,j,0) *= Real(0.5); + } + } + + if ((bclo[1] == LinOpBCType::Neumann || bclo[1] == LinOpBCType::inflow) + && lo.y == domlo.y) + { + AMREX_PRAGMA_SIMD + for (int i = lo.x; i <= hi.x; ++i) { + dmsk(i,lo.y,0) *= Real(0.5); + } + } + + if ((bchi[1] == LinOpBCType::Neumann || bchi[1] == LinOpBCType::inflow) + && hi.y == domhi.y) + { + AMREX_PRAGMA_SIMD + for (int i = lo.x; i <= hi.x; ++i) { + dmsk(i,hi.y,0) *= Real(0.5); + } + } +} + +} + +#endif diff --git a/Src/LinearSolvers/MLMG/AMReX_MLNodeLinOp_3D_K.H b/Src/LinearSolvers/MLMG/AMReX_MLNodeLinOp_3D_K.H new file mode 100644 index 0000000000..976a16c7aa --- /dev/null +++ b/Src/LinearSolvers/MLMG/AMReX_MLNodeLinOp_3D_K.H @@ -0,0 +1,810 @@ +#ifndef AMREX_ML_NODE_LINOP_3D_K_H_ +#define AMREX_ML_NODE_LINOP_3D_K_H_ +#include + +namespace amrex { + +template +inline void mlndlap_bc_doit (Box const& vbx, Array4 const& a, Box const& domain, + GpuArray const& bflo, + GpuArray const& bfhi) noexcept +{ + Box gdomain = domain; + for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) { + if (! bflo[idim]) { gdomain.growLo(idim,1); } + if (! bfhi[idim]) { gdomain.growHi(idim,1); } + } + + if (gdomain.strictly_contains(vbx)) { return; } + + const int offset = domain.cellCentered() ? 0 : 1; + + const auto dlo = amrex::lbound(domain); + const auto dhi = amrex::ubound(domain); + + Box const& sbox = amrex::grow(vbx,1); + AMREX_HOST_DEVICE_FOR_3D(sbox, i, j, k, + { + if (! gdomain.contains(IntVect(i,j,k))) { + // xlo & ylo & zlo + if (i == dlo.x-1 && j == dlo.y-1 && k == dlo.z-1 && (bflo[0] || bflo[1] || bflo[2])) + { + if (bflo[0] && bflo[1] && bflo[2]) + { + a(i,j,k) = a(i+1+offset, j+1+offset, k+1+offset); + } + else if (bflo[0] && bflo[1]) + { + a(i,j,k) = a(i+1+offset, j+1+offset, k); + } + else if (bflo[0] && bflo[2]) + { + a(i,j,k) = a(i+1+offset, j, k+1+offset); + } + else if (bflo[1] && bflo[2]) + { + a(i,j,k) = a(i, j+1+offset, k+1+offset); + } + else if (bflo[0]) + { + a(i,j,k) = a(i+1+offset, j, k); + } + else if (bflo[1]) + { + a(i,j,k) = a(i, j+1+offset, k); + } + else if (bflo[2]) + { + a(i,j,k) = a(i, j, k+1+offset); + } + } + // xhi & ylo & zlo + else if (i == dhi.x+1 && j == dlo.y-1 && k == dlo.z-1 && (bfhi[0] || bflo[1] || bflo[2])) + { + if (bfhi[0] && bflo[1] && bflo[2]) + { + a(i,j,k) = a(i-1-offset, j+1+offset, k+1+offset); + } + else if (bfhi[0] && bflo[1]) + { + a(i,j,k) = a(i-1-offset, j+1+offset, k); + } + else if (bfhi[0] && bflo[2]) + { + a(i,j,k) = a(i-1-offset, j, k+1+offset); + } + else if (bflo[1] && bflo[2]) + { + a(i,j,k) = a(i, j+1+offset, k+1+offset); + } + else if (bfhi[0]) + { + a(i,j,k) = a(i-1-offset, j, k); + } + else if (bflo[1]) + { + a(i,j,k) = a(i, j+1+offset, k); + } + else if (bflo[2]) + { + a(i,j,k) = a(i, j, k+1+offset); + } + } + // xlo & yhi & zlo + else if (i == dlo.x-1 && j == dhi.y+1 && k == dlo.z-1 && (bflo[0] || bfhi[1] || bflo[2])) + { + if (bflo[0] && bfhi[1] && bflo[2]) + { + a(i,j,k) = a(i+1+offset, j-1-offset, k+1+offset); + } + else if (bflo[0] && bfhi[1]) + { + a(i,j,k) = a(i+1+offset, j-1-offset, k); + } + else if (bflo[0] && bflo[2]) + { + a(i,j,k) = a(i+1+offset, j, k+1+offset); + } + else if (bfhi[1] && bflo[2]) + { + a(i,j,k) = a(i, j-1-offset, k+1+offset); + } + else if (bflo[0]) + { + a(i,j,k) = a(i+1+offset, j, k); + } + else if (bfhi[1]) + { + a(i,j,k) = a(i, j-1-offset, k); + } + else if (bflo[2]) + { + a(i,j,k) = a(i, j, k+1+offset); + } + } + // xhi & yhi & zlo + else if (i == dhi.x+1 && j == dhi.y+1 && k == dlo.z-1 && (bfhi[0] || bfhi[1] || bflo[2])) + { + if (bfhi[0] && bfhi[1] && bflo[2]) + { + a(i,j,k) = a(i-1-offset, j-1-offset, k+1+offset); + } + else if (bfhi[0] && bfhi[1]) + { + a(i,j,k) = a(i-1-offset, j-1-offset, k); + } + else if (bfhi[0] && bflo[2]) + { + a(i,j,k) = a(i-1-offset, j, k+1+offset); + } + else if (bfhi[1] && bflo[2]) + { + a(i,j,k) = a(i, j-1-offset, k+1+offset); + } + else if (bfhi[0]) + { + a(i,j,k) = a(i-1-offset, j, k); + } + else if (bfhi[1]) + { + a(i,j,k) = a(i, j-1-offset, k); + } + else if (bflo[2]) + { + a(i,j,k) = a(i, j, k+1+offset); + } + } + // xlo & ylo & zhi + else if (i == dlo.x-1 && j == dlo.y-1 && k == dhi.z+1 && (bflo[0] || bflo[1] || bfhi[2])) + { + if (bflo[0] && bflo[1] && bfhi[2]) + { + a(i,j,k) = a(i+1+offset, j+1+offset, k-1-offset); + } + else if (bflo[0] && bflo[1]) + { + a(i,j,k) = a(i+1+offset, j+1+offset, k); + } + else if (bflo[0] && bfhi[2]) + { + a(i,j,k) = a(i+1+offset, j, k-1-offset); + } + else if (bflo[1] && bfhi[2]) + { + a(i,j,k) = a(i, j+1+offset, k-1-offset); + } + else if (bflo[0]) + { + a(i,j,k) = a(i+1+offset, j, k); + } + else if (bflo[1]) + { + a(i,j,k) = a(i, j+1+offset, k); + } + else if (bfhi[2]) + { + a(i,j,k) = a(i, j, k-1-offset); + } + } + // xhi & ylo & zhi + else if (i == dhi.x+1 && j == dlo.y-1 && k == dhi.z+1 && (bfhi[0] || bflo[1] || bfhi[2])) + { + if (bfhi[0] && bflo[1] && bfhi[2]) + { + a(i,j,k) = a(i-1-offset, j+1+offset, k-1-offset); + } + else if (bfhi[0] && bflo[1]) + { + a(i,j,k) = a(i-1-offset, j+1+offset, k); + } + else if (bfhi[0] && bfhi[2]) + { + a(i,j,k) = a(i-1-offset, j, k-1-offset); + } + else if (bflo[1] && bfhi[2]) + { + a(i,j,k) = a(i, j+1+offset, k-1-offset); + } + else if (bfhi[0]) + { + a(i,j,k) = a(i-1-offset, j, k); + } + else if (bflo[1]) + { + a(i,j,k) = a(i, j+1+offset, k); + } + else if (bfhi[2]) + { + a(i,j,k) = a(i, j, k-1-offset); + } + } + // xlo & yhi & zhi + else if (i == dlo.x-1 && j == dhi.y+1 && k == dhi.z+1 && (bflo[0] || bfhi[1] || bfhi[2])) + { + if (bflo[0] && bfhi[1] && bfhi[2]) + { + a(i,j,k) = a(i+1+offset, j-1-offset, k-1-offset); + } + else if (bflo[0] && bfhi[1]) + { + a(i,j,k) = a(i+1+offset, j-1-offset, k); + } + else if (bflo[0] && bfhi[2]) + { + a(i,j,k) = a(i+1+offset, j, k-1-offset); + } + else if (bfhi[1] && bfhi[2]) + { + a(i,j,k) = a(i, j-1-offset, k-1-offset); + } + else if (bflo[0]) + { + a(i,j,k) = a(i+1+offset, j, k); + } + else if (bfhi[1]) + { + a(i,j,k) = a(i, j-1-offset, k); + } + else if (bfhi[2]) + { + a(i,j,k) = a(i, j, k-1-offset); + } + } + // xhi & yhi & zhi + else if (i == dhi.x+1 && j == dhi.y+1 && k == dhi.z+1 && (bfhi[0] || bfhi[1] || bfhi[2])) + { + if (bfhi[0] && bfhi[1] && bfhi[2]) + { + a(i,j,k) = a(i-1-offset, j-1-offset, k-1-offset); + } + else if (bfhi[0] && bfhi[1]) + { + a(i,j,k) = a(i-1-offset, j-1-offset, k); + } + else if (bfhi[0] && bfhi[2]) + { + a(i,j,k) = a(i-1-offset, j, k-1-offset); + } + else if (bfhi[1] && bfhi[2]) + { + a(i,j,k) = a(i, j-1-offset, k-1-offset); + } + else if (bfhi[0]) + { + a(i,j,k) = a(i-1-offset, j, k); + } + else if (bfhi[1]) + { + a(i,j,k) = a(i, j-1-offset, k); + } + else if (bfhi[2]) + { + a(i,j,k) = a(i, j, k-1-offset); + } + } + // xlo & ylo + else if (i == dlo.x-1 && j == dlo.y-1 && (bflo[0] || bflo[1])) + { + if (bflo[0] && bflo[1]) + { + a(i,j,k) = a(i+1+offset, j+1+offset, k); + } + else if (bflo[0]) + { + a(i,j,k) = a(i+1+offset, j, k); + } + else if (bflo[1]) + { + a(i,j,k) = a(i, j+1+offset, k); + } + } + // xhi & ylo + else if (i == dhi.x+1 && j == dlo.y-1 && (bfhi[0] || bflo[1])) + { + if (bfhi[0] && bflo[1]) + { + a(i,j,k) = a(i-1-offset, j+1+offset, k); + } + else if (bfhi[0]) + { + a(i,j,k) = a(i-1-offset, j, k); + } + else if (bflo[1]) + { + a(i,j,k) = a(i, j+1+offset, k); + } + } + // xlo & yhi + else if (i == dlo.x-1 && j == dhi.y+1 && (bflo[0] || bfhi[1])) + { + if (bflo[0] && bfhi[1]) + { + a(i,j,k) = a(i+1+offset, j-1-offset, k); + } + else if (bflo[0]) + { + a(i,j,k) = a(i+1+offset, j, k); + } + else if (bfhi[1]) + { + a(i,j,k) = a(i, j-1-offset, k); + } + } + // xhi & yhi + else if (i == dhi.x+1 && j == dhi.y+1 && (bfhi[0] || bfhi[1])) + { + if (bfhi[0] && bfhi[1]) + { + a(i,j,k) = a(i-1-offset, j-1-offset, k); + } + else if (bfhi[0]) + { + a(i,j,k) = a(i-1-offset, j, k); + } + else if (bfhi[1]) + { + a(i,j,k) = a(i, j-1-offset, k); + } + } + // xlo & zlo + else if (i == dlo.x-1 && k == dlo.z-1 && (bflo[0] || bflo[2])) + { + if (bflo[0] && bflo[2]) + { + a(i,j,k) = a(i+1+offset, j, k+1+offset); + } + else if (bflo[0]) + { + a(i,j,k) = a(i+1+offset, j, k); + } + else if (bflo[2]) + { + a(i,j,k) = a(i, j, k+1+offset); + } + } + // xhi & zlo + else if (i == dhi.x+1 && k == dlo.z-1 && (bfhi[0] || bflo[2])) + { + if (bfhi[0] && bflo[2]) + { + a(i,j,k) = a(i-1-offset, j, k+1+offset); + } + else if (bfhi[0]) + { + a(i,j,k) = a(i-1-offset, j, k); + } + else if (bflo[2]) + { + a(i,j,k) = a(i, j, k+1+offset); + } + } + // xlo & zhi + else if (i == dlo.x-1 && k == dhi.z+1 && (bflo[0] || bfhi[2])) + { + if (bflo[0] && bfhi[2]) + { + a(i,j,k) = a(i+1+offset, j, k-1-offset); + } + else if (bflo[0]) + { + a(i,j,k) = a(i+1+offset, j, k); + } + else if (bfhi[2]) + { + a(i,j,k) = a(i, j, k-1-offset); + } + } + // xhi & zhi + else if (i == dhi.x+1 && k == dhi.z+1 && (bfhi[0] || bfhi[2])) + { + if (bfhi[0] && bfhi[2]) + { + a(i,j,k) = a(i-1-offset, j, k-1-offset); + } + else if (bfhi[0]) + { + a(i,j,k) = a(i-1-offset, j, k); + } + else if (bfhi[2]) + { + a(i,j,k) = a(i, j, k-1-offset); + } + } + // ylo & zlo + else if (j == dlo.y-1 && k == dlo.z-1 && (bflo[1] || bflo[2])) + { + if (bflo[1] && bflo[2]) + { + a(i,j,k) = a(i, j+1+offset, k+1+offset); + } + else if (bflo[1]) + { + a(i,j,k) = a(i, j+1+offset, k); + } + else if (bflo[2]) + { + a(i,j,k) = a(i, j, k+1+offset); + } + } + // yhi & zlo + else if (j == dhi.y+1 && k == dlo.z-1 && (bfhi[1] || bflo[2])) + { + if (bfhi[1] && bflo[2]) + { + a(i,j,k) = a(i, j-1-offset, k+1+offset); + } + else if (bfhi[1]) + { + a(i,j,k) = a(i, j-1-offset, k); + } + else if (bflo[2]) + { + a(i,j,k) = a(i, j, k+1+offset); + } + } + // ylo & zhi + else if (j == dlo.y-1 && k == dhi.z+1 && (bflo[1] || bfhi[2])) + { + if (bflo[1] && bfhi[2]) + { + a(i,j,k) = a(i, j+1+offset, k-1-offset); + } + else if (bflo[1]) + { + a(i,j,k) = a(i, j+1+offset, k); + } + else if (bfhi[2]) + { + a(i,j,k) = a(i, j, k-1-offset); + } + } + // yhi & zhi + else if (j == dhi.y+1 && k == dhi.z+1 && (bfhi[1] || bfhi[2])) + { + if (bfhi[1] && bfhi[2]) + { + a(i,j,k) = a(i, j-1-offset, k-1-offset); + } + else if (bfhi[1]) + { + a(i,j,k) = a(i, j-1-offset, k); + } + else if (bfhi[2]) + { + a(i,j,k) = a(i, j, k-1-offset); + } + } + else if (i == dlo.x-1 && bflo[0]) + { + a(i,j,k) = a(i+1+offset, j, k); + } + else if (i == dhi.x+1 && bfhi[0]) + { + a(i,j,k) = a(i-1-offset, j, k); + } + else if (j == dlo.y-1 && bflo[1]) + { + a(i,j,k) = a(i, j+1+offset, k); + } + else if (j == dhi.y+1 && bfhi[1]) + { + a(i,j,k) = a(i, j-1-offset, k); + } + else if (k == dlo.z-1 && bflo[2]) + { + a(i,j,k) = a(i, j, k+1+offset); + } + else if (k == dhi.z+1 && bfhi[2]) + { + a(i,j,k) = a(i, j, k-1-offset); + } + } + }); +} + +// +// restriction +// + +AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +void mlndlap_restriction (int i, int j, int k, Array4 const& crse, + Array4 const& fine, Array4 const& msk) noexcept +{ + int ii = i*2; + int jj = j*2; + int kk = k*2; + if (msk(ii,jj,kk)) { + crse(i,j,k) = Real(0.0); + } else { + crse(i,j,k) = Real(1./64.)*(fine(ii-1,jj-1,kk-1)+fine(ii+1,jj-1,kk-1) + +fine(ii-1,jj+1,kk-1)+fine(ii+1,jj+1,kk-1) + +fine(ii-1,jj-1,kk+1)+fine(ii+1,jj-1,kk+1) + +fine(ii-1,jj+1,kk+1)+fine(ii+1,jj+1,kk+1)) + + Real(1./32.)*(fine(ii ,jj-1,kk-1)+fine(ii ,jj+1,kk-1) + +fine(ii ,jj-1,kk+1)+fine(ii ,jj+1,kk+1) + +fine(ii-1,jj ,kk-1)+fine(ii+1,jj ,kk-1) + +fine(ii-1,jj ,kk+1)+fine(ii+1,jj ,kk+1) + +fine(ii-1,jj-1,kk )+fine(ii+1,jj-1,kk ) + +fine(ii-1,jj+1,kk )+fine(ii+1,jj+1,kk )) + + Real(1./16.)*(fine(ii-1,jj,kk)+fine(ii+1,jj,kk) + +fine(ii,jj-1,kk)+fine(ii,jj+1,kk) + +fine(ii,jj,kk-1)+fine(ii,jj,kk+1)) + + Real(1./8.)*fine(ii,jj,kk); + } +} + +template +AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +void mlndlap_restriction (int i, int j, int k, Array4 const& crse, + Array4 const& fine, Array4 const& msk, + Box const& fdom, + GpuArray const& bclo, + GpuArray const& bchi) noexcept +{ + const int ii = i*rr; + const int jj = j*rr; + const int kk = k*rr; + if (msk(ii,jj,kk)) { + crse(i,j,k) = Real(0.0); + } else { + const auto ndlo = amrex::lbound(fdom); + const auto ndhi = amrex::ubound(fdom); + Real tmp = Real(0.0); + for (int koff = -rr+1; koff <= rr-1; ++koff) { + Real wz = rr - std::abs(koff); + for (int joff = -rr+1; joff <= rr-1; ++joff) { + Real wy = rr - std::abs(joff); + for (int ioff = -rr+1; ioff <= rr-1; ++ioff) { + Real wx = rr - std::abs(ioff); + int itmp = ii + ioff; + int jtmp = jj + joff; + int ktmp = kk + koff; + if ((itmp < ndlo.x && (bclo[0] == LinOpBCType::Neumann || + bclo[0] == LinOpBCType::inflow)) || + (itmp > ndhi.x && (bchi[0] == LinOpBCType::Neumann || + bchi[0] == LinOpBCType::inflow))) { + itmp = ii - ioff; + } + if ((jtmp < ndlo.y && (bclo[1] == LinOpBCType::Neumann || + bclo[1] == LinOpBCType::inflow)) || + (jtmp > ndhi.y && (bchi[1] == LinOpBCType::Neumann || + bchi[1] == LinOpBCType::inflow))) { + jtmp = jj - joff; + } + if ((ktmp < ndlo.z && (bclo[2] == LinOpBCType::Neumann || + bclo[2] == LinOpBCType::inflow)) || + (ktmp > ndhi.z && (bchi[2] == LinOpBCType::Neumann || + bchi[2] == LinOpBCType::inflow))) { + ktmp = kk - koff; + } + tmp += wx*wy*wz*fine(itmp,jtmp,ktmp); + } + } + } + crse(i,j,k) = tmp/Real(rr*rr*rr*rr*rr*rr); + } +} + +AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +void mlndlap_semi_restriction (int i, int j, int k, Array4 const& crse, + Array4 const& fine, Array4 const& msk, int idir) noexcept +{ + if (idir == 2) + { + int ii = i*2; + int jj = j*2; + int kk = k; + if (msk(ii,jj,kk)) { + crse(i,j,k) = Real(0.0); + } else { // use 2-D formula + crse(i,j,k) = Real(1./16.)*(fine(ii-1,jj-1,kk) + Real(2.)*fine(ii ,jj-1,kk) + fine(ii+1,jj-1,kk) + + Real(2.)*fine(ii-1,jj ,kk) + Real(4.)*fine(ii ,jj ,kk) + Real(2.)*fine(ii+1,jj ,kk) + + fine(ii-1,jj+1,kk) + Real(2.)*fine(ii ,jj+1,kk) + fine(ii+1,jj+1,kk)); + } + } + else if (idir == 1) + { + int ii = i*2; + int jj = j; + int kk = k*2; + if (msk(ii,jj,kk)) { + crse(i,j,k) = Real(0.0); + } else { // use 2-D formula + crse(i,j,k) = Real(1./16.)*(fine(ii-1,jj,kk-1) + Real(2.)*fine(ii ,jj,kk-1) + fine(ii+1,jj,kk-1) + + Real(2.)*fine(ii-1,jj ,kk) + Real(4.)*fine(ii ,jj,kk ) + Real(2.)*fine(ii+1,jj,kk ) + + fine(ii-1,jj,kk+1) + Real(2.)*fine(ii ,jj,kk+1) + fine(ii+1,jj,kk+1)); + } + } + else if (idir == 0) + { + int ii = i; + int jj = j*2; + int kk = k*2; + if (msk(ii,jj,kk)) { + crse(i,j,k) = Real(0.0); + } else { // use 2-D formula + crse(i,j,k) = Real(1./16.)*(fine(ii,jj-1,kk-1) + Real(2.)*fine(ii ,jj,kk-1) + fine(ii,jj+1,kk-1) + + Real(2.)*fine(ii,jj-1 ,kk) + Real(4.)*fine(ii ,jj,kk ) + Real(2.)*fine(ii,jj+1,kk ) + + fine(ii,jj-1,kk+1) + Real(2.)*fine(ii ,jj,kk+1) + fine(ii,jj+1,kk+1)); + } + } + else + { + amrex::Abort("mlndlap_semi_restriction semi direction wrong semi-direction. "); + } +} + +// +// masks +// + +AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +void mlndlap_set_nodal_mask (int i, int j, int k, Array4 const& nmsk, + Array4 const& cmsk) noexcept +{ + using namespace nodelap_detail; + + int s = cmsk(i-1,j-1,k-1) + cmsk(i ,j-1,k-1) + + cmsk(i-1,j ,k-1) + cmsk(i ,j ,k-1) + + cmsk(i-1,j-1,k ) + cmsk(i ,j-1,k ) + + cmsk(i-1,j ,k ) + cmsk(i ,j ,k ); + if (s == 8*crse_cell) { + nmsk(i,j,k) = crse_node; + } + else if (s == 8*fine_cell) { + nmsk(i,j,k) = fine_node; + } else { + nmsk(i,j,k) = crse_fine_node; + } +} + +AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +void mlndlap_set_dirichlet_mask (Box const& bx, Array4 const& dmsk, + Array4 const& omsk, Box const& dom, + GpuArray const& bclo, + GpuArray const& bchi) noexcept +{ + const auto lo = amrex::lbound(bx); + const auto hi = amrex::ubound(bx); + for (int k = lo.z; k <= hi.z; ++k) { + for (int j = lo.y; j <= hi.y; ++j) { + AMREX_PRAGMA_SIMD + for (int i = lo.x; i <= hi.x; ++i) { + if (!dmsk(i,j,k)) { + dmsk(i,j,k) = (omsk(i-1,j-1,k-1) == 1 || omsk(i,j-1,k-1) == 1 || + omsk(i-1,j ,k-1) == 1 || omsk(i,j ,k-1) == 1 || + omsk(i-1,j-1,k ) == 1 || omsk(i,j-1,k ) == 1 || + omsk(i-1,j ,k ) == 1 || omsk(i,j ,k ) == 1); + } + }}} + + const auto domlo = amrex::lbound(dom); + const auto domhi = amrex::ubound(dom); + + if (bclo[0] == LinOpBCType::Dirichlet && lo.x == domlo.x) { + for (int k = lo.z; k <= hi.z; ++k) { + for (int j = lo.y; j <= hi.y; ++j) { + dmsk(lo.x,j,k) = 1; + }} + } + + if (bchi[0] == LinOpBCType::Dirichlet && hi.x == domhi.x) { + for (int k = lo.z; k <= hi.z; ++k) { + for (int j = lo.y; j <= hi.y; ++j) { + dmsk(hi.x,j,k) = 1; + }} + } + + if (bclo[1] == LinOpBCType::Dirichlet && lo.y == domlo.y) { + for (int k = lo.z; k <= hi.z; ++k) { + AMREX_PRAGMA_SIMD + for (int i = lo.x; i <= hi.x; ++i) { + dmsk(i,lo.y,k) = 1; + }} + } + + if (bchi[1] == LinOpBCType::Dirichlet && hi.y == domhi.y) { + for (int k = lo.z; k <= hi.z; ++k) { + AMREX_PRAGMA_SIMD + for (int i = lo.x; i <= hi.x; ++i) { + dmsk(i,hi.y,k) = 1; + }} + } + + if (bclo[2] == LinOpBCType::Dirichlet && lo.z == domlo.z) { + for (int j = lo.y; j <= hi.y; ++j) { + AMREX_PRAGMA_SIMD + for (int i = lo.x; i <= hi.x; ++i) { + dmsk(i,j,lo.z) = 1; + }} + } + + if (bchi[2] == LinOpBCType::Dirichlet && hi.z == domhi.z) { + for (int j = lo.y; j <= hi.y; ++j) { + AMREX_PRAGMA_SIMD + for (int i = lo.x; i <= hi.x; ++i) { + dmsk(i,j,hi.z) = 1; + }} + } +} + +AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +void mlndlap_set_dot_mask (Box const& bx, Array4 const& dmsk, + Array4 const& omsk, Box const& dom, + GpuArray const& bclo, + GpuArray const& bchi) noexcept +{ + const auto lo = amrex::lbound(bx); + const auto hi = amrex::ubound(bx); + for (int k = lo.z; k <= hi.z; ++k) { + for (int j = lo.y; j <= hi.y; ++j) { + AMREX_PRAGMA_SIMD + for (int i = lo.x; i <= hi.x; ++i) { + dmsk(i,j,k) = static_cast(omsk(i,j,k)); + }}} + + const auto domlo = amrex::lbound(dom); + const auto domhi = amrex::ubound(dom); + + if ((bclo[0] == LinOpBCType::Neumann || bclo[0] == LinOpBCType::inflow) + && lo.x == domlo.x) + { + for (int k = lo.z; k <= hi.z; ++k) { + for (int j = lo.y; j <= hi.y; ++j) { + dmsk(lo.x,j,k) *= Real(0.5); + }} + } + + if ((bchi[0] == LinOpBCType::Neumann || bchi[0] == LinOpBCType::inflow) + && hi.x == domhi.x) + { + for (int k = lo.z; k <= hi.z; ++k) { + for (int j = lo.y; j <= hi.y; ++j) { + dmsk(hi.x,j,k) *= Real(0.5); + }} + } + + if ((bclo[1] == LinOpBCType::Neumann || bclo[1] == LinOpBCType::inflow) + && lo.y == domlo.y) + { + for (int k = lo.z; k <= hi.z; ++k) { + AMREX_PRAGMA_SIMD + for (int i = lo.x; i <= hi.x; ++i) { + dmsk(i,lo.y,k) *= Real(0.5); + }} + } + + if ((bchi[1] == LinOpBCType::Neumann || bchi[1] == LinOpBCType::inflow) + && hi.y == domhi.y) + { + for (int k = lo.z; k <= hi.z; ++k) { + AMREX_PRAGMA_SIMD + for (int i = lo.x; i <= hi.x; ++i) { + dmsk(i,hi.y,k) *= Real(0.5); + }} + } + + if ((bclo[2] == LinOpBCType::Neumann || bclo[2] == LinOpBCType::inflow) + && lo.z == domlo.z) + { + for (int j = lo.y; j <= hi.y; ++j) { + AMREX_PRAGMA_SIMD + for (int i = lo.x; i <= hi.x; ++i) { + dmsk(i,j,lo.z) *= Real(0.5); + }} + } + + if ((bchi[2] == LinOpBCType::Neumann || bchi[2] == LinOpBCType::inflow) + && hi.z == domhi.z) + { + for (int j = lo.y; j <= hi.y; ++j) { + AMREX_PRAGMA_SIMD + for (int i = lo.x; i <= hi.x; ++i) { + dmsk(i,j,hi.z) *= Real(0.5); + }} + } +} + +} + +#endif diff --git a/Src/LinearSolvers/MLMG/AMReX_MLNodeLinOp_K.H b/Src/LinearSolvers/MLMG/AMReX_MLNodeLinOp_K.H new file mode 100644 index 0000000000..2c98d32a00 --- /dev/null +++ b/Src/LinearSolvers/MLMG/AMReX_MLNodeLinOp_K.H @@ -0,0 +1,107 @@ +#ifndef AMREX_ML_NODE_LINOP_K_H_ +#define AMREX_ML_NODE_LINOP_K_H_ +#include + +#include + +namespace amrex::nodelap_detail { + +#ifdef AMREX_USE_HYPRE + + struct GetNode { + AMREX_GPU_DEVICE Dim3 operator() (Dim3 const& lo, Dim3 const& len, int& offset) + { + Dim3 node; + constexpr int nsten = AMREX_D_TERM(3,*3,*3); + int icell = offset / nsten; + node.z = icell / (len.x*len.y); + node.y = (icell - node.z*(len.x*len.y)) / len.x; + node.x = (icell - node.z*(len.x*len.y)) - node.y*len.x; + node.x += lo.x; + node.y += lo.y; + node.z += lo.z; + offset -= icell*nsten; + return node; + } + }; + + struct GetNode2 { + AMREX_GPU_DEVICE Dim3 operator() (int offset, Dim3 const& node) + { + // In 2D the offsets are + // 6 7 8 + // 4 0 5 + // 1 2 3 + constexpr int nstenhalf = AMREX_SPACEDIM == 2 ? 4 : 13; + if (offset == 0) { + return node; + } else { + if (offset <= nstenhalf) { --offset; } + Dim3 node2; + node2.z = offset / 9; + node2.y = (offset - node2.z*9) / 3; + node2.x = (offset - node2.z*9) - node2.y*3; + AMREX_D_TERM(node2.x += node.x-1;, + node2.y += node.y-1;, + node2.z += node.z-1); + return node2; + } + } + }; + +#endif /* AMREX_USE_HYPRE */ + + constexpr int crse_cell = 0; // Do NOT change the values + constexpr int fine_cell = 1; + constexpr int crse_node = 0; + constexpr int crse_fine_node = 1; + constexpr int fine_node = 2; +} + +namespace amrex { + +template +void mlndlap_fillbc_cc (Box const& vbx, Array4 const& sigma, Box const& domain, + GpuArray bclo, + GpuArray bchi) noexcept +{ + GpuArray bflo{{AMREX_D_DECL(bclo[0] != LinOpBCType::Periodic, + bclo[1] != LinOpBCType::Periodic, + bclo[2] != LinOpBCType::Periodic)}}; + GpuArray bfhi{{AMREX_D_DECL(bchi[0] != LinOpBCType::Periodic, + bchi[1] != LinOpBCType::Periodic, + bchi[2] != LinOpBCType::Periodic)}}; + mlndlap_bc_doit(vbx, sigma, domain, bflo, bfhi); +} + +template +void mlndlap_applybc (Box const& vbx, Array4 const& phi, Box const& domain, + GpuArray bclo, + GpuArray bchi) noexcept +{ + GpuArray bflo{{AMREX_D_DECL(bclo[0] == LinOpBCType::Neumann || + bclo[0] == LinOpBCType::inflow, + bclo[1] == LinOpBCType::Neumann || + bclo[1] == LinOpBCType::inflow, + bclo[2] == LinOpBCType::Neumann || + bclo[2] == LinOpBCType::inflow)}}; + GpuArray bfhi{{AMREX_D_DECL(bchi[0] == LinOpBCType::Neumann || + bchi[0] == LinOpBCType::inflow, + bchi[1] == LinOpBCType::Neumann || + bchi[1] == LinOpBCType::inflow, + bchi[2] == LinOpBCType::Neumann || + bchi[2] == LinOpBCType::inflow)}}; + mlndlap_bc_doit(vbx, phi, domain, bflo, bfhi); +} + +} + +#if (AMREX_SPACEDIM == 1) +#include +#elif (AMREX_SPACEDIM == 2) +#include +#else +#include +#endif + +#endif diff --git a/Src/LinearSolvers/MLMG/AMReX_MLNodeTensorLaplacian.cpp b/Src/LinearSolvers/MLMG/AMReX_MLNodeTensorLaplacian.cpp index 59718d7624..e8135aeba1 100644 --- a/Src/LinearSolvers/MLMG/AMReX_MLNodeTensorLaplacian.cpp +++ b/Src/LinearSolvers/MLMG/AMReX_MLNodeTensorLaplacian.cpp @@ -1,5 +1,5 @@ #include -#include +#include #include #include diff --git a/Src/LinearSolvers/MLMG/Make.package b/Src/LinearSolvers/MLMG/Make.package index 3609164c91..8e8d9b3ac1 100644 --- a/Src/LinearSolvers/MLMG/Make.package +++ b/Src/LinearSolvers/MLMG/Make.package @@ -1,6 +1,9 @@ ifndef AMREX_MLMG_MAKE AMREX_MLMG_MAKE := 1 +USE_LINEAR_SOLVERS_INCFLO ?= TRUE +USE_LINEAR_SOLVERS_EM ?= TRUE + CEXE_sources += AMReX_MLMG.cpp CEXE_headers += AMReX_MLMG.H @@ -16,7 +19,7 @@ CEXE_headers += AMReX_MLLinOp_K.H CEXE_headers += AMReX_MLCellLinOp.H -CEXE_headers += AMReX_MLNodeLinOp.H +CEXE_headers += AMReX_MLNodeLinOp.H AMReX_MLNodeLinOp_K.H AMReX_MLNodeLinOp_$(DIM)D_K.H CEXE_sources += AMReX_MLNodeLinOp.cpp CEXE_headers += AMReX_MLCellABecLap.H @@ -39,60 +42,69 @@ ifeq ($(DIM),3) CEXE_headers += AMReX_MLPoisson_2D_K.H endif -CEXE_headers += AMReX_MLNodeLaplacian.H -CEXE_sources += AMReX_MLNodeLaplacian.cpp -CEXE_sources += AMReX_MLNodeLaplacian_sync.cpp -CEXE_sources += AMReX_MLNodeLaplacian_sten.cpp -CEXE_sources += AMReX_MLNodeLaplacian_misc.cpp -CEXE_headers += AMReX_MLNodeLap_K.H AMReX_MLNodeLap_$(DIM)D_K.H -ifeq ($(USE_EB),TRUE) - CEXE_sources += AMReX_MLNodeLaplacian_eb.cpp +ifneq ($(BL_NO_FORT),TRUE) + CEXE_headers += AMReX_MLLinOp_F.H + F90EXE_sources += AMReX_MLLinOp_nd.F90 endif -ifeq ($(USE_HYPRE),TRUE) - CEXE_sources += AMReX_MLNodeLaplacian_hypre.cpp + +ifeq ($(USE_EB),TRUE) + CEXE_headers += AMReX_MLEBABecLap.H + CEXE_sources += AMReX_MLEBABecLap.cpp + CEXE_sources += AMReX_MLEBABecLap_F.cpp + CEXE_headers += AMReX_MLEBABecLap_K.H + CEXE_headers += AMReX_MLEBABecLap_$(DIM)D_K.H endif -CEXE_headers += AMReX_MLNodeABecLaplacian.H -CEXE_sources += AMReX_MLNodeABecLaplacian.cpp -CEXE_headers += AMReX_MLNodeABecLap_K.H AMReX_MLNodeABecLap_$(DIM)D_K.H +ifneq ($(USE_LINEAR_SOLVERS_INCFLO),FALSE) -CEXE_headers += AMReX_MLNodeTensorLaplacian.H -CEXE_sources += AMReX_MLNodeTensorLaplacian.cpp -CEXE_headers += AMReX_MLNodeTensorLap_K.H AMReX_MLNodeTensorLap_$(DIM)D_K.H + CEXE_headers += AMReX_MLNodeABecLaplacian.H + CEXE_sources += AMReX_MLNodeABecLaplacian.cpp + CEXE_headers += AMReX_MLNodeABecLap_K.H AMReX_MLNodeABecLap_$(DIM)D_K.H -CEXE_headers += AMReX_MLTensorOp.H -CEXE_sources += AMReX_MLTensorOp.cpp AMReX_MLTensorOp_grad.cpp -CEXE_headers += AMReX_MLTensor_K.H AMReX_MLTensor_$(DIM)D_K.H + CEXE_headers += AMReX_MLNodeLaplacian.H + CEXE_sources += AMReX_MLNodeLaplacian.cpp + CEXE_sources += AMReX_MLNodeLaplacian_sync.cpp + CEXE_sources += AMReX_MLNodeLaplacian_sten.cpp + CEXE_sources += AMReX_MLNodeLaplacian_misc.cpp + CEXE_headers += AMReX_MLNodeLap_K.H AMReX_MLNodeLap_$(DIM)D_K.H +ifeq ($(USE_EB),TRUE) + CEXE_sources += AMReX_MLNodeLaplacian_eb.cpp +endif +ifeq ($(USE_HYPRE),TRUE) + CEXE_sources += AMReX_MLNodeLaplacian_hypre.cpp +endif -CEXE_headers += AMReX_MLEBNodeFDLaplacian.H -CEXE_sources += AMReX_MLEBNodeFDLaplacian.cpp -CEXE_headers += AMReX_MLEBNodeFDLap_K.H -CEXE_headers += AMReX_MLEBNodeFDLap_$(DIM)D_K.H + CEXE_headers += AMReX_MLTensorOp.H + CEXE_sources += AMReX_MLTensorOp.cpp AMReX_MLTensorOp_grad.cpp + CEXE_headers += AMReX_MLTensor_K.H AMReX_MLTensor_$(DIM)D_K.H ifeq ($(USE_EB),TRUE) -CEXE_headers += AMReX_MLEBABecLap.H -CEXE_sources += AMReX_MLEBABecLap.cpp -CEXE_sources += AMReX_MLEBABecLap_F.cpp -CEXE_headers += AMReX_MLEBABecLap_K.H -CEXE_headers += AMReX_MLEBABecLap_$(DIM)D_K.H - -CEXE_headers += AMReX_MLEBTensorOp.H -CEXE_sources += AMReX_MLEBTensorOp.cpp -CEXE_sources += AMReX_MLEBTensorOp_bc.cpp -CEXE_headers += AMReX_MLEBTensor_K.H AMReX_MLEBTensor_$(DIM)D_K.H + CEXE_headers += AMReX_MLEBTensorOp.H + CEXE_sources += AMReX_MLEBTensorOp.cpp + CEXE_sources += AMReX_MLEBTensorOp_bc.cpp + CEXE_headers += AMReX_MLEBTensor_K.H AMReX_MLEBTensor_$(DIM)D_K.H endif -ifneq ($(BL_NO_FORT),TRUE) - CEXE_headers += AMReX_MLLinOp_F.H - F90EXE_sources += AMReX_MLLinOp_nd.F90 -endif +endif # ifneq ($(USE_LINEAR_SOLVERS_INCFLO),FALSE) +ifneq ($(USE_LINEAR_SOLVERS_EM),FALSE) ifneq ($(DIM),1) CEXE_headers += AMReX_MLCurlCurl.H CEXE_sources += AMReX_MLCurlCurl.cpp CEXE_headers += AMReX_MLCurlCurl_K.H endif + CEXE_headers += AMReX_MLEBNodeFDLaplacian.H + CEXE_sources += AMReX_MLEBNodeFDLaplacian.cpp + CEXE_headers += AMReX_MLEBNodeFDLap_K.H + CEXE_headers += AMReX_MLEBNodeFDLap_$(DIM)D_K.H + + CEXE_headers += AMReX_MLNodeTensorLaplacian.H + CEXE_sources += AMReX_MLNodeTensorLaplacian.cpp + CEXE_headers += AMReX_MLNodeTensorLap_K.H AMReX_MLNodeTensorLap_$(DIM)D_K.H + +endif # ifneq ($(USE_LINEAR_SOLVERS_EM),FALSE) + VPATH_LOCATIONS += $(AMREX_HOME)/Src/LinearSolvers/MLMG INCLUDE_LOCATIONS += $(AMREX_HOME)/Src/LinearSolvers/MLMG diff --git a/Src/Particle/AMReX_NeighborParticles.H b/Src/Particle/AMReX_NeighborParticles.H index 5f814805a2..1b7d01d7a9 100644 --- a/Src/Particle/AMReX_NeighborParticles.H +++ b/Src/Particle/AMReX_NeighborParticles.H @@ -467,7 +467,7 @@ protected: ParticleCopyPlan neighbor_copy_plan; amrex::PODVector > snd_buffer; - amrex::Gpu::DeviceVector rcv_buffer; + amrex::PODVector > rcv_buffer; Gpu::PinnedVector pinned_snd_buffer; Gpu::PinnedVector pinned_rcv_buffer; diff --git a/Src/Particle/AMReX_NeighborParticlesGPUImpl.H b/Src/Particle/AMReX_NeighborParticlesGPUImpl.H index 4cc533d901..b4e50bef7b 100644 --- a/Src/Particle/AMReX_NeighborParticlesGPUImpl.H +++ b/Src/Particle/AMReX_NeighborParticlesGPUImpl.H @@ -255,6 +255,12 @@ updateNeighborsGPU (bool boundary_neighbors_only) } clearNeighbors(); + + if (ParallelDescriptor::UseGpuAwareMpi()) { + snd_buffer.setArena(The_Comms_Arena()); + rcv_buffer.setArena(The_Comms_Arena()); + } + packBuffer(*this, neighbor_copy_op, neighbor_copy_plan, snd_buffer); if (ParallelDescriptor::UseGpuAwareMpi()) { diff --git a/Src/Particle/AMReX_ParticleContainer.H b/Src/Particle/AMReX_ParticleContainer.H index 03a2254a10..4b347d283b 100644 --- a/Src/Particle/AMReX_ParticleContainer.H +++ b/Src/Particle/AMReX_ParticleContainer.H @@ -60,6 +60,7 @@ #include #include #include +#include #include #include #include @@ -1144,7 +1145,8 @@ public: */ ParticleTileType& DefineAndReturnParticleTile (int lev, int grid, int tile) { - m_particles[lev][std::make_pair(grid, tile)].define(NumRuntimeRealComps(), NumRuntimeIntComps()); + m_particles[lev][std::make_pair(grid, tile)].define(NumRuntimeRealComps(), NumRuntimeIntComps(), &m_soa_rdata_names, &m_soa_idata_names); + return ParticlesAt(lev, grid, tile); } @@ -1247,10 +1249,10 @@ public: Long superParticleSize() const { return superparticle_size; } - template ,int> = 0> - void AddRealComp (T communicate=true) + void AddRealComp (std::string const & name, int communicate=1) { + m_soa_rdata_names.push_back(name); + m_runtime_comps_defined = true; m_num_runtime_real++; h_redistribute_real_comp.push_back(communicate); @@ -1270,10 +1272,15 @@ public: } } - template ,int> = 0> - void AddIntComp (T communicate=true) + void AddRealComp (int communicate=1) { + AddRealComp(getDefaultCompNameReal(NArrayReal+m_num_runtime_real), communicate); + } + + void AddIntComp (std::string const & name, int communicate=1) + { + m_soa_idata_names.push_back(name); + m_runtime_comps_defined = true; m_num_runtime_int++; h_redistribute_int_comp.push_back(communicate); @@ -1293,6 +1300,11 @@ public: } } + void AddIntComp (int communicate=1) + { + AddIntComp(getDefaultCompNameInt(NArrayInt+m_num_runtime_int), communicate); + } + int NumRuntimeRealComps () const { return m_num_runtime_real; } int NumRuntimeIntComps () const { return m_num_runtime_int; } @@ -1403,6 +1415,15 @@ public: #include "AMReX_ParticlesHDF5.H" #endif + /** Overwrite the default names for the compile-time SoA components */ + void SetSoACompileTimeNames (std::vector const & rdata_name, std::vector const & idata_name); + + /** Get the names for the real SoA components **/ + std::vector GetRealSoANames () const {return m_soa_rdata_names;} + + /** Get the names for the int SoA components **/ + std::vector GetIntSoANames () const {return m_soa_idata_names;} + protected: template @@ -1435,6 +1456,10 @@ private: size_t particle_size, superparticle_size; int num_real_comm_comps, num_int_comm_comps; Vector m_particles; + + // names of both compile-time and runtime Real and Int SoA data + std::vector m_soa_rdata_names; + std::vector m_soa_idata_names; }; template class Allocator, class CellAssignor> diff --git a/Src/Particle/AMReX_ParticleContainerI.H b/Src/Particle/AMReX_ParticleContainerI.H index 74e65b792f..d42d2d5b4b 100644 --- a/Src/Particle/AMReX_ParticleContainerI.H +++ b/Src/Particle/AMReX_ParticleContainerI.H @@ -1,6 +1,10 @@ -#include #include +#include +#include +#include + + template class Allocator, class CellAssignor> void @@ -60,10 +64,40 @@ ParticleContainer_impl(i)); + } + for (int i=0; i(i)); + } + initialized = true; } } +template class Allocator, class CellAssignor> +void +ParticleContainer_impl :: SetSoACompileTimeNames ( + std::vector const & rdata_name, std::vector const & idata_name +) +{ + AMREX_ALWAYS_ASSERT_WITH_MESSAGE(rdata_name.size() == NArrayReal, "rdata_name must be equal to NArrayReal"); + AMREX_ALWAYS_ASSERT_WITH_MESSAGE(idata_name.size() == NArrayInt, "idata_name must be equal to NArrayInt"); + + for (int i=0; i class Allocator, class CellAssignor> template @@ -1161,7 +1195,7 @@ ParticleContainer_impl > snd_buffer; - Gpu::DeviceVector rcv_buffer; + amrex::PODVector > rcv_buffer; + + if (ParallelDescriptor::UseGpuAwareMpi()) { + snd_buffer.setArena(The_Comms_Arena()); + rcv_buffer.setArena(The_Comms_Arena()); + } packBuffer(*this, op, plan, snd_buffer); @@ -1498,7 +1538,7 @@ ParticleContainer_impl write_real_comp; Vector tmp_real_comp_names; - int nrc = ParticleType::is_soa_particle ? NStructReal + NumRealComps() - AMREX_SPACEDIM : NStructReal + NumRealComps(); - for (int i = 0; i < nrc; ++i ) + int first_rcomp = ParticleType::is_soa_particle ? AMREX_SPACEDIM : 0; + for (int i = first_rcomp; i < NStructReal + NumRealComps(); ++i ) { write_real_comp.push_back(1); if (real_comp_names.empty()) { - std::stringstream ss; - ss << "real_comp" << i; - tmp_real_comp_names.push_back(ss.str()); + tmp_real_comp_names.push_back(getDefaultCompNameReal(i)); } else { - tmp_real_comp_names.push_back(real_comp_names[i]); + tmp_real_comp_names.push_back(real_comp_names[i-first_rcomp]); } } @@ -75,9 +73,7 @@ ParticleContainer_impl(i)); } else { @@ -98,14 +94,12 @@ ParticleContainer_impl write_real_comp; Vector real_comp_names; - int nrc = ParticleType::is_soa_particle ? NStructReal + NumRealComps() - AMREX_SPACEDIM : NStructReal + NumRealComps(); - for (int i = 0; i < nrc; ++i ) + int first_rcomp = ParticleType::is_soa_particle ? AMREX_SPACEDIM : 0; + for (int i = first_rcomp; i < NStructReal + NumRealComps(); ++i ) { write_real_comp.push_back(1); - std::stringstream ss; - ss << "real_comp" << i; - real_comp_names.push_back(ss.str()); + real_comp_names.push_back(getDefaultCompNameReal(i)); } Vector write_int_comp; @@ -113,9 +107,7 @@ ParticleContainer_impl(i)); } WriteBinaryParticleData(dir, name, write_real_comp, write_int_comp, @@ -182,9 +174,7 @@ ParticleContainer_impl int_comp_names; for (int i = 0; i < NStructInt + NumIntComps(); ++i ) { - std::stringstream ss; - ss << "int_comp" << i; - int_comp_names.push_back(ss.str()); + int_comp_names.push_back(getDefaultCompNameInt(i)); } WriteBinaryParticleData(dir, name, @@ -211,20 +201,16 @@ ParticleContainer_impl real_comp_names; - int nrc = ParticleType::is_soa_particle ? NStructReal + NumRealComps() - AMREX_SPACEDIM : NStructReal + NumRealComps(); - for (int i = 0; i < nrc; ++i ) + int first_rcomp = ParticleType::is_soa_particle ? AMREX_SPACEDIM : 0; + for (int i = first_rcomp; i < NStructReal + NumRealComps(); ++i ) { - std::stringstream ss; - ss << "real_comp" << i; - real_comp_names.push_back(ss.str()); + real_comp_names.push_back(getDefaultCompNameReal(i)); } Vector int_comp_names; for (int i = 0; i < NStructInt + NumIntComps(); ++i ) { - std::stringstream ss; - ss << "int_comp" << i; - int_comp_names.push_back(ss.str()); + int_comp_names.push_back(getDefaultCompNameInt(i)); } WriteBinaryParticleData(dir, name, write_real_comp, write_int_comp, @@ -259,14 +245,12 @@ ParticleContainer_impl write_real_comp; Vector real_comp_names; - int nrc = ParticleType::is_soa_particle ? NStructReal + NumRealComps() - AMREX_SPACEDIM : NStructReal + NumRealComps(); - for (int i = 0; i < nrc; ++i ) + int first_rcomp = ParticleType::is_soa_particle ? AMREX_SPACEDIM : 0; + for (int i = first_rcomp; i < NStructReal + NumRealComps(); ++i ) { write_real_comp.push_back(1); - std::stringstream ss; - ss << "real_comp" << i; - real_comp_names.push_back(ss.str()); + real_comp_names.push_back(getDefaultCompNameReal(i)); } Vector write_int_comp; @@ -274,9 +258,7 @@ ParticleContainer_impl(i)); } WriteBinaryParticleData(dir, name, write_real_comp, write_int_comp, @@ -345,9 +327,7 @@ ParticleContainer_impl int_comp_names; for (int i = 0; i < NStructInt + NumIntComps(); ++i ) { - std::stringstream ss; - ss << "int_comp" << i; - int_comp_names.push_back(ss.str()); + int_comp_names.push_back(getDefaultCompNameInt(i)); } WriteBinaryParticleData(dir, name, @@ -374,20 +354,16 @@ ParticleContainer_impl real_comp_names; - int nrc = ParticleType::is_soa_particle ? NStructReal + NumRealComps() - AMREX_SPACEDIM : NStructReal + NumRealComps(); - for (int i = 0; i < nrc; ++i ) + int first_rcomp = ParticleType::is_soa_particle ? AMREX_SPACEDIM : 0; + for (int i = first_rcomp; i < NStructReal + NumRealComps(); ++i ) { - std::stringstream ss; - ss << "real_comp" << i; - real_comp_names.push_back(ss.str()); + real_comp_names.push_back(getDefaultCompNameReal(i)); } Vector int_comp_names; for (int i = 0; i < NStructInt + NumIntComps(); ++i ) { - std::stringstream ss; - ss << "int_comp" << i; - int_comp_names.push_back(ss.str()); + int_comp_names.push_back(getDefaultCompNameInt(i)); } WriteBinaryParticleData(dir, name, write_real_comp, write_int_comp, diff --git a/Src/Particle/AMReX_ParticleTile.H b/Src/Particle/AMReX_ParticleTile.H index a645330e04..7546ff8a21 100644 --- a/Src/Particle/AMReX_ParticleTile.H +++ b/Src/Particle/AMReX_ParticleTile.H @@ -11,7 +11,10 @@ #include #include +#include #include +#include + namespace amrex { @@ -730,10 +733,15 @@ struct ParticleTile ParticleTile& operator= (ParticleTile &&) noexcept = default; #endif - void define (int a_num_runtime_real, int a_num_runtime_int) + void define ( + int a_num_runtime_real, + int a_num_runtime_int, + std::vector* soa_rdata_names=nullptr, + std::vector* soa_idata_names=nullptr + ) { m_defined = true; - GetStructOfArrays().define(a_num_runtime_real, a_num_runtime_int); + GetStructOfArrays().define(a_num_runtime_real, a_num_runtime_int, soa_rdata_names, soa_idata_names); m_runtime_r_ptrs.resize(a_num_runtime_real); m_runtime_i_ptrs.resize(a_num_runtime_int); m_runtime_r_cptrs.resize(a_num_runtime_real); diff --git a/Src/Particle/AMReX_ParticleUtil.H b/Src/Particle/AMReX_ParticleUtil.H index b09f1d3583..f6f2506ec7 100644 --- a/Src/Particle/AMReX_ParticleUtil.H +++ b/Src/Particle/AMReX_ParticleUtil.H @@ -883,6 +883,26 @@ void PermutationForDeposition (Gpu::DeviceVector& perm, index_type n }); } +template +std::string getDefaultCompNameReal (const int i) { + int first_r_name = 0; + if constexpr (P::is_soa_particle) { + if (i < AMREX_SPACEDIM) { + constexpr int x_in_ascii = 120; + std::string const name{char(x_in_ascii+i)}; + return name; + } + first_r_name = AMREX_SPACEDIM; + } + std::string const name{("real_comp" + std::to_string(i-first_r_name))}; + return name; +} + +template +std::string getDefaultCompNameInt (const int i) { + std::string const name{("int_comp" + std::to_string(i))}; + return name; +} #ifdef AMREX_USE_HDF5_ASYNC void async_vol_es_wait_particle(); diff --git a/Src/Particle/AMReX_StructOfArrays.H b/Src/Particle/AMReX_StructOfArrays.H index 4de35e085c..46d18a1715 100644 --- a/Src/Particle/AMReX_StructOfArrays.H +++ b/Src/Particle/AMReX_StructOfArrays.H @@ -6,7 +6,11 @@ #include #include +#include #include +#include +#include + namespace amrex { @@ -19,11 +23,18 @@ struct StructOfArrays { using RealVector = amrex::PODVector >; using IntVector = amrex::PODVector >; - void define (int a_num_runtime_real, int a_num_runtime_int) + void define ( + int a_num_runtime_real, + int a_num_runtime_int, + std::vector* soa_rdata_names=nullptr, + std::vector* soa_idata_names=nullptr + ) { m_defined = true; m_runtime_rdata.resize(a_num_runtime_real); m_runtime_idata.resize(a_num_runtime_int ); + m_rdata_names = soa_rdata_names; + m_idata_names = soa_idata_names; } [[nodiscard]] int NumRealComps () const noexcept { return NReal + m_runtime_rdata.size(); } @@ -41,6 +52,28 @@ struct StructOfArrays { /** Get access to the particle Int Arrays (only compile-time components) */ [[nodiscard]] const std::array< IntVector, NInt>& GetIntData () const { return m_idata; } + /** Get the names for the real SoA components **/ + [[nodiscard]] std::vector GetRealNames () const + { + if (m_rdata_names) { + return *m_rdata_names; + } + else { + return std::vector(); + } + } + + /** Get the names for the int SoA components **/ + [[nodiscard]] std::vector GetIntNames () const + { + if (m_idata_names) { + return *m_idata_names; + } + else { + return std::vector(); + } + } + /** Get access to a particle Real component Array (compile-time and runtime component) * * @param index component with 0...NReal-1 compile-time and NReal... runtime arguments @@ -79,6 +112,32 @@ struct StructOfArrays { } } + /** Get access to a particle Real component Array (compile-time and runtime component) + * + * @param name named component component with 0...NReal-1 compile-time and NReal... runtime arguments + */ + [[nodiscard]] RealVector& GetRealData (std::string const & name) { + AMREX_ALWAYS_ASSERT_WITH_MESSAGE(m_rdata_names != nullptr, "SoA Real names were not defined."); + auto const pos = std::find(m_rdata_names->begin(), m_rdata_names->end(), name); + AMREX_ALWAYS_ASSERT_WITH_MESSAGE(pos != m_rdata_names->end(), "Soa Real name='" + name + "' was not found components"); + + int const index = std::distance(m_rdata_names->begin(), pos); + return GetRealData(index); + } + + /** Get access to a particle Real component Array (compile-time and runtime component) + * + * @param name named component component with 0...NReal-1 compile-time and NReal... runtime arguments + */ + [[nodiscard]] const RealVector& GetRealData (std::string const & name) const { + AMREX_ALWAYS_ASSERT_WITH_MESSAGE(m_rdata_names != nullptr, "SoA Real names were not defined."); + auto const pos = std::find(m_rdata_names->begin(), m_rdata_names->end(), name); + AMREX_ALWAYS_ASSERT_WITH_MESSAGE(pos != m_rdata_names->end(), "Soa Real name='" + name + "' was not found components"); + + int const index = std::distance(m_rdata_names->begin(), pos); + return GetRealData(index); + } + /** Get access to a particle Int component Array (compile-time and runtime component) * * @param index component with 0...NInt-1 compile-time and NInt... runtime arguments @@ -118,6 +177,34 @@ struct StructOfArrays { } } + /** Get access to a particle Int component Array (compile-time and runtime component) + * + * @param index component with 0...NInt-1 compile-time and NInt... runtime arguments + * @return + */ + [[nodiscard]] IntVector& GetIntData (std::string const & name) { + AMREX_ALWAYS_ASSERT_WITH_MESSAGE(m_idata_names != nullptr, "SoA Int names were not defined."); + auto const pos = std::find(m_idata_names->begin(), m_idata_names->end(), name); + AMREX_ALWAYS_ASSERT_WITH_MESSAGE(pos != m_idata_names->end(), "Soa Int name='" + name + "' was not found components"); + + int const index = std::distance(m_idata_names->begin(), pos); + return GetIntData(index); + } + + /** Get access to a particle Int component Array (compile-time and runtime component) + * + * @param index component with 0...NInt-1 compile-time and NInt... runtime arguments + * @return + */ + [[nodiscard]] const IntVector& GetIntData (std::string const & name) const { + AMREX_ALWAYS_ASSERT_WITH_MESSAGE(m_idata_names != nullptr, "SoA Int names were not defined."); + auto const pos = std::find(m_idata_names->begin(), m_idata_names->end(), name); + AMREX_ALWAYS_ASSERT_WITH_MESSAGE(pos != m_idata_names->end(), "Soa Int name='" + name + "' was not found components"); + + int const index = std::distance(m_idata_names->begin(), pos); + return GetIntData(index); + } + /** * \brief Returns the total number of particles (real and neighbor) * @@ -226,13 +313,20 @@ struct StructOfArrays { int m_num_neighbor_particles{0}; private: + // compile-time data IdCPU m_idcpu; std::array m_rdata; std::array< IntVector, NInt> m_idata; + // runtime data std::vector m_runtime_rdata; std::vector m_runtime_idata; + // names of both compile-time and runtime Real and Int data + std::vector* m_rdata_names = nullptr; + std::vector* m_idata_names = nullptr; + + //! whether the runtime components are sized correctly bool m_defined{false}; }; diff --git a/Tests/CMakeLists.txt b/Tests/CMakeLists.txt index a5ec9e1cf4..9d6afba857 100644 --- a/Tests/CMakeLists.txt +++ b/Tests/CMakeLists.txt @@ -117,13 +117,17 @@ if (AMReX_TEST_TYPE STREQUAL "Small") add_subdirectory("LinearSolvers/ABecLaplacian_C") endif() + if (AMReX_FFT) + add_subdirectory("FFT/Poisson") + endif() + else() # # List of subdirectories to search for CMakeLists. # set( AMREX_TESTS_SUBDIRS Amr AsyncOut CLZ CTOParFor DeviceGlobal Enum MultiBlock MultiPeriod ParmParse Parser Parser2 Reinit - RoundoffDomain) + RoundoffDomain SmallMatrix) if (AMReX_PARTICLES) list(APPEND AMREX_TESTS_SUBDIRS Particles) @@ -137,6 +141,10 @@ else() list(APPEND AMREX_TESTS_SUBDIRS LinearSolvers) endif () + if (AMReX_FFT) + list(APPEND AMREX_TESTS_SUBDIRS FFT) + endif () + if (AMReX_HDF5) list(APPEND AMREX_TESTS_SUBDIRS HDF5Benchmark) endif () diff --git a/Tests/FFT/Poisson/CMakeLists.txt b/Tests/FFT/Poisson/CMakeLists.txt new file mode 100644 index 0000000000..21a9d3b268 --- /dev/null +++ b/Tests/FFT/Poisson/CMakeLists.txt @@ -0,0 +1,10 @@ +foreach(D IN LISTS AMReX_SPACEDIM) + set(_sources main.cpp) + + set(_input_files) + + setup_test(${D} _sources _input_files) + + unset(_sources) + unset(_input_files) +endforeach() diff --git a/Tests/FFT/Poisson/GNUmakefile b/Tests/FFT/Poisson/GNUmakefile new file mode 100644 index 0000000000..93376f4485 --- /dev/null +++ b/Tests/FFT/Poisson/GNUmakefile @@ -0,0 +1,26 @@ +AMREX_HOME := ../../.. + +DEBUG = FALSE + +DIM = 3 + +COMP = gcc + +USE_MPI = TRUE +USE_OMP = FALSE +USE_CUDA = FALSE +USE_HIP = FALSE +USE_SYCL = FALSE + +USE_FFT = TRUE + +BL_NO_FORT = TRUE + +TINY_PROFILE = FALSE + +include $(AMREX_HOME)/Tools/GNUMake/Make.defs + +include ./Make.package +include $(AMREX_HOME)/Src/Base/Make.package + +include $(AMREX_HOME)/Tools/GNUMake/Make.rules diff --git a/Tests/FFT/Poisson/Make.package b/Tests/FFT/Poisson/Make.package new file mode 100644 index 0000000000..6b4b865e8f --- /dev/null +++ b/Tests/FFT/Poisson/Make.package @@ -0,0 +1 @@ +CEXE_sources += main.cpp diff --git a/Tests/FFT/Poisson/main.cpp b/Tests/FFT/Poisson/main.cpp new file mode 100644 index 0000000000..1286d80dad --- /dev/null +++ b/Tests/FFT/Poisson/main.cpp @@ -0,0 +1,148 @@ +#include // Put this at the top for testing + +#include +#include +#include +#include + +using namespace amrex; + +int main (int argc, char* argv[]) +{ + amrex::Initialize(argc, argv); + { + BL_PROFILE("main"); + + AMREX_D_TERM(int n_cell_x = 64;, + int n_cell_y = 32;, + int n_cell_z = 128); + + AMREX_D_TERM(int max_grid_size_x = 32;, + int max_grid_size_y = 32;, + int max_grid_size_z = 32); + + AMREX_D_TERM(Real prob_lo_x = 0.;, + Real prob_lo_y = 0.;, + Real prob_lo_z = 0.); + AMREX_D_TERM(Real prob_hi_x = 1.;, + Real prob_hi_y = 1.;, + Real prob_hi_z = 1.); + + { + ParmParse pp; + AMREX_D_TERM(pp.query("n_cell_x", n_cell_x);, + pp.query("n_cell_y", n_cell_y);, + pp.query("n_cell_z", n_cell_z)); + AMREX_D_TERM(pp.query("max_grid_size_x", max_grid_size_x);, + pp.query("max_grid_size_y", max_grid_size_y);, + pp.query("max_grid_size_z", max_grid_size_z)); + } + + Box domain(IntVect(0),IntVect(AMREX_D_DECL(n_cell_x-1,n_cell_y-1,n_cell_z-1))); + BoxArray ba(domain); + ba.maxSize(IntVect(AMREX_D_DECL(max_grid_size_x, + max_grid_size_y, + max_grid_size_z))); + DistributionMapping dm(ba); + + Geometry geom; + { + geom.define(domain, + RealBox(AMREX_D_DECL(prob_lo_x,prob_lo_y,prob_lo_z), + AMREX_D_DECL(prob_hi_x,prob_hi_y,prob_hi_z)), + CoordSys::cartesian, {AMREX_D_DECL(1,1,1)}); + } + auto const& dx = geom.CellSizeArray(); + + MultiFab rhs(ba,dm,1,0); + MultiFab soln(ba,dm,1,0); + auto const& rhsma = rhs.arrays(); + ParallelFor(rhs, [=] AMREX_GPU_DEVICE (int b, int i, int j, int k) + { + AMREX_D_TERM(Real x = (i+0.5_rt) * dx[0] - 0.5_rt;, + Real y = (j+0.5_rt) * dx[1] - 0.5_rt;, + Real z = (k+0.5_rt) * dx[2] - 0.5_rt); + rhsma[b](i,j,k) = std::exp(-10._rt* + (AMREX_D_TERM(x*x*1.05_rt, + y*y*0.90_rt, + z*z))); + }); + + // Shift rhs so that its sum is zero. + auto rhosum = rhs.sum(0); + rhs.plus(-rhosum/geom.Domain().d_numPts(), 0, 1); + +#if (AMREX_SPACEDIM == 3) + Array solvers{0,1}; +#else + Array solvers{0}; +#endif + + for (int solver_type : solvers) { + double tsetup, tsolve; + if (solver_type == 0) { + auto t0 = amrex::second(); + FFT::Poisson fft_poisson(geom); + auto t1 = amrex::second(); + tsetup = t1-t0; + + for (int n = 0; n < 2; ++n) { + auto ta = amrex::second(); + fft_poisson.solve(soln, rhs); + auto tb = amrex::second(); + tsolve = tb-ta; + } + } else { + auto t0 = amrex::second(); + FFT::PoissonHybrid fft_poisson(geom); + auto t1 = amrex::second(); + tsetup = t1-t0; + + for (int n = 0; n < 2; ++n) { + auto ta = amrex::second(); + fft_poisson.solve(soln, rhs); + auto tb = amrex::second(); + tsolve = tb-ta; + } + } + + amrex::Print() << " AMReX FFT setup time: " << tsetup + << ", solve time " << tsolve << "\n"; + + MultiFab phi(soln.boxArray(), soln.DistributionMap(), 1, 1); + MultiFab res(soln.boxArray(), soln.DistributionMap(), 1, 0); + MultiFab::Copy(phi, soln, 0, 0, 1, 0); + phi.FillBoundary(geom.periodicity()); + auto const& res_ma = res.arrays(); + auto const& phi_ma = phi.const_arrays(); + auto const& rhs_ma = rhs.const_arrays(); + ParallelFor(res, [=] AMREX_GPU_DEVICE (int b, int i, int j, int k) + { + auto const& phia = phi_ma[b]; + auto lap = (phia(i-1,j,k)-2._rt*phia(i,j,k)+phia(i+1,j,k)) / (dx[0]*dx[0]); +#if (AMREX_SPACEDIM >= 2) + lap += (phia(i,j-1,k)-2._rt*phia(i,j,k)+phia(i,j+1,k)) / (dx[1]*dx[1]); +#endif +#if (AMREX_SPACEDIM == 3) + if ((solver_type == 1) && (k == 0)) { // Neumann + lap += (-phia(i,j,k)+phia(i,j,k+1)) / (dx[2]*dx[2]); + } else if ((solver_type == 1) && ((k+1) == n_cell_z)) { // Neumann + lap += (phia(i,j,k-1)-phia(i,j,k)) / (dx[2]*dx[2]); + } else { + lap += (phia(i,j,k-1)-2._rt*phia(i,j,k)+phia(i,j,k+1)) / (dx[2]*dx[2]); + } +#endif + res_ma[b](i,j,k) = rhs_ma[b](i,j,k) - lap; + }); + auto bnorm = rhs.norminf(); + auto rnorm = res.norminf(); + amrex::Print() << " rhs inf norm " << bnorm << "\n" + << " res inf norm " << rnorm << "\n"; +#ifdef AMREX_USE_FLOAT + auto eps = 2.e-3f; +#else + auto eps = 1.e-11; +#endif + AMREX_ALWAYS_ASSERT(rnorm < eps*bnorm); + } + } + amrex::Finalize(); +} diff --git a/Tests/FFT/R2C/CMakeLists.txt b/Tests/FFT/R2C/CMakeLists.txt new file mode 100644 index 0000000000..21a9d3b268 --- /dev/null +++ b/Tests/FFT/R2C/CMakeLists.txt @@ -0,0 +1,10 @@ +foreach(D IN LISTS AMReX_SPACEDIM) + set(_sources main.cpp) + + set(_input_files) + + setup_test(${D} _sources _input_files) + + unset(_sources) + unset(_input_files) +endforeach() diff --git a/Tests/FFT/R2C/GNUmakefile b/Tests/FFT/R2C/GNUmakefile new file mode 100644 index 0000000000..93376f4485 --- /dev/null +++ b/Tests/FFT/R2C/GNUmakefile @@ -0,0 +1,26 @@ +AMREX_HOME := ../../.. + +DEBUG = FALSE + +DIM = 3 + +COMP = gcc + +USE_MPI = TRUE +USE_OMP = FALSE +USE_CUDA = FALSE +USE_HIP = FALSE +USE_SYCL = FALSE + +USE_FFT = TRUE + +BL_NO_FORT = TRUE + +TINY_PROFILE = FALSE + +include $(AMREX_HOME)/Tools/GNUMake/Make.defs + +include ./Make.package +include $(AMREX_HOME)/Src/Base/Make.package + +include $(AMREX_HOME)/Tools/GNUMake/Make.rules diff --git a/Tests/FFT/R2C/Make.package b/Tests/FFT/R2C/Make.package new file mode 100644 index 0000000000..6b4b865e8f --- /dev/null +++ b/Tests/FFT/R2C/Make.package @@ -0,0 +1 @@ +CEXE_sources += main.cpp diff --git a/Tests/FFT/R2C/main.cpp b/Tests/FFT/R2C/main.cpp new file mode 100644 index 0000000000..7103038575 --- /dev/null +++ b/Tests/FFT/R2C/main.cpp @@ -0,0 +1,126 @@ +#include // Put this at the top for testing + +#include +#include +#include +#include + +using namespace amrex; + +int main (int argc, char* argv[]) +{ + amrex::Initialize(argc, argv); + { + BL_PROFILE("main"); + + AMREX_D_TERM(int n_cell_x = 128;, + int n_cell_y = 32;, + int n_cell_z = 64); + + AMREX_D_TERM(int max_grid_size_x = 32;, + int max_grid_size_y = 32;, + int max_grid_size_z = 32); + + AMREX_D_TERM(Real prob_lo_x = 0.;, + Real prob_lo_y = 0.;, + Real prob_lo_z = 0.); + AMREX_D_TERM(Real prob_hi_x = 1.;, + Real prob_hi_y = 1.;, + Real prob_hi_z = 1.); + + { + ParmParse pp; + AMREX_D_TERM(pp.query("n_cell_x", n_cell_x);, + pp.query("n_cell_y", n_cell_y);, + pp.query("n_cell_z", n_cell_z)); + AMREX_D_TERM(pp.query("max_grid_size_x", max_grid_size_x);, + pp.query("max_grid_size_y", max_grid_size_y);, + pp.query("max_grid_size_z", max_grid_size_z)); + } + + Box domain(IntVect(0),IntVect(AMREX_D_DECL(n_cell_x-1,n_cell_y-1,n_cell_z-1))); + BoxArray ba(domain); + ba.maxSize(IntVect(AMREX_D_DECL(max_grid_size_x, + max_grid_size_y, + max_grid_size_z))); + DistributionMapping dm(ba); + + Geometry geom; + { + geom.define(domain, + RealBox(AMREX_D_DECL(prob_lo_x,prob_lo_y,prob_lo_z), + AMREX_D_DECL(prob_hi_x,prob_hi_y,prob_hi_z)), + CoordSys::cartesian, {AMREX_D_DECL(1,1,1)}); + } + auto const& dx = geom.CellSizeArray(); + + MultiFab mf(ba,dm,1,0); + auto const& ma = mf.arrays(); + ParallelFor(mf, [=] AMREX_GPU_DEVICE (int b, int i, int j, int k) + { + AMREX_D_TERM(Real x = (i+0.5_rt) * dx[0] - 0.5_rt;, + Real y = (j+0.5_rt) * dx[1] - 0.5_rt;, + Real z = (k+0.5_rt) * dx[2] - 0.5_rt); + ma[b](i,j,k) = std::exp(-10._rt* + (AMREX_D_TERM(x*x*1.05_rt, + y*y*0.90_rt, + z*z))); + }); + + MultiFab mf2(ba,dm,1,0); + + auto scaling = Real(1) / Real(geom.Domain().d_numPts()); + + { + cMultiFab cmf(ba,dm,1,0); + + // forward + { + FFT::R2C r2c(geom.Domain()); + r2c.forward(mf,cmf); + } + + // backward + { + FFT::R2C r2c(geom.Domain()); + r2c.backward(cmf,mf2); + } + + auto const& ma2 = mf2.arrays(); + ParallelFor(mf2, [=] AMREX_GPU_DEVICE (int b, int i, int j, int k) + { + ma2[b](i,j,k) = ma[b](i,j,k) - ma2[b](i,j,k)*scaling; + }); + + auto error = mf2.norminf(); + amrex::Print() << " Expected to be close to zero: " << error << "\n"; +#ifdef AMREX_USE_FLOAT + auto eps = 1.e-6f; +#else + auto eps = 1.e-13; +#endif + AMREX_ALWAYS_ASSERT(error < eps); + } + + mf2.setVal(std::numeric_limits::max()); + + { // forward and backward + FFT::R2C r2c(geom.Domain()); + r2c.forwardThenBackward(mf, mf2, + [=] AMREX_GPU_DEVICE (int, int, int, auto& sp) + { + sp *= scaling; + }); + + MultiFab::Subtract(mf2, mf, 0, 0, 1, 0); + + auto error = mf2.norminf(); + amrex::Print() << " Expected to be close to zero: " << error << "\n"; +#ifdef AMREX_USE_FLOAT + auto eps = 1.e-6f; +#else + auto eps = 1.e-13; +#endif + AMREX_ALWAYS_ASSERT(error < eps); + } + } + amrex::Finalize(); +} diff --git a/Tests/LinearSolvers/CurlCurl/CMakeLists.txt b/Tests/LinearSolvers/CurlCurl/CMakeLists.txt index 9dacdeb2fe..d7b1a912ed 100644 --- a/Tests/LinearSolvers/CurlCurl/CMakeLists.txt +++ b/Tests/LinearSolvers/CurlCurl/CMakeLists.txt @@ -1,5 +1,5 @@ foreach(D IN LISTS AMReX_SPACEDIM) - if (D EQUAL 1) + if (D EQUAL 1 OR NOT AMReX_LINEAR_SOLVERS_EM) return() endif () diff --git a/Tests/LinearSolvers/NodalPoisson/CMakeLists.txt b/Tests/LinearSolvers/NodalPoisson/CMakeLists.txt index d15b7d8e64..f42bd1fecc 100644 --- a/Tests/LinearSolvers/NodalPoisson/CMakeLists.txt +++ b/Tests/LinearSolvers/NodalPoisson/CMakeLists.txt @@ -1,5 +1,5 @@ foreach(D IN LISTS AMReX_SPACEDIM) - if(D EQUAL 1) + if(D EQUAL 1 OR NOT AMReX_LINEAR_SOLVERS_INCFLO) continue() endif() diff --git a/Tests/LinearSolvers/Nodal_Projection_EB/CMakeLists.txt b/Tests/LinearSolvers/Nodal_Projection_EB/CMakeLists.txt index 3a8b331e45..d244b7573a 100644 --- a/Tests/LinearSolvers/Nodal_Projection_EB/CMakeLists.txt +++ b/Tests/LinearSolvers/Nodal_Projection_EB/CMakeLists.txt @@ -1,4 +1,4 @@ -if ( (NOT AMReX_EB) OR NOT (3 IN_LIST AMReX_SPACEDIM)) +if ( (NOT AMReX_EB) OR (NOT AMReX_LINEAR_SOLVERS_INCFLO) OR NOT (3 IN_LIST AMReX_SPACEDIM)) return() endif () diff --git a/Tests/LinearSolvers/NodeTensorLap/CMakeLists.txt b/Tests/LinearSolvers/NodeTensorLap/CMakeLists.txt index 956ea25072..4d40669a0c 100644 --- a/Tests/LinearSolvers/NodeTensorLap/CMakeLists.txt +++ b/Tests/LinearSolvers/NodeTensorLap/CMakeLists.txt @@ -1,6 +1,6 @@ if (AMReX_GPU_BACKEND STREQUAL NONE) foreach(D IN LISTS AMReX_SPACEDIM) - if(D EQUAL 1) + if(D EQUAL 1 OR NOT AMReX_LINEAR_SOLVERS_EM) continue() endif() diff --git a/Tests/Particles/NamedSoAComponents/CMakeLists.txt b/Tests/Particles/NamedSoAComponents/CMakeLists.txt new file mode 100644 index 0000000000..e14ddd6897 --- /dev/null +++ b/Tests/Particles/NamedSoAComponents/CMakeLists.txt @@ -0,0 +1,10 @@ +foreach(D IN LISTS AMReX_SPACEDIM) + set(_sources main.cpp) + #set(_input_files) + #set(_input_files inputs) + + setup_test(${D} _sources _input_files NTHREADS 2) + + unset(_sources) + unset(_input_files) +endforeach() diff --git a/Tests/Particles/NamedSoAComponents/GNUmakefile b/Tests/Particles/NamedSoAComponents/GNUmakefile new file mode 100644 index 0000000000..9f49d3ec02 --- /dev/null +++ b/Tests/Particles/NamedSoAComponents/GNUmakefile @@ -0,0 +1,22 @@ +AMREX_HOME = ../../../ + +DEBUG = FALSE + +DIM = 3 + +COMP = gcc + +USE_MPI = TRUE +USE_OMP = FALSE +USE_CUDA = FALSE + +#TINY_PROFILE = TRUE +USE_PARTICLES = TRUE + +include $(AMREX_HOME)/Tools/GNUMake/Make.defs + +include ./Make.package +include $(AMREX_HOME)/Src/Base/Make.package +include $(AMREX_HOME)/Src/Particle/Make.package + +include $(AMREX_HOME)/Tools/GNUMake/Make.rules diff --git a/Tests/Particles/NamedSoAComponents/Make.package b/Tests/Particles/NamedSoAComponents/Make.package new file mode 100644 index 0000000000..6b4b865e8f --- /dev/null +++ b/Tests/Particles/NamedSoAComponents/Make.package @@ -0,0 +1 @@ +CEXE_sources += main.cpp diff --git a/Tests/Particles/NamedSoAComponents/main.cpp b/Tests/Particles/NamedSoAComponents/main.cpp new file mode 100644 index 0000000000..39b6a70ff5 --- /dev/null +++ b/Tests/Particles/NamedSoAComponents/main.cpp @@ -0,0 +1,139 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +using namespace amrex; + +void addParticles () +{ + using PC = ParticleContainerPureSoA; + int is_per[AMREX_SPACEDIM]; + for (int & d : is_per) { + d = 1; + } + + RealBox real_box; + for (int n = 0; n < AMREX_SPACEDIM; n++) + { + real_box.setLo(n, 0.0); + real_box.setHi(n, 100.0); + } + + IntVect domain_lo(AMREX_D_DECL(0, 0, 0)); + IntVect domain_hi(AMREX_D_DECL(127, 127, 127)); + const Box base_domain(domain_lo, domain_hi); + + Geometry geom(base_domain, &real_box, CoordSys::cartesian, is_per); + BoxArray ba(base_domain); + ba.maxSize(64); + + DistributionMapping dm(ba); + + PC pc(geom, dm, ba); + + amrex::Print() << "Original Real SoA component names are: "; + for (auto& n : pc.GetRealSoANames()) { + amrex::Print() << n << ", "; + } + amrex::Print() << "\n"; + + amrex::Print() << "Original Int SoA component names are: "; + for (auto& n : pc.GetIntSoANames()) { + amrex::Print() << n << ", "; + } + amrex::Print() << "\n"; + + amrex::Print() << "Adding runtime comps. \n"; + pc.AddRealComp("real_comp1"); + pc.AddRealComp(); // without name - should be real_comp2 + pc.AddIntComp(); // without name - should be int_comp0 + + amrex::Print() << "New Real SoA component names are: "; + for (auto& n : pc.GetRealSoANames()) { + amrex::Print() << n << ", "; + } + amrex::Print() << "\n"; + + amrex::Print() << "New Int SoA component names are: "; + for (auto& n : pc.GetIntSoANames()) { + amrex::Print() << n << ", "; + } + amrex::Print() << "\n"; + + amrex::Print() << "Reset compile-time SoA names \n"; + pc.SetSoACompileTimeNames({AMREX_D_DECL("x", "y", "z"), "w"}, {"i1", "i2"}); + + amrex::Print() << "New Real SoA component names are: "; + for (auto& n : pc.GetRealSoANames()) { + amrex::Print() << n << ", "; + } + amrex::Print() << "\n"; + + amrex::Print() << "New Int SoA component names are: "; + for (auto& n : pc.GetIntSoANames()) { + amrex::Print() << n << ", "; + } + amrex::Print() << "\n"; + + int const NArrayReal = PC::NArrayReal; + int const NArrayInt = PC::NArrayInt; + using ParticleType = typename PC::ParticleType; + + const int add_num_particles = 5; + auto& ptile1 = pc.DefineAndReturnParticleTile(0, 0, 0); + ptile1.resize(add_num_particles); + + for (int i = 0; i < add_num_particles; ++i) + { + for (int d = 0; d < AMREX_SPACEDIM; d++) { + ptile1.pos(i, d) = 12.0; + } + ptile1.getParticleTileData().rdata(AMREX_SPACEDIM)[i] = 1.2; // w + + ptile1.push_back_int(0, int(ParticleType::NextID())); + ptile1.push_back_int(1, amrex::ParallelDescriptor::MyProc()); + } + + int lev=0; + using MyParIter = ParIter_impl; + for (MyParIter pti(pc, lev); pti.isValid(); ++pti) { + auto& soa = pti.GetStructOfArrays(); + AMREX_D_TERM( + auto *xp = soa.GetRealData("x").data();, + auto *yp = soa.GetRealData("y").data();, + auto *zp = soa.GetRealData("z").data(); + ); + auto *wp = soa.GetRealData("w").data(); + + const int np = pti.numParticles(); + ParallelFor( np, [=] AMREX_GPU_DEVICE (long ip) + { + AMREX_D_TERM( + AMREX_ALWAYS_ASSERT_WITH_MESSAGE(xp[ip] == 12_prt, + "pos attribute expected to be 12");, + AMREX_ALWAYS_ASSERT_WITH_MESSAGE(yp[ip] == 12_prt, + "pos attribute expected to be 12");, + AMREX_ALWAYS_ASSERT_WITH_MESSAGE(zp[ip] == 12_prt, + "pos attribute expected to be 12"); + ); + AMREX_ALWAYS_ASSERT_WITH_MESSAGE(wp[ip] == 1.2_prt, + "pos attribute expected to be 1.2"); + }); + } +} + +int main (int argc, char* argv[]) + { + amrex::Initialize(argc,argv); + { + addParticles(); + } + amrex::Finalize(); + } diff --git a/Tests/SmallMatrix/CMakeLists.txt b/Tests/SmallMatrix/CMakeLists.txt new file mode 100644 index 0000000000..224c4563c8 --- /dev/null +++ b/Tests/SmallMatrix/CMakeLists.txt @@ -0,0 +1,9 @@ +foreach(D IN LISTS AMReX_SPACEDIM) + set(_sources main.cpp) + set(_input_files) + + setup_test(${D} _sources _input_files) + + unset(_sources) + unset(_input_files) +endforeach() diff --git a/Tests/SmallMatrix/GNUmakefile b/Tests/SmallMatrix/GNUmakefile new file mode 100644 index 0000000000..d0d895ff52 --- /dev/null +++ b/Tests/SmallMatrix/GNUmakefile @@ -0,0 +1,24 @@ +AMREX_HOME := ../.. + +DEBUG = FALSE + +DIM = 3 + +COMP = gcc + +USE_MPI = FALSE +USE_OMP = FALSE +USE_CUDA = FALSE +USE_HIP = FALSE +USE_SYCL = FALSE + +BL_NO_FORT = TRUE + +TINY_PROFILE = FALSE + +include $(AMREX_HOME)/Tools/GNUMake/Make.defs + +include ./Make.package +include $(AMREX_HOME)/Src/Base/Make.package + +include $(AMREX_HOME)/Tools/GNUMake/Make.rules diff --git a/Tests/SmallMatrix/Make.package b/Tests/SmallMatrix/Make.package new file mode 100644 index 0000000000..6b4b865e8f --- /dev/null +++ b/Tests/SmallMatrix/Make.package @@ -0,0 +1 @@ +CEXE_sources += main.cpp diff --git a/Tests/SmallMatrix/main.cpp b/Tests/SmallMatrix/main.cpp new file mode 100644 index 0000000000..505ae50bd6 --- /dev/null +++ b/Tests/SmallMatrix/main.cpp @@ -0,0 +1,306 @@ +#include +#include +#include +#include +#include + +using namespace amrex; + +int main (int argc, char* argv[]) +{ + static_assert(Order::C == Order::RowMajor && + Order::F == Order::ColumnMajor); + + amrex::Initialize(argc, argv); + // 0-based indexing + { + SmallMatrix m34{}; + for (int j = 0; j < 4; ++j) { + for (int i = 0; i < 3; ++i) { + AMREX_ALWAYS_ASSERT(m34(i,j) == 0.0_rt); + } + } + } + { + SmallVector cv{}; + SmallRowVector rv{}; + SmallVector cv2{1,2,3}; + SmallRowVector rv2{0,10,20}; + SmallVector cv3{0,1,2}; + for (int j = 0; j < 3; ++j) { + AMREX_ALWAYS_ASSERT(cv(j) == 0.0_rt && + rv(j) == 0.0_rt && + cv2(j) == j+1 && + rv2(j) == j*10 && + cv3(j) == j); + } + AMREX_ALWAYS_ASSERT(cv3(3) == 0 && cv3(4) == 0); + } + { + SmallMatrix m34{{0,3,6,9}, + {1,4,7,10}, + {2,5,8,11}}; + int v = 0; + for (int j = 0; j < 4; ++j) { + for (int i = 0; i < 3; ++i) { + AMREX_ALWAYS_ASSERT(m34(i,j) == v++); + } + } + std::cout << m34; + } + { + SmallMatrix m34{{0,1,2,3}, + {4,5,6,7}, + {8,9,10,11}}; + int v = 0; + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 4; ++j) { + AMREX_ALWAYS_ASSERT(m34(i,j) == v++); + } + } + } + { + auto v3 = SmallVector::Zero(); + v3[0] = 1.; + v3(1) = 2.; + v3[2] = 3.; + auto m33 = SmallMatrix::Identity(); + auto r = m33*v3; + AMREX_ALWAYS_ASSERT(almostEqual(r[0],v3[0]) && + almostEqual(r[1],v3[1]) && + almostEqual(r[2],v3[2])); + } + { + SmallMatrix A{{1, 0, 1}, + {2, 1, 1}, + {0, 1, 1}, + {1, 1, 2}}; + SmallMatrix B{{1, 2, 1}, + {2, 3, 1}, + {4, 2, 2}}; + SmallMatrix C{10, 8, 6}; + auto ABC = A*B*C; + AMREX_ALWAYS_ASSERT(ABC(0,0) == 100 && + ABC(1,0) == 182 && + ABC(2,0) == 118 && + ABC(3,0) == 218); + } + { + SmallMatrix A{{1, 2, 0, 1}, + {0, 1, 1, 1}, + {1, 1, 1, 2}}; + SmallMatrix B{{1, 2, 4}, + {2, 3, 2}, + {1, 1, 2}}; + SmallMatrix C{10, 8, 6}; + auto ABC = A.transpose()*B.transposeInPlace()*C.transpose(); + AMREX_ALWAYS_ASSERT(ABC(0,0) == 100 && + ABC(1,0) == 182 && + ABC(2,0) == 118 && + ABC(3,0) == 218); + } + { + SmallMatrix m; + m.setVal(2); + using M = decltype(m); + AMREX_ALWAYS_ASSERT(m.product() == Math::powi(2)); + AMREX_ALWAYS_ASSERT(m.sum() == 2*m.row_size*m.column_size); + } + { + SmallMatrix m{{1.0, 3.4, 4.5, 5.6, 6.7}, + {1.3, 2.0, 4.5, 5.6, 6.7}, + {1.3, 1.0, 3.0, 5.6, 6.7}, + {1.3, 1.4, 4.5, 4.0, 6.7}, + {1.3, 1.0, 4.5, 5.6, 5.0}}; + AMREX_ALWAYS_ASSERT(m.trace() == double(1+2+3+4+5)); + } + { + SmallMatrix a{{+1, +2, +3}, + {+7, +8, +9}}; + SmallMatrix b{{-1, -2, -3}, + {-7, -8, -9}}; + auto c = a*2 + 2*b; + for (auto const& x : c) { + AMREX_ALWAYS_ASSERT(x == 0); + } + } + { + SmallMatrix a{{+1, +2, +3}, + {+7, +8, +9}}; + SmallMatrix b{{-1, -2, -3}, + {-7, -8, -9}}; + auto c = -a - b; + for (auto const& x : c) { + AMREX_ALWAYS_ASSERT(x == 0); + } + } + { + SmallMatrix a{{+1, +2, +3}, + {+7, +8, +9}}; + SmallMatrix b; + b.setVal(-1); + AMREX_ALWAYS_ASSERT(a.dot(b) == -30); + } + { + SmallVector v{10,20,30}; + auto const& [x,y,z] = v; + AMREX_ALWAYS_ASSERT(x == 10 && y == 20 && z == 30); + + auto& [a,b,c] = v; + a = 100; b = 200; c = 300; + AMREX_ALWAYS_ASSERT(v[0] == 100 && v[1] == 200 && v[2] == 300); + + auto const [i,j,k] = v; + AMREX_ALWAYS_ASSERT(i == 100 && j == 200 && k == 300); + + auto [d,e,f] = v; + AMREX_ALWAYS_ASSERT(d == 100 && e == 200 && f == 300); + } + + // 1-based indexing + { + SmallMatrix m34{}; + for (int j = 1; j <= 4; ++j) { + for (int i = 1; i <= 3; ++i) { + AMREX_ALWAYS_ASSERT(m34(i,j) == 0.0_rt); + } + } + } + { + SmallVector cv{}; + SmallRowVector rv{}; + SmallVector cv2{1,2,3}; + SmallRowVector rv2{0,10,20}; + SmallVector cv3{0,1,2}; + for (int j = 0; j < 3; ++j) { + AMREX_ALWAYS_ASSERT(cv(j+1) == 0.0_rt && + rv(j+1) == 0.0_rt && + cv2(j+1) == j+1 && + rv2(j+1) == j*10 && + cv3(j+1) == j); + } + AMREX_ALWAYS_ASSERT(cv3(4) == 0 && cv3(5) == 0); + } + { + SmallMatrix m34{{0,3,6,9}, + {1,4,7,10}, + {2,5,8,11}}; + int v = 0; + for (int j = 1; j <= 4; ++j) { + for (int i = 1; i <= 3; ++i) { + AMREX_ALWAYS_ASSERT(m34(i,j) == v++); + } + } + std::cout << m34; + } + { + SmallMatrix m34{{0,1,2,3}, + {4,5,6,7}, + {8,9,10,11}}; + int v = 0; + for (int i = 1; i <= 3; ++i) { + for (int j = 1; j <= 4; ++j) { + AMREX_ALWAYS_ASSERT(m34(i,j) == v++); + } + } + } + { + auto v3 = SmallVector::Zero(); + v3[1] = 1.; + v3(2) = 2.; + v3[3] = 3.; + auto m33 = SmallMatrix::Identity(); + auto r = m33*v3; + AMREX_ALWAYS_ASSERT(almostEqual(r[1],v3[1]) && + almostEqual(r[2],v3[2]) && + almostEqual(r[3],v3[3])); + } + { + SmallMatrix A{{1, 0, 1}, + {2, 1, 1}, + {0, 1, 1}, + {1, 1, 2}}; + SmallMatrix B{{1, 2, 1}, + {2, 3, 1}, + {4, 2, 2}}; + SmallMatrix C{10, 8, 6}; + auto ABC = A*B*C; + AMREX_ALWAYS_ASSERT(ABC(1,1) == 100 && + ABC(2,1) == 182 && + ABC(3,1) == 118 && + ABC(4,1) == 218); + } + { + SmallMatrix A{{1, 2, 0, 1}, + {0, 1, 1, 1}, + {1, 1, 1, 2}}; + SmallMatrix B{{1, 2, 4}, + {2, 3, 2}, + {1, 1, 2}}; + SmallMatrix C{10, 8, 6}; + auto ABC = A.transpose()*B.transposeInPlace()*C.transpose(); + AMREX_ALWAYS_ASSERT(ABC(1,1) == 100 && + ABC(2,1) == 182 && + ABC(3,1) == 118 && + ABC(4,1) == 218); + } + { + SmallMatrix m; + m.setVal(2); + using M = decltype(m); + AMREX_ALWAYS_ASSERT(m.product() == Math::powi(2)); + AMREX_ALWAYS_ASSERT(m.sum() == 2*m.row_size*m.column_size); + } + { + SmallMatrix m{{1.0, 3.4, 4.5, 5.6, 6.7}, + {1.3, 2.0, 4.5, 5.6, 6.7}, + {1.3, 1.0, 3.0, 5.6, 6.7}, + {1.3, 1.4, 4.5, 4.0, 6.7}, + {1.3, 1.0, 4.5, 5.6, 5.0}}; + AMREX_ALWAYS_ASSERT(m.trace() == double(1+2+3+4+5)); + } + { + SmallMatrix a{{+1, +2, +3}, + {+7, +8, +9}}; + SmallMatrix b{{-1, -2, -3}, + {-7, -8, -9}}; + auto c = a*2 + 2*b; + for (auto const& x : c) { + AMREX_ALWAYS_ASSERT(x == 0); + } + } + { + SmallMatrix a{{+1, +2, +3}, + {+7, +8, +9}}; + SmallMatrix b{{-1, -2, -3}, + {-7, -8, -9}}; + auto c = -a - b; + for (auto const& x : c) { + AMREX_ALWAYS_ASSERT(x == 0); + } + } + { + SmallMatrix a{{+1, +2, +3}, + {+7, +8, +9}}; + SmallMatrix b; + b.setVal(-1); + AMREX_ALWAYS_ASSERT(a.dot(b) == -30); + } + { + SmallVector v{10,20,30}; + auto const& [x,y,z] = v; + AMREX_ALWAYS_ASSERT(x == 10 && y == 20 && z == 30); + + auto& [a,b,c] = v; + a = 100; b = 200; c = 300; + AMREX_ALWAYS_ASSERT(v[1] == 100 && v[2] == 200 && v[3] == 300); + + auto const [i,j,k] = v; + AMREX_ALWAYS_ASSERT(i == 100 && j == 200 && k == 300); + + auto [d,e,f] = v; + AMREX_ALWAYS_ASSERT(d == 100 && e == 200 && f == 300); + } + + amrex::Finalize(); +} diff --git a/Tools/CMake/AMReXConfig.cmake.in b/Tools/CMake/AMReXConfig.cmake.in index f5045b715c..96fb12cbf7 100644 --- a/Tools/CMake/AMReXConfig.cmake.in +++ b/Tools/CMake/AMReXConfig.cmake.in @@ -74,6 +74,9 @@ set(AMReX_AMRLEVEL_FOUND @AMReX_AMRLEVEL@) set(AMReX_EB_FOUND @AMReX_EB@) set(AMReX_FINTERFACES_FOUND @AMReX_FORTRAN_INTERFACES@) set(AMReX_LSOLVERS_FOUND @AMReX_LINEAR_SOLVERS@) +set(AMReX_LSOLVERS_INCFLO_FOUND @AMReX_LINEAR_SOLVERS_INCFLO@) +set(AMReX_LSOLVERS_EM_FOUND @AMReX_LINEAR_SOLVERS_EM@) +set(AMReX_FFT_FOUND @AMReX_FFT@) set(AMReX_AMRDATA_FOUND @AMReX_AMRDATA@) set(AMReX_PARTICLES_FOUND @AMReX_PARTICLES@) set(AMReX_P@AMReX_PARTICLES_PRECISION@_FOUND ON) @@ -129,6 +132,9 @@ set(AMReX_AMRLEVEL @AMReX_AMRLEVEL@) set(AMReX_EB @AMReX_EB@) set(AMReX_FINTERFACES @AMReX_FORTRAN_INTERFACES@) set(AMReX_LSOLVERS @AMReX_LINEAR_SOLVERS@) +set(AMReX_LSOLVERS_INCFLO @AMReX_LINEAR_SOLVERS_INCFLO@) +set(AMReX_LSOLVERS_EM @AMReX_LINEAR_SOLVERS_EM@) +set(AMReX_FFT @AMReX_FFT@) set(AMReX_AMRDATA @AMReX_AMRDATA@) set(AMReX_PARTICLES @AMReX_PARTICLES@) set(AMReX_PARTICLES_PRECISION @AMReX_PARTICLES_PRECISION@) @@ -212,6 +218,12 @@ if (@AMReX_CONDUIT@) find_dependency(Conduit REQUIRED) endif () +if (@AMReX_FFT@) + if (@AMReX_GPU_BACKEND@ STREQUAL NONE) + find_dependency(AMReXFFTW REQUIRED) + endif() +endif() + if (@AMReX_HDF5@) find_dependency(HDF5 REQUIRED) endif () diff --git a/Tools/CMake/AMReXOptions.cmake b/Tools/CMake/AMReXOptions.cmake index 3e5d4c8bdb..a7863f125e 100644 --- a/Tools/CMake/AMReXOptions.cmake +++ b/Tools/CMake/AMReXOptions.cmake @@ -284,6 +284,19 @@ print_option(AMReX_FORTRAN_INTERFACES) option( AMReX_LINEAR_SOLVERS "Build AMReX Linear solvers" ON ) print_option( AMReX_LINEAR_SOLVERS ) +cmake_dependent_option( AMReX_LINEAR_SOLVERS_INCFLO + "Build AMReX Linear solvers useful for incompressible flow codes" ON + "AMReX_LINEAR_SOLVERS" OFF) +print_option( AMReX_LINEAR_SOLVERS_INCFLO ) + +cmake_dependent_option( AMReX_LINEAR_SOLVERS_EM + "Build AMReX Linear solvers useful for electromagnetic codes" ON + "AMReX_LINEAR_SOLVERS" OFF) +print_option( AMReX_LINEAR_SOLVERS_EM ) + +option( AMReX_FFT "Build AMReX FFT" OFF ) +print_option( AMReX_FFT ) + option( AMReX_AMRDATA "Build data services" OFF ) print_option( AMReX_AMRDATA ) diff --git a/Tools/CMake/AMReXThirdPartyLibraries.cmake b/Tools/CMake/AMReXThirdPartyLibraries.cmake index abe62a2ebc..b8ad503e83 100644 --- a/Tools/CMake/AMReXThirdPartyLibraries.cmake +++ b/Tools/CMake/AMReXThirdPartyLibraries.cmake @@ -1,3 +1,27 @@ +# +# FFT +# +if (AMReX_FFT) + if (AMReX_CUDA) + find_package(CUDAToolkit REQUIRED) + foreach(D IN LISTS AMReX_SPACEDIM) + target_link_libraries(amrex_${D}d PUBLIC CUDA::cufft) + endforeach() + elseif (AMReX_HIP) + find_package(rocfft REQUIRED) + foreach(D IN LISTS AMReX_SPACEDIM) + target_link_libraries(amrex_${D}d PUBLIC roc::rocfft) + endforeach() + elseif (AMReX_SYCL) + # nothing to do + else() + find_package(AMReXFFTW REQUIRED) + foreach(D IN LISTS AMReX_SPACEDIM) + target_link_libraries(amrex_${D}d PUBLIC AMReX::FFTW) + endforeach() + endif() +endif() + # # HDF5 -- here it would be best to create an imported target # diff --git a/Tools/CMake/AMReX_Config_ND.H.in b/Tools/CMake/AMReX_Config_ND.H.in index 3296a403ff..07e3b7fd63 100644 --- a/Tools/CMake/AMReX_Config_ND.H.in +++ b/Tools/CMake/AMReX_Config_ND.H.in @@ -39,6 +39,7 @@ #cmakedefine BL_FORT_USE_LOWERCASE #cmakedefine BL_FORT_USE_UPPERCASE #cmakedefine BL_NO_FORT +#cmakedefine AMREX_USE_FFT #cmakedefine AMREX_USE_SENSEI_INSITU #cmakedefine AMREX_NO_SENSEI_AMR_INST #cmakedefine AMREX_USE_CONDUIT diff --git a/Tools/CMake/FindAMReXFFTW.cmake b/Tools/CMake/FindAMReXFFTW.cmake new file mode 100644 index 0000000000..678743a08b --- /dev/null +++ b/Tools/CMake/FindAMReXFFTW.cmake @@ -0,0 +1,51 @@ +#[=======================================================================[: +FindAMReXFFTW +------- + +Finds the FFTW library. + +Imported Targets +^^^^^^^^^^^^^^^^ + +This module provides the following imported target, if found: + +``FFTW`` + The FFTW library + +Result Variables +^^^^^^^^^^^^^^^^ + +This will define the following variables: + +``AMReXFFTW_FOUND`` + True if the hypre library has been found. +``FFTW_INCLUDES`` + Include directories needed to use FFTW. +``FFTW_LIBRARIES`` + Libraries needed to link to FFTW. + +This will also create an imported target, AMReX::FFTW. +#]=======================================================================] + +if (NOT FFTW_INCLUDES) + find_path(FFTW_INCLUDES NAMES "fftw3.h" HINTS ${FFTW_ROOT}/include) +endif() + +if (NOT FFTW_LIBRARIES) + find_library(FFTW_LIBRARY NAMES "fftw3" HINTS ${FFTW_ROOT}/lib) + find_library(FFTWF_LIBRARY NAMES "fftw3f" HINTS ${FFTW_ROOT}/lib) + set(FFTW_LIBRARIES ${FFTW_LIBRARY} ${FFTWF_LIBRARY}) +endif() + +include(FindPackageHandleStandardArgs) + +find_package_handle_standard_args(AMReXFFTW + REQUIRED_VARS FFTW_LIBRARIES FFTW_INCLUDES) + +mark_as_advanced(FFTW_LIBRARIES FFTW_INCLUDES) + +# Create imported target +add_library(AMReX::FFTW INTERFACE IMPORTED GLOBAL) +target_link_libraries(AMReX::FFTW INTERFACE ${FFTW_LIBRARIES}) +set_target_properties(AMReX::FFTW PROPERTIES + INTERFACE_INCLUDE_DIRECTORIES "${FFTW_INCLUDES}") diff --git a/Tools/F_scripts/dep.py b/Tools/F_scripts/dep.py index 151d1a9f9a..e5eb74cb40 100755 --- a/Tools/F_scripts/dep.py +++ b/Tools/F_scripts/dep.py @@ -28,7 +28,7 @@ import preprocess # modules to ignore in the dependencies -IGNORES = ["iso_c_binding", "iso_fortran_env", "omp_lib", "mpi", "cudafor", "openacc", "hdf"] +IGNORES = ["iso_c_binding", "iso_fortran_env", "omp_lib", "mpi", "cudafor", "openacc", "hdf", "hdf5"] # regular expression for "{}module{}name", where {} can be any number # of spaces. We use 4 groups here, denoted by (), so the name of the diff --git a/Tools/GNUMake/Make.defs b/Tools/GNUMake/Make.defs index f33911ed3a..66de8ec6a5 100644 --- a/Tools/GNUMake/Make.defs +++ b/Tools/GNUMake/Make.defs @@ -112,6 +112,12 @@ else DEBUG := FALSE endif +ifdef USE_FFT + USE_FFT := $(strip $(USE_FFT)) +else + USE_FFT := FALSE +endif + ifdef PROFILE PROFILE := $(strip $(PROFILE)) else @@ -604,6 +610,28 @@ else DebugSuffix := endif +ifeq ($(USE_FFT),TRUE) + include $(AMREX_HOME)/Src/FFT/Make.package + ifeq ($(USE_CUDA),TRUE) + LIBRARIES += -lcufft + else ifeq ($(USE_HIP),TRUE) + # Use rocFFT. ROC_PATH is defined in hip.mak + SYSTEM_INCLUDE_LOCATIONS += $(ROC_PATH)/rocfft/include + LIBRARY_LOCATIONS += $(ROC_PATH)/rocfft/lib + LIBRARIES += -Wl,--rpath=$(ROC_PATH)/rocfft/lib -lrocfft + else ifeq ($(USE_SYCL),TRUE) + # nothing + else + FFTW_HOME ?= NOT_SET + ifneq ($(FFTW_HOME),NOT_SET) + SYSTEM_INCLUDE_LOCATIONS += $(FFTW_HOME)/include + LIBRARY_LOCATIONS += $(FFTW_HOME)/lib + LIBRARIES += -Wl,--rpath=$(FFTW_HOME)/lib + endif + LIBRARIES += -lfftw3f -lfftw3 + endif +endif + ifeq ($(USE_PROFPARSER),TRUE) PROFILE := TRUE TRACE_PROFILE := TRUE @@ -760,10 +788,6 @@ ifeq ($(USE_HIP),TRUE) GPUSuffix := .HIP - ifeq ($(HIP_INDIRECT_FUNCTION),TRUE) - DEFINES += -DAMREX_HIP_INDIRECT_FUNCTION - endif - ifeq ($(USE_MPI),TRUE) # Make sure that the C/C++ MPI # wrappers are calling hipcc to compile the code. @@ -918,6 +942,10 @@ ifeq ($(USE_PARTICLES),TRUE) DEFINES += -DAMREX_PARTICLES endif +ifeq ($(USE_FFT),TRUE) + DEFINES += -DAMREX_USE_FFT +endif + ifeq ($(USE_EB),TRUE) DEFINES += -DAMREX_USE_EB endif diff --git a/Tools/GNUMake/comps/hip.mak b/Tools/GNUMake/comps/hip.mak index 87bb3e93f5..26dff7f94f 100644 --- a/Tools/GNUMake/comps/hip.mak +++ b/Tools/GNUMake/comps/hip.mak @@ -119,8 +119,8 @@ ifeq ($(HIP_COMPILER),clang) endif # Generic HIP info - ROC_PATH=$(realpath $(dir $(HIP_PATH))) - SYSTEM_INCLUDE_LOCATIONS += $(ROC_PATH)/include $(HIP_PATH)/include + ROC_PATH=$(realpath $(HIP_PATH)) + SYSTEM_INCLUDE_LOCATIONS += $(ROC_PATH)/include # rocRand SYSTEM_INCLUDE_LOCATIONS += $(ROC_PATH)/include/hiprand $(ROC_PATH)/include/rocrand diff --git a/Tools/libamrex/configure.py b/Tools/libamrex/configure.py index 1545f86dfb..873b575fe4 100755 --- a/Tools/libamrex/configure.py +++ b/Tools/libamrex/configure.py @@ -57,10 +57,22 @@ def configure(argv): help="Enable AMReX Fortran API [default=yes]", choices=["yes","no"], default="yes") + parser.add_argument("--enable-fft", + help="Enable AMReX FFT [default=no]", + choices=["yes","no"], + default="no") parser.add_argument("--enable-linear-solver", help="Enable AMReX linear solvers [default=yes]", choices=["yes","no"], default="yes") + parser.add_argument("--enable-linear-solver-incflo", + help="Enable AMReX linear solvers for incompressible flow codes [default=yes]", + choices=["yes","no"], + default="yes") + parser.add_argument("--enable-linear-solver-em", + help="Enable AMReX linear solvers for electromagnetic codes [default=yes]", + choices=["yes","no"], + default="yes") parser.add_argument("--enable-hypre", help="Enable Hypre as an option for bottom solver of AMReX linear solvers [default=no]", choices=["yes","no"], @@ -143,7 +155,10 @@ def configure(argv): f.write("DEBUG = {}\n".format("TRUE" if args.debug == "yes" else "FALSE")) f.write("USE_PARTICLES = {}\n".format("FALSE" if args.enable_particle == "no" else "TRUE")) f.write("USE_FORTRAN_INTERFACE = {}\n".format("FALSE" if args.enable_fortran_api == "no" else "TRUE")) + f.write("USE_FFT = {}\n".format("TRUE" if args.enable_fft == "yes" else "FALSE")) f.write("USE_LINEAR_SOLVERS = {}\n".format("FALSE" if args.enable_linear_solver == "no" else "TRUE")) + f.write("USE_LINEAR_SOLVERS_INCFLO = {}\n".format("FALSE" if args.enable_linear_solver_incflo == "no" else "TRUE")) + f.write("USE_LINEAR_SOLVERS_EM = {}\n".format("FALSE" if args.enable_linear_solver_em == "no" else "TRUE")) f.write("USE_HYPRE = {}\n".format("TRUE" if args.enable_hypre == "yes" else "FALSE")) f.write("USE_PETSC = {}\n".format("TRUE" if args.enable_petsc == "yes" else "FALSE")) f.write("USE_EB = {}\n".format("TRUE" if args.enable_eb == "yes" else "FALSE"))