diff --git a/.github/workflows/clang.yml b/.github/workflows/clang.yml index c84d19850a7..1ed5240164f 100644 --- a/.github/workflows/clang.yml +++ b/.github/workflows/clang.yml @@ -44,6 +44,7 @@ jobs: -DCMAKE_BUILD_TYPE=Debug \ -DCMAKE_VERBOSE_MAKEFILE=ON \ -DCMAKE_INSTALL_PREFIX=/tmp/my-amrex \ + -DAMReX_FFT=ON \ -DAMReX_EB=ON \ -DAMReX_FORTRAN=ON \ -DAMReX_MPI=OFF \ @@ -104,6 +105,7 @@ jobs: cmake .. \ -DCMAKE_BUILD_TYPE=Debug \ -DCMAKE_VERBOSE_MAKEFILE=ON \ + -DAMReX_FFT=ON \ -DAMReX_EB=ON \ -DAMReX_ENABLE_TESTS=ON \ -DAMReX_FORTRAN=ON \ @@ -158,6 +160,7 @@ jobs: cmake .. \ -DCMAKE_BUILD_TYPE=RelWithDebInfo \ -DCMAKE_VERBOSE_MAKEFILE=ON \ + -DAMReX_FFT=ON \ -DAMReX_EB=ON \ -DAMReX_ENABLE_TESTS=ON \ -DAMReX_FORTRAN=OFF \ @@ -200,7 +203,7 @@ jobs: export CCACHE_LOGFILE=${{ github.workspace }}/ccache.log.txt ccache -z - ./configure --dim 2 --with-fortran no --comp llvm --with-mpi no + ./configure --dim 2 --with-fortran no --comp llvm --with-mpi no --enable-fft yes make -j4 WARN_ALL=TRUE WARN_ERROR=TRUE XTRA_CXXFLAGS="-fno-operator-names" \ CCACHE=ccache make install diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml index 9e96aefac5e..927e99ded40 100644 --- a/.github/workflows/cuda.yml +++ b/.github/workflows/cuda.yml @@ -38,6 +38,7 @@ jobs: cmake -S . -B build \ -DCMAKE_VERBOSE_MAKEFILE=ON \ + -DAMReX_FFT=ON \ -DAMReX_EB=ON \ -DAMReX_ENABLE_TESTS=ON \ -DAMReX_FORTRAN=OFF \ @@ -97,6 +98,7 @@ jobs: cmake -S . -B build \ -DCMAKE_VERBOSE_MAKEFILE=ON \ -DAMReX_MPI=OFF \ + -DAMReX_FFT=ON \ -DAMReX_EB=ON \ -DAMReX_ENABLE_TESTS=ON \ -DAMReX_FORTRAN=OFF \ @@ -153,6 +155,7 @@ jobs: -DCMAKE_VERBOSE_MAKEFILE=ON \ -DAMReX_ENABLE_TESTS=ON \ -DAMReX_TEST_TYPE=Small \ + -DAMReX_FFT=ON \ -DAMReX_FORTRAN=ON \ -DAMReX_FORTRAN_INTERFACES=ON \ -DAMReX_GPU_BACKEND=CUDA \ @@ -196,7 +199,7 @@ jobs: ccache -z export PATH=/usr/local/nvidia/bin:/usr/local/cuda/bin:${PATH} - ./configure --dim 3 --with-cuda yes --enable-eb yes --enable-xsdk-defaults yes --with-fortran no + ./configure --dim 3 --with-cuda yes --enable-eb yes --enable-xsdk-defaults yes --with-fortran no --enable-fft yes # # /home/runner/work/amrex/amrex/Src/Base/AMReX_GpuLaunchGlobal.H:16:41: error: unused parameter ‘f0’ [-Werror=unused-parameter] # 16 | AMREX_GPU_GLOBAL void launch_global (L f0) { f0(); } diff --git a/.github/workflows/dependencies/dependencies.sh b/.github/workflows/dependencies/dependencies.sh index 07e461f577f..c7cde496519 100755 --- a/.github/workflows/dependencies/dependencies.sh +++ b/.github/workflows/dependencies/dependencies.sh @@ -16,6 +16,7 @@ sudo apt-get update sudo apt-get install -y --no-install-recommends\ build-essential \ + libfftw3-dev \ g++ gfortran \ libopenmpi-dev \ openmpi-bin diff --git a/.github/workflows/dependencies/dependencies_clang.sh b/.github/workflows/dependencies/dependencies_clang.sh index 2e96b5196d8..4c329321b68 100755 --- a/.github/workflows/dependencies/dependencies_clang.sh +++ b/.github/workflows/dependencies/dependencies_clang.sh @@ -16,5 +16,6 @@ sudo apt-get update sudo apt-get install -y --no-install-recommends \ build-essential \ + libfftw3-dev \ gfortran \ clang-$1 diff --git a/.github/workflows/dependencies/dependencies_gcc.sh b/.github/workflows/dependencies/dependencies_gcc.sh index 2a576c0b52d..93d9aa27ec4 100755 --- a/.github/workflows/dependencies/dependencies_gcc.sh +++ b/.github/workflows/dependencies/dependencies_gcc.sh @@ -17,6 +17,7 @@ sudo apt-get update sudo apt-get install -y --no-install-recommends \ build-essential \ + libfftw3-dev \ g++-$1 gfortran-$1 \ libopenmpi-dev \ openmpi-bin diff --git a/.github/workflows/dependencies/dependencies_hip.sh b/.github/workflows/dependencies/dependencies_hip.sh index ab5185ce0ae..df4f274ef36 100755 --- a/.github/workflows/dependencies/dependencies_hip.sh +++ b/.github/workflows/dependencies/dependencies_hip.sh @@ -56,6 +56,7 @@ sudo apt-get install -y --no-install-recommends \ roctracer-dev \ rocprofiler-dev \ rocrand-dev \ + rocfft-dev \ rocprim-dev # hiprand-dev is a new package that does not exist in old versions diff --git a/.github/workflows/gcc.yml b/.github/workflows/gcc.yml index b64dbd3b5f0..88fe47c988c 100644 --- a/.github/workflows/gcc.yml +++ b/.github/workflows/gcc.yml @@ -42,6 +42,7 @@ jobs: mkdir build cd build cmake .. \ + -DAMReX_FFT=ON \ -DAMReX_FORTRAN=ON \ -DAMReX_PLOTFILE_TOOLS=ON \ -DCMAKE_VERBOSE_MAKEFILE=ON \ @@ -99,6 +100,7 @@ jobs: cmake -S . -B build \ -DCMAKE_BUILD_TYPE=Debug \ -DCMAKE_VERBOSE_MAKEFILE=ON \ + -DAMReX_FFT=ON \ -DAMReX_EB=ON \ -DAMReX_ENABLE_TESTS=ON \ -DAMReX_FORTRAN=ON \ @@ -147,6 +149,7 @@ jobs: cmake -S . -B build \ -DCMAKE_BUILD_TYPE=Debug \ -DCMAKE_VERBOSE_MAKEFILE=ON \ + -DAMReX_FFT=ON \ -DAMReX_EB=ON \ -DAMReX_ENABLE_TESTS=ON \ -DAMReX_FORTRAN=ON \ @@ -196,6 +199,7 @@ jobs: cmake -S . -B build \ -DCMAKE_BUILD_TYPE=Debug \ -DCMAKE_VERBOSE_MAKEFILE=ON \ + -DAMReX_FFT=ON \ -DAMReX_EB=OFF \ -DAMReX_ENABLE_TESTS=ON \ -DAMReX_FORTRAN=ON \ @@ -248,6 +252,7 @@ jobs: -DCMAKE_VERBOSE_MAKEFILE=ON \ -DAMReX_ASSERTIONS=ON \ -DAMReX_TESTING=ON \ + -DAMReX_FFT=ON \ -DAMReX_EB=OFF \ -DAMReX_ENABLE_TESTS=ON \ -DAMReX_BOUND_CHECK=ON \ @@ -310,6 +315,7 @@ jobs: -DAMReX_TESTING=ON \ -DAMReX_BOUND_CHECK=ON \ -DAMReX_FPE=ON \ + -DAMReX_FFT=ON \ -DAMReX_EB=ON \ -DAMReX_ENABLE_TESTS=ON \ -DAMReX_FORTRAN=ON \ @@ -371,6 +377,7 @@ jobs: -DAMReX_TESTING=ON \ -DAMReX_BOUND_CHECK=ON \ -DAMReX_FPE=ON \ + -DAMReX_FFT=ON \ -DAMReX_EB=ON \ -DAMReX_ENABLE_TESTS=ON \ -DAMReX_FORTRAN=OFF \ @@ -457,7 +464,7 @@ jobs: export CCACHE_LOGFILE=${{ github.workspace }}/ccache.log.txt ccache -z - ./configure --dim 3 --enable-eb yes --enable-xsdk-defaults yes + ./configure --dim 3 --enable-eb yes --enable-xsdk-defaults yes --enable-fft yes make -j4 WARN_ALL=TRUE WARN_ERROR=TRUE XTRA_CXXFLAGS=-fno-operator-names \ CCACHE=ccache make install @@ -497,7 +504,8 @@ jobs: export CCACHE_LOGFILE=${{ github.workspace }}/ccache.log.txt ccache -z - ./configure --dim 3 --enable-eb no --enable-xsdk-defaults no --single-precision yes --single-precision-particles yes --enable-tiny-profile yes + ./configure --dim 3 --enable-eb no --enable-xsdk-defaults no --single-precision yes \ + --single-precision-particles yes --enable-tiny-profile yes --enable-fft yes make -j4 WARN_ALL=TRUE WARN_ERROR=TRUE XTRA_CXXFLAGS=-fno-operator-names \ CCACHE=ccache make install @@ -623,6 +631,7 @@ jobs: -DAMReX_OMP=ON \ -DCMAKE_VERBOSE_MAKEFILE=ON \ -DAMReX_ENABLE_TESTS=ON \ + -DAMReX_FFT=ON \ -DCMAKE_CXX_COMPILER_LAUNCHER=ccache make -j 4 diff --git a/.github/workflows/hip.yml b/.github/workflows/hip.yml index 345d7c468b5..22154d6b012 100644 --- a/.github/workflows/hip.yml +++ b/.github/workflows/hip.yml @@ -48,6 +48,7 @@ jobs: cmake -S . -B build \ -DCMAKE_VERBOSE_MAKEFILE=ON \ + -DAMReX_FFT=ON \ -DAMReX_EB=ON \ -DAMReX_ENABLE_TESTS=ON \ -DAMReX_FORTRAN=ON \ @@ -103,6 +104,7 @@ jobs: cmake -S . -B build_full_legacywrapper \ -DCMAKE_VERBOSE_MAKEFILE=ON \ + -DAMReX_FFT=ON \ -DAMReX_EB=OFF \ -DAMReX_ENABLE_TESTS=ON \ -DAMReX_FORTRAN=ON \ @@ -145,7 +147,9 @@ jobs: export CCACHE_MAXSIZE=100M ccache -z - ./configure --dim 2 --with-hip yes --enable-eb yes --enable-xsdk-defaults yes --with-mpi no --with-omp no --single-precision yes --single-precision-particles yes + ./configure --dim 2 --with-hip yes --enable-eb yes --enable-xsdk-defaults yes \ + --with-mpi no --with-omp no --single-precision yes \ + --single-precision-particles yes --enable-fft yes make -j4 WARN_ALL=TRUE AMD_ARCH=gfx90a CCACHE=ccache make install diff --git a/.github/workflows/intel.yml b/.github/workflows/intel.yml index 227b0f97380..15c7bbda587 100644 --- a/.github/workflows/intel.yml +++ b/.github/workflows/intel.yml @@ -41,6 +41,7 @@ jobs: set -e cmake -S . -B build \ -DCMAKE_VERBOSE_MAKEFILE=ON \ + -DAMReX_FFT=ON \ -DAMReX_EB=OFF \ -DAMReX_ENABLE_TESTS=ON \ -DAMReX_FORTRAN=ON \ @@ -89,6 +90,7 @@ jobs: set -e cmake -S . -B build \ -DCMAKE_VERBOSE_MAKEFILE=ON \ + -DAMReX_FFT=ON \ -DAMReX_EB=ON \ -DAMReX_ENABLE_TESTS=ON \ -DAMReX_FORTRAN=OFF \ diff --git a/GNUmakefile.in b/GNUmakefile.in index b85c2e0c35e..67c789d97ca 100644 --- a/GNUmakefile.in +++ b/GNUmakefile.in @@ -26,6 +26,9 @@ ifeq ($(USE_LINEAR_SOLVERS),TRUE) Pdirs += F_Interfaces/LinearSolvers endif endif +ifeq ($(USE_FFT),TRUE) + Pdirs += FFT +endif ifeq ($(USE_EB),TRUE) Pdirs += EB endif diff --git a/Src/Base/AMReX_FabArray.H b/Src/Base/AMReX_FabArray.H index 69970d6401f..a67a72f0a38 100644 --- a/Src/Base/AMReX_FabArray.H +++ b/Src/Base/AMReX_FabArray.H @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -3679,6 +3680,8 @@ FabArray::norminf (FabArray const& mask, int comp, int ncomp, return nm0; } +using cMultiFab = FabArray > >; + } #endif /*BL_FABARRAY_H*/ diff --git a/Src/Base/AMReX_GpuError.H b/Src/Base/AMReX_GpuError.H index ce3ac188a85..65457c8f4e7 100644 --- a/Src/Base/AMReX_GpuError.H +++ b/Src/Base/AMReX_GpuError.H @@ -84,6 +84,16 @@ namespace Gpu { std::string errStr(std::string("CURAND error in file ") + __FILE__ \ + " line " + std::to_string(__LINE__)); \ amrex::Abort(errStr); }} while(0) + +#define AMREX_CUFFT_SAFE_CALL(call) { \ + cufftResult_t amrex_i_err = call; \ + if (CUFFT_SUCCESS != amrex_i_err) { \ + std::string errStr(std::string("CUFFT error ")+std::to_string(amrex_i_err) \ + + std::string(" in file ") + __FILE__ \ + + " line " + std::to_string(__LINE__)); \ + amrex::Abort(errStr); \ + }} + #endif #ifdef AMREX_USE_HIP @@ -100,6 +110,16 @@ namespace Gpu { std::string errStr(std::string("HIPRAND error in file ") + __FILE__ \ + " line " + std::to_string(__LINE__)); \ amrex::Abort(errStr); }} while(0) + +#define AMREX_ROCFFT_SAFE_CALL(call) { \ + auto amrex_i_err = call; \ + if (rocfft_status_success != amrex_i_err) { \ + std::string errStr(std::string("rocFFT error ")+std::to_string(amrex_i_err) \ + + std::string(" in file ") + __FILE__ \ + + " line " + std::to_string(__LINE__)); \ + amrex::Abort(errStr); \ + }} + #endif #define AMREX_GPU_ERROR_CHECK() amrex::Gpu::ErrorCheck(__FILE__, __LINE__) diff --git a/Src/CMakeLists.txt b/Src/CMakeLists.txt index 6e8af043e0d..25455d72636 100644 --- a/Src/CMakeLists.txt +++ b/Src/CMakeLists.txt @@ -136,6 +136,10 @@ if (AMReX_PARTICLES) add_subdirectory(Particle) endif () +if (AMReX_FFT) + add_subdirectory(FFT) +endif () + # # Optional external components # diff --git a/Src/FFT/AMReX_FFT.H b/Src/FFT/AMReX_FFT.H new file mode 100644 index 00000000000..758ac8cd153 --- /dev/null +++ b/Src/FFT/AMReX_FFT.H @@ -0,0 +1,609 @@ +#ifndef AMREX_FFT_H_ +#define AMREX_FFT_H_ +#include + +#include +#include +#include +#include + +#if defined(AMREX_USE_CUDA) +# include +# include +#elif defined(AMREX_USE_HIP) +# if __has_include() // ROCm 5.3+ +# include +# else +# include +# endif +# include +#elif defined(AMREX_USE_SYCL) +# include +#else +# include +#endif + +namespace amrex::FFT +{ + +enum struct Scaling { full, symmetric, none }; +enum struct Direction { forward, backward }; + +template +class R2C +{ +public: + using MF = std::conditional_t, + MultiFab, FabArray > >; + using cMF = FabArray > >; + + R2C (Box const& domain); + + ~R2C (); + + R2C (R2C const&) = delete; + R2C (R2C &&) = delete; + R2C& operator= (R2C const&) = delete; + R2C& operator= (R2C &&) = delete; + + template + void forwardThenBackward (MF const& inmf, MF& outmf, F const& post_forward) + { + this->forward_doit(inmf); + this->post_forward_doit(post_forward); + this->backward_doit(outmf); + } + + struct Swap01 + { + [[nodiscard]] AMREX_GPU_HOST_DEVICE Dim3 operator() (Dim3 i) const noexcept + { + return {i.y, i.x, i.z}; + } + + [[nodiscard]] AMREX_GPU_HOST_DEVICE Dim3 Inverse (Dim3 i) const noexcept + { + return {i.y, i.x, i.z}; + } + + [[nodiscard]] IndexType operator() (IndexType it) const noexcept + { + return it; + } + + [[nodiscard]] IndexType Inverse (IndexType it) const noexcept + { + return it; + } + }; + + struct Swap02 + { + [[nodiscard]] AMREX_GPU_HOST_DEVICE Dim3 operator() (Dim3 i) const noexcept + { + return {i.z, i.y, i.x}; + } + + [[nodiscard]] AMREX_GPU_HOST_DEVICE Dim3 Inverse (Dim3 i) const noexcept + { + return {i.z, i.y, i.x}; + } + + [[nodiscard]] IndexType operator() (IndexType it) const noexcept + { + return it; + } + + [[nodiscard]] IndexType Inverse (IndexType it) const noexcept + { + return it; + } + }; + +public: // public for cuda + + template + void post_forward_doit (F const& post_forward); + +private: + +#if defined(AMREX_USE_CUDA) + using FFTPlan = cufftHandle; + using FFTPlan2 = FFTPlan; + using FFTComplex = std::conditional_t, + cuComplex, cuDoubleComplex>; +#elif defined(AMREX_USE_HIP) + using FFTPlan = rocfft_plan; + using FFTPlan2 = FFTPlan; + using FFTComplex = std::conditional_t, + float2, double2>; +#elif defined(AMREX_USE_SYCL) + using FFTPlan = oneapi::mkl::dft::descriptor< + std::is_same_v ? oneapi::mkl::dft::precision::SINGLE + : oneapi::mkl::dft::precision::DOUBLE, + oneapi::mkl::dft::domain::REAL> *; + using FFTPlan2 = oneapi::mkl::dft::descriptor< + std::is_same_v ? oneapi::mkl::dft::precision::SINGLE + : oneapi::mkl::dft::precision::DOUBLE, + oneapi::mkl::dft::domain::COMPLEX> *; + using FFTComplex = GpuComplex; +#else + using FFTPlan = std::conditional_t, + fftwf_plan, fftw_plan>; + using FFTPlan2 = FFTPlan; + using FFTComplex = std::conditional_t, + fftwf_complex, fftw_complex>; +#endif + + void forward_doit (MF const& inmf, Scaling scaling = Scaling::none); + void backward_doit (MF& outmf, Scaling scaling = Scaling::none); + + static void exec_r2c (FFTPlan plan, MF& in, cMF& out); + static void exec_c2r (FFTPlan plan, cMF& in, MF& out); + template + static void exec_c2c (FFTPlan2 plan, cMF& inout); + + template + static void destroy_plan (P plan); + static std::pair make_c2c_plans (cMF& inout); + + Box m_real_domain; + Box m_spectral_domain_x; + Box m_spectral_domain_y; + Box m_spectral_domain_z; + + // assuming it's double for now + FFTPlan m_fft_fwd_x; + FFTPlan m_fft_bwd_x; + FFTPlan2 m_fft_fwd_y; + FFTPlan2 m_fft_bwd_y; + FFTPlan2 m_fft_fwd_z; + FFTPlan2 m_fft_bwd_z; + + // Comm meta-data. In the forward phase, we start with (x,y,z), + // transpose to (y,x,z) and then (z,x,y). In the backward phase, we + // perform inverse transpose. + Swap01 m_dtos_x2y{}; + std::unique_ptr m_cmd_x2y; + // + Swap01 m_dtos_y2x{}; + std::unique_ptr m_cmd_y2x; + // + Swap02 m_dtos_y2z{}; + std::unique_ptr m_cmd_y2z; + // + Swap02 m_dtos_z2y{}; + std::unique_ptr m_cmd_z2y; + + // Optionally we need to copy from m_cz to user provided cMultiFab. xxxxx todo + + MF m_rx; + cMF m_cx; + cMF m_cy; + cMF m_cz; +}; + +template +R2C::R2C (Box const& domain) + : m_real_domain(domain), + m_spectral_domain_x(IntVect(0), IntVect(AMREX_D_DECL(domain.length(0)/2, + domain.bigEnd(1), + domain.bigEnd(2)))), + m_spectral_domain_y(IntVect(0), IntVect(AMREX_D_DECL(domain.bigEnd(1), + domain.length(0)/2, + domain.bigEnd(2)))), + m_spectral_domain_z(IntVect(0), IntVect(AMREX_D_DECL(domain.bigEnd(2), + domain.length(0)/2, + domain.bigEnd(1)))) +{ + static_assert(std::is_same_v || std::is_same_v); + AMREX_ALWAYS_ASSERT(m_real_domain.smallEnd() == 0 && m_real_domain.cellCentered()); + + int myproc = ParallelDescriptor::MyProc(); + int nprocs = ParallelDescriptor::NProcs(); + + // xxxxx todo: need to handle cases there are more processes than 2d cells + // xxxxx todo: 1d & 2d + + auto bax = amrex::decompose(m_real_domain, nprocs, {false,true,true}); + Vector pmx(bax.size()); + std::iota(pmx.begin(), pmx.end(), 0); + DistributionMapping dmx(std::move(pmx)); + m_rx.define(bax, dmx, 1, 0); + + { + BoxList bl = bax.boxList(); + for (auto & b : bl) { + b.setBig(0, m_spectral_domain_x.bigEnd(0)); + } + BoxArray cbax(std::move(bl)); + m_cx.define(cbax, dmx, 1, 0); + } + + // plans for x-direction + { + Box const local_box = m_rx.boxArray()[myproc]; + int n = local_box.length(0); + int howmany = local_box.length(1) * local_box.length(2); + +#if defined(AMREX_USE_CUDA) + + cufftType fwd_type = std::is_same_v ? CUFFT_R2C : CUFFT_D2Z; + cufftType bwd_type = std::is_same_v ? CUFFT_C2R : CUFFT_Z2D; + AMREX_CUFFT_SAFE_CALL + (cufftPlanMany(&m_fft_fwd_x, 1, &n, nullptr, 1, m_real_domain.length(0), + nullptr, 1, m_spectral_domain_x.length(0), + fwd_type, howmany)); + AMREX_CUFFT_SAFE_CALL(cufftSetStream(m_fft_fwd_x, Gpu::gpuStream())); + AMREX_CUFFT_SAFE_CALL + (cufftPlanMany(&m_fft_bwd_x, 1, &n, nullptr, 1, m_spectral_domain_x.length(0), + nullptr, 1, m_real_domain.length(0), + bwd_type, howmany)); + AMREX_CUFFT_SAFE_CALL(cufftSetStream(m_fft_bwd_x, Gpu::gpuStream())); + +#elif defined(AMREX_USE_HIP) + + auto prec = std::is_same_v ? rocfft_precision_single : rocfft_precision_double; + const std::size_t length = n; + AMREX_ROCFFT_SAFE_CALL + (rocfft_plan_create(&m_fft_fwd_x, rocfft_placement_notinplace, + rocfft_transform_type_real_forward, prec, 1, &length, howmany, + nullptr)); + AMREX_ROCFFT_SAFE_CALL + (rocfft_plan_create(&m_fft_bwd_x, rocfft_placement_notinplace, + rocfft_transform_type_real_inverse, prec, 1, &length, howmany, + nullptr)); + +#elif defined(AMREX_USE_SYCL) + + m_fft_fwd_x = new std::remove_pointer_t(n); + m_fft_fwd_x->set_value(oneapi::mkl::dft::config_param::PLACEMENT, + DFTI_NOT_INPLACE); + m_fft_fwd_x->set_value(oneapi::mkl::dft::config_param::NUMBER_OF_TRANSFORMS, + howmany); + m_fft_fwd_x->set_value(oneapi::mkl::dft::config_param::FWD_DISTANCE, + m_real_domain.length(0)); + m_fft_fwd_x->set_value(oneapi::mkl::dft::config_param::BWD_DISTANCE, + m_spectral_domain_x.length(0)); + std::array strides{0,1}; + m_fft_fwd_x->set_value(oneapi::mkl::dft::config_param::FWD_STRIDES, + strides.data()); + m_fft_fwd_x->set_value(oneapi::mkl::dft::config_param::BWD_STRIDES, + strides.data()); + m_fft_fwd_x->set_value(oneapi::mkl::dft::config_param::WORKSPACE, + oneapi::mkl::dft::config_value::WORKSPACE_EXTERNAL); + m_fft_fwd_x->commit(amrex::Gpu::Device::streamQueue()); + + m_fft_bwd_x = m_fft_fwd_x; + +#else /* FFTW */ + + auto* in = m_rx[myproc].dataPtr(); + auto* out = (FFTComplex*)(m_cx[myproc].dataPtr()); + + if constexpr (std::is_same_v) { + m_fft_fwd_x = fftwf_plan_many_dft_r2c + (1, &n, howmany, in, nullptr, 1, m_real_domain.length(0), + out, nullptr, 1, m_spectral_domain_x.length(0), + FFTW_ESTIMATE | FFTW_DESTROY_INPUT); + + m_fft_bwd_x = fftwf_plan_many_dft_c2r + (1, &n, howmany, out, nullptr, 1, m_spectral_domain_x.length(0), + in, nullptr, 1, m_real_domain.length(0), + FFTW_ESTIMATE | FFTW_DESTROY_INPUT); + } else { + m_fft_fwd_x = fftw_plan_many_dft_r2c + (1, &n, howmany, in, nullptr, 1, m_real_domain.length(0), + out, nullptr, 1, m_spectral_domain_x.length(0), + FFTW_ESTIMATE | FFTW_DESTROY_INPUT); + + m_fft_bwd_x = fftw_plan_many_dft_c2r + (1, &n, howmany, out, nullptr, 1, m_spectral_domain_x.length(0), + in, nullptr, 1, m_real_domain.length(0), + FFTW_ESTIMATE | FFTW_DESTROY_INPUT); + } +#endif + } + + auto cbay = amrex::decompose(m_spectral_domain_y, nprocs, {false,true,true}); + DistributionMapping cdmy = dmx; // xxxxx todo + m_cy.define(cbay, cdmy, 1, 0); + + std::tie(m_fft_fwd_y, m_fft_bwd_y) = make_c2c_plans(m_cy); + + // comm meta-data between x and y phases + m_cmd_x2y = std::make_unique + (m_cy.boxArray(), m_cy.DistributionMap(), m_spectral_domain_y, + m_cx.boxArray(), m_cx.DistributionMap(), IntVect(0), m_dtos_x2y); + m_cmd_y2x = std::make_unique + (m_cx.boxArray(), m_cx.DistributionMap(), m_spectral_domain_x, + m_cy.boxArray(), m_cy.DistributionMap(), IntVect(0), m_dtos_y2x); + + auto cbaz = amrex::decompose(m_spectral_domain_z, nprocs, {false,true,true}); + DistributionMapping cdmz = dmx; // xxxxx todo + m_cz.define(cbaz, cdmz, 1, 0); + + std::tie(m_fft_fwd_z, m_fft_bwd_z) = make_c2c_plans(m_cz); + + // comm meta-data between y and z phases + m_cmd_y2z = std::make_unique + (m_cz.boxArray(), m_cz.DistributionMap(), m_spectral_domain_z, + m_cy.boxArray(), m_cy.DistributionMap(), IntVect(0), m_dtos_y2z); + m_cmd_z2y = std::make_unique + (m_cy.boxArray(), m_cy.DistributionMap(), m_spectral_domain_y, + m_cz.boxArray(), m_cz.DistributionMap(), IntVect(0), m_dtos_z2y); +} + +template +template +void R2C::destroy_plan (P plan) +{ +#if defined(AMREX_USE_CUDA) + AMREX_CUFFT_SAFE_CALL(cufftDestroy(plan)); +#elif defined(AMREX_USE_HIP) + AMREX_ROCFFT_SAFE_CALL(rocfft_plan_destroy(plan)); +#elif defined(AMREX_USE_SYCL) + delete plan; +#else + if constexpr (std::is_same_v) { + fftwf_destroy_plan(plan); + } else { + fftw_destroy_plan(plan); + } +#endif +} + +template +R2C::~R2C () +{ + destroy_plan(m_fft_fwd_x); + destroy_plan(m_fft_fwd_y); + destroy_plan(m_fft_fwd_z); +#if !defined(AMREX_USE_SYCL) + // For SYCL the same plans are used for both forward and backward transforms. + destroy_plan(m_fft_bwd_x); + destroy_plan(m_fft_bwd_y); + destroy_plan(m_fft_bwd_z); +#endif +} + +#ifdef AMREX_USE_HIP +namespace detail { void hip_execute (rocfft_plan plan, void **in, void **out); } +#endif + +#ifdef AMREX_USE_SYCL +namespace detail +{ +template +void sycl_execute (P plan, TI* in, TO* out) +{ + std::size_t workspaceSize = 0; + plan->get_value(oneapi::mkl::dft::config_param::WORKSPACE_BYTES, + &workspaceSize); + auto* buffer = (T*)amrex::The_Arena()->alloc(workspaceSize); + plan->set_workspace(buffer); + sycl::event r; + if (std::is_same_v) { + amrex::ignore_unused(in); + if constexpr (direction == Direction::forward) { + r = oneapi::mkl::dft::compute_forward(*plan, out); + } else { + r = oneapi::mkl::dft::compute_backward(*plan, out); + } + } else { + if constexpr (direction == Direction::forward) { + r = oneapi::mkl::dft::compute_forward(*plan, in, out); + } else { + r = oneapi::mkl::dft::compute_backward(*plan, in, out); + } + } + r.wait(); + amrex::The_Arena()->free(buffer); +} +} +#endif + +template +void R2C::exec_r2c (FFTPlan plan, MF& in, cMF& out) +{ +#if defined(AMREX_USE_GPU) + auto* pin = in[ParallelDescriptor::MyProc()].dataPtr(); + auto* pout = out[ParallelDescriptor::MyProc()].dataPtr(); +#else + amrex::ignore_unused(in,out); +#endif + +#if defined(AMREX_USE_CUDA) + if constexpr (std::is_same_v) { + AMREX_CUFFT_SAFE_CALL(cufftExecR2C(plan, pin, (FFTComplex*)pout)); + } else { + AMREX_CUFFT_SAFE_CALL(cufftExecD2Z(plan, pin, (FFTComplex*)pout)); + } +#elif defined(AMREX_USE_HIP) + detail::hip_execute(plan, (void**)&pin, (void**)&pout); +#elif defined(AMREX_USE_SYCL) + detail::sycl_execute(plan, pin, (std::complex*)pout); +#else + if constexpr (std::is_same_v) { + fftwf_execute(plan); + } else { + fftw_execute(plan); + } +#endif +} + +template +void R2C::exec_c2r (FFTPlan plan, cMF& in, MF& out) +{ +#if defined(AMREX_USE_GPU) + auto* pin = in[ParallelDescriptor::MyProc()].dataPtr(); + auto* pout = out[ParallelDescriptor::MyProc()].dataPtr(); +#else + amrex::ignore_unused(in,out); +#endif + +#if defined(AMREX_USE_CUDA) + if constexpr (std::is_same_v) { + AMREX_CUFFT_SAFE_CALL(cufftExecC2R(plan, (FFTComplex*)pin, pout)); + } else { + AMREX_CUFFT_SAFE_CALL(cufftExecZ2D(plan, (FFTComplex*)pin, pout)); + } +#elif defined(AMREX_USE_HIP) + detail::hip_execute(plan, (void**)&pin, (void**)&pout); +#elif defined(AMREX_USE_SYCL) + detail::sycl_execute(plan, (std::complex*)pin, pout); +#else + if constexpr (std::is_same_v) { + fftwf_execute(plan); + } else { + fftw_execute(plan); + } +#endif +} + +template +template +void R2C::exec_c2c (FFTPlan2 plan, cMF& inout) +{ + amrex::ignore_unused(inout); +#if defined(AMREX_USE_GPU) + auto* p = inout[ParallelDescriptor::MyProc()].dataPtr(); +#endif + +#if defined(AMREX_USE_CUDA) + auto cufft_direction = (direction == Direction::forward) ? CUFFT_FORWARD : CUFFT_INVERSE; + if constexpr (std::is_same_v) { + AMREX_CUFFT_SAFE_CALL(cufftExecC2C(plan, (FFTComplex*)p, (FFTComplex*)p, + cufft_direction)); + } else { + AMREX_CUFFT_SAFE_CALL(cufftExecZ2Z(plan, (FFTComplex*)p, (FFTComplex*)p, + cufft_direction)); + } +#elif defined(AMREX_USE_HIP) + detail::hip_execute(plan, (void**)&p, (void**)&p); +#elif defined(AMREX_USE_SYCL) + detail::sycl_execute(plan, (std::complex*)p, (std::complex*)p); +#else + if constexpr (std::is_same_v) { + fftwf_execute(plan); + } else { + fftw_execute(plan); + } +#endif +} + +template +void R2C::forward_doit (MF const& inmf, Scaling /*scaling*/) +{ + m_rx.ParallelCopy(inmf, 0, 0, 1); + exec_r2c(m_fft_fwd_x, m_rx, m_cx); + + ParallelCopy(m_cy, m_cx, *m_cmd_x2y, 0, 0, 1, m_dtos_x2y); + exec_c2c(m_fft_fwd_y, m_cy); + + ParallelCopy(m_cz, m_cy, *m_cmd_y2z, 0, 0, 1, m_dtos_y2z); + exec_c2c(m_fft_fwd_z, m_cz); +} + +template +void R2C::backward_doit (MF& outmf, Scaling /*scaling*/) +{ + exec_c2c(m_fft_bwd_z, m_cz); + ParallelCopy(m_cy, m_cz, *m_cmd_z2y, 0, 0, 1, m_dtos_z2y); + + exec_c2c(m_fft_bwd_y, m_cy); + ParallelCopy(m_cx, m_cy, *m_cmd_y2x, 0, 0, 1, m_dtos_y2x); + + exec_c2r(m_fft_bwd_x, m_cx, m_rx); + outmf.ParallelCopy(m_rx, 0, 0, 1); +} + +template +std::pair::FFTPlan2, typename R2C::FFTPlan2> +R2C::make_c2c_plans (cMF& inout) +{ + auto& fab = inout[ParallelDescriptor::MyProc()]; + Box const& local_box = fab.box(); + + int n = local_box.length(0); + int howmany = local_box.length(1) * local_box.length(2); + + FFTPlan2 fwd; + FFTPlan2 bwd; + +#if defined(AMREX_USE_CUDA) + + cufftType fwd_type = std::is_same_v ? CUFFT_C2C : CUFFT_Z2Z; + cufftType bwd_type = std::is_same_v ? CUFFT_C2C : CUFFT_Z2Z; + AMREX_CUFFT_SAFE_CALL + (cufftPlanMany(&fwd, 1, &n, nullptr, 1, n, nullptr, 1, n, fwd_type, howmany)); + AMREX_CUFFT_SAFE_CALL(cufftSetStream(fwd, Gpu::gpuStream())); + AMREX_CUFFT_SAFE_CALL + (cufftPlanMany(&bwd, 1, &n, nullptr, 1, n, nullptr, 1, n, bwd_type, howmany)); + AMREX_CUFFT_SAFE_CALL(cufftSetStream(bwd, Gpu::gpuStream())); + +#elif defined(AMREX_USE_HIP) + + auto prec = std::is_same_v ? rocfft_precision_single : rocfft_precision_double; + const std::size_t length = n; + AMREX_ROCFFT_SAFE_CALL + (rocfft_plan_create(&fwd, rocfft_placement_inplace, + rocfft_transform_type_complex_forward, prec, 1, &length, howmany, + nullptr)); + AMREX_ROCFFT_SAFE_CALL + (rocfft_plan_create(&bwd, rocfft_placement_inplace, + rocfft_transform_type_complex_inverse, prec, 1, &length, howmany, + nullptr)); + +#elif defined(AMREX_USE_SYCL) + + fwd = new std::remove_pointer_t(n); + fwd->set_value(oneapi::mkl::dft::config_param::PLACEMENT, + DFTI_INPLACE); + fwd->set_value(oneapi::mkl::dft::config_param::NUMBER_OF_TRANSFORMS, + howmany); + fwd->set_value(oneapi::mkl::dft::config_param::FWD_DISTANCE, n); + fwd->set_value(oneapi::mkl::dft::config_param::BWD_DISTANCE, n); + std::array strides{0,1}; + fwd->set_value(oneapi::mkl::dft::config_param::FWD_STRIDES, strides.data()); + fwd->set_value(oneapi::mkl::dft::config_param::BWD_STRIDES, strides.data()); + fwd->set_value(oneapi::mkl::dft::config_param::WORKSPACE, + oneapi::mkl::dft::config_value::WORKSPACE_EXTERNAL); + fwd->commit(amrex::Gpu::Device::streamQueue()); + + bwd = fwd; + +#else + auto* pinout = (FFTComplex*)fab.dataPtr(); + + if constexpr (std::is_same_v) { + fwd = fftwf_plan_many_dft(1, &n, howmany, pinout, nullptr, 1, n, + pinout, nullptr, 1, n, -1, FFTW_ESTIMATE); + bwd = fftwf_plan_many_dft(1, &n, howmany, pinout, nullptr, 1, n, + pinout, nullptr, 1, n, +1, FFTW_ESTIMATE); + } else { + fwd = fftw_plan_many_dft(1, &n, howmany, pinout, nullptr, 1, n, + pinout, nullptr, 1, n, -1, FFTW_ESTIMATE); + bwd = fftw_plan_many_dft(1, &n, howmany, pinout, nullptr, 1, n, + pinout, nullptr, 1, n, +1, FFTW_ESTIMATE); + } +#endif + + return {fwd,bwd}; +} + +template +template +void R2C::post_forward_doit (F const& post_forward) +{ + auto& spectral_fab = m_cz[ParallelDescriptor::MyProc()]; + auto const& a = spectral_fab.array(); // m_cz's ordering is z,x,y + ParallelFor(spectral_fab.box(), [=] AMREX_GPU_DEVICE (int iz, int jx, int ky) + { + post_forward(jx,ky,iz,a(iz,jx,ky)); + }); +} + +} + +#endif diff --git a/Src/FFT/AMReX_FFT.cpp b/Src/FFT/AMReX_FFT.cpp new file mode 100644 index 00000000000..e6204454f75 --- /dev/null +++ b/Src/FFT/AMReX_FFT.cpp @@ -0,0 +1,32 @@ +#include + +namespace amrex::FFT +{ + +#ifdef AMREX_USE_HIP +namespace detail +{ +void hip_execute (rocfft_plan plan, void **in, void **out) +{ + rocfft_execution_info execinfo = nullptr; + AMREX_ROCFFT_SAFE_CALL(rocfft_execution_info_create(&execinfo)); + + std::size_t buffersize = 0; + AMREX_ROCFFT_SAFE_CALL(rocfft_plan_get_work_buffer_size(plan, &buffersize)); + + auto* buffer = (void*)amrex::The_Arena()->alloc(buffersize); + AMREX_ROCFFT_SAFE_CALL(rocfft_execution_info_set_work_buffer(execinfo, buffer, buffersize)); + + AMREX_ROCFFT_SAFE_CALL(rocfft_execution_info_set_stream(execinfo, amrex::Gpu::gpuStream())); + + AMREX_ROCFFT_SAFE_CALL(rocfft_execute(plan, in, out, execinfo)); + + amrex::Gpu::streamSynchronize(); + amrex::The_Arena()->free(buffer); + + AMREX_ROCFFT_SAFE_CALL(rocfft_execution_info_destroy(execinfo)); +} +} +#endif + +} diff --git a/Src/FFT/CMakeLists.txt b/Src/FFT/CMakeLists.txt new file mode 100644 index 00000000000..509092da568 --- /dev/null +++ b/Src/FFT/CMakeLists.txt @@ -0,0 +1,10 @@ +foreach(D IN LISTS AMReX_SPACEDIM) + target_include_directories(amrex_${D}d PUBLIC $) + + target_sources(amrex_${D}d + PRIVATE + AMReX_FFT.H + AMReX_FFT.cpp + ) + +endforeach() diff --git a/Src/FFT/Make.package b/Src/FFT/Make.package new file mode 100644 index 00000000000..54322566b71 --- /dev/null +++ b/Src/FFT/Make.package @@ -0,0 +1,10 @@ +ifndef AMREX_FFT_MAKE + AMREX_FFT_MAKE := 1 + +CEXE_headers += AMReX_FFT.H +CEXE_sources += AMReX_FFT.cpp + +VPATH_LOCATIONS += $(AMREX_HOME)/Src/FFT +INCLUDE_LOCATIONS += $(AMREX_HOME)/Src/FFT + +endif diff --git a/Tests/CMakeLists.txt b/Tests/CMakeLists.txt index 90730c24fb6..9d6afba857a 100644 --- a/Tests/CMakeLists.txt +++ b/Tests/CMakeLists.txt @@ -117,6 +117,10 @@ if (AMReX_TEST_TYPE STREQUAL "Small") add_subdirectory("LinearSolvers/ABecLaplacian_C") endif() + if (AMReX_FFT) + add_subdirectory("FFT/Poisson") + endif() + else() # # List of subdirectories to search for CMakeLists. @@ -137,6 +141,10 @@ else() list(APPEND AMREX_TESTS_SUBDIRS LinearSolvers) endif () + if (AMReX_FFT) + list(APPEND AMREX_TESTS_SUBDIRS FFT) + endif () + if (AMReX_HDF5) list(APPEND AMREX_TESTS_SUBDIRS HDF5Benchmark) endif () diff --git a/Tests/FFT/Poisson/CMakeLists.txt b/Tests/FFT/Poisson/CMakeLists.txt new file mode 100644 index 00000000000..1fb1be42ec1 --- /dev/null +++ b/Tests/FFT/Poisson/CMakeLists.txt @@ -0,0 +1,14 @@ +foreach(D IN LISTS AMReX_SPACEDIM) + if (NOT D EQUAL 3) # 3d only for now + return() + endif() + + set(_sources main.cpp) + + set(_input_files) + + setup_test(${D} _sources _input_files) + + unset(_sources) + unset(_input_files) +endforeach() diff --git a/Tests/FFT/Poisson/GNUmakefile b/Tests/FFT/Poisson/GNUmakefile new file mode 100644 index 00000000000..4c15ced1883 --- /dev/null +++ b/Tests/FFT/Poisson/GNUmakefile @@ -0,0 +1,26 @@ +AMREX_HOME := ../../.. + +DEBUG = TRUE + +DIM = 3 + +COMP = gcc + +USE_MPI = TRUE +USE_OMP = FALSE +USE_CUDA = FALSE +USE_HIP = FALSE +USE_SYCL = FALSE + +USE_FFT = TRUE + +BL_NO_FORT = TRUE + +TINY_PROFILE = FALSE + +include $(AMREX_HOME)/Tools/GNUMake/Make.defs + +include ./Make.package +include $(AMREX_HOME)/Src/Base/Make.package + +include $(AMREX_HOME)/Tools/GNUMake/Make.rules diff --git a/Tests/FFT/Poisson/Make.package b/Tests/FFT/Poisson/Make.package new file mode 100644 index 00000000000..6b4b865e8fc --- /dev/null +++ b/Tests/FFT/Poisson/Make.package @@ -0,0 +1 @@ +CEXE_sources += main.cpp diff --git a/Tests/FFT/Poisson/main.cpp b/Tests/FFT/Poisson/main.cpp new file mode 100644 index 00000000000..ee2175ffa98 --- /dev/null +++ b/Tests/FFT/Poisson/main.cpp @@ -0,0 +1,123 @@ +#include // Put this at the top for testing + +#include +#include +#include +#include + +using namespace amrex; + +int main (int argc, char* argv[]) +{ + amrex::Initialize(argc, argv); + { + BL_PROFILE("main"); + + int n_cell_x = 64; + int n_cell_y = 64; + int n_cell_z = 64; + + Real prob_lo_x = 0.; + Real prob_lo_y = 0.; + Real prob_lo_z = 0.; + Real prob_hi_x = 1.; + Real prob_hi_y = 1.; + Real prob_hi_z = 1.; + + { + ParmParse pp; + pp.query("n_cell_x", n_cell_x); + pp.query("n_cell_y", n_cell_y); + pp.query("n_cell_z", n_cell_z); + } + + Box domain(IntVect(0),IntVect(AMREX_D_DECL(n_cell_x-1,n_cell_y-1,n_cell_z-1))); + BoxArray ba = amrex::decompose(domain, ParallelDescriptor::NProcs()); + DistributionMapping dm(ba); + + Geometry geom; + { + geom.define(domain, + RealBox({AMREX_D_DECL(prob_lo_x,prob_lo_y,prob_lo_z)}, + {AMREX_D_DECL(prob_hi_x,prob_hi_y,prob_hi_z)}), + CoordSys::cartesian, {AMREX_D_DECL(1,1,1)}); + } + auto const& dx = geom.CellSizeArray(); + + MultiFab rhs(ba,dm,1,0); + MultiFab soln(ba,dm,1,0); + auto const& rhsma = rhs.arrays(); + ParallelFor(rhs, [=] AMREX_GPU_DEVICE (int b, int i, int j, int k) + { + Real x = (i+0.5_rt) * dx[0]; + Real y = (AMREX_SPACEDIM>=2) ? (j+0.5_rt) * dx[1] : 0._rt; + Real z = (AMREX_SPACEDIM==3) ? (k+0.5_rt) * dx[2] : 0._rt; + rhsma[b](i,j,k) = std::exp(-10._rt*((x-0.5_rt)*(x-0.5_rt)*1.05_rt + + (y-0.5_rt)*(y-0.5_rt)*0.90_rt + + (z-0.5_rt)*(z-0.5_rt))); + }); + + // Shift rhs so that its sum is zero. + auto rhosum = rhs.sum(0); + rhs.plus(-rhosum/geom.Domain().d_numPts(), 0, 1); + + Real facx = 2._rt*Math::pi()/std::abs(prob_hi_x-prob_lo_x); + Real facy = 2._rt*Math::pi()/std::abs(prob_hi_y-prob_lo_y); + Real facz = 2._rt*Math::pi()/std::abs(prob_hi_z-prob_lo_z); + Real scale = 1._rt/(Real(n_cell_z)*Real(n_cell_y)*Real(n_cell_z)); + + auto post_forward = [=] AMREX_GPU_DEVICE (int i, int j, int k, + GpuComplex& spectral_data) + { + amrex::ignore_unused(j,k); + // the values in the upper-half of the spectral array in y and z + // are here interpreted as negative wavenumbers + AMREX_D_TERM(Real a = facx*i;, + Real b = (j < n_cell_y/2) ? facy*j : facy*(n_cell_y-j);, + Real c = (k < n_cell_z/2) ? facz*k : facz*(n_cell_z-k)); + Real k2 = AMREX_D_TERM(2._rt*(std::cos(a*dx[0])-1._rt)/(dx[0]*dx[0]), + +2._rt*(std::cos(b*dx[1])-1._rt)/(dx[1]*dx[1]), + +2._rt*(std::cos(c*dx[2])-1._rt)/(dx[2]*dx[2])); + if (k2 != 0._rt) { + spectral_data /= k2; + } else { + // interpretation here is that the average value of the + // solution is zero + spectral_data *= 0._rt; + } + spectral_data *= scale; + }; + + auto t0 = amrex::second(); + + FFT::R2C fft(geom.Domain()); + fft.forwardThenBackward(rhs, soln, post_forward); + + auto t1 = amrex::second(); + amrex::Print() << " AMReX FFT time: " << t1-t0 << "\n"; + + { + MultiFab phi(soln.boxArray(), soln.DistributionMap(), 1, 1); + MultiFab res(soln.boxArray(), soln.DistributionMap(), 1, 0); + MultiFab::Copy(phi, soln, 0, 0, 1, 0); + phi.FillBoundary(geom.periodicity()); + auto const& res_ma = res.arrays(); + auto const& phi_ma = phi.const_arrays(); + auto const& rhs_ma = rhs.const_arrays(); + ParallelFor(res, [=] AMREX_GPU_DEVICE (int b, int i, int j, int k) + { + auto const& phia = phi_ma[b]; + auto lap = (phia(i-1,j,k)-2.*phia(i,j,k)+phia(i+1,j,k)) / (dx[0]*dx[0]) + + (phia(i,j-1,k)-2.*phia(i,j,k)+phia(i,j+1,k)) / (dx[1]*dx[1]) + + (phia(i,j,k-1)-2.*phia(i,j,k)+phia(i,j,k+1)) / (dx[2]*dx[2]); + res_ma[b](i,j,k) = rhs_ma[b](i,j,k) - lap; + }); + amrex::Print() << " rhs.min & max: " << rhs.min(0) << " " << rhs.max(0) << "\n" + << " res.min & max: " << res.min(0) << " " << res.max(0) << "\n"; + VisMF::Write(soln, "soln"); + VisMF::Write(rhs, "rhs"); + VisMF::Write(res, "res"); + } + } + amrex::Finalize(); +} diff --git a/Tools/CMake/AMReXConfig.cmake.in b/Tools/CMake/AMReXConfig.cmake.in index 6ae09ba0a00..dae386cc9f4 100644 --- a/Tools/CMake/AMReXConfig.cmake.in +++ b/Tools/CMake/AMReXConfig.cmake.in @@ -76,6 +76,7 @@ set(AMReX_FINTERFACES_FOUND @AMReX_FORTRAN_INTERFACES@) set(AMReX_LSOLVERS_FOUND @AMReX_LINEAR_SOLVERS@) set(AMReX_LSOLVERS_INCFLO_FOUND @AMReX_LINEAR_SOLVERS_INCFLO@) set(AMReX_LSOLVERS_EM_FOUND @AMReX_LINEAR_SOLVERS_EM@) +set(AMReX_FFT_FOUND @AMReX_FFT@) set(AMReX_AMRDATA_FOUND @AMReX_AMRDATA@) set(AMReX_PARTICLES_FOUND @AMReX_PARTICLES@) set(AMReX_P@AMReX_PARTICLES_PRECISION@_FOUND ON) @@ -133,6 +134,7 @@ set(AMReX_FINTERFACES @AMReX_FORTRAN_INTERFACES@) set(AMReX_LSOLVERS @AMReX_LINEAR_SOLVERS@) set(AMReX_LSOLVERS_INCFLO @AMReX_LINEAR_SOLVERS_INCFLO@) set(AMReX_LSOLVERS_EM @AMReX_LINEAR_SOLVERS_EM@) +set(AMReX_FFT @AMReX_FFT@) set(AMReX_AMRDATA @AMReX_AMRDATA@) set(AMReX_PARTICLES @AMReX_PARTICLES@) set(AMReX_PARTICLES_PRECISION @AMReX_PARTICLES_PRECISION@) @@ -216,6 +218,12 @@ if (@AMReX_CONDUIT@) find_dependency(Conduit REQUIRED) endif () +if (@AMReX_FFT@) + if (@AMReX_GPU_BACKEND@ STREQUAL NONE) + find_dependency(AMReXFFTW3 REQUIRED) + endif() +endif() + if (@AMReX_HDF5@) find_dependency(HDF5 REQUIRED) endif () diff --git a/Tools/CMake/AMReXOptions.cmake b/Tools/CMake/AMReXOptions.cmake index 382459e38f1..a7863f125e3 100644 --- a/Tools/CMake/AMReXOptions.cmake +++ b/Tools/CMake/AMReXOptions.cmake @@ -294,6 +294,9 @@ cmake_dependent_option( AMReX_LINEAR_SOLVERS_EM "AMReX_LINEAR_SOLVERS" OFF) print_option( AMReX_LINEAR_SOLVERS_EM ) +option( AMReX_FFT "Build AMReX FFT" OFF ) +print_option( AMReX_FFT ) + option( AMReX_AMRDATA "Build data services" OFF ) print_option( AMReX_AMRDATA ) diff --git a/Tools/CMake/AMReXThirdPartyLibraries.cmake b/Tools/CMake/AMReXThirdPartyLibraries.cmake index abe62a2ebc9..b8ad503e83f 100644 --- a/Tools/CMake/AMReXThirdPartyLibraries.cmake +++ b/Tools/CMake/AMReXThirdPartyLibraries.cmake @@ -1,3 +1,27 @@ +# +# FFT +# +if (AMReX_FFT) + if (AMReX_CUDA) + find_package(CUDAToolkit REQUIRED) + foreach(D IN LISTS AMReX_SPACEDIM) + target_link_libraries(amrex_${D}d PUBLIC CUDA::cufft) + endforeach() + elseif (AMReX_HIP) + find_package(rocfft REQUIRED) + foreach(D IN LISTS AMReX_SPACEDIM) + target_link_libraries(amrex_${D}d PUBLIC roc::rocfft) + endforeach() + elseif (AMReX_SYCL) + # nothing to do + else() + find_package(AMReXFFTW REQUIRED) + foreach(D IN LISTS AMReX_SPACEDIM) + target_link_libraries(amrex_${D}d PUBLIC AMReX::FFTW) + endforeach() + endif() +endif() + # # HDF5 -- here it would be best to create an imported target # diff --git a/Tools/CMake/FindAMReXFFTW.cmake b/Tools/CMake/FindAMReXFFTW.cmake new file mode 100644 index 00000000000..678743a08bb --- /dev/null +++ b/Tools/CMake/FindAMReXFFTW.cmake @@ -0,0 +1,51 @@ +#[=======================================================================[: +FindAMReXFFTW +------- + +Finds the FFTW library. + +Imported Targets +^^^^^^^^^^^^^^^^ + +This module provides the following imported target, if found: + +``FFTW`` + The FFTW library + +Result Variables +^^^^^^^^^^^^^^^^ + +This will define the following variables: + +``AMReXFFTW_FOUND`` + True if the hypre library has been found. +``FFTW_INCLUDES`` + Include directories needed to use FFTW. +``FFTW_LIBRARIES`` + Libraries needed to link to FFTW. + +This will also create an imported target, AMReX::FFTW. +#]=======================================================================] + +if (NOT FFTW_INCLUDES) + find_path(FFTW_INCLUDES NAMES "fftw3.h" HINTS ${FFTW_ROOT}/include) +endif() + +if (NOT FFTW_LIBRARIES) + find_library(FFTW_LIBRARY NAMES "fftw3" HINTS ${FFTW_ROOT}/lib) + find_library(FFTWF_LIBRARY NAMES "fftw3f" HINTS ${FFTW_ROOT}/lib) + set(FFTW_LIBRARIES ${FFTW_LIBRARY} ${FFTWF_LIBRARY}) +endif() + +include(FindPackageHandleStandardArgs) + +find_package_handle_standard_args(AMReXFFTW + REQUIRED_VARS FFTW_LIBRARIES FFTW_INCLUDES) + +mark_as_advanced(FFTW_LIBRARIES FFTW_INCLUDES) + +# Create imported target +add_library(AMReX::FFTW INTERFACE IMPORTED GLOBAL) +target_link_libraries(AMReX::FFTW INTERFACE ${FFTW_LIBRARIES}) +set_target_properties(AMReX::FFTW PROPERTIES + INTERFACE_INCLUDE_DIRECTORIES "${FFTW_INCLUDES}") diff --git a/Tools/GNUMake/Make.defs b/Tools/GNUMake/Make.defs index f33911ed3aa..4ec9c9262ed 100644 --- a/Tools/GNUMake/Make.defs +++ b/Tools/GNUMake/Make.defs @@ -112,6 +112,12 @@ else DEBUG := FALSE endif +ifdef USE_FFT + USE_FFT := $(strip $(USE_FFT)) +else + USE_FFT := FALSE +endif + ifdef PROFILE PROFILE := $(strip $(PROFILE)) else @@ -604,6 +610,28 @@ else DebugSuffix := endif +ifeq ($(USE_FFT),TRUE) + include $(AMREX_HOME)/Src/FFT/Make.package + ifeq ($(USE_CUDA),TRUE) + LIBRARIES += -lcufft + else ifeq ($(USE_HIP),TRUE) + # Use rocFFT. ROC_PATH is defined in hip.mak + SYSTEM_INCLUDE_LOCATIONS += $(ROC_PATH)/rocfft/include + LIBRARY_LOCATIONS += $(ROC_PATH)/rocfft/lib + LIBRARIES += -Wl,--rpath=$(ROC_PATH)/rocfft/lib -lrocfft + else ifeq ($(USE_SYCL),TRUE) + # nothing + else + FFTW_HOME ?= NOT_SET + ifneq ($(FFTW_HOME),NOT_SET) + SYSTEM_INCLUDE_LOCATIONS += $(FFTW_HOME)/include + LIBRARY_LOCATIONS += $(FFTW_HOME)/lib + LIBRARIES += -Wl,--rpath=$(FFTW_HOME)/lib + endif + LIBRARIES += -lfftw3f -lfftw3 + endif +endif + ifeq ($(USE_PROFPARSER),TRUE) PROFILE := TRUE TRACE_PROFILE := TRUE diff --git a/Tools/GNUMake/comps/hip.mak b/Tools/GNUMake/comps/hip.mak index 87bb3e93f59..26dff7f94ff 100644 --- a/Tools/GNUMake/comps/hip.mak +++ b/Tools/GNUMake/comps/hip.mak @@ -119,8 +119,8 @@ ifeq ($(HIP_COMPILER),clang) endif # Generic HIP info - ROC_PATH=$(realpath $(dir $(HIP_PATH))) - SYSTEM_INCLUDE_LOCATIONS += $(ROC_PATH)/include $(HIP_PATH)/include + ROC_PATH=$(realpath $(HIP_PATH)) + SYSTEM_INCLUDE_LOCATIONS += $(ROC_PATH)/include # rocRand SYSTEM_INCLUDE_LOCATIONS += $(ROC_PATH)/include/hiprand $(ROC_PATH)/include/rocrand diff --git a/Tools/libamrex/configure.py b/Tools/libamrex/configure.py index 8988c18b965..873b575fe4f 100755 --- a/Tools/libamrex/configure.py +++ b/Tools/libamrex/configure.py @@ -57,6 +57,10 @@ def configure(argv): help="Enable AMReX Fortran API [default=yes]", choices=["yes","no"], default="yes") + parser.add_argument("--enable-fft", + help="Enable AMReX FFT [default=no]", + choices=["yes","no"], + default="no") parser.add_argument("--enable-linear-solver", help="Enable AMReX linear solvers [default=yes]", choices=["yes","no"], @@ -151,6 +155,7 @@ def configure(argv): f.write("DEBUG = {}\n".format("TRUE" if args.debug == "yes" else "FALSE")) f.write("USE_PARTICLES = {}\n".format("FALSE" if args.enable_particle == "no" else "TRUE")) f.write("USE_FORTRAN_INTERFACE = {}\n".format("FALSE" if args.enable_fortran_api == "no" else "TRUE")) + f.write("USE_FFT = {}\n".format("TRUE" if args.enable_fft == "yes" else "FALSE")) f.write("USE_LINEAR_SOLVERS = {}\n".format("FALSE" if args.enable_linear_solver == "no" else "TRUE")) f.write("USE_LINEAR_SOLVERS_INCFLO = {}\n".format("FALSE" if args.enable_linear_solver_incflo == "no" else "TRUE")) f.write("USE_LINEAR_SOLVERS_EM = {}\n".format("FALSE" if args.enable_linear_solver_em == "no" else "TRUE"))