diff --git a/ExampleCodes/heFFTe/Poisson2/GNUmakefile b/ExampleCodes/heFFTe/Poisson2/GNUmakefile new file mode 100644 index 0000000..3b4e52f --- /dev/null +++ b/ExampleCodes/heFFTe/Poisson2/GNUmakefile @@ -0,0 +1,35 @@ +AMREX_HOME ?= ../../../../amrex +HEFFTE_HOME ?= ../../../../heffte/build + +DEBUG = FALSE +DIM = 3 +COMP = gcc +TINY_PROFILE = FALSE +USE_MPI = TRUE +USE_CUDA = FALSE +USE_HIP = FALSE + +BL_NO_FORT = TRUE + +include $(AMREX_HOME)/Tools/GNUMake/Make.defs +include $(AMREX_HOME)/Src/Base/Make.package +include Make.package + +VPATH_LOCATIONS += $(HEFFTE_HOME)/include +INCLUDE_LOCATIONS += $(HEFFTE_HOME)/include +LIBRARY_LOCATIONS += $(HEFFTE_HOME)/lib + +libraries += -lheffte + +ifeq ($(USE_CUDA),TRUE) + libraries += -lcufft +else ifeq ($(USE_HIP),TRUE) + # Use rocFFT. ROC_PATH is defined in amrex + INCLUDE_LOCATIONS += $(ROC_PATH)/rocfft/include + LIBRARY_LOCATIONS += $(ROC_PATH)/rocfft/lib + LIBRARIES += -L$(ROC_PATH)/rocfft/lib -lrocfft +else + libraries += -lfftw3_mpi -lfftw3f -lfftw3 +endif + +include $(AMREX_HOME)/Tools/GNUMake/Make.rules diff --git a/ExampleCodes/heFFTe/Poisson2/Make.package b/ExampleCodes/heFFTe/Poisson2/Make.package new file mode 100644 index 0000000..6b4b865 --- /dev/null +++ b/ExampleCodes/heFFTe/Poisson2/Make.package @@ -0,0 +1 @@ +CEXE_sources += main.cpp diff --git a/ExampleCodes/heFFTe/Poisson2/inputs b/ExampleCodes/heFFTe/Poisson2/inputs new file mode 100644 index 0000000..9b88870 --- /dev/null +++ b/ExampleCodes/heFFTe/Poisson2/inputs @@ -0,0 +1,11 @@ +n_cell_x = 64 +n_cell_y = 64 +n_cell_z = 64 + +prob_lo_x = 0. +prob_lo_y = 0. +prob_lo_z = 0. + +prob_hi_x = 1. +prob_hi_y = 1. +prob_hi_z = 1. diff --git a/ExampleCodes/heFFTe/Poisson2/main.cpp b/ExampleCodes/heFFTe/Poisson2/main.cpp new file mode 100644 index 0000000..3472702 --- /dev/null +++ b/ExampleCodes/heFFTe/Poisson2/main.cpp @@ -0,0 +1,314 @@ +#include + +#include +#include +#include +#include +#include + +using namespace amrex; + +static_assert(AMREX_SPACEDIM == 3); + +int main (int argc, char* argv[]) +{ + amrex::Initialize(argc, argv); { + + BL_PROFILE("main"); + + // ********************************** + // DECLARE SIMULATION PARAMETERS + // ********************************** + + // number of cells on each side of the domain + int n_cell_x; + int n_cell_y; + int n_cell_z; + + // physical dimensions of the domain + Real prob_lo_x = 0.; + Real prob_lo_y = 0.; + Real prob_lo_z = 0.; + Real prob_hi_x = 1.; + Real prob_hi_y = 1.; + Real prob_hi_z = 1.; + + // ********************************** + // READ PARAMETER VALUES FROM INPUTS FILE + // ********************************** + { + // ParmParse is way of reading inputs from the inputs file + // pp.get means we require the inputs file to have it + // pp.query means we optionally need the inputs file to have it - but you should supply a default value above + + ParmParse pp; + + pp.get("n_cell_x",n_cell_x); + pp.get("n_cell_y",n_cell_y); + pp.get("n_cell_z",n_cell_z); + + pp.query("prob_lo_x",prob_lo_x); + pp.query("prob_lo_y",prob_lo_y); + pp.query("prob_lo_z",prob_lo_z); + + pp.query("prob_hi_x",prob_hi_x); + pp.query("prob_hi_y",prob_hi_y); + pp.query("prob_hi_z",prob_hi_z); + } + + // Determine the domain length in each direction + Real L_x = std::abs(prob_hi_x - prob_lo_x); + Real L_y = std::abs(prob_hi_y - prob_lo_y); + + // define lower and upper indices of domain + IntVect dom_lo(AMREX_D_DECL( 0, 0, 0)); + IntVect dom_hi(AMREX_D_DECL(n_cell_x-1, n_cell_y-1, n_cell_z-1)); + + // Make a single box that is the entire domain + Box domain(dom_lo, dom_hi); + + // Initialize the boxarray "ba" from the single box "domain" There are + // exactly nprocs boxes. The domain decomposition is done in the x- and + // y-directions, but not the z-direction. + BoxArray ba = amrex::decompose(domain, ParallelDescriptor::NProcs(), + {AMREX_D_DECL(true,true,false)}); + + // How Boxes are distrubuted among MPI processes + DistributionMapping dm(ba); + + // This defines the physical box size in each direction + RealBox real_box({ AMREX_D_DECL(prob_lo_x, prob_lo_y, prob_lo_z)}, + { AMREX_D_DECL(prob_hi_x, prob_hi_y, prob_hi_z)} ); + + // periodic in all direction + Array is_periodic{AMREX_D_DECL(1,1,1)}; + + // geometry object for real data + Geometry geom(domain, real_box, CoordSys::cartesian, is_periodic); + + // extract dx from the geometry object + GpuArray dx = geom.CellSizeArray(); + + MultiFab rhs(ba,dm,1,0); + MultiFab soln(ba,dm,1,0); + + // check to make sure each MPI rank has exactly 1 box + AMREX_ALWAYS_ASSERT_WITH_MESSAGE(rhs.local_size() == 1, "Must have one Box per MPI process"); + + for (MFIter mfi(rhs); mfi.isValid(); ++mfi) { + + Array4 const& rhs_ptr = rhs.array(mfi); + + const Box& bx = mfi.fabbox(); + + amrex::ParallelFor(bx, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept + { + + // ********************************** + // SET VALUES FOR EACH CELL + // ********************************** + + Real x = (i+0.5) * dx[0]; + Real y = (AMREX_SPACEDIM>=2) ? (j+0.5) * dx[1] : 0.; + Real z = (AMREX_SPACEDIM==3) ? (k+0.5) * dx[2] : 0.; + + rhs_ptr(i,j,k) = std::exp(-10.*((x-0.5)*(x-0.5)+(y-0.5)*(y-0.5)+(z-0.5)*(z-0.5))); + + }); + } + + // Shift rhs so that its sum is zero. + auto rhosum = rhs.sum(0); + rhs.plus(-rhosum/geom.Domain().d_numPts(), 0, 1); + + // since there is 1 MPI rank per box, here each MPI rank obtains its local box and the associated boxid + Box local_box; + int local_boxid; + { + for (int i = 0; i < ba.size(); ++i) { + Box b = ba[i]; + // each MPI rank has its own local_box Box and local_boxid ID + if (ParallelDescriptor::MyProc() == dm[i]) { + local_box = b; + local_boxid = i; + } + } + } + + // now each MPI rank works on its own box + // for real->complex fft's, the fft is stored in an (nx/2+1) x ny x nz dataset + + // start by coarsening each box by 2 in the x-direction + Box c_local_box = amrex::coarsen(local_box, IntVect(AMREX_D_DECL(2,1,1))); + + // if the coarsened box's high-x index is even, we shrink the size in 1 in x + // this avoids overlap between coarsened boxes + if (c_local_box.bigEnd(0) * 2 == local_box.bigEnd(0)) { + c_local_box.setBig(0,c_local_box.bigEnd(0)-1); + } + // for any boxes that touch the hi-x domain we + // increase the size of boxes by 1 in x + // this makes the overall fft dataset have size (Nx/2+1 x Ny x Nz) + if (local_box.bigEnd(0) == geom.Domain().bigEnd(0)) { + c_local_box.growHi(0,1); + } + + // each MPI rank gets storage for its piece of the fft + BaseFab > spectral_field(c_local_box, 1, The_Device_Arena()); + + // create real->complex fft objects with the appropriate backend and data about + // the domain size and its local box size + using fft_r2c_t = +#ifdef AMREX_USE_CUDA + heffte::fft2d_r2c; +#elif AMREX_USE_HIP + heffte::fft2d_r2c; +#else + heffte::fft2d_r2c; +#endif + + auto lo = amrex::lbound(local_box); + auto hi = amrex::ubound(local_box); + auto len = amrex::length(local_box); + auto clo = amrex::lbound(c_local_box); + auto chi = amrex::ubound(c_local_box); + auto clen = amrex::length(c_local_box); + + auto fft = std::make_unique + (heffte::box3d({ lo.x, lo.y, 0}, + { hi.x, hi.y, 0}), + heffte::box3d({clo.x, clo.y, 0}, + {chi.x, chi.y, 0}), + 0, ParallelDescriptor::Communicator()); + + Real start_step = static_cast(ParallelDescriptor::second()); + using heffte_complex = typename heffte::fft_output::type; + heffte_complex* spectral_data = (heffte_complex*) spectral_field.dataPtr(); + + int batch_size = n_cell_z; + Gpu::DeviceVector workspace(fft->size_workspace()*batch_size); + + { BL_PROFILE("HEFFTE-total"); + { + BL_PROFILE("ForwardTransform"); + fft->forward(batch_size, rhs[local_boxid].dataPtr(), spectral_data, + workspace.data()); + } + + // Now we take the standard FFT and scale it by 1/k^2 + Array4< GpuComplex > spectral = spectral_field.array(); + + FArrayBox tridiag_workspace(c_local_box,4); + auto const& ald = tridiag_workspace.array(0); + auto const& bd = tridiag_workspace.array(1); + auto const& cud = tridiag_workspace.array(2); + auto const& scratch = tridiag_workspace.array(3); + + Gpu::DeviceVector delzv(n_cell_z, dx[2]); + auto const* delz = delzv.data(); + + auto xybox = amrex::makeSlab(c_local_box, 2, 0); + ParallelFor(xybox, [=] AMREX_GPU_DEVICE(int i, int j, int) + { + Real a = 2.*M_PI*i / L_x; + Real b = 2.*M_PI*j / L_y; + + // the values in the upper-half of the spectral array in y and z are here interpreted as negative wavenumbers + if (j >= n_cell_y/2) b = 2.*M_PI*(n_cell_y-j) / L_y; + + Real k2 = 2*(std::cos(a*dx[0])-1.)/(dx[0]*dx[0]) + 2*(std::cos(b*dx[1])-1.)/(dx[1]*dx[1]); + + // Tridiagonal solve with homogeneous Neumann + for( int k=0; k= 0; k--) { + spectral(i,j,k) -= scratch(i,j,k) * spectral(i,j,k + 1); + } + }); + Gpu::streamSynchronize(); + + { + BL_PROFILE("BackwardTransform"); + fft->backward(batch_size, spectral_data, soln[local_boxid].dataPtr(), + heffte::scale::full); + } + } + + Real end_step = static_cast(ParallelDescriptor::second()); + // amrex::Print() << "TIME IN SOLVE " << end_step - start_step << std::endl; + + // storage for variables to write to plotfile + MultiFab plotfile(ba, dm, 2, 0); + + // copy rhs and soln into plotfile + MultiFab::Copy(plotfile, rhs , 0, 0, 1, 0); + MultiFab::Copy(plotfile, soln, 0, 1, 1, 0); + + // time and step are dummy variables required to WriteSingleLevelPlotfile + Real time = 0.; + int step = 0; + + // arguments + // 1: name of plotfile + // 2: MultiFab containing data to plot + // 3: variables names + // 4: geometry object + // 5: "time" of plotfile; not relevant in this example + // 6: "time step" of plotfile; not relevant in this example + WriteSingleLevelPlotfile("plt", plotfile, {"rhs", "soln"}, geom, time, step); + + { + MultiFab phi(soln.boxArray(), soln.DistributionMap(), 1, 1); + MultiFab res(soln.boxArray(), soln.DistributionMap(), 1, 0); + MultiFab::Copy(phi, soln, 0, 0, 1, 0); + phi.FillBoundary(geom.periodicity()); + auto const& res_ma = res.arrays(); + auto const& phi_ma = phi.const_arrays(); + auto const& rhs_ma = rhs.const_arrays(); + ParallelFor(res, [=] AMREX_GPU_DEVICE (int b, int i, int j, int k) + { + auto const& phia = phi_ma[b]; + auto lap = (phia(i-1,j,k)-2.*phia(i,j,k)+phia(i+1,j,k)) / (dx[0]*dx[0]) + + (phia(i,j-1,k)-2.*phia(i,j,k)+phia(i,j+1,k)) / (dx[1]*dx[1]); + if (k == 0) { + lap += (-phia(i,j,k)+phia(i,j,k+1)) / (dx[2]*dx[2]); + } else if (k == n_cell_z-1) { + lap += (phia(i,j,k-1)-phia(i,j,k)) / (dx[2]*dx[2]); + } else { + lap += (phia(i,j,k-1)-2.*phia(i,j,k)+phia(i,j,k+1)) / (dx[2]*dx[2]); + } + res_ma[b](i,j,k) = rhs_ma[b](i,j,k) - lap; + }); + amrex::Print() << " rhs.min & max: " << rhs.min(0) << " " << rhs.max(0) << "\n" + << " res.min & max: " << res.min(0) << " " << res.max(0) << "\n"; + } + + } amrex::Finalize(); +}