diff --git a/src/CartLatticeAccess.hpp.Rt b/src/CartLatticeAccess.hpp.Rt index 5a72e1c24..0b1f2d071 100644 --- a/src/CartLatticeAccess.hpp.Rt +++ b/src/CartLatticeAccess.hpp.Rt @@ -262,21 +262,23 @@ class CartLatticeAccess { const z_t z; const flag_t nt; const range_int<0,1,0,1> nx, ny, nz; + const CartLatticeContainer* container; public: - CudaDeviceFunction CartLatticeAccess(const int& x_, const int& y_, const int& z_) : + CudaDeviceFunction CartLatticeAccess(const int& x_, const int& y_, const int& z_, const CartLatticeContainer& container_) : x(x_),y(y_),z(z_), - nt(fetchNodeType(constContainer, x, y, z)), - nx(constContainer.nx),ny(constContainer.ny),nz(constContainer.nz) + nt(fetchNodeType(container_, x, y, z)), + nx(container_.nx),ny(container_.ny),nz(container_.nz), + container(&container_) { } - CudaDeviceFunction real_t getX() const { return constContainer.px + x; } - CudaDeviceFunction real_t getY() const { return constContainer.py + y; } - CudaDeviceFunction real_t getZ() const { return constContainer.pz + z; } + CudaDeviceFunction real_t getX() const { return container->px + x; } + CudaDeviceFunction real_t getY() const { return container->py + y; } + CudaDeviceFunction real_t getZ() const { return container->pz + z; } CudaDeviceFunction flag_t getNodeType() const { return nt; } CudaDeviceFunction inline cut_t getQ(const int& d) const { - if (constContainer.Q == NULL) return NO_CUT; + if (!container->Q) return NO_CUT; size_t i = ((((size_t)d)*nz+z)*ny+y)*nx+x; - return constContainer.Q[i]; + return container->Q[i]; } @@ -299,7 +301,7 @@ template CudaDeviceFunction real_t CartLatticeAccess< x_t, y_t, z_t >::load_ (const dx_t & dx, const dy_t & dy, const dz_t & dz) const { storage_t ret; in"); p = PV(c("x","y","z")); dp = PV(c("dx","dy","dz")); if (f$minx == f$maxx) dp[1] = f$minx @@ -318,7 +320,7 @@ template CudaDeviceFunction void CartLatticeAccess< x_t, y_t, z_t >::pop(N & node) const { storage_t val; in",pocket=TRUE); for (d in rows(Density)[s$load.densities]) { f = rows(Fields)[[match(d$field, Fields$name)]] dp = c(-d$dx, -d$dy, -d$dz) @@ -332,7 +334,7 @@ template CudaDeviceFunction void CartLatticeAccess< x_t, y_t, z_t >::push(N & node) const { storage_t val; out",pocket=TRUE); for (f in rows(Fields)[s$save.fields]) { ?> val = ; template CudaDeviceFunction void CartLatticeAccess< x_t, y_t, z_t >::push_adj(N & node) const { adjout",pocket=TRUE); for (d in rows(Density)[s$load.densities]) { f = rows(Fields)[[match(d$field, Fields$name)]] dp = c(-d$dx, -d$dy, -d$dz) @@ -357,7 +359,7 @@ template < class x_t, class y_t, class z_t > template CudaDeviceFunction void CartLatticeAccess< x_t, y_t, z_t >::pop_adj(N & node) const { adjin",pocket=TRUE); for (f in rows(Fields)[s$save.fields]) { val = paste("node",f$adjoint_name,sep=".") con = field.access( d=val, f, p, pattern="put", access="getsum", MContext=con) diff --git a/src/CartLatticeExecutor.hpp.Rt b/src/CartLatticeExecutor.hpp.Rt deleted file mode 100644 index bfe9d74be..000000000 --- a/src/CartLatticeExecutor.hpp.Rt +++ /dev/null @@ -1,327 +0,0 @@ - -/** \file CartLatticeExecutor.hpp -*/ - -#include "CartLatticeExecutor.h" -#include "CartLatticeAccess.hpp" -#include "Node.hpp" -#include "GetThreads.h" - -using CartLatticeAccessAll = AccessComposite< CartLatticeAccess< range_int<0,0,-1,1>, range_int<0,0,-1,1>, range_int<0,0,-1,1> > >; -using CartLatticeAccessInterior = AccessComposite< CartLatticeAccess< - range_int< ,0,,1>, - range_int< ,0,,1>, - range_int< ,0,,1> > >; - -template -static void CopyObjToConst(const T& host_obj, T& const_symbol) { - CudaCopyToConstant("copy to GPU constant memory", const_symbol, &host_obj, sizeof(T)); -} - -void CartLatticeExecutor::CopyToConst() const { - CopyObjToConst(container, constContainer); - CopyObjToConst(data, constData); -} - -/// Main Kernel -/** - iterates over all elements and runs them with RunElement function. - constContainer.dx/dy is to calculate only internal nodes -*/ -template < eOperationType I, eCalculateGlobals G, eStage S > -struct InteriorExecutor { - CudaDeviceFunction void Execute() const { - using LA = CartLatticeAccessInterior; - using N = Node; - int x_ = CudaThread.x + CudaBlock.z*CudaNumberOfThreads.x + ; - int y_ = CudaThread.y + CudaBlock.x*CudaNumberOfThreads.y + ; - int z_ = CudaBlock.y + ; - if (y_ < constContainer.ny - ) { -#ifndef GRID3D - for (; x_ < constContainer.nx; x_ += CudaNumberOfThreads.x) -#endif - { - LA acc(x_,y_,z_); - N now(acc); - now.RunElement(); - } - } - } -}; - -/// Border Kernel -/** - iterates over border elements and runs them with RunElement function -*/ -template < eOperationType I, eCalculateGlobals G, eStage S > -struct BorderExecutor { - CudaDeviceFunction void Execute() const { - using LA = CartLatticeAccessAll; - using N = Node; - int x_ = CudaThread.x + CudaBlock.z*CudaNumberOfThreads.x + ; - int a_ = CudaThread.y + CudaBlock.x*CudaNumberOfThreads.y; - int y_,z_; - switch (CudaBlock.y) { BorderMargin$min[2]) for (y in BorderMargin$min[2]:BorderMargin$max[2]) if (y != 0) { ?> - case : - z_ = a_; 0) { ?> - y_ = ; - y_ = constContainer.ny - ; - if (z_ >= constContainer.nz) return; - break; BorderMargin$min[3]) for (z in BorderMargin$min[3]:BorderMargin$max[3]) if (z != 0) { ?> - case : - y_ = a_ + ; 0) { ?> - z_ = ; - z_ = constContainer.nz - ; - if (y_ >= constContainer.ny - ) return; - break; - default: - assert(CudaThread.y < ); - y_ = 0;z_ = 0; - break; - } - -#ifndef GRID3D - for (; x_ < constContainer.nx; x_ += CudaNumberOfThreads.x) -#endif - { - LA acc(x_,y_,z_); - N now(acc); - now.RunElement(); - } - } -}; - -template -CudaGlobalFunction void Kernel() { - E e; - e.Execute(); -} - -/// Run the interior kernel -/** - Dispatch the kernel running RunElement on all interior elements of the lattice - \param stream CUDA Stream to which add the kernel run -*/ -template -void RunInteriorImpl(const CartLatticeContainer& container, CudaStream_t stream) { - const int nx = container.nx; - const int ny = container.ny; - const int nz = container.nz; - dim3 thr = ThreadNumber< EX >::threads(); - dim3 blx; -#ifdef GRID3D - blx.z = nx/thr.x; -#else - blx.z = 1; -#endif - int totx = ny - ; - blx.x = ceiling_div(totx, thr.y); - int toty = nz - ; - blx.y = toty; - CudaKernelRunNoWait(Kernel< EX >, blx, thr, stream); -}; - -/// Run the border kernel -/** - Dispatch the kernel running RunElement on all border elements of the Lattice - \param stream CUDA Stream to which add the kernel run -*/ -template -void RunBorderImpl(const CartLatticeContainer& container, CudaStream_t stream) { - const int nx = container.nx; - const int ny = container.ny; - const int nz = container.nz; - 0) { -?> - dim3 thr = ThreadNumber< EX >::threads(); - dim3 blx; -#ifdef GRID3D - blx.z = nx/thr.x; -#else - blx.z = 1; -#endif - int totx = ; - blx.x = ceiling_div(totx, thr.y); - blx.y = ; - CudaKernelRunNoWait(Kernel< EX >, blx, thr, stream); - -}; - -template < eOperationType I, eCalculateGlobals G, eStage S > -void CartLatticeExecutor::RunInterior(CudaStream_t stream) const { - RunInteriorImpl< InteriorExecutor< I, G, S > >(container, stream); -} - -template < eOperationType I, eCalculateGlobals G, eStage S > -void CartLatticeExecutor::RunBorder(CudaStream_t stream) const { - RunBorderImpl< BorderExecutor< I, G, S > >(container, stream); -} - -/// Old function for graphics output -/** - calculates the color for one node -*/ -template < eOperationType I, eCalculateGlobals G, eStage S > -CudaDeviceFunction void CartLatticeExecutor::NodeToColor(int x, int y, int z, uchar4 *optr) { - using LA = CartLatticeAccessAll; - using N = Node; - - int offset = x + y * constContainer.nx; - float l = 0.0; float w = 0.0; - int r = 0,g = 0,b = 0; - if (x < 0 || y < 0 || z < 0) return; - if (x >= constContainer.nx || y >= constContainer.ny || z >= constContainer.nz) return; - - { - LA acc(x,y,z); - N now(acc); - acc.pop(now); - float2 v = now.Color(); - l = v.x; - w = v.y; - } - - if (ISFINITE(l)) { - l = l * 111; - if ( (l <-111)) {r = 255; g = 255; b = 255; } - if ((l >= -111) && (l < -11)) {r = 255*(-l-11)/100; g = 255; b = 255; } - if ((l >= -11) && (l < -1)) {r = 0; g = (255*(-l-1))/10; b = 255; } - if ((l >= -1) && (l < 0)) {r = 0; g = 0; b = 255*(-l); } - if ((l >= 0) && (l < 1)) {r = 255*l; g = 0; b = 0; } - if ((l >= 1) && (l < 11)) {r = 255; g = 255*(l-1)/10; b = 0; } - if ((l >= 11) && (l < 111)) {r = 255; g = 255; b = 255*(l-11)/100; } - if ((l >= 111) ) {r = 255; g = 255; b = 255; } - r=r*w; - g=g*w + (1-w)*255; - b=b*w; - } else { - r = 255; - g = 0; - b = 255; - } - optr[offset].x = r; - optr[offset].y = g; - optr[offset].z = b; - optr[offset].w = 255; -} - -/// Kernel for graphics output -CudaGlobalFunction void ColorKernel(uchar4 *optr, int z) -{ - CartLatticeExecutor::NodeToColor< Primal, NoGlobals, Get >( - CudaThread.x+CudaBlock.x*CudaNumberOfThreads.x, - CudaBlock.y, - z, - optr); -} - -/// Runs kernel for rendering graphics -/** - Runs the kernel for rendering graphics - \param optr 4-component graphics buffer -*/ -void CartLatticeExecutor::Color(uchar4 *optr) { - CopyToConst(); - CudaKernelRun(ColorKernel, dim3(floor(container.nx / X_BLOCK), container.ny, 1), dim3(X_BLOCK), optr, container.nz / 2); -} - -// Quantity getters - -/// Calculate quantity [] kernel -/** - Kernel to calculate quantity () over a region - \param r Lattice region to calculate the quantity - \param tab buffor to put the calculated result - \param scale Scale to rescale the result (for units) -*/ -CudaGlobalFunction void get(lbRegion r, * tab, real_t scale) -{ - using LA = CartLatticeAccessAll; - int x = CudaBlock.x+r.dx; - int y = CudaBlock.y+r.dy; - int z = CudaBlock.z+r.dz; - LA acc(x,y,z); - Node< LA, Adjoint, NoGlobals, Get > now(acc); - Node< LA, Primal, NoGlobals, Get > now(acc); - w; -// if (now.NodeType) { - acc.pop(now); - acc.pop_adj(now); - w = now.get(); - w. *= scale; - w *= scale; -// } else { -// w. = nan(""); -// w = nan(""); -// } - tab[r.offset(x,y,z)] = w; -} - - - -/// Init Settings with 0 in GPU constant memory -void initSettings() { - real_t val = 0; - - CudaCopyToConstant("", , &val, sizeof(real_t)); -} - -/// Set Setting in GPU constant memory -/** - Sets a Setting in the constant memory of GPU - \param i Index of the Setting - \param tmp value of the Setting -*/ -void setConstSetting(int i, real_t tmp) { - switch (i) { - - case : - CudaCopyToConstant("", , &tmp, sizeof(real_t)); - break; - } -} diff --git a/src/CartLatticeExecutor.h.Rt b/src/CartLatticeLauncher.h.Rt similarity index 54% rename from src/CartLatticeExecutor.h.Rt rename to src/CartLatticeLauncher.h.Rt index 213186dad..1b4eb99c7 100644 --- a/src/CartLatticeExecutor.h.Rt +++ b/src/CartLatticeLauncher.h.Rt @@ -2,7 +2,7 @@ source("conf.R") c_header(); ?> -/** \file CartLatticeExecutor.h +/** \file CartLatticeLauncher.h */ #ifndef CARTLATTICEEXECUTOR_H @@ -11,7 +11,7 @@ c_header(); #include "CartLatticeContainer.h" #include "LatticeData.hpp" -struct CartLatticeExecutor { +struct CartLatticeLauncher { CartLatticeContainer container; LatticeData data; @@ -19,15 +19,7 @@ struct CartLatticeExecutor { void RunBorder(CudaStream_t stream) const; template < eOperationType I, eCalculateGlobals G, eStage S > void RunInterior(CudaStream_t stream) const; - void Color(uchar4 *out_ptr); - - void CopyToConst() const; - - template < eOperationType I, eCalculateGlobals G, eStage S > - static CudaDeviceFunction void NodeToColor(int x, int y, int z, uchar4 *optr); -}; - -CudaGlobalFunction void ColorKernel(uchar4 *optr, int z); + void Color(uchar4 *out_ptr) const; -CudaGlobalFunction void get(lbRegion r, * tab, real_t scale); -CudaGlobalFunction void get_(lbRegion , *, int); + void GetQuantity(lbRegion r, * tab, real_t scale) const; + void SampleQuantity(lbRegion r, * tab, real_t scale) const; +}; #endif // CARTLATTICEEXECUTOR_H diff --git a/src/CartLatticeLauncher.hpp.Rt b/src/CartLatticeLauncher.hpp.Rt new file mode 100644 index 000000000..0fbad39ff --- /dev/null +++ b/src/CartLatticeLauncher.hpp.Rt @@ -0,0 +1,340 @@ + +/** \file CartLatticeLauncher.hpp +*/ + +#include "CartLatticeLauncher.h" +#include "CartLatticeAccess.hpp" +#include "Node.hpp" +#include "GetThreads.h" + +using CartLatticeAccessAll = AccessComposite< CartLatticeAccess< range_int<0,0,-1,1>, range_int<0,0,-1,1>, range_int<0,0,-1,1> > >; +using CartLatticeAccessInterior = AccessComposite< CartLatticeAccess< + range_int< ,0,,1>, + range_int< ,0,,1>, + range_int< ,0,,1> > >; + +/// Main Kernel +/** + iterates over all elements and runs them with RunElement function. + container.dx/dy is to calculate only internal nodes +*/ +template < eOperationType I, eCalculateGlobals G, eStage S > +class CartInteriorExecutor { + CartLatticeContainer container; + LatticeData data; + +public: + CartInteriorExecutor(const CartLatticeContainer& container_, const LatticeData& data_) : container(container_), data(data_) {} + + CudaHostFunction LaunchParams ComputeLaunchParams(dim3 thr) const { + const int nx = container.nx; + const int ny = container.ny; + const int nz = container.nz; + dim3 blx; +#ifdef GRID3D + blx.z = nx/thr.x; +#else + blx.z = 1; +#endif + int totx = ny - ; + blx.x = ceiling_div(totx, thr.y); + int toty = nz - ; + blx.y = toty; + return LaunchParams{blx, thr}; + } + + CudaDeviceFunction void Execute() const { + using LA = CartLatticeAccessInterior; + using N = Node; + int x_ = CudaThread.x + CudaBlock.z*CudaNumberOfThreads.x + ; + int y_ = CudaThread.y + CudaBlock.x*CudaNumberOfThreads.y + ; + int z_ = CudaBlock.y + ; + if (y_ < container.ny - ) { +#ifndef GRID3D + for (; x_ < container.nx; x_ += CudaNumberOfThreads.x) +#endif + { + LA acc(x_, y_, z_, container); + N now(acc, data); + now.RunElement(); + } + } + } +}; + +/// Border Kernel +/** + iterates over border elements and runs them with RunElement function +*/ +template < eOperationType I, eCalculateGlobals G, eStage S > +class CartBorderExecutor { + CartLatticeContainer container; + LatticeData data; + +public: + CartBorderExecutor(const CartLatticeContainer& container_, const LatticeData& data_) : container(container_), data(data_) {} + + CudaHostFunction LaunchParams ComputeLaunchParams(dim3 thr) const { + const int nx = container.nx; + const int ny = container.ny; + const int nz = container.nz; + 0) { +?> + dim3 blx; +#ifdef GRID3D + blx.z = nx/thr.x; +#else + blx.z = 1; +#endif + int totx = ; + blx.x = ceiling_div(totx, thr.y); + blx.y = ; + return LaunchParams{blx, thr}; + + } + + CudaDeviceFunction void Execute() const { + int x_ = CudaThread.x + CudaBlock.z*CudaNumberOfThreads.x + ; + int a_ = CudaThread.y + CudaBlock.x*CudaNumberOfThreads.y; + int y_,z_; + switch (CudaBlock.y) { BorderMargin$min[2]) for (y in BorderMargin$min[2]:BorderMargin$max[2]) if (y != 0) { ?> + case : + z_ = a_; 0) { ?> + y_ = ; + y_ = container.ny - ; + if (z_ >= container.nz) return; + break; BorderMargin$min[3]) for (z in BorderMargin$min[3]:BorderMargin$max[3]) if (z != 0) { ?> + case : + y_ = a_ + ; 0) { ?> + z_ = ; + z_ = container.nz - ; + if (y_ >= container.ny - ) return; + break; + default: + assert(CudaThread.y < ); + y_ = 0;z_ = 0; + break; + } + + using LA = CartLatticeAccessAll; + using N = Node; + +#ifndef GRID3D + for (; x_ < container.nx; x_ += CudaNumberOfThreads.x) +#endif + { + LA acc(x_, y_, z_, container); + N now(acc, data); + now.RunElement(); + } + } +}; + +template < eOperationType I, eCalculateGlobals G, eStage S > +void CartLatticeLauncher::RunInterior(CudaStream_t stream) const { + const CartInteriorExecutor< I, G, S > executor(container, data); + LaunchExecutorAsync(executor, stream); +} + +template < eOperationType I, eCalculateGlobals G, eStage S > +void CartLatticeLauncher::RunBorder(CudaStream_t stream) const { + const CartBorderExecutor< I, G, S > executor(container, data); + LaunchExecutorAsync(executor, stream); +} + + +template < eOperationType I, eCalculateGlobals G, eStage S > +class CartColorExecutor{ + CartLatticeContainer container; + LatticeData data; + + uchar4* optr; + int z; + +public: + CartColorExecutor(const CartLatticeContainer& container_, const LatticeData& data_, uchar4* optr_, int z_) + : container(container_), data(data_), optr(optr_), z(z_) + {} + + CudaHostFunction LaunchParams ComputeLaunchParams(dim3) const { + return LaunchParams{dim3(floor(container.nx / X_BLOCK), container.ny, 1), dim3(X_BLOCK)}; + } + + CudaDeviceFunction void Execute() const { + using LA = CartLatticeAccessAll; + using N = Node; + + const int x = CudaThread.x + CudaBlock.x * CudaNumberOfThreads.x; + const int y = CudaBlock.y; + + int offset = x + y * container.nx; + float l = 0.; float w = 0.; + int r = 0, g = 0, b = 0; + if (x < 0 || y < 0 || z < 0 || x >= container.nx || y >= container.ny || z >= container.nz) + return; + + LA acc(x, y, z, container); + N now(acc, data); + acc.pop(now); + float2 v = now.Color(); + l = v.x; + w = v.y; + + if (ISFINITE(l)) { + l = l * 111; + if ( (l <-111)) {r = 255; g = 255; b = 255; } + if ((l >= -111) && (l < -11)) {r = 255*(-l-11)/100; g = 255; b = 255; } + if ((l >= -11) && (l < -1)) {r = 0; g = (255*(-l-1))/10; b = 255; } + if ((l >= -1) && (l < 0)) {r = 0; g = 0; b = 255*(-l); } + if ((l >= 0) && (l < 1)) {r = 255*l; g = 0; b = 0; } + if ((l >= 1) && (l < 11)) {r = 255; g = 255*(l-1)/10; b = 0; } + if ((l >= 11) && (l < 111)) {r = 255; g = 255; b = 255*(l-11)/100; } + if ((l >= 111) ) {r = 255; g = 255; b = 255; } + r=r*w; + g=g*w + (1-w)*255; + b=b*w; + } else { + r = 255; + g = 0; + b = 255; + } + + optr[offset].x = r; + optr[offset].y = g; + optr[offset].z = b; + optr[offset].w = 255; + } +}; + +/// Runs kernel for rendering graphics +/** + Runs the kernel for rendering graphics + \param optr 4-component graphics buffer +*/ +void CartLatticeLauncher::Color(uchar4 *optr) const { + const CartColorExecutor< Primal, NoGlobals, Get > executor(container, data, optr, container.nz / 2); + LaunchExecutor(executor); +} + +// Quantity getters + + +class GetQuantityExecutorBase { +protected: + CartLatticeContainer container; + LatticeData data; + + lbRegion small; + * buf; + real_t scale; + +public: + GetQuantityExecutorBase(const CartLatticeContainer& container_, const LatticeData& data_, const lbRegion& small_, * buf_, real_t scale_) + : container(container_), data(data_), small(small_), buf(buf_), scale(scale_) + {} + + CudaDeviceFunction void Execute() const { + using LA = CartLatticeAccessAll; + int x = CudaBlock.x + small.dx; + int y = CudaBlock.y + small.dy; + int z = CudaBlock.z + small.dz; + LA acc(x, y, z, container); + Node< LA, Adjoint, NoGlobals, Get > now(acc, data); + Node< LA, Primal, NoGlobals, Get > now(acc, data); + w; +// if (now.NodeType) { + acc.pop(now); + acc.pop_adj(now); + w = now.get(); + w. *= scale; + w *= scale; +// } else { +// w. = nan(""); +// w = nan(""); +// } + buf[small.offset(x,y,z)] = w; + } +}; + +class GetQuantityExecutor : public GetQuantityExecutorBase { +public: + template + GetQuantityExecutor(Args&&... args) : GetQuantityExecutorBase(std::forward(args)...) {} + + CudaHostFunction LaunchParams ComputeLaunchParams(dim3) const { + return LaunchParams{dim3(small.nx, small.ny, small.nz), dim3(1)}; + } +}; + +class GetQuantitySampleExecutor : public GetQuantityExecutorBase { +public: + template + GetQuantitySampleExecutor(Args&&... args) : GetQuantityExecutorBase(std::forward(args)...) {} + + CudaHostFunction LaunchParams ComputeLaunchParams(dim3) const { + return LaunchParams{dim3(small.nx, small.ny), dim3(1)}; + } +}; + +/// Calculate quantity [] kernel +/** + Kernel to calculate quantity () over a region + \param r Lattice region to calculate the quantity + \param tab buffer to put the calculated result + \param scale Scale to rescale the result (for units) +*/ +void CartLatticeLauncher::GetQuantity(lbRegion r, * tab, real_t scale) const { + const GetQuantityExecutor executor(container, data, r, tab, scale); + LaunchExecutor(executor); +} + +/// Sample quantity [] kernel +/** + Kernel to sample quantity () over a region + \param r Lattice region to sample the quantity + \param tab buffer to put the sampled result + \param scale Scale to rescale the result (for units) +*/ +void CartLatticeLauncher::SampleQuantity(lbRegion r, * tab, real_t scale) const { + const GetQuantitySampleExecutor executor(container, data, r, tab, scale); + LaunchExecutor(executor); +} + + diff --git a/src/GetThreads.h b/src/GetThreads.h index c35c19cdf..0e2f7e8fc 100644 --- a/src/GetThreads.h +++ b/src/GetThreads.h @@ -1,8 +1,7 @@ #include "Global.h" -#include #include "cross.h" -template CudaGlobalFunction void Kernel(); +#include std::string cxx_demangle(std::string str); @@ -10,17 +9,23 @@ inline int ceiling_div(const int & x, const int & y) { return x / y + (x % y != 0); } -/// Get maximal number of threads for all the kernels on runtime -template < class T > int GetThreads() { - dim3 ret; - CudaFuncAttributes * attr = new CudaFuncAttributes; - CudaFuncGetAttributes(attr, Kernel) ; - debug1( "[%d] Constant mem:%ld\n", D_MPI_RANK, attr->constSizeBytes); - debug1( "[%d] Local mem:%ld\n", D_MPI_RANK, attr->localSizeBytes); - debug1( "[%d] Max threads:%d\n", D_MPI_RANK, attr->maxThreadsPerBlock); - debug1( "[%d] Reg Number:%d\n", D_MPI_RANK, attr->numRegs); - debug1( "[%d] Shared mem:%ld\n", D_MPI_RANK, attr->sharedSizeBytes); - return attr->maxThreadsPerBlock; +template +CudaGlobalFunction void Kernel(const EX executor) { + executor.Execute(); +} + +/// Get maximum number of threads for all kernels at runtime +template < class EX > +int GetThreads() { + CudaFuncAttributes attr; + auto attr_ptr = &attr; + CudaFuncGetAttributes(attr_ptr, Kernel< EX >) ; + debug1( "[%d] Constant mem:%ld\n", D_MPI_RANK, attr.constSizeBytes); + debug1( "[%d] Local mem:%ld\n", D_MPI_RANK, attr.localSizeBytes); + debug1( "[%d] Max threads:%d\n", D_MPI_RANK, attr.maxThreadsPerBlock); + debug1( "[%d] Reg Number:%d\n", D_MPI_RANK, attr.numRegs); + debug1( "[%d] Shared mem:%ld\n", D_MPI_RANK, attr.sharedSizeBytes); + return attr.maxThreadsPerBlock; } class ThreadNumberCalculatorBase { @@ -43,11 +48,11 @@ class ThreadNumberCalculatorBase { void print(); }; -template < class T > class ThreadNumberCalculator : public ThreadNumberCalculatorBase { +template < class EX > class ThreadNumberCalculator : public ThreadNumberCalculatorBase { public: virtual void Init() { - name = cxx_demangle(typeid(T).name()); - maxthr = GetThreads< T >(); + name = cxx_demangle(typeid(EX).name()); + maxthr = GetThreads< EX >(); thr.z = 1; int val = maxthr; if (maxthr < X_BLOCK) { @@ -63,15 +68,37 @@ template < class T > class ThreadNumberCalculator : public ThreadNumberCalculato }; }; -template < class T > class ThreadNumber { - typedef ThreadNumberCalculator< T > calc_t; +template < class EX > class ThreadNumber { + typedef ThreadNumberCalculator< EX > calc_t; static calc_t calc; public: static inline dim3 threads() { return calc.threads(); } }; -template < class T > ThreadNumberCalculator ThreadNumber::calc; +template < class EX > +ThreadNumberCalculator< EX > ThreadNumber< EX >::calc; /// Initialize Thread/Block number variables int InitDim(); +struct LaunchParams{ + dim3 blx, thr; +}; + +template +LaunchParams ComputeLaunchParams(const EX& executor) { + const auto threads = ThreadNumber< EX >::threads(); + return executor.ComputeLaunchParams(threads); +} + +template +void LaunchExecutor(const EX& executor) { + const auto exec_params = ComputeLaunchParams(executor); + CudaKernelRun(Kernel< EX >, exec_params.blx, exec_params.thr, executor); +} + +template +void LaunchExecutorAsync(const EX& executor, CudaStream_t stream) { + const auto exec_params = ComputeLaunchParams(executor); + CudaKernelRunAsync(Kernel< EX >, exec_params.blx, exec_params.thr, stream, executor); +} diff --git a/src/Lattice.cu.Rt b/src/Lattice.cpp.Rt similarity index 88% rename from src/Lattice.cu.Rt rename to src/Lattice.cpp.Rt index ff0354b02..a87bdeb3b 100644 --- a/src/Lattice.cu.Rt +++ b/src/Lattice.cpp.Rt @@ -57,9 +57,9 @@ void Lattice::setPosition(double px_, double py_, double pz_) px = px_; py = py_; pz = pz_; - executor.container.px = px + 0.5 + region.dx; - executor.container.py = py + 0.5 + region.dy; - executor.container.pz = pz + 0.5 + region.dz; + launcher.container.px = px + 0.5 + region.dx; + launcher.container.py = py + 0.5 + region.dy; + launcher.container.pz = pz + 0.5 + region.dz; } /// Calculation of the offset from X, Y and Z @@ -95,8 +95,8 @@ Lattice::Lattice(lbRegion _region, MPIInfo mpi_, int ns) : zSet(ZONESETTINGS, ZO { DEBUG_M; model = &my_model; - AllocContainer(executor.container, _region.nx, _region.ny, _region.nz); - executor.data.Alloc(); + AllocContainer(launcher.container, _region.nx, _region.ny, _region.nz); + launcher.data.Alloc(); reverse_save=0; Record_Iter = 0; Iter = 0; @@ -108,8 +108,8 @@ Lattice::Lattice(lbRegion _region, MPIInfo mpi_, int ns) : zSet(ZONESETTINGS, ZO Snaps = new FTabs[nSnaps]; iSnaps = new int[maxSnaps]; setPosition(0.0,0.0,0.0); - executor.data.iter = 0; - executor.data.reset_iter = 0; + launcher.data.iter = 0; + launcher.data.reset_iter = 0; DEBUG_M; for (int i=0; i < nSnaps; i++) { Snaps[i].PreAlloc(_region.nx,_region.ny,_region.nz); @@ -128,10 +128,10 @@ Lattice::Lattice(lbRegion _region, MPIInfo mpi_, int ns) : zSet(ZONESETTINGS, ZO CudaAllocFinalize(); DEBUG_M; - executor.container.in = Snaps[0]; - executor.container.out = Snaps[1]; + launcher.container.in = Snaps[0]; + launcher.container.out = Snaps[1]; #ifdef ADJOINT - executor.container.adjout = aSnaps[0]; + launcher.container.adjout = aSnaps[0]; #endif aSnap = 0; for (int i=0; i < SETTINGS; i++) { @@ -140,17 +140,17 @@ Lattice::Lattice(lbRegion _region, MPIInfo mpi_, int ns) : zSet(ZONESETTINGS, ZO CudaStreamCreate(&kernelStream); CudaStreamCreate(&inStream); CudaStreamCreate(&outStream); - executor.data.ZoneSettings = zSet.gpuTab; - executor.data.ConstZoneSettings = zSet.gpuConst; - executor.data.ZoneIndex = 0; + launcher.data.ZoneSettings = zSet.gpuTab; + launcher.data.ConstZoneSettings = zSet.gpuConst; + launcher.data.ZoneIndex = 0; // zSet.set(, -1, units.alt("")); ZoneIter = 0; particle_data_size_max = 0; - SC.InitFinder(executor.data.solidfinder); - executor.data.particle_data = NULL; - executor.data.particle_data_size = 0; + SC.InitFinder(launcher.data.solidfinder); + launcher.data.particle_data = NULL; + launcher.data.particle_data_size = 0; SC.balls = &RFI; RFI.name = "TCLB"; } @@ -405,28 +405,28 @@ void Lattice::CopyInParticles() { RFI.SendParticles(); DEBUG_PROF_POP(); if (RFI.size() > particle_data_size_max) { - if (executor.data.particle_data != NULL) CudaFree(executor.data.particle_data); + if (launcher.data.particle_data != NULL) CudaFree(launcher.data.particle_data); particle_data_size_max = RFI.size(); - CudaMalloc(&executor.data.particle_data, RFI.mem_size()); + CudaMalloc(&launcher.data.particle_data, RFI.mem_size()); } - executor.data.particle_data_size = RFI.size(); + launcher.data.particle_data_size = RFI.size(); for (size_t i=0; i 0) { - CudaMemcpyAsync(executor.data.particle_data, RFI.Particles(), RFI.mem_size(), CudaMemcpyHostToDevice, kernelStream); + CudaMemcpyAsync(launcher.data.particle_data, RFI.Particles(), RFI.mem_size(), CudaMemcpyHostToDevice, kernelStream); } DEBUG_PROF_PUSH("Tree Build"); SC.Build(); DEBUG_PROF_POP(); - SC.CopyToGPU(executor.data.solidfinder, kernelStream); + SC.CopyToGPU(launcher.data.solidfinder, kernelStream); } void Lattice::CopyOutParticles() { if (RFI.mem_size() > 0) { - CudaMemcpyAsync(RFI.Particles(), executor.data.particle_data, RFI.mem_size(), CudaMemcpyDeviceToHost, kernelStream); + CudaMemcpyAsync(RFI.Particles(), launcher.data.particle_data, RFI.mem_size(), CudaMemcpyDeviceToHost, kernelStream); } CudaStreamSynchronize(kernelStream); DEBUG_PROF_PUSH("Testing particles for NaNs"); @@ -452,15 +452,15 @@ void Lattice::SetFirstTabs(int tab0, int tab1) { to = mpi.node[mpi.rank].; if (mpi.rank != to) { gpuin[i] = Snaps[tab1].; - gpuout[i] = executor.container.out. = gpubuf[i]; + gpuout[i] = launcher.container.out. = gpubuf[i]; nodeout[i] = to; nodein[i] = from; i ++; } else { - executor.container.out. = Snaps[tab1].; + launcher.container.out. = Snaps[tab1].; } - executor.container.in = Snaps[tab0]; + launcher.container.in = Snaps[tab0]; } @@ -481,8 +481,8 @@ void Lattice::(int tab0, int tab1, int iter_type) ZoneIter = (Iter + Record_Iter) % zSet.getLen(); debug1("ZoneIter: %d (in )\n", ZoneIter); - executor.data.ZoneIndex = ZoneIter; - executor.data.MaxZones = zSet.MaxZones; + launcher.data.ZoneIndex = ZoneIter; + launcher.data.MaxZones = zSet.MaxZones; SetFirstTabs(tab0, tab1); (int tab0, int tab1, int iter_type) 0) { ?> MPIStream_B(); CudaDeviceSynchronize(); - executor.container.in = Snaps[tab1]; - executor.CopyToConst(); DEBUG_PROF_PUSH("Calculation"); switch(iter_type & ITER_INTEG){ case ITER_NO: - executor.RunBorder< Primal, NoGlobals, > (kernelStream); break; + launcher.RunBorder< Primal, NoGlobals, > (kernelStream); break; case ITER_GLOBS: - executor.RunBorder< Primal, IntegrateGlobals, >(kernelStream); break; + launcher.RunBorder< Primal, IntegrateGlobals, >(kernelStream); break; #ifdef ADJOINT case ITER_OBJ: - executor.RunBorder< Primal, OnlyObjective, >(kernelStream); break; + launcher.RunBorder< Primal, OnlyObjective, >(kernelStream); break; #endif } CudaStreamSynchronize(kernelStream); MPIStream_A(); switch(iter_type & ITER_INTEG){ case ITER_NO: - executor.RunInterior< Primal, NoGlobals, > (kernelStream); break; + launcher.RunInterior< Primal, NoGlobals, > (kernelStream); break; case ITER_GLOBS: - executor.RunInterior< Primal, IntegrateGlobals, >(kernelStream); break; + launcher.RunInterior< Primal, IntegrateGlobals, >(kernelStream); break; #ifdef ADJOINT case ITER_OBJ: - executor.RunInterior< Primal, OnlyObjective, >(kernelStream); break; + launcher.RunInterior< Primal, OnlyObjective, >(kernelStream); break; #endif } DEBUG_PROF_POP(); @@ -555,8 +554,8 @@ inline void Lattice::_Adj(int tab0, int tab1, int adjtab0, int int i=0; debug1("[%d] Iteration_Adj %d -> %d type: %d\n", D_MPI_RANK, adjtab0, adjtab1, iter_type); ZoneIter = (Iter + Record_Iter) % zSet.getLen(); - executor.data.ZoneIndex = ZoneIter; - executor.data.MaxZones = zSet.MaxZones; + launcher.data.ZoneIndex = ZoneIter; + launcher.data.MaxZones = zSet.MaxZones; debug1("ZoneIter: %d (in _Adj)\n", ZoneIter); _Adj(int tab0, int tab1, int adjtab0, int from = mpi.node[mpi.rank].; if (mpi.rank != to) { gpuout[i] = aSnaps[adjtab0].; - gpuin[i] = executor.container.adjin. = gpubuf[i]; + gpuin[i] = launcher.container.adjin. = gpubuf[i]; nodeout[i] = to; nodein[i] = from; i ++; } else { - executor.container.adjin. = aSnaps[adjtab0].; + launcher.container.adjin. = aSnaps[adjtab0].; } - executor.container.in. = Snaps[tab0].; - executor.container.adjout. = aSnaps[adjtab1].; + launcher.container.in. = Snaps[tab0].; + launcher.container.adjout. = aSnaps[adjtab1].; - - executor.CopyToConst(); - MPIStream_A(); switch(iter_type & ITER_INTEG){ case ITER_NO: - executor.RunInterior< Adjoint, NoGlobals, > (kernelStream); break; + launcher.RunInterior< Adjoint, NoGlobals, > (kernelStream); break; case ITER_GLOBS: - executor.RunInterior< Adjoint, IntegrateGlobals, > (kernelStream); break; + launcher.RunInterior< Adjoint, IntegrateGlobals, > (kernelStream); break; case ITER_NO | ITER_STEADY: - executor.RunInterior< SteadyAdjoint, NoGlobals, > (kernelStream); break; + launcher.RunInterior< SteadyAdjoint, NoGlobals, > (kernelStream); break; case ITER_GLOBS | ITER_STEADY: - executor.RunInterior< SteadyAdjoint, IntegrateGlobals, > (kernelStream); break; + launcher.RunInterior< SteadyAdjoint, IntegrateGlobals, > (kernelStream); break; } MPIStream_B(); @@ -602,13 +598,13 @@ inline void Lattice::_Adj(int tab0, int tab1, int adjtab0, int DEBUG_M; switch(iter_type & ITER_INTEG){ case ITER_NO: - executor.RunBorder< Adjoint, NoGlobals, > (kernelStream); break; + launcher.RunBorder< Adjoint, NoGlobals, > (kernelStream); break; case ITER_GLOBS: - executor.RunBorder< Adjoint, IntegrateGlobals, > (kernelStream); break; + launcher.RunBorder< Adjoint, IntegrateGlobals, > (kernelStream); break; case ITER_NO | ITER_STEADY: - executor.RunBorder< SteadyAdjoint, NoGlobals, > (kernelStream); break; + launcher.RunBorder< SteadyAdjoint, NoGlobals, > (kernelStream); break; case ITER_GLOBS | ITER_STEADY: - executor.RunBorder< SteadyAdjoint, IntegrateGlobals, > (kernelStream); break; + launcher.RunBorder< SteadyAdjoint, IntegrateGlobals, > (kernelStream); break; } DEBUG_M; CudaDeviceSynchronize(); @@ -635,8 +631,8 @@ inline void Lattice::_Opt(int tab0, int tab1, int adjtab0, int #ifdef ADJOINT (tab0, tab1, iter_type); _Adj(tab0, tab1, adjtab0, adjtab1, iter_type | ITER_STEADY); - executor.RunInterior< Optimize, NoGlobals, > (kernelStream); - executor.RunBorder< Optimize, NoGlobals, > (kernelStream); + launcher.RunInterior< Optimize, NoGlobals, > (kernelStream); + launcher.RunBorder< Optimize, NoGlobals, > (kernelStream); CudaDeviceSynchronize(); #else ERROR("This model doesn't have adjoint!\n"); @@ -791,7 +787,7 @@ Lattice::~Lattice() { RFI.Close(); CudaAllocFreeAll(); - FreeContainer(executor.container); + FreeContainer(launcher.container); for (int i=0; i - /// Get [] /** Retrive the values of the Quantity () @@ -1223,20 +1218,20 @@ void Lattice::GetQuantity(int quant, lbRegion over, real_t * tab, real_t scale) */ void Lattice::Get(lbRegion over, * tab, real_t scale) { - executor.container.in = Snaps[Snap]; - executor.container.adjin = aSnaps[aSnap]; - executor.CopyToConst(); + launcher.container.in = Snaps[Snap]; + launcher.container.adjin = aSnaps[aSnap]; lbRegion inter = region.intersect(over); if (inter.size()==0) return; * buf=NULL; CudaMalloc((void**)&buf, inter.sizeL()*sizeof()); - { lbRegion small = inter; - small.dx -= region.dx; - small.dy -= region.dy; - small.dz -= region.dz; - CudaKernelRun( get , dim3(small.nx,small.ny,small.nz) , dim3(1) , small, buf, scale); - CudaMemcpy(tab, buf, small.sizeL()*sizeof(), CudaMemcpyDeviceToHost); + { + lbRegion small = inter; + small.dx -= region.dx; + small.dy -= region.dy; + small.dz -= region.dz; + launcher.GetQuantity(small, buf, scale); + CudaMemcpy(tab, buf, small.sizeL()*sizeof(), CudaMemcpyDeviceToHost); } CudaFree(buf); } @@ -1298,7 +1293,7 @@ void Lattice::pop_settings() */ void Lattice::getGlobals(real_t * tab) { real_t tabl[ GLOBALS ]; - executor.data.getGlobals(tabl); MPI_Reduce( &tabl[], @@ -1331,7 +1326,7 @@ void Lattice::calcGlobals() { ?> tab[] += obj; for (int i=0; i< GLOBALS ; i++) globals[i] += tab[i]; - executor.data.clearGlobals(); + launcher.data.clearGlobals(); } /// Clear the internal globals table @@ -1359,12 +1354,19 @@ double Lattice::getObjective() { /** Set a specific Setting \param i Index of the Setting - \param tmp Value of the Setting + \param val Value of the Setting +*/ +void Lattice::setSetting(int i, real_t val) { + launcher.data.settings[i] = val; +} + +/// Set a Setting +/** + Get a specific Setting + \param i Index of the Setting */ -void Lattice::setSetting(int i, real_t tmp) { - settings[i] = tmp; - debug1("[%d] Settings %d to %f\n", D_MPI_RANK,i, tmp); - setConstSetting(i,tmp); +real_t Lattice::getSetting(int i) { + return launcher.data.settings[i]; } void Lattice::SetSetting(const Model::Setting& set, real_t val) { @@ -1378,7 +1380,7 @@ void Lattice::SetSetting(const Model::Setting& set, real_t val) { } void Lattice::resetAverage() { - executor.data.reset_iter = executor.data.iter; + launcher.data.reset_iter = launcher.data.iter; CudaMemset(&Snaps[Snap].block14[*region.sizeL()],0,region.sizeL()*sizeof(real_t)); @@ -1386,11 +1388,11 @@ void Lattice::resetAverage() { void Lattice::GetSample(lbRegion over, real_t scale,real_t* buf) { - executor.container.in = Snaps[Snap]; - executor.container.adjin = aSnaps[aSnap]; - executor.CopyToConst(); - lbRegion small = region.intersect(over); - CudaKernelRun( get , dim3(small.nx,small.ny) , dim3(1) , small, (*) buf, scale); + launcher.container.in = Snaps[Snap]; + + launcher.container.adjin = aSnaps[aSnap]; + lbRegion small = region.intersect(over); + launcher.SampleQuantity(small, (*)buf, scale); } void Lattice::updateAllSamples(){ @@ -1401,7 +1403,7 @@ void Lattice::updateAllSamples(){ if (sample->quant->in("")) { double v = sample->units.alt(""); - GetSample(sample->spoints[j].location,1/v,&sample->gpu_buffer[sample->location[""]+(executor.data.iter - sample->startIter)*sample->size + sample->totalIter*j*sample->size]); + GetSample(sample->spoints[j].location,1/v,&sample->gpu_buffer[sample->location[""]+(launcher.data.iter - sample->startIter)*sample->size + sample->totalIter*j*sample->size]); } } } diff --git a/src/Lattice.h.Rt b/src/Lattice.h.Rt index ef2d15f99..14eddf4d8 100644 --- a/src/Lattice.h.Rt +++ b/src/Lattice.h.Rt @@ -12,7 +12,7 @@ #include "Sampler.h" #include "SolidContainer.h" #include "Lists.h" -#include "CartLatticeExecutor.h" +#include "CartLatticeLauncher.h" class lbRegion; @@ -39,7 +39,7 @@ const int maxSnaps=33; class Lattice { private: Model_m my_model; - CartLatticeExecutor executor; ///< Main execution context, including the lattice container, data, etc. + CartLatticeLauncher launcher; ///< Main execution context, including the lattice container, data, etc. storage_t *mpiin[27], *mpiout[27]; ///< MPI Buffers storage_t *gpuin[27], *gpuout[27], *gpubuf[27], *gpubuf2[27]; ///< GPU Buffers size_t bufsize[27]; ///< Sizes of the Buffers @@ -187,6 +187,7 @@ void GetQuantity(int quant, lbRegion over, real_t * tab, real_t scale); double getObjective(); void resetAverage(); void setSetting(int i, real_t tmp); + real_t getSetting(int i); void SetSetting(const Model::Setting& set, real_t val); void GenerateST(); }; diff --git a/src/LatticeData.hpp.Rt b/src/LatticeData.hpp.Rt index 4bb1ae89f..a8d3f75c1 100644 --- a/src/LatticeData.hpp.Rt +++ b/src/LatticeData.hpp.Rt @@ -10,21 +10,11 @@ #include "SyntheticTurbulence.h" #include "SolidContainer.h" - -#ifndef SETTINGS_H - - CudaExternConstantMemory(real_t ); - void initSettings(); - void setConstSetting(int i, real_t tmp); - -#define SETTINGS_H 1 -#endif - #include "Consts.h" #include "cross.h" +#include + struct LatticeData { size_t particle_data_size; real_t *particle_data; @@ -37,31 +27,30 @@ struct LatticeData { real_t **ZoneSettings; real_t *ConstZoneSettings; STWaveSet ST; + real_t settings[SETTINGS]; ///< settings initialized to zero - void CopyToConst() const; - - CudaDeviceFunction real_t ZoneSetting(const int &s, const int &z) { + CudaDeviceFunction real_t ZoneSetting(const int &s, const int &z) const { const int i = s + ZONESETTINGS * z; const real_t *w = ZoneSettings[i]; if (w == NULL) return ConstZoneSettings[i]; return w[ZoneIndex]; } - CudaDeviceFunction real_t ZoneSetting_DT(const int &s, const int &z) { + CudaDeviceFunction real_t ZoneSetting_DT(const int &s, const int &z) const { const int i = s + ZONESETTINGS * z; const real_t *w = ZoneSettings[i + DT_OFFSET]; if (w == NULL) return 0; return w[ZoneIndex]; } - CudaDeviceFunction real_t *ZoneSettingGrad(const int &s, const int &z) { + CudaDeviceFunction real_t *ZoneSettingGrad(const int &s, const int &z) const { const int i = s + ZONESETTINGS * z; real_t *w = ZoneSettings[i + GRAD_OFFSET]; if (w == NULL) return &ConstZoneSettings[i + GRAD_OFFSET]; return &w[ZoneIndex]; } - CudaDeviceFunction vector_t getST(real_t x, real_t y, real_t z) { + CudaDeviceFunction vector_t getST(real_t x, real_t y, real_t z) const { return calc(ST, x, y, z); } diff --git a/src/Node.hpp.Rt b/src/Node.hpp.Rt index 5f91bb01b..71e833601 100644 --- a/src/Node.hpp.Rt +++ b/src/Node.hpp.Rt @@ -7,6 +7,8 @@ Defines Node based on supplied Dynamics */ +#include "Particle.hpp" + iter)")) + AddMacro("SyntheticTurbulence(x__,y__,z__)","data->getST(x__,y__,z__)") + AddMacro("average_iter","(data->iter - data->reset_iter)") P = expand.grid(x=0:2,y=0:2,z=0:2) Q = paste("Q",P$x,P$y,P$z,sep="")[-1] AddMacro(Q, paste0_s("(acc.getQ(",seq_along(Q)-1,"))")) s = ZoneSettings[! as.logical(ZoneSettings$preload),,drop=FALSE] - AddMacro(s$name, paste0_s("constData.ZoneSetting(", s$Index, ", acc.getNodeType() >> ZONE_SHIFT)")) + AddMacro(s$name, paste0_s("data->ZoneSetting(", s$Index, ", acc.getNodeType() >> ZONE_SHIFT)")) s = ZoneSettings - AddMacro(paste0_s(s$name,"_DT"), paste0_s("constData.ZoneSetting_DT(", s$Index, ", acc.getNodeType() >> ZONE_SHIFT)")) + AddMacro(paste0_s(s$name,"_DT"), paste0_s("data->ZoneSetting_DT(", s$Index, ", acc.getNodeType() >> ZONE_SHIFT)")) g = Globals g$opstr = ifelse(g$op == "SUM", "Add", "Max") AddMacro(paste0_s("AddTo", g$name, "(x__)"), paste0_s("glob.", g$opstr, "ToGlobal<", g$Index, ">(x__, acc.getNodeType())")) @@ -37,6 +39,8 @@ AddMacro(paste0_s("Iam",n$name),paste0_s("((acc.getNodeType() & ", n$groupIndex, ") == ", n$Index,")")) n = NodeTypeGroups AddMacro(paste0_s("Iam",n$name),paste0_s("(acc.getNodeType() & ", n$Index, ")")) + s = Settings[is.na(Settings$derived), ] + AddMacro(s$name, paste0_s("data->settings[",s$Index,"]")) ?> template @@ -44,6 +48,7 @@ struct CalculateGlobals {}; template <> struct CalculateGlobals { + CudaDeviceFunction CalculateGlobals(const LatticeData&) {} template CudaDeviceFunction inline void AddToGlobal(const real_t& x, const flag_t& NodeType) {} template @@ -53,17 +58,18 @@ struct CalculateGlobals { template <> struct CalculateGlobals { - real_t globals[GLOBALS]; - CudaDeviceFunction CalculateGlobals() { - for (int i=0; i CudaDeviceFunction inline void AddToGlobal(const real_t& x, const flag_t& NodeType) { globals[I] = globals[I] + x; if (I < SUM_GLOBALS) { - globals[GLOBALS_Objective] = globals[GLOBALS_Objective] + constData.ZoneSetting(I + IN_OBJ_OFFSET, NodeType >> ZONE_SHIFT); + globals[GLOBALS_Objective] = globals[GLOBALS_Objective] + data->ZoneSetting(I + IN_OBJ_OFFSET, NodeType >> ZONE_SHIFT); } } template @@ -73,9 +79,9 @@ struct CalculateGlobals { CudaDeviceFunction void inline Glob() { for (int i=0; iGlobals[i], globals[i]); } else { - CudaAtomicMaxReduceWarp(&constData.Globals[i], globals[i]); + CudaAtomicMaxReduceWarp(&data->Globals[i], globals[i]); } } } @@ -84,25 +90,28 @@ struct CalculateGlobals { template <> struct CalculateGlobals { real_t obj; - CudaDeviceFunction CalculateGlobals() { - obj = 0.0; - } + const LatticeData* data; + + CudaDeviceFunction CalculateGlobals(const LatticeData& data_) : obj(0.), data(&data_) {} template CudaDeviceFunction inline void AddToGlobal(const real_t& x, const flag_t& NodeType) { if (I < SUM_GLOBALS) { - obj = obj + constData.ZoneSetting(I + IN_OBJ_OFFSET, NodeType >> ZONE_SHIFT); + obj = obj + data->ZoneSetting(I + IN_OBJ_OFFSET, NodeType >> ZONE_SHIFT); } } template CudaDeviceFunction inline void MaxToGlobal(const real_t& x, const flag_t& NodeType) {} CudaDeviceFunction void inline Glob() { - CudaAtomicAddReduceWarp(&constData.Globals[GLOBALS_Objective], obj); + CudaAtomicAddReduceWarp(&data->Globals[GLOBALS_Objective], obj); } }; struct CalculateGlobalsAdjoint { real_t duals[ZONESETTINGS]; real_t duals_dt[ZONESETTINGS]; + const LatticeData* data; + + CudaDeviceFunction CalculateGlobalsAdjoint(const LatticeData& data_) : data(&data_) {} CudaDeviceFunction CalculateGlobalsAdjoint() { for (int i=0; i> ZONE_SHIFT; - for (int nz = 0; nz < constData.MaxZones; nz++) if (CudaSyncWarpOr(nz == z)) { + for (int nz = 0; nz < data->MaxZones; nz++) if (CudaSyncWarpOr(nz == z)) { for (int i=0; iZoneSettingGrad( i , nz), val); val = (nz == z) ? duals_dt[i] : 0.0f; - CudaAtomicAddReduceWarp(constData.ZoneSettingGrad( i + DT_OFFSET, nz), val); + CudaAtomicAddReduceWarp(data->ZoneSettingGrad( i + DT_OFFSET, nz), val); } } } }; +template +CudaDeviceFunction T ParticleIteratorXBlockT(real_t x, real_t y, real_t z, const LatticeData* data) { + real_t mar = PART_MAR; + real_t point[3] = {x,y,z}; + real_t lower[3] = {x-CudaThread.x-mar,y-mar,z-mar}; + real_t upper[3] = {x-CudaThread.x+CudaNumberOfThreads.x-1.0f+mar,y+mar,z+mar}; + return T(data->solidfinder, data->particle_data, point, lower, upper); +} + +template +CudaDeviceFunction T ParticleIteratorT(real_t x, real_t y, real_t z, const LatticeData* data) { + real_t mar = PART_MAR; + real_t point[3] = {x,y,z}; + real_t lower[3] = {x-mar,y-mar,z-mar}; + real_t upper[3] = {x+mar,y+mar,z+mar}; + return T(data->solidfinder, data->particle_data, point, lower, upper); +} #ifndef NODE_HPP #define NODE_HPP template < class LA, eOperationType I, eCalculateGlobals G, eStage S> struct Node { CudaDeviceFunction inline void RunElement(){}; - CudaDeviceFunction inline Node(const LA& acc_) {}; + CudaDeviceFunction inline Node(const LA& acc_, const LatticeData&) {}; }; template < class LA > struct Node < LA, , , > { const LA& acc; + const LatticeData* data; CalculateGlobals< > glob; real_t ; - CudaDeviceFunction inline Node(const LA& acc_):acc(acc_) { + CudaDeviceFunction Node(const LA& acc_, const LatticeData& data_) : acc(acc_), data(&data_), glob(data_) { int z = NodeType >> ZONE_SHIFT; - = constData.ZoneSetting(, z); = data->ZoneSetting(, z); }; CudaDeviceFunction void inline Glob() { @@ -273,7 +300,21 @@ template < class LA > struct Node < LA, , , > { ExecElement(); SaveElement(); Glob(); - } (x, y, z, data); + } +#else + CudaDeviceFunction set_found_t_s SyncParticleIterator(real_t x, real_t y, real_t z) { + return ParticleIteratorXBlockT< set_found_t_s >(x, y, z, data); + } +#endif + + CudaDeviceFunction set_found_t_i ParticleIterator(real_t x, real_t y, real_t z) { + return ParticleIteratorT< set_found_t_i >(x, y, z, data); + } + }; struct ParticleS : ParticleI { vector_t force; vector_t moment; - CudaDeviceFunction ParticleS(const size_t& i_, const real_t node[3]) : ParticleI(i_,node) { + CudaDeviceFunction ParticleS(real_t* particle_data_, const size_t& i_, const real_t node[3]) : ParticleI(particle_data_, i_, node) { force.x = 0; force.y = 0; force.z = 0; @@ -62,37 +63,37 @@ struct ParticleS : ParticleI { template <> CudaDeviceFunction ParticleS< NO_SYNC >::~ParticleS() { - CudaAtomicAdd(&constData.particle_data[i*RFI_DATA_SIZE+RFI_DATA_FORCE+0],force.x); - CudaAtomicAdd(&constData.particle_data[i*RFI_DATA_SIZE+RFI_DATA_FORCE+1],force.y); - CudaAtomicAdd(&constData.particle_data[i*RFI_DATA_SIZE+RFI_DATA_FORCE+2],force.z); - CudaAtomicAdd(&constData.particle_data[i*RFI_DATA_SIZE+RFI_DATA_MOMENT+0],moment.x); - CudaAtomicAdd(&constData.particle_data[i*RFI_DATA_SIZE+RFI_DATA_MOMENT+1],moment.y); - CudaAtomicAdd(&constData.particle_data[i*RFI_DATA_SIZE+RFI_DATA_MOMENT+2],moment.z); + CudaAtomicAdd(&particle_data[i*RFI_DATA_SIZE+RFI_DATA_FORCE+0],force.x); + CudaAtomicAdd(&particle_data[i*RFI_DATA_SIZE+RFI_DATA_FORCE+1],force.y); + CudaAtomicAdd(&particle_data[i*RFI_DATA_SIZE+RFI_DATA_FORCE+2],force.z); + CudaAtomicAdd(&particle_data[i*RFI_DATA_SIZE+RFI_DATA_MOMENT+0],moment.x); + CudaAtomicAdd(&particle_data[i*RFI_DATA_SIZE+RFI_DATA_MOMENT+1],moment.y); + CudaAtomicAdd(&particle_data[i*RFI_DATA_SIZE+RFI_DATA_MOMENT+2],moment.z); } template <> CudaDeviceFunction ParticleS< WARP_SYNC >::~ParticleS() { real_t val[6] = {force.x,force.y,force.z,moment.x,moment.y,moment.z}; - CudaAtomicAddReduceWarpArr<6>(&constData.particle_data[i*RFI_DATA_SIZE+RFI_DATA_FORCE],val); + CudaAtomicAddReduceWarpArr<6>(&particle_data[i*RFI_DATA_SIZE+RFI_DATA_FORCE],val); } #ifdef CROSS_HAS_ADDOPP template <> CudaDeviceFunction ParticleS< OPP_SYNC >::~ParticleS() { real_t val[6] = {force.x,force.y,force.z,moment.x,moment.y,moment.z}; - CudaAtomicAddReduceOppArr<6>(&constData.particle_data[i*RFI_DATA_SIZE+RFI_DATA_FORCE],val); + CudaAtomicAddReduceOppArr<6>(&particle_data[i*RFI_DATA_SIZE+RFI_DATA_FORCE],val); } #endif template <> CudaDeviceFunction ParticleS< BLOCK_SYNC >::~ParticleS() { - CudaAtomicAddReduce(&constData.particle_data[i*RFI_DATA_SIZE+RFI_DATA_FORCE+0],force.x); - CudaAtomicAddReduce(&constData.particle_data[i*RFI_DATA_SIZE+RFI_DATA_FORCE+1],force.y); - CudaAtomicAddReduce(&constData.particle_data[i*RFI_DATA_SIZE+RFI_DATA_FORCE+2],force.z); - CudaAtomicAddReduce(&constData.particle_data[i*RFI_DATA_SIZE+RFI_DATA_MOMENT+0],moment.x); - CudaAtomicAddReduce(&constData.particle_data[i*RFI_DATA_SIZE+RFI_DATA_MOMENT+1],moment.y); - CudaAtomicAddReduce(&constData.particle_data[i*RFI_DATA_SIZE+RFI_DATA_MOMENT+2],moment.z); + CudaAtomicAddReduce(&particle_data[i*RFI_DATA_SIZE+RFI_DATA_FORCE+0],force.x); + CudaAtomicAddReduce(&particle_data[i*RFI_DATA_SIZE+RFI_DATA_FORCE+1],force.y); + CudaAtomicAddReduce(&particle_data[i*RFI_DATA_SIZE+RFI_DATA_FORCE+2],force.z); + CudaAtomicAddReduce(&particle_data[i*RFI_DATA_SIZE+RFI_DATA_MOMENT+0],moment.x); + CudaAtomicAddReduce(&particle_data[i*RFI_DATA_SIZE+RFI_DATA_MOMENT+1],moment.y); + CudaAtomicAddReduce(&particle_data[i*RFI_DATA_SIZE+RFI_DATA_MOMENT+2],moment.z); } @@ -104,46 +105,10 @@ CudaDeviceFunction ParticleS< BLOCK_SYNC >::~ParticleS() { #ifdef SOLID_CACHE typedef typename solidcontainer_t::cache_set_found_t< ParticleS< PART_SYNC >, SOLID_CACHE > set_found_t; -#else - typedef typename solidcontainer_t::set_found_t< ParticleS< PART_SYNC > > set_found_t; -#endif - -template -CudaDeviceFunction T ParticleIteratorXBlockT(real_t x, real_t y, real_t z) { - real_t mar = PART_MAR; - real_t point[3] = {x,y,z}; - real_t lower[3] = {x-CudaThread.x-mar,y-mar,z-mar}; - real_t upper[3] = {x-CudaThread.x+CudaNumberOfThreads.x-1.0f+mar,y+mar,z+mar}; - return T(constData.solidfinder, point, lower, upper); -} - -template -CudaDeviceFunction T ParticleIteratorT(real_t x, real_t y, real_t z) { - real_t mar = PART_MAR; - real_t point[3] = {x,y,z}; - real_t lower[3] = {x-mar,y-mar,z-mar}; - real_t upper[3] = {x+mar,y+mar,z+mar}; - return T(constData.solidfinder, point, lower, upper); -} - -#ifdef SOLID_CACHE - typedef typename solidcontainer_t::cache_set_found_t< ParticleS< PART_SYNC >, SOLID_CACHE > set_found_t_s; + typedef typename solidcontainer_t::cache_set_found_t< ParticleS< PART_SYNC >, SOLID_CACHE > set_found_t_s; typedef typename solidcontainer_t::cache_set_found_t< ParticleI, SOLID_CACHE > set_found_t_i; #else - typedef typename solidcontainer_t::set_found_t< ParticleS< PART_SYNC > > set_found_t_s; - typedef typename solidcontainer_t::set_found_t< ParticleI > set_found_t_i; -#endif - -#ifdef USE_ADDOPP - CudaDeviceFunction set_found_t_s SyncParticleIterator(real_t x, real_t y, real_t z) { - return ParticleIteratorT< set_found_t_s >(x,y,z); - } -#else - CudaDeviceFunction set_found_t_s SyncParticleIterator(real_t x, real_t y, real_t z) { - return ParticleIteratorXBlockT< set_found_t_s >(x,y,z); - } + typedef typename solidcontainer_t::set_found_t< ParticleS< PART_SYNC > > set_found_t; + typedef typename solidcontainer_t::set_found_t< ParticleS< PART_SYNC > > set_found_t_s; + typedef typename solidcontainer_t::set_found_t< ParticleI > set_found_t_i; #endif - -CudaDeviceFunction set_found_t_i ParticleIterator(real_t x, real_t y, real_t z) { - return ParticleIteratorT< set_found_t_i >(x,y,z); -} diff --git a/src/Region.h b/src/Region.h index 9103b82fd..4ab72f2df 100644 --- a/src/Region.h +++ b/src/Region.h @@ -25,16 +25,16 @@ class lbRegion { if (ret.nx <= 0 || ret.ny <= 0 || ret.nz <= 0) { ret.nx = ret.ny = ret.nz = 0; }; return ret; }; - inline int offset(int x,int y) { + int offset(int x,int y) const { return (x-dx) + (y-dy) * nx; }; inline void print() { printf("Region: %dx%dx%d + %d,%d,%d\n", nx,ny,nz,dx,dy,dz); }; - CudaHostFunction CudaDeviceFunction inline int offset(int x,int y,int z) { + CudaHostFunction CudaDeviceFunction int offset(int x,int y,int z) const { return (x-dx) + (y-dy) * nx + (z-dz) * nx * ny; }; - CudaHostFunction CudaDeviceFunction inline size_t offsetL(int x,int y,int z) { + CudaHostFunction CudaDeviceFunction size_t offsetL(int x,int y,int z) const { return (x-dx) + (y-dy) * nx + (z-dz) * nx * ny; }; diff --git a/src/SolidAll.h b/src/SolidAll.h index 7126b6d2e..e8b16a767 100644 --- a/src/SolidAll.h +++ b/src/SolidAll.h @@ -9,6 +9,7 @@ class SolidAll { template class set_found_t { const finder_t& finder; + real_t* particle_data; real_t point[3]; class iterator_t { const set_found_t * set; @@ -16,7 +17,7 @@ class SolidAll { CudaDeviceFunction iterator_t(const set_found_t& set_, const addr_t& i_) : set(&set_), i(i_) { }; friend class set_found_t; public: - CudaDeviceFunction T operator* () { return T(i,set->point); } + CudaDeviceFunction T operator* () { return T(set->particle_data, i, set->point); } CudaDeviceFunction iterator_t& operator++() { ++i; return *this; @@ -27,7 +28,7 @@ class SolidAll { typedef iterator_t iterator; CudaDeviceFunction inline iterator begin() { return iterator(*this, 0); }; CudaDeviceFunction inline iterator end() { return iterator(*this, finder.size); }; - CudaDeviceFunction inline set_found_t(const finder_t& finder_, const real_t point_[], const real_t lower[], const real_t upper[]): finder(finder_) { + CudaDeviceFunction inline set_found_t(const finder_t& finder_, real_t* particle_data_, const real_t point_[], const real_t lower[], const real_t upper[]) : finder(finder_), particle_data(particle_data_) { for (int i=0;i<3;i++) { point[i]=point_[i]; } }; }; diff --git a/src/SolidGrid.h b/src/SolidGrid.h index 0ce721077..fa345b74d 100644 --- a/src/SolidGrid.h +++ b/src/SolidGrid.h @@ -19,6 +19,7 @@ class SolidGrid { template class set_found_t { const finder_t& finder; + real_t* particle_data; real_t point[3]; int mins[3]; int maxs[3]; @@ -62,7 +63,7 @@ class SolidGrid { CudaDeviceFunction iterator_t() : set(NULL) { balli = -1; }; friend class set_found_t; public: - CudaDeviceFunction T operator* () { return T(balli,set->point); } + CudaDeviceFunction T operator* () { return T(set->particle_data, balli, set->point); } CudaDeviceFunction iterator_t& operator++() { ++d; go(); @@ -74,7 +75,7 @@ class SolidGrid { typedef iterator_t iterator; CudaDeviceFunction inline iterator begin() { return iterator(*this); }; CudaDeviceFunction inline iterator end() { return iterator(); }; - CudaDeviceFunction inline set_found_t(const finder_t& finder_, const real_t point_[], const real_t lower[], const real_t upper[]): finder(finder_) { + CudaDeviceFunction inline set_found_t(const finder_t& finder_, real_t* particle_data_, const real_t point_[], const real_t lower[], const real_t upper[]): finder(finder_), particle_data(particle_data_) { for (int i=0;i<3;i++) { point[i]=point_[i]; } real_t d = 0.5*finder.delta; for (int k=0; k<3; k++) { @@ -87,6 +88,7 @@ class SolidGrid { }; template class cache_set_found_t { + real_t* particle_data; real_t point[3]; size_t cache_size; tr_addr_t cache[MAX_CACHE]; @@ -96,7 +98,7 @@ class SolidGrid { CudaDeviceFunction iterator_t(const cache_set_found_t& set_, const size_t& i_) : set(&set_), i(i_) { }; friend class cache_set_found_t; public: - CudaDeviceFunction T operator* () { return T(set->cache[i],set->point); } + CudaDeviceFunction T operator* () { return T(set->particle_data, set->cache[i], set->point); } CudaDeviceFunction iterator_t& operator++() { ++i; return *this; @@ -107,7 +109,7 @@ class SolidGrid { typedef iterator_t iterator; CudaDeviceFunction inline iterator begin() { return iterator(*this, 0); }; CudaDeviceFunction inline iterator end() { return iterator(*this, cache_size); }; - CudaDeviceFunction inline cache_set_found_t(const finder_t& finder, const real_t point_[], const real_t lower[], const real_t upper[]) { + CudaDeviceFunction inline cache_set_found_t(const finder_t& finder, real_t* particle_data_, const real_t point_[], const real_t lower[], const real_t upper[]) : particle_data(particle_data_) { for (int i=0;i<3;i++) { point[i]=point_[i]; } int mins[3]; int maxs[3]; diff --git a/src/SolidTree.cpp b/src/SolidTree.cpp deleted file mode 100644 index 9a0238b17..000000000 --- a/src/SolidTree.cpp +++ /dev/null @@ -1,3 +0,0 @@ -#include "SolidTree.hpp" -#include "RemoteForceInterface.hpp" - diff --git a/src/SolidTree.h b/src/SolidTree.h index 4f04f8e0b..079ecb8bf 100644 --- a/src/SolidTree.h +++ b/src/SolidTree.h @@ -28,6 +28,7 @@ class SolidTree { template class set_found_t { const finder_t& finder; + real_t* particle_data; real_t lower[3]; real_t upper[3]; real_t point[3]; @@ -56,7 +57,7 @@ class SolidTree { CudaDeviceFunction iterator_t(): set(nullptr) { nodei = -1; return; }; friend class set_found_t; public: - CudaDeviceFunction T operator* () { return T(i,set->point); } + CudaDeviceFunction T operator* () { return T(set->particle_data, i, set->point); } CudaDeviceFunction iterator_t& operator++() { if (nodei != -1) { nodei = set->finder.data[nodei].back; @@ -70,12 +71,13 @@ class SolidTree { typedef iterator_t iterator; CudaDeviceFunction inline iterator begin() { return iterator(*this); }; CudaDeviceFunction inline iterator end() { return iterator(); }; - CudaDeviceFunction inline set_found_t(const finder_t& finder_, const real_t point_[], const real_t lower_[], const real_t upper_[]): finder(finder_) { + CudaDeviceFunction inline set_found_t(const finder_t& finder_, real_t* particle_data_, const real_t point_[], const real_t lower_[], const real_t upper_[]): finder(finder_), particle_data(particle_data_) { for (int i=0;i<3;i++) { lower[i]=lower_[i]; upper[i]=upper_[i]; point[i]=point_[i]; } }; }; template class cache_set_found_t { + real_t* particle_data; real_t point[3]; size_t cache_size; tr_addr_t cache[MAX_CACHE]; @@ -85,7 +87,7 @@ class SolidTree { CudaDeviceFunction iterator_t(const cache_set_found_t& set_, const size_t& i_) : set(&set_), i(i_) { }; friend class cache_set_found_t; public: - CudaDeviceFunction T operator* () { return T(set->cache[i],set->point); } + CudaDeviceFunction T operator* () { return T(set->particle_data, set->cache[i], set->point); } CudaDeviceFunction iterator_t& operator++() { ++i; return *this; @@ -96,7 +98,7 @@ class SolidTree { typedef iterator_t iterator; CudaDeviceFunction inline iterator begin() { return iterator(*this, 0); }; CudaDeviceFunction inline iterator end() { return iterator(*this, cache_size); }; - CudaDeviceFunction inline cache_set_found_t(const finder_t& finder, const real_t point_[], const real_t lower[], const real_t upper[]) { + CudaDeviceFunction inline cache_set_found_t(const finder_t& finder, real_t* particle_data_, const real_t point_[], const real_t lower[], const real_t upper[]) : particle_data(particle_data_) { for (int i=0;i<3;i++) { point[i]=point_[i]; } tr_addr_t nodei = 0; bool go_left = true; diff --git a/src/Solver.cpp.Rt b/src/Solver.cpp.Rt index 383db170c..11ef3ae8d 100644 --- a/src/Solver.cpp.Rt +++ b/src/Solver.cpp.Rt @@ -165,8 +165,7 @@ void MainFree( Solver *d); */ int Solver::writeLog(const char * filename) { - FILE * f = NULL; - double v; + FILE * f = NULL; double * glob = lattice->globals; if (mpi.rank == 0) { int j=0; @@ -174,22 +173,21 @@ void MainFree( Solver *d); assert( f != NULL ); fprintf(f,"%d, %.13le, %.13le, %d",iter, LogScales[SETTINGS+GLOBALS+ZONESETTINGS+SCALES_dt] * iter, get_walltime(), opt_iter); for (int i=0; i< SETTINGS; i++) { - v = lattice->settings[i]; + const double v = lattice->getSetting(i); fprintf(f,", %.13le, %.13le",v,v*LogScales[j]); j++; } for (int i=0; i< ZONESETTINGS; i++) { - for (std::map::iterator it = geometry->SettingZones.begin(); it != geometry->SettingZones.end(); it++) { + for (const auto& map_entry : geometry->SettingZones) { int ind = lattice->ZoneIter; - int zone = it->second; - v = lattice->zSet.get(i, zone, ind); -// v = lattice->settings[i]; - fprintf(f,", %.13le, %.13le",v,v*LogScales[j]); - } + int zone = map_entry.second; + const double v = lattice->zSet.get(i, zone, ind); + fprintf(f,", %.13le, %.13le",v,v*LogScales[j]); + } j++; } for (int i=0; i< GLOBALS; i++) { - v = glob[i]; + const double v = glob[i]; fprintf(f,", %.13le, %.13le",v,v*LogScales[j]); j++; } @@ -391,9 +389,6 @@ void MainFree( Solver *d); lattice->zSet.set(, -1, units.alt("")); - // Setting global variables - initSettings(); - geometry = new Geometry(region, mpi.totalregion, units); return 0; diff --git a/src/config.mk.in b/src/config.mk.in index aa5084468..3b83b0f56 100644 --- a/src/config.mk.in +++ b/src/config.mk.in @@ -14,7 +14,7 @@ EMBEDED_PYTHON = @EMBEDED_PYTHON@ WITH_LAMMPS = @WITH_LAMMPS@ -SOURCE_CU=Global.cu Lattice.cu main.cu vtkLattice.cu vtkOutput.cu cross.cu cuda.cu CartLatticeContainer.cu Dynamics.c +SOURCE_CU=Global.cu main.cu vtkLattice.cu vtkOutput.cu cross.cu cuda.cu CartLatticeContainer.cu Dynamics.c SOURCE=$(SOURCE_CU) HEADERS=Global.h gpu_anim.h CartLatticeContainer.h Lattice.h Region.h vtkLattice.h vtkOutput.h cross.h gl_helper.h Dynamics.h types.h pugixml.hpp pugiconfig.hpp diff --git a/src/cross.h b/src/cross.h index 7349cc07d..5d5b424e7 100644 --- a/src/cross.h +++ b/src/cross.h @@ -56,16 +56,16 @@ #ifndef CROSS_HIP #define CudaKernelRun(a__,b__,c__,...) a__<<>>(__VA_ARGS__); HANDLE_ERROR( cudaDeviceSynchronize()); HANDLE_ERROR( cudaGetLastError() ) #ifdef CROSS_SYNC - #define CudaKernelRunNoWait(a__,b__,c__,e__,...) a__<<>>(__VA_ARGS__); HANDLE_ERROR( cudaDeviceSynchronize()); HANDLE_ERROR( cudaGetLastError() ); + #define CudaKernelRunAsync(a__,b__,c__,e__,...) a__<<>>(__VA_ARGS__); HANDLE_ERROR( cudaDeviceSynchronize()); HANDLE_ERROR( cudaGetLastError() ); #else - #define CudaKernelRunNoWait(a__,b__,c__,e__,...) a__<<>>(__VA_ARGS__); + #define CudaKernelRunAsync(a__,b__,c__,e__,...) a__<<>>(__VA_ARGS__); #endif #else #define CudaKernelRun(a__,b__,c__,...) a__<<>>(__VA_ARGS__); HANDLE_ERROR( hipDeviceSynchronize()); HANDLE_ERROR( hipGetLastError() ) #ifdef CROSS_SYNC - #define CudaKernelRunNoWait(a__,b__,c__,e__,...) a__<<>>(__VA_ARGS__); HANDLE_ERROR( hipDeviceSynchronize()); HANDLE_ERROR( hipGetLastError() ); + #define CudaKernelRunAsync(a__,b__,c__,e__,...) a__<<>>(__VA_ARGS__); HANDLE_ERROR( hipDeviceSynchronize()); HANDLE_ERROR( hipGetLastError() ); #else - #define CudaKernelRunNoWait(a__,b__,c__,e__,...) a__<<>>(__VA_ARGS__); + #define CudaKernelRunAsync(a__,b__,c__,e__,...) a__<<>>(__VA_ARGS__); #endif #endif #define CudaBlock blockIdx @@ -174,7 +174,7 @@ #define CudaGetDeviceCount(a__) HANDLE_ERROR( hipGetDeviceCount( a__ ) ) #define CudaDeviceReset() HANDLE_ERROR( hipDeviceReset( ) ) #define CudaFuncAttributes hipFuncAttributes - #define CudaFuncGetAttributes(a__,b__) HANDLE_ERROR( hipFuncGetAttributes(a__, reinterpret_cast(&b__)) ) + #define CudaFuncGetAttributes(a__,b__) HANDLE_ERROR( hipFuncGetAttributes(a__, reinterpret_cast(b__)) ) #endif // cudaError_t cudaPreAlloc(void ** ptr, size_t size); // cudaError_t cudaAllocFinalize(); @@ -182,6 +182,7 @@ CudaError HandleError( CudaError err, const char *file, int line ); #define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ )) #define ISFINITE(l__) isfinite(l__) + #else #include #include @@ -281,8 +282,7 @@ extern uint3 CpuThread; extern uint3 CpuSize; - #include - + #include template inline void CPUKernelRun(F &&func, const dim3& blocks, P &&... args) { #pragma omp parallel for collapse(3) schedule(static) @@ -302,7 +302,7 @@ } template - inline void CudaKernelRunNoWait(F &&func, const dim3& blocks, const dim3& threads, CudaStream_t stream, P &&... args) { + inline void CudaKernelRunAsync(F &&func, const dim3& blocks, const dim3& threads, CudaStream_t stream, P &&... args) { CPUKernelRun(func, blocks, std::forward

(args)...); } diff --git a/src/cuda.cu.Rt b/src/cuda.cu.Rt index ac9d9e9d3..8c18426a8 100644 --- a/src/cuda.cu.Rt +++ b/src/cuda.cu.Rt @@ -24,39 +24,27 @@ if (exists('Extra_Dynamics_C_Header')) { #include "cross.h" #include "cross.hpp" +#ifdef ADJOINT + #include "ADTools.cu" +#endif + +#include "CartLatticeLauncher.hpp" + +/* #include CudaDeviceFunction real_t rise_nan() { assert(false); return NAN; } #define RISENAN rise_nan() - -#ifdef ADJOINT - #include "ADTools.cu" -#endif - -/// Constant memory variables -#include "CartLatticeContainer.h" -#include "LatticeData.hpp" - -CudaDeviceFunction CudaConstantMemory CartLatticeContainer constContainer; -CudaDeviceFunction CudaConstantMemory LatticeData constData; - -// Settings -/// GPU Constant memory variable for [] -CudaDeviceFunction CudaConstantMemory real_t = 0.0f; - - -// These headers depend on contant memory variables -#include "Particle.hpp" -#include "CartLatticeExecutor.hpp" +*/ -template void CartLatticeExecutor::RunBorder < > (CudaStream_t stream) const; -template void CartLatticeExecutor::RunInterior < > (CudaStream_t stream) const; > (CudaStream_t stream) const; +template void CartLatticeLauncher::RunInterior < > (CudaStream_t stream) const; diff --git a/src/makefile.main.Rt b/src/makefile.main.Rt index 3573cf9ba..0ab532390 100644 --- a/src/makefile.main.Rt +++ b/src/makefile.main.Rt @@ -17,16 +17,16 @@ all: .PHONY: all clean dummy -SOURCE_PLAN+=Global.cpp Lattice.cu vtkLattice.cpp vtkOutput.cpp cross.cu cuda.cu CartLatticeAccess.hpp +SOURCE_PLAN+=Global.cpp Lattice.cpp vtkLattice.cpp vtkOutput.cpp cross.cu cuda.cu CartLatticeAccess.hpp SOURCE_PLAN+=Dynamics.c Dynamics_sp.c Solver.cpp pugixml.cpp Geometry.cpp def.cpp unit.cpp SOURCE_PLAN+=ZoneSettings.cpp SyntheticTurbulence.cpp Sampler.cpp SOURCE_PLAN+=main.cpp -SOURCE_PLAN+=Global.h gpu_anim.h CartLatticeContainer.h LatticeData.hpp CartLatticeExecutor.h CartLatticeExecutor.hpp Lattice.h Region.h vtkLattice.h vtkOutput.h cross.h cross.hpp +SOURCE_PLAN+=Global.h gpu_anim.h CartLatticeContainer.h LatticeData.hpp CartLatticeLauncher.h CartLatticeLauncher.hpp Lattice.h Region.h vtkLattice.h vtkOutput.h cross.h cross.hpp SOURCE_PLAN+=gl_helper.h Dynamics.h types.h Consts.h Solver.h pugixml.hpp pugiconfig.hpp Node.hpp SOURCE_PLAN+=Geometry.h def.h utils.h unit.h ZoneSettings.h SyntheticTurbulence.h Sampler.h spline.h TCLBForceGroupCommon.h SOURCE_PLAN+=RemoteForceInterface.cpp RemoteForceInterface.h RemoteForceInterface.hpp SOURCE_PLAN+=TCLBForceGroupCommon.h MPMD.hpp empty.cpp Particle.hpp lammps.cpp -SOURCE_PLAN+=SolidTree.h SolidTree.hpp SolidTree.cpp SolidAll.h SolidGrid.h SolidGrid.hpp +SOURCE_PLAN+=SolidTree.h SolidTree.hpp SolidAll.h SolidGrid.h SolidGrid.hpp SOURCE_PLAN+=SolidContainer.h SOURCE_PLAN+=xpath_modification.cpp xpath_modification.h SOURCE_PLAN+=hdf5Lattice.cpp hdf5Lattice.h diff --git a/src/solver.vcproj b/src/solver.vcproj index 183fa71fd..b59555b38 100644 --- a/src/solver.vcproj +++ b/src/solver.vcproj @@ -359,7 +359,7 @@ > set_found_t; real_t lower[3] = {pos[0]-offset,pos[1]-offset,pos[2]-offset}; real_t upper[3] = {pos[0]+offset,pos[1]+offset,pos[2]+offset}; - for (auto part : set_found_t(finder, pos, lower, upper)) { + for (auto part : set_found_t(finder, nullptr, pos, lower, upper)) { if (part.dist <= part.rad+offset) ret.push_back(part.balli); } std::sort(ret.begin(),ret.end());