diff --git a/src/CudaHelp.hh b/src/CudaHelp.hh index de6b389..9326593 100644 --- a/src/CudaHelp.hh +++ b/src/CudaHelp.hh @@ -6,11 +6,14 @@ #define MIN_CTAS_PER_SM 4 #define MAX_REDUCTION_CTAS 1024 -#ifdef __CUDACC__ +#ifdef USE_CUDA #include #include "legion.h" +#ifndef __CUDA_HD__ #define __CUDA_HD__ __host__ __device__ +#endif +#ifdef __CUDACC__ template __device__ __forceinline__ void reduce_double(Legion::DeferredReduction result, double value) @@ -39,6 +42,7 @@ void reduce_double(Legion::DeferredReduction result, double value) __threadfence_system(); } } +#endif #else #define __CUDA_HD__ diff --git a/src/Hydro.cc b/src/Hydro.cc index 5f9d40e..383cad5 100644 --- a/src/Hydro.cc +++ b/src/Hydro.cc @@ -217,8 +217,8 @@ Hydro::Hydro( tts = new TTS(inp, this); qcs = new QCS(inp, this); - const double2 vfixx = double2(1., 0.); - const double2 vfixy = double2(0., 1.); + const double2 vfixx = make_double2(1., 0.); + const double2 vfixy = make_double2(0., 1.); if (mesh->parallel) { for (int i = 0; i < bcx.size(); ++i) bcs.push_back(new HydroBC(mesh, vfixx, bcx[i], true/*xplane*/)); @@ -303,7 +303,7 @@ void Hydro::init() { if (uinitradial != 0.) initRadialVel(uinitradial, pfirst, plast); else - fill(&pu[pfirst], &pu[plast], double2(0., 0.)); + fill(&pu[pfirst], &pu[plast], make_double2(0., 0.)); } // for pch LogicalRegion& lrp = mesh->lrp; @@ -411,7 +411,7 @@ void Hydro::initParallel() { } else { - const double2 zero2(0., 0.); + const double2 zero2 = make_double2(0., 0.); FillLauncher launcher(lrp, lrp, TaskArgument(&zero2,sizeof(zero2))); launcher.add_field(FID_PU); runtime->fill_fields(ctx, launcher); @@ -432,7 +432,7 @@ void Hydro::initRadialVel( if (pmag > eps) pu[p] = vel * px[p] / pmag; else - pu[p] = double2(0., 0.); + pu[p] = make_double2(0., 0.); } } @@ -473,7 +473,7 @@ Future Hydro::doCycle( launchffd.argument = TaskArgument(ffdargs, sizeof(ffdargs)); launchffd.predicate = p_not_done; - double2 ffd2args[] = { double2(0., 0.) }; + double2 ffd2args[] = { make_double2(0., 0.) }; IndexFillLauncher launchffd2; launchffd2.launch_space = ispc; launchffd2.projection = 0; @@ -1908,7 +1908,7 @@ void Hydro::initRadialVelTask( if (pmag > args->eps) acc_pu[*itr] = args->vel * px / pmag; else - acc_pu[*itr] = double2(0., 0.); + acc_pu[*itr] = make_double2(0., 0.); } } diff --git a/src/Mesh.cc b/src/Mesh.cc index 2cddea3..cc7e84e 100644 --- a/src/Mesh.cc +++ b/src/Mesh.cc @@ -157,7 +157,7 @@ const int SumOp::identity = 0; template <> const double SumOp::identity = 0.; template <> -const double2 SumOp::identity = double2(0., 0.); +const double2 SumOp::identity = make_double2(0., 0.); template <> const double MinOp::identity = DBL_MAX; template <> @@ -1268,7 +1268,7 @@ void Mesh::calcCtrsTask( const IndexSpace& isz = task->regions[1].region.get_index_space(); for (PointIterator itr(runtime, isz); itr(); itr++) - acc_zx[*itr] = double2(0., 0.); + acc_zx[*itr] = make_double2(0., 0.); const IndexSpace& iss = task->regions[0].region.get_index_space(); for (PointIterator itr(runtime, iss); itr(); itr++) @@ -1314,7 +1314,7 @@ void Mesh::calcCtrsOMPTask( const Rect<1> rectz = runtime->get_index_space_domain(isz); #pragma omp parallel for for (coord_t z = rectz.lo[0]; z <= rectz.hi[0]; z++) - acc_zx[z] = double2(0., 0.); + acc_zx[z] = make_double2(0., 0.); const IndexSpace& iss = task->regions[0].region.get_index_space(); // This will assert if it is not dense @@ -1738,7 +1738,7 @@ void Mesh::calcCtrs( int zfirst = mapsz[sfirst]; int zlast = (slast < nums ? mapsz[slast] : numz); - fill(&zx[zfirst], &zx[zlast], double2(0., 0.)); + fill(&zx[zfirst], &zx[zlast], make_double2(0., 0.)); for (int s = sfirst; s < slast; ++s) { int p1 = mapsp1[s]; diff --git a/src/QCS.cc b/src/QCS.cc index b382ca4..b17e5e8 100644 --- a/src/QCS.cc +++ b/src/QCS.cc @@ -146,7 +146,7 @@ void QCS::setCornerDivTask( // [1] Compute a zone-centered velocity const IndexSpace& isz = task->regions[1].region.get_index_space(); for (PointIterator itz(runtime, isz); itz(); itz++) - acc_zuc[*itz] = double2(0., 0.); + acc_zuc[*itz] = make_double2(0., 0.); const IndexSpace& iss = task->regions[0].region.get_index_space(); for (PointIterator its(runtime, iss); its(); its++) @@ -488,7 +488,7 @@ void QCS::setCornerDivOMPTask( const Rect<1> rectz = runtime->get_index_space_domain(isz); #pragma omp parallel for for (coord_t z = rectz.lo[0]; z <= rectz.hi[0]; z++) - acc_zuc[z] = double2(0., 0.); + acc_zuc[z] = make_double2(0., 0.); const IndexSpace& iss = task->regions[0].region.get_index_space(); // This will assert if it is not dense diff --git a/src/Vec2.hh b/src/Vec2.hh index 523fd7e..7f96ca4 100644 --- a/src/Vec2.hh +++ b/src/Vec2.hh @@ -20,7 +20,7 @@ // This struct is defined with all functions inline, // to give the compiler maximum opportunity to optimize. -#ifndef __CUDACC__ +#ifndef USE_CUDA struct double2 { typedef double value_type; @@ -37,38 +37,10 @@ struct double2 return(*this); } - inline double2& operator+=(const double2& v2) - { - x += v2.x; - y += v2.y; - return(*this); - } - - inline double2& operator-=(const double2& v2) - { - x -= v2.x; - y -= v2.y; - return(*this); - } - - inline double2& operator*=(const double& r) - { - x *= r; - y *= r; - return(*this); - } - - inline double2& operator/=(const double& r) - { - x /= r; - y /= r; - return(*this); - } - }; // double2 -#endif // __CUDACC__ +#endif // USE_CUDA -#ifndef __CUDACC__ +#ifndef USE_CUDA // Already has a decleration in cuda inline double2 make_double2(double x_, double y_) { return(double2(x_, y_)); @@ -119,6 +91,14 @@ inline double2 operator+(const double2& v1, const double2& v2) return make_double2(v1.x + v2.x, v1.y + v2.y); } +__CUDA_HD__ +inline double2& operator+=(double2& v1, const double2& v2) +{ + v1.x += v2.x; + v1.y += v2.y; + return v1; +} + // subtract __CUDA_HD__ inline double2 operator-(const double2& v1, const double2& v2) @@ -126,6 +106,14 @@ inline double2 operator-(const double2& v1, const double2& v2) return make_double2(v1.x - v2.x, v1.y - v2.y); } +__CUDA_HD__ +inline double2& operator-=(double2& v1, const double2& v2) +{ + v1.x -= v2.x; + v1.y -= v2.y; + return v1; +} + // multiply vector by scalar __CUDA_HD__ inline double2 operator*(const double2& v, const double& r) @@ -133,6 +121,14 @@ inline double2 operator*(const double2& v, const double& r) return make_double2(v.x * r, v.y * r); } +__CUDA_HD__ +inline double2& operator*=(double2& v, const double& r) +{ + v.x *= r; + v.y *= r; + return v; +} + // multiply scalar by vector __CUDA_HD__ inline double2 operator*(const double& r, const double2& v) @@ -148,6 +144,14 @@ inline double2 operator/(const double2& v, const double& r) return make_double2(v.x * rinv, v.y * rinv); } +__CUDA_HD__ +inline double2& operator/=(double2& v, const double& r) +{ + double rinv = (double) 1. / r; + v.x *= rinv; + v.y *= rinv; + return v; +} // other vector operations: