From 08f1f8e3e1b00e0876de98015d4746af431ed4ae Mon Sep 17 00:00:00 2001 From: Chuck Yount Date: Wed, 8 Feb 2017 14:30:49 -0700 Subject: [PATCH 1/3] Sub domains (#13) * Use pointers to grids to improve code-gen in some stencils. Grids as member vars instead of pointers prevented the compiler from optimizing addr calculations, causing >3x code bloat in AWP stencil and breaking vectorization in FSG. Also: Fixed uninitialized vars in grids. Fixed some g++ compilation warnings. Ensured cache-line-boundary alignment in allocation. * Bugfix typo (prevented compilation). (#10) (#12) --- Makefile | 2 +- gen-layouts.pl | 4 +- src/foldBuilder/Cpp.hpp | 8 +- src/foldBuilder/Print.cpp | 84 +++++++------- src/foldBuilder/main.cpp | 6 +- src/realv_grids.hpp | 237 ++++++++++++++++++++++---------------- src/stencil.hpp | 6 +- src/stencil_calc.cpp | 61 +++++++--- src/stencil_calc.hpp | 15 ++- src/stencil_main.cpp | 6 +- src/utils.cpp | 8 +- src/utils.hpp | 11 +- 12 files changed, 261 insertions(+), 187 deletions(-) diff --git a/Makefile b/Makefile index cca7f616..2e3c79ae 100644 --- a/Makefile +++ b/Makefile @@ -336,7 +336,7 @@ endif # gen-loops.pl args: # Rank loops break up the whole rank into smaller regions. -# In order for tempral wavefronts to operate properly, the +# In order for temporal wavefronts to operate properly, the # order of spatial dimensions may be changed, but traversal # paths that do not have strictly incrementing indices (e.g., # grouped, serpentine, square-wave) may not be used here when diff --git a/gen-layouts.pl b/gen-layouts.pl index f8fd84bd..84ca7f77 100755 --- a/gen-layouts.pl +++ b/gen-layouts.pl @@ -176,10 +176,10 @@ ($$$$) " Layout_$name() { }\n\n", " Layout_$name($cargs) : ${basename}($cvars) { }\n\n", " // Return 1-D offset from $n-D 'j' indices.\n", - " virtual idx_t layout($margs) const\n", + " virtual idx_t layout($margs) const final\n", " { return ", makeLayout(\@p, \@jvars, \@dvars), "; }\n\n", " // set $n 'j' indices based on 1-D 'ai' input.\n", - " virtual void unlayout(idx_t ai, $uargs) const\n", + " virtual void unlayout(idx_t ai, $uargs) const final\n", " { ", makeUnlayout(\@p, \@jvars, \@dvars, "; "), "; }\n", "};\n"; diff --git a/src/foldBuilder/Cpp.hpp b/src/foldBuilder/Cpp.hpp index 2ab28091..e5f67bd6 100644 --- a/src/foldBuilder/Cpp.hpp +++ b/src/foldBuilder/Cpp.hpp @@ -64,7 +64,7 @@ class CppPrintHelper : public PrintHelper { // Return a parameter reference. virtual string readFromParam(ostream& os, const GridPoint& pp) { - string str = "context." + pp.getName() + "(" + pp.makeValStr() + ")"; + string str = "(*context." + pp.getName() + ")(" + pp.makeValStr() + ")"; return str; } @@ -73,7 +73,7 @@ class CppPrintHelper : public PrintHelper { virtual string makePointCall(const GridPoint& gp, const string& fname, string optArg = "") const { ostringstream oss; - oss << "context." << gp.getName() << "." << fname << "("; + oss << "context." << gp.getName() << "->" << fname << "("; if (optArg.length()) oss << optArg << ", "; oss << gp.makeDimValOffsetStr() << ", __LINE__)"; return oss.str(); @@ -120,7 +120,7 @@ class CppVecPrintHelper : public VecPrintHelper { // Return a parameter reference. virtual string readFromParam(ostream& os, const GridPoint& pp) { - string str = "context." + pp.getName() + "(" + pp.makeValStr() + ")"; + string str = "(*context." + pp.getName() + ")(" + pp.makeValStr() + ")"; return str; } @@ -140,7 +140,7 @@ class CppVecPrintHelper : public VecPrintHelper { const string& firstArg, const string& lastArg, bool isNorm) const { - os << " context." << gp.getName() << "." << funcName << "("; + os << " context." << gp.getName() << "->" << funcName << "("; if (firstArg.length()) os << firstArg << ", "; if (isNorm) diff --git a/src/foldBuilder/Print.cpp b/src/foldBuilder/Print.cpp index 343f8d4a..b7bebb00 100644 --- a/src/foldBuilder/Print.cpp +++ b/src/foldBuilder/Print.cpp @@ -656,6 +656,7 @@ void YASKCppPrinter::printCode(ostream& os) { // Type name. // Name in kernel is 'Grid_' followed by dimensions. string typeName = "Grid_"; + string templStr; for (auto* dim : gp->getDims()) { // Add dim suffix. @@ -667,20 +668,27 @@ void YASKCppPrinter::printCode(ostream& os) { string sdvar = grid + "_alloc_" + *dim; int sdval = _settings._stepAlloc > 0 ? _settings._stepAlloc : gp->getStepDimSize(); - os << " const idx_t " << sdvar << " = " << sdval << + os << " static const idx_t " << sdvar << " = " << sdval << "; // total allocation required in '" << *dim << "' dimension.\n"; - ctorCode += " " + grid + ".set_tdim(" + sdvar + ");\n"; + templStr = "<" + sdvar + ">"; } } + typeName += templStr; // Actual grid declaration. - os << " " << typeName << " " << grid << ";\n"; - - // Ctor list. - if (ctorList.length()) ctorList += ", "; - ctorList += grid + "(\"" + grid + "\")\n"; + os << " " << typeName << "* " << grid << ";\n"; + + // Grid init. + ctorCode += "\n // Init grid '" + grid + "'.\n" + + " " + grid + " = new " + typeName + "(\"" + grid + "\");\n" + + " gridPtrs.push_back(" + grid + ");\n" + + " gridNames.insert(\"" + grid + "\");\n"; + if (_eqGroups.getOutputGrids().count(gp)) { + ctorCode += " outputGridPtrs.push_back(" + grid + ");\n" + + " outputGridNames.insert(\"" + grid + "\");\n"; + } - // Init code. + // Halo-setting code. for (auto* dim : gp->getDims()) { // non-step dimension. @@ -692,7 +700,7 @@ void YASKCppPrinter::printCode(ostream& os) { _settings._haloSize : gp->getHaloSize(*dim); os << " const idx_t " << hvar << " = " << hval << "; // halo allocation required in '" << *dim << "' dimension.\n"; - ctorCode += " " + grid + ".set_halo_" + *dim + + ctorCode += " " + grid + "->set_halo_" + *dim + "(" + hvar + ");\n"; // Update max halo across grids. @@ -720,56 +728,42 @@ void YASKCppPrinter::printCode(ostream& os) { os << "\n // The " << pp->getNumDims() << "D '" << param << "' parameter.\n"; - // Actual declaration. // Type-name in kernel is 'GenericGridNd'. - os << " GenericGrid" << pp->size() << "dsize() << "dsize()) { - os << ", Layout_"; + oss << ", Layout_"; // Traditional C layout, e.g., 321. for (int dn = pp->size(); dn > 0; dn--) - os << dn; + oss << dn; } - os << ">" << param << ";\n"; + oss << ">"; + string ptype = oss.str(); - // Ctor list. + // Actual declaration. + os << " " << ptype << "* " << param << ";\n"; + + // Param init. string dimArg = pp->makeValStr(); - if (ctorList.length()) ctorList += ", "; - ctorList += param + "(" + dimArg + ")\n"; + ctorCode += "\n // Init parameter '" + param + "'.\n" + + " " + param + " = new " + ptype + "(" + dimArg + ");\n" + + " paramPtrs.push_back(" + param + ");\n" + + " paramNames.insert(\"" + param + "\");\n"; } // Ctor. { - os << endl << - " // Constructor.\n" << + os << "\n // Constructor.\n" << " " << _context_base << "(StencilSettings& settings) :" - " StencilContext(settings), " << ctorList << + " StencilContext(settings)" << ctorList << " {\n name = \"" << _stencil.getName() << "\";\n"; - - // Init grid ptrs. - for (auto gp : _grids) { - string grid = gp->getName(); - os << "\n // '" << grid << "' grid.\n" << - " gridPtrs.push_back(&" << grid << ");\n" << - " gridNames.insert(\"" << grid << "\");\n"; - - // I/O grids. - if (_eqGroups.getOutputGrids().count(gp)) { - os << " outputGridPtrs.push_back(&" << grid << ");" << endl; - os << " outputGridNames.insert(\"" << grid << "\");" << endl; - } - } - - // Init param ptrs. - for (auto pp : _params) { - string param = pp->getName(); - os << "\n // '" << param << "' parameter.\n" << - " paramPtrs.push_back(&" << param << ");\n" << - " paramNames.insert(\"" << param << "\");\n"; - } + os << "\n // Create grids and parameters.\n" << + ctorCode; + // Init halo sizes. - os << "\n // Halo sizes.\n" << ctorCode; + os << "\n // Rounded halo sizes.\n"; for (auto dim : maxHalos.getDims()) os << " h" << *dim << " = ROUND_UP(max_halo_" << *dim << ", VLEN_" << allCaps(*dim) << ");" << endl; @@ -812,13 +806,13 @@ void YASKCppPrinter::printCode(ostream& os) { if (eq.getOutputGrids().size()) { os << "\n // The following grids are written by " << egsName << endl; for (auto gp : eq.getOutputGrids()) - os << " outputGridPtrs.push_back(&context." << gp->getName() << ");" << endl; + os << " outputGridPtrs.push_back(context." << gp->getName() << ");" << endl; } if (eq.getInputGrids().size()) { os << "\n // The following grids are read by " << egsName << endl; for (auto gp : eq.getInputGrids()) if (!gp->isParam()) - os << " inputGridPtrs.push_back(&context." << gp->getName() << ");" << endl; + os << " inputGridPtrs.push_back(context." << gp->getName() << ");" << endl; } os << " } // Ctor." << endl; } diff --git a/src/foldBuilder/main.cpp b/src/foldBuilder/main.cpp index 4bd57f5c..f96dad7c 100644 --- a/src/foldBuilder/main.cpp +++ b/src/foldBuilder/main.cpp @@ -404,18 +404,18 @@ int main(int argc, const char* argv[]) { stencilFunc->define(dims._allDims); // Check for illegal dependencies within equations for scalar size. - cout << "Checking equation(s) in a scalar context...\n" + cout << "Checking equation(s) with scalar operations...\n" " If this fails, review stencil equation(s) for illegal dependencies.\n"; grids.checkDeps(dims._scalar, dims._stepDim); // Check for illegal dependencies within equations for vector size. - cout << "Checking equation(s) in a single folded-vector context...\n" + cout << "Checking equation(s) with folded-vector operations...\n" " If this fails, the fold dimensions are not compatible with all equations.\n"; grids.checkDeps(dims._fold, dims._stepDim); // Check for illegal dependencies within equations for cluster sizes and // also create equation groups based on legal dependencies. - cout << "Checking equation(s) in a cluster-of-vectors context...\n" + cout << "Checking equation(s) with clusters of vectors...\n" " If this fails, the cluster dimensions are not compatible with all equations.\n"; EqGroups eqGroups(eq_group_basename_default, dims); eqGroups.findEqGroups(grids, eqGroupTargets, dims._clusterPts); diff --git a/src/realv_grids.hpp b/src/realv_grids.hpp index 5767f907..c2256b2d 100644 --- a/src/realv_grids.hpp +++ b/src/realv_grids.hpp @@ -50,20 +50,17 @@ namespace yask { std::string _name; RealVecGrid* _gp; - // time allocation. - idx_t _tdim=1; - // real_t sizes for up to 4 spatial dims. - idx_t _dn=1, _dx=1, _dy=1, _dz=1; // domain sizes. + idx_t _dn=VLEN_N, _dx=VLEN_X, _dy=VLEN_Y, _dz=VLEN_Z; // domain sizes. idx_t _hn=0, _hx=0, _hy=0, _hz=0; // halo sizes. idx_t _pn=0, _px=0, _py=0, _pz=0; // halo + extra-pad sizes. idx_t _on=0, _ox=0, _oy=0, _oz=0; // offsets into global problem domain. // real_vec_t sizes for up to 4 spatial dims. // halo vector-sizes are not given here, because they are not rounded up. - idx_t _dnv, _dxv, _dyv, _dzv; - idx_t _pnv, _pxv, _pyv, _pzv; - idx_t _onv, _oxv, _oyv, _ozv; + idx_t _dnv=1, _dxv=1, _dyv=1, _dzv=1; + idx_t _pnv=0, _pxv=0, _pyv=0, _pzv=0; + idx_t _onv=0, _oxv=0, _oyv=0, _ozv=0; // Normalize element indices to vector indices and element offsets. ALWAYS_INLINE @@ -85,6 +82,30 @@ namespace yask { elem_ofs = padded_index % vec_len; } + // Adjust logical spatial vector index to 0-based internal index by + // adding padding and removing offset. TODO: currently, the + // compiler isn't able to eliminate some common sub-expressions in + // addr calculation when these functions are used. Until this is + // resolved, alternative code is used in derived classes if the + // macro USE_GET_INDEX is not set. + ALWAYS_INLINE idx_t get_index(idx_t vec_index, + idx_t vec_pad, + idx_t vec_ofs) const { + return vec_index + vec_pad - vec_ofs; + } + ALWAYS_INLINE idx_t get_index_n(idx_t vec_index) const { + return get_index(vec_index, _pnv, _onv); + } + ALWAYS_INLINE idx_t get_index_x(idx_t vec_index) const { + return get_index(vec_index, _pxv, _oxv); + } + ALWAYS_INLINE idx_t get_index_y(idx_t vec_index) const { + return get_index(vec_index, _pyv, _oyv); + } + ALWAYS_INLINE idx_t get_index_z(idx_t vec_index) const { + return get_index(vec_index, _pzv, _ozv); + } + // Resize the underlying grid based on the current settings. virtual void resize_g() =0; @@ -112,7 +133,7 @@ namespace yask { } // Get temporal allocation. - virtual inline idx_t get_tdim() const { return _tdim; } + virtual inline idx_t get_tdim() const { return 1; } // Get domain-size for this rank after round-up. inline idx_t get_dn() const { return _dn; } @@ -147,7 +168,6 @@ namespace yask { inline idx_t get_last_z() const { return _oz + _dz - 1; } // Set domain-size for this rank and round-up. - inline void set_tdim(idx_t tdim) { _tdim = tdim; resize_g(); } inline void set_dn(idx_t dn) { _dn = ROUND_UP(dn, VLEN_N); _dnv = _dn / VLEN_N; resize_g(); } inline void set_dx(idx_t dx) { @@ -291,55 +311,7 @@ namespace yask { return _gp; } - protected: - - // Adjust logical time index to 0-based index - // using temporal allocation size. - ALWAYS_INLINE - idx_t get_index_t(idx_t t) const { - - // Index wraps in tdim. - // Examples based on tdim == 2: - // t_idx => return value. - // -2 => 0. - // -1 => 1. - // 0 => 0. - // 1 => 1. - // 2 => 0. - - // Avoid discontinuity caused by negative time by adding a large - // offset to the t index. So, t can be negative, but not so - // much that it would still be negaive after adding the offset. - // This should not be a practical restriction. - t += 0x100 * _tdim; - assert(t >= 0); - assert(t % CPTS_T == 0); - idx_t t_idx = t / idx_t(CPTS_T); - return t_idx % _tdim; - } - - // Adjust logical spatial vector index to 0-based internal index by - // adding padding and removing offset. - ALWAYS_INLINE idx_t get_index(idx_t vec_index, - idx_t vec_pad, - idx_t vec_ofs) const { - return vec_index + vec_pad - vec_ofs; - } - ALWAYS_INLINE idx_t get_index_n(idx_t vec_index) const { - return get_index(vec_index, _pnv, _onv); - } - ALWAYS_INLINE idx_t get_index_x(idx_t vec_index) const { - return get_index(vec_index, _pxv, _oxv); - } - ALWAYS_INLINE idx_t get_index_y(idx_t vec_index) const { - return get_index(vec_index, _pyv, _oyv); - } - ALWAYS_INLINE idx_t get_index_z(idx_t vec_index) const { - return get_index(vec_index, _pzv, _ozv); - } - }; - // A 3D (x, y, z) collection of real_vec_t elements. // Supports symmetric padding in each dimension. @@ -379,9 +351,15 @@ namespace yask { #endif // adjust for padding and offset. +#if USE_GET_INDEX xv = get_index_x(xv); yv = get_index_y(yv); zv = get_index_z(zv); +#else + xv += _pxv - _oxv; + yv += _pyv - _oyv; + zv += _pzv - _ozv; +#endif #ifdef TRACE_MEM if (checkBounds) @@ -554,9 +532,8 @@ namespace yask { void printElem(std::ostream& os, const std::string& m, idx_t x, idx_t y, idx_t z, real_t e, int line) const { - printElem_TNXYZ(0, 0, xv, yv, zv, e, line); + printElem_TNXYZ(0, 0, x, y, z, e, line); } - }; // A 4D (n, x, y, z) collection of real_vec_t elements. @@ -599,10 +576,17 @@ namespace yask { #endif // adjust for padding and offset. +#if USE_GET_INDEX nv = get_index_n(nv); xv = get_index_x(xv); yv = get_index_y(yv); zv = get_index_z(zv); +#else + nv += _pnv - _onv; + xv += _pxv - _oxv; + yv += _pyv - _oyv; + zv += _pzv - _ozv; +#endif #ifdef TRACE_MEM if (checkBounds) @@ -770,14 +754,58 @@ namespace yask { void printElem(std::ostream& os, const std::string& m, idx_t n, idx_t x, idx_t y, idx_t z, real_t e, int line) const { - printElem_TNXYZ(0, n, xv, yv, zv, e, line); + printElem_TNXYZ(0, n, x, y, z, e, line); + } + }; + + // Base class that adds a templated temporal size for + // index-calculation efficiency. + template + class RealVecGridTemplate : public RealVecGridBase { + + protected: + + // Adjust logical time index to 0-based index + // using temporal allocation size. + ALWAYS_INLINE + idx_t get_index_t(idx_t t) const { + + // Index wraps in tdim. + // Examples based on tdim == 2: + // t_idx => return value. + // -2 => 0. + // -1 => 1. + // 0 => 0. + // 1 => 1. + // 2 => 0. + + // Avoid discontinuity caused by negative time by adding a large + // offset to the t index. So, t can be negative, but not so + // much that it would still be negaive after adding the offset. + // This should not be a practical restriction. + t += 256 * _tdim; + assert(t >= 0); + assert(t % CPTS_T == 0); + idx_t t_idx = t / idx_t(CPTS_T); + return t_idx % _tdim; + } + + public: + + RealVecGridTemplate(const std::string& name, + RealVecGrid* gp) : + RealVecGridBase(name, gp) { } + + // Get temporal allocation. + virtual inline idx_t get_tdim() const final { + return _tdim; } }; // A 4D (t, x, y, z) collection of real_vec_t elements. // Supports symmetric padding in each dimension. - template class RealVecGrid_TXYZ : - public RealVecGridBase { + template class RealVecGrid_TXYZ : + public RealVecGridTemplate<_tdim> { protected: @@ -785,19 +813,19 @@ namespace yask { virtual void resize_g() { _data.set_d1(_tdim); - _data.set_d2(_dxv + - 2 * _pxv); - _data.set_d3(_dyv + - 2 * _pyv); - _data.set_d4(_dzv + - 2 * _pzv); + _data.set_d2(this->_dxv + + 2 * this->_pxv); + _data.set_d3(this->_dyv + + 2 * this->_pyv); + _data.set_d4(this->_dzv + + 2 * this->_pzv); } public: // Ctor. RealVecGrid_TXYZ(const std::string& name) : - RealVecGridBase(name, &_data) { } + RealVecGridTemplate<_tdim>(name, &_data) { } // Determine what dims are defined. virtual bool got_t() const { return true; } @@ -817,10 +845,16 @@ namespace yask { #endif // adjust for padding and offset. - t = get_index_t(t); - xv = get_index_x(xv); - yv = get_index_y(yv); - zv = get_index_z(zv); + t = this->get_index_t(t); +#if USE_GET_INDEX + xv = this->get_index_x(xv); + yv = this->get_index_y(yv); + zv = this->get_index_z(zv); +#else + xv += this->_pxv - this->_oxv; + yv += this->_pyv - this->_oyv; + zv += this->_pzv - this->_ozv; +#endif #ifdef TRACE_MEM if (checkBounds) @@ -846,9 +880,9 @@ namespace yask { const real_t* getElemPtr(idx_t t, idx_t x, idx_t y, idx_t z, bool checkBounds=true) const { idx_t xv, ie, yv, je, zv, ke; - normalize_x(x, xv, ie); - normalize_y(y, yv, je); - normalize_z(z, zv, ke); + this->normalize_x(x, xv, ie); + this->normalize_y(y, yv, je); + this->normalize_z(z, zv, ke); // Get vector. const real_vec_t* vp = getVecPtrNorm(t, xv, yv, zv, checkBounds); @@ -993,14 +1027,14 @@ namespace yask { idx_t t, idx_t x, idx_t y, idx_t z, real_t e, int line) const { - printElem_TNXYZ(t, 0, xv, yv, zv, e, line); + printElem_TNXYZ(t, 0, x, y, z, e, line); } }; // A 5D (t, n, x, y, z) collection of real_vec_t elements. // Supports symmetric padding in each dimension. - template class RealVecGrid_TNXYZ : - public RealVecGridBase { + template class RealVecGrid_TNXYZ : + public RealVecGridTemplate<_tdim> { protected: @@ -1008,21 +1042,21 @@ namespace yask { virtual void resize_g() { _data.set_d1(_tdim); - _data.set_d2(_dnv + - 2 * _pnv); - _data.set_d3(_dxv + - 2 * _pxv); - _data.set_d4(_dyv + - 2 * _pyv); - _data.set_d5(_dzv + - 2 * _pzv); + _data.set_d2(this->_dnv + + 2 * this->_pnv); + _data.set_d3(this->_dxv + + 2 * this->_pxv); + _data.set_d4(this->_dyv + + 2 * this->_pyv); + _data.set_d5(this->_dzv + + 2 * _this->pzv); } public: // Ctor. RealVecGrid_TNXYZ(const std::string& name) : - RealVecGridBase(name, &_data) { } + RealVecGridTemplate<_tdim>(name, &_data) { } // Determine what dims are defined. virtual bool got_t() const { return true; } @@ -1043,11 +1077,18 @@ namespace yask { #endif // adjust for padding and offset. - t = get_index_t(t); - nv = get_index_n(nv); - xv = get_index_x(xv); - yv = get_index_y(yv); - zv = get_index_z(zv); + t = this->get_index_t(t); +#if USE_GET_INDEX + nv = this->get_index_n(nv); + xv = this->get_index_x(xv); + yv = this->get_index_y(yv); + zv = this->get_index_z(zv); +#else + nv += this->_pnv - this->_onv; + xv += this->_pxv - this->_oxv; + yv += this->_pyv - this->_oyv; + zv += this->_pzv - this->_ozv; +#endif #ifdef TRACE_MEM if (checkBounds) @@ -1073,10 +1114,10 @@ namespace yask { const real_t* getElemPtr(idx_t t, idx_t n, idx_t x, idx_t y, idx_t z, bool checkBounds=true) const { idx_t nv, ne, xv, ie, yv, je, zv, ke; - normalize_n(n, nv, ne); - normalize_x(x, xv, ie); - normalize_y(y, yv, je); - normalize_z(z, zv, ke); + this->normalize_n(n, nv, ne); + this->normalize_x(x, xv, ie); + this->normalize_y(y, yv, je); + this->normalize_z(z, zv, ke); // Get vector. const real_vec_t* vp = getVecPtrNorm(t, nv, xv, yv, zv, checkBounds); @@ -1217,7 +1258,7 @@ namespace yask { idx_t t, idx_t n, idx_t x, idx_t y, idx_t z, real_t e, int line) const { - printElem_TNXYZ(t, n, xv, yv, zv, e, line); + printElem_TNXYZ(t, n, x, y, z, e, line); } }; diff --git a/src/stencil.hpp b/src/stencil.hpp index 77e5a86b..d151dacd 100644 --- a/src/stencil.hpp +++ b/src/stencil.hpp @@ -124,9 +124,11 @@ namespace yask { // RealVecGrids using layouts defined above. using Grid_XYZ = RealVecGrid_XYZ; - using Grid_TXYZ = RealVecGrid_TXYZ; using Grid_NXYZ = RealVecGrid_NXYZ; - using Grid_TNXYZ = RealVecGrid_TNXYZ; + template + using Grid_TXYZ = RealVecGrid_TXYZ; + template + using Grid_TNXYZ = RealVecGrid_TNXYZ; // RealGrids using traditional C layout. typedef GenericGrid3d RealGrid_XYZ; diff --git a/src/stencil_calc.cpp b/src/stencil_calc.cpp index 5940f4f4..4aad1aae 100644 --- a/src/stencil_calc.cpp +++ b/src/stencil_calc.cpp @@ -70,6 +70,14 @@ namespace yask { global_barrier(); } + // Copy env settings from another context. + void StencilContext::copyEnv(const StencilContext& src) { + comm = src.comm; + my_rank = src.my_rank; + num_ranks = src.num_ranks; + _ostr = src._ostr; + } + // Set ostr to given stream if provided. // If not provided, set to cout if my_rank == msg_rank // or a null stream otherwise. @@ -527,7 +535,7 @@ namespace yask { auto& gname = gp->get_name(); // Size of buffer in each direction: if dist to neighbor is zero - // (i.e., is perpendicular to this rank), use full size; + // (i.e., is perpendicular to this rank), use domain size; // otherwise, use halo size. idx_t bsn = ROUND_UP((rdn == 0) ? _opts->dn : gp->get_halo_n(), VLEN_N); idx_t bsx = ROUND_UP((rdx == 0) ? _opts->dx : gp->get_halo_x(), VLEN_X); @@ -535,7 +543,7 @@ namespace yask { idx_t bsz = ROUND_UP((rdz == 0) ? _opts->dz : gp->get_halo_z(), VLEN_Z); if (bsn * bsx * bsy * bsz == 0) { - os << " No halo exchange for grid '" << gname << + os << " No halo exchange needed for grid '" << gname << "' with rank " << rn << '.' << endl; } else { @@ -564,11 +572,13 @@ namespace yask { } // Allocate memory for grids, params, and MPI bufs. - // If 'do_distrib' is true, distribute already-allocated memory. // TODO: allow different types of memory for different grids, MPI bufs, etc. - void StencilContext::allocData(bool do_distrib) { + void StencilContext::allocData() { ostream& os = get_ostr(); + // if '_data_buf' is null, allocate memory and call recursively to distribute. + // If '_data_buf' is not null, distribute already-allocated memory. + // Determine how many bytes are needed. size_t nbytes = 0, gbytes = 0, pbytes = 0, bbytes = 0; @@ -590,26 +600,32 @@ namespace yask { gp->set_ofs_z(ofs_z); // set storage if requested. - if (do_distrib) { + if (_data_buf) { gp->set_storage(_data_buf, nbytes); gp->print_info(os); } // determine size used (also offset to next location). gbytes += gp->get_num_bytes(); - nbytes += gp->get_num_bytes() + _data_buf_pad; + nbytes += ROUND_UP(gp->get_num_bytes() + _data_buf_pad, + CACHELINE_BYTES); + TRACE_MSG("grid '" << gp->get_name() << "' needs " << + gp->get_num_bytes() << " bytes"); } // Params. for (auto pp : paramPtrs) { // set storage if requested. - if (do_distrib) + if (_data_buf) pp->set_storage(_data_buf, nbytes); // determine size used (also offset to next location). pbytes += pp->get_num_bytes(); - nbytes += pp->get_num_bytes() + _data_buf_pad; + nbytes += ROUND_UP(pp->get_num_bytes() + _data_buf_pad, + CACHELINE_BYTES); + TRACE_MSG("param needs " << + pp->get_num_bytes() << " bytes"); } // MPI buffers. @@ -622,16 +638,22 @@ namespace yask { Grid_NXYZ* rcvBuf) { if (sendBuf) { - if (do_distrib) + if (_data_buf) sendBuf->set_storage(_data_buf, nbytes); bbytes += sendBuf->get_num_bytes(); - nbytes += sendBuf->get_num_bytes() + _data_buf_pad; + nbytes += ROUND_UP(sendBuf->get_num_bytes() + _data_buf_pad, + CACHELINE_BYTES); + TRACE_MSG("send buf '" << sendBuf->get_name() << "' needs " << + sendBuf->get_num_bytes() << " bytes"); } if (rcvBuf) { - if (do_distrib) + if (_data_buf) rcvBuf->set_storage(_data_buf, nbytes); bbytes += rcvBuf->get_num_bytes(); - nbytes += rcvBuf->get_num_bytes() + _data_buf_pad; + nbytes += ROUND_UP(rcvBuf->get_num_bytes() + _data_buf_pad, + CACHELINE_BYTES); + TRACE_MSG("rcv buf '" << rcvBuf->get_name() << "' needs " << + rcvBuf->get_num_bytes() << " bytes"); } } ); } @@ -641,12 +663,12 @@ namespace yask { nbytes -= _data_buf_pad; // Allocate and distribute data. - if (!do_distrib) { + if (!_data_buf) { os << "Allocating " << printWithPow2Multiplier(nbytes) << "B for all grids, parameters, and other buffers with a " << printWithPow2Multiplier(_data_buf_alignment) << "B alignment...\n" << flush; int ret = posix_memalign(&_data_buf, _data_buf_alignment, nbytes); - if (ret) { + if (ret || !_data_buf) { cerr << "Error: unable to allocate memory.\n"; exit_yask(1); } @@ -659,7 +681,7 @@ namespace yask { "B for inter-data padding.\n"; // Distribute this allocation w/a recursive call. - allocData(true); + allocData(); } } @@ -1342,15 +1364,18 @@ namespace yask { idx_t dn, idx_t dx, idx_t dy, idx_t dz, const std::string& name) { - // NB: there may be an existing buffer here left - // over from a shallow copy. Just ignore it and make a new one. - auto gp = getBuf(bd, nn, nx, ny, nz); + TRACE_MSG0(cout, "making MPI buffer '" << name << "' at " << + nn << ", " << nx << ", " << ny << ", " << nz << " with size " << + dn << " * " << dx << " * " << dy << " * " << dz); + auto** gp = getBuf(bd, nn, nx, ny, nz); *gp = new Grid_NXYZ(name); assert(*gp); (*gp)->set_dn(dn); (*gp)->set_dx(dx); (*gp)->set_dy(dy); (*gp)->set_dz(dz); + TRACE_MSG0(cout, "MPI buffer '" << name << "' size: " << + (*gp)->get_num_bytes()); return *gp; } diff --git a/src/stencil_calc.hpp b/src/stencil_calc.hpp index 2f1d2840..4e52fe9b 100644 --- a/src/stencil_calc.hpp +++ b/src/stencil_calc.hpp @@ -177,6 +177,15 @@ namespace yask { // The context's BB encompasses all eq-group BBs. class StencilContext : public BoundingBox { + private: + // Disallow copying. + StencilContext(const StencilContext& src) { + exit_yask(1); + } + void operator=(const StencilContext& src) { + exit_yask(1); + } + protected: // Output stream for messages. @@ -277,6 +286,9 @@ namespace yask { // This is normally called very early in the program. virtual void initEnv(int* argc, char*** argv); + // Copy env settings from another context. + virtual void copyEnv(const StencilContext& src); + // Set ostr to given stream if provided. // If not provided, set to cout if my_rank == msg_rank // or a null stream otherwise. @@ -300,9 +312,8 @@ namespace yask { virtual void setupRank(); // Allocate grid, param, and MPI memory. - // If '_distrib' is true, distribute already-allocated memory. // Called from allocAll(), so it doesn't normally need to be called from user code. - virtual void allocData(bool _distrib = false); + virtual void allocData(); // Allocate grids, params, MPI bufs, etc. // Initialize some other data structures. diff --git a/src/stencil_main.cpp b/src/stencil_main.cpp index a21b5167..5cc8f1aa 100644 --- a/src/stencil_main.cpp +++ b/src/stencil_main.cpp @@ -282,10 +282,10 @@ int main(int argc, char** argv) os << endl << divLine << "Setup for validation...\n"; - // Make a reference context for comparisons w/new grids: - // Copy the settings from context, then re-alloc grids. - YASK_STENCIL_CONTEXT ref_context = context; + // Make a reference context for comparisons w/new grids. + YASK_STENCIL_CONTEXT ref_context(opts); ref_context.name += "-reference"; + ref_context.copyEnv(context); ref_context.allocAll(); // init to same value used in context. diff --git a/src/utils.cpp b/src/utils.cpp index e9635c69..e0292fb2 100644 --- a/src/utils.cpp +++ b/src/utils.cpp @@ -184,7 +184,7 @@ namespace yask { // Format help message to fit in width. pos = 0; for (size_t i = 0; i < words.size(); i++) { - if (i == 0 || pos + words[i].length() > width) { + if (i == 0 || pos + words[i].length() > size_t(width)) { os << endl << _help_leader; pos = _help_leader.length(); } @@ -216,9 +216,9 @@ namespace yask { // On failure, print msg using string from args[argi-1] and exit. // On success, increment argi and return value. idx_t CommandLineParser::OptionBase::_idx_val(vector& args, - int& argi) + int& argi) { - if (argi >= args.size() || args[argi].length() == 0) { + if (size_t(argi) >= args.size() || args[argi].length() == 0) { cerr << "Error: no argument for option '" << args[argi - 1] << "'." << endl; exit(1); } @@ -335,7 +335,7 @@ namespace yask { vector non_args; // Loop through strings in args. - for (int argi = 0; argi < args.size(); ) { + for (int argi = 0; argi < int(args.size()); ) { // Compare against all registered options. bool matched = false; diff --git a/src/utils.hpp b/src/utils.hpp index 8473cebd..3361d3f1 100644 --- a/src/utils.hpp +++ b/src/utils.hpp @@ -85,11 +85,12 @@ IN THE SOFTWARE. #ifdef _OPENMP #include #else -#define omp_get_num_procs() (1) -#define omp_get_num_threads() (1) -#define omp_get_max_threads() (1) -#define omp_get_thread_num() (0) -#define omp_set_num_threads(n) (void(0)) +inline int omp_get_num_procs() { return 1; } +inline int omp_get_num_threads() { return 1; } +inline int omp_get_max_threads() { return 1; } +inline int omp_get_thread_num() { return 0; } +inline void omp_set_num_threads(int n) { } +inline void omp_set_nested(int n) { } #endif // rounding macros for integer types. From a4d6874bc714a7e6becd6ac88fdbdf5b1a72184e Mon Sep 17 00:00:00 2001 From: Chuck Yount Date: Wed, 8 Feb 2017 14:38:06 -0700 Subject: [PATCH 2/3] Sub domains (#14) * Use pointers to grids to improve code-gen in some stencils. Grids as member vars instead of pointers prevented the compiler from optimizing addr calculations, causing >3x code bloat in AWP stencil and breaking vectorization in FSG. Also: Fixed uninitialized vars in grids. Fixed some g++ compilation warnings. Ensured cache-line-boundary alignment in allocation. * Bugfix typo (prevented compilation). (#10) (#12) From e6798b4e34a65fc5af8bbc562e072bc5a32a75f6 Mon Sep 17 00:00:00 2001 From: Chuck Yount Date: Wed, 8 Feb 2017 14:40:42 -0700 Subject: [PATCH 3/3] Sub domains (#15) * Use pointers to grids to improve code-gen in some stencils. Grids as member vars instead of pointers prevented the compiler from optimizing addr calculations, causing >3x code bloat in AWP stencil and breaking vectorization in FSG. Also: Fixed uninitialized vars in grids. Fixed some g++ compilation warnings. Ensured cache-line-boundary alignment in allocation. * Bugfix typo (prevented compilation). (#10) (#12)