diff --git a/include/yask_common_api.hpp b/include/yask_common_api.hpp index 1891fda6..fdcb662a 100644 --- a/include/yask_common_api.hpp +++ b/include/yask_common_api.hpp @@ -33,6 +33,7 @@ IN THE SOFTWARE. #define YASK_COMMON_API #include +#include #include #include #include @@ -204,6 +205,55 @@ namespace yask { virtual ~yask_null_output() {} }; + /// Create finite-difference (FD) coefficients for the standard center form. + /** + Find FD coefficients with `radius` sample points to both the left and right + of the center sample and evaluation point on a uniformly-spaced grid. + The FD has `radius * 2`-order accuracy. + @returns `radius * 2 + 1` FD coefficients. + */ + std::vector + get_center_fd_coefficients(int derivative_order + /**< [in] `1` for 1st derivative, `2` for 2nd, etc. */, + int radius + /**< [in] Number of points to either side of the center point. */ ); + + /// Create finite-difference (FD) coefficients for the standard forward form. + /** + Find FD coefficients with `accuracy_order` sample points to the right + of the center sample and evaluation point on a uniformly-spaced grid. + @returns `accuracy_order + 1` FD coefficients. + */ + std::vector + get_forward_fd_coefficients(int derivative_order + /**< [in] `1` for 1st derivative, `2` for 2nd, etc. */, + int accuracy_order + /**< [in] Number of points to the right of the center point. */ ); + + /// Create finite-difference (FD) coefficients for the standard backward form. + /** + Find FD coefficients with `accuracy_order` sample points to the left + of the center sample and evaluation point on a uniformly-spaced grid. + @returns `accuracy_order + 1` FD coefficients. + */ + std::vector + get_backward_fd_coefficients(int derivative_order + /**< [in] `1` for 1st derivative, `2` for 2nd, etc. */, + int accuracy_order + /**< [in] Number of points to the left of the center point. */ ); + + /// Create finite-difference (FD) coefficients at arbitrary evaluation and sample points. + /** + @returns `sample_points` FD coefficients. + */ + std::vector + get_arbitrary_fd_coefficients(int derivative_order + /**< [in] `1` for 1st derivative, `2` for 2nd, etc. */, + double eval_point + /**< [in] Location of evaluation point. */, + const std::vector sample_points + /**< [in] Locations of sampled points. Must have at least 2. */ ); + /** @}*/ } // namespace yask. diff --git a/src/common/common.mk b/src/common/common.mk index 7a7d18d3..2b9faac9 100644 --- a/src/common/common.mk +++ b/src/common/common.mk @@ -55,7 +55,8 @@ endif # Common source. COMM_DIR := $(SRC_DIR)/common -COMM_SRC_NAMES := output common_utils tuple combo +COMM_SRC_NAMES := output common_utils tuple combo fd_coeff fd_coeff2 +COEFF_DIR := $(SRC_DIR)/contrib/coefficients # YASK stencil compiler. # This is here because both the compiler and kernel diff --git a/src/common/common_utils.cpp b/src/common/common_utils.cpp index 512781c9..aaa42ab5 100644 --- a/src/common/common_utils.cpp +++ b/src/common/common_utils.cpp @@ -43,7 +43,7 @@ namespace yask { // for numbers above 9 (at least up to 99). // Format: "major.minor.patch". - const string version = "2.20.00"; + const string version = "2.21.00"; string yask_get_version_string() { return version; diff --git a/src/common/fd_coeff2.cpp b/src/common/fd_coeff2.cpp new file mode 100644 index 00000000..6d0334d0 --- /dev/null +++ b/src/common/fd_coeff2.cpp @@ -0,0 +1,75 @@ +/***************************************************************************** + +YASK: Yet Another Stencil Kernel +Copyright (c) 2014-2019, Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to +deal in the Software without restriction, including without limitation the +rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +sell copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +* The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +IN THE SOFTWARE. + +*****************************************************************************/ + +///////// FD coefficients implementation. + +#include "yask_common_api.hpp" +#include "fd_coeff.hpp" +#include "common_utils.hpp" + +using namespace std; + +namespace yask { + + // C++-style interface. + vector get_arbitrary_fd_coefficients(int derivative_order, double eval_point, const vector sample_points) { + if (derivative_order < 1) + THROW_YASK_EXCEPTION("Error: get_fd_coefficients() called with derivative-order less than 1"); + int n = int(sample_points.size()); + if (n < 2) + THROW_YASK_EXCEPTION("Error: get_fd_coefficients() called with fewer than 2 sample points"); + vector coeffs(n); + fd_coeff(&coeffs[0], double(eval_point), derivative_order, &sample_points[0], n); + return coeffs; + } + + // Common FD forms for uniform grid spacing. + vector get_center_fd_coefficients(int derivative_order, int radius) { + if (radius < 1) + THROW_YASK_EXCEPTION("get_center_fd_coefficients() called with less than radius 1"); + vector pts; + for (int i = -radius; i <= radius; i++) + pts.push_back(i); + assert(sizeof(pts) == size_t(radius * 2 + 1)); + return get_arbitrary_fd_coefficients(derivative_order, 0, pts); + } + vector get_forward_fd_coefficients(int derivative_order, int accuracy_order) { + if (accuracy_order < 1) + THROW_YASK_EXCEPTION("get_forward_fd_coefficients() called with less than order-of-accuracy 1"); + vector pts; + for (int i = 0; i <= accuracy_order; i++) + pts.push_back(i); + return get_arbitrary_fd_coefficients(derivative_order, 0, pts); + } + vector get_backward_fd_coefficients(int derivative_order, int accuracy_order) { + if (accuracy_order < 1) + THROW_YASK_EXCEPTION("get_backward_fd_coefficients() called with less than order-of-accuracy 1"); + vector pts; + for (int i = -accuracy_order; i <= 0; i++) + pts.push_back(i); + return get_arbitrary_fd_coefficients(derivative_order, 0, pts); + } + +} // yask namespace. diff --git a/src/compiler/Makefile b/src/compiler/Makefile index 99b1051c..a2b78b45 100644 --- a/src/compiler/Makefile +++ b/src/compiler/Makefile @@ -50,7 +50,7 @@ YC_SRC_NAMES := Expr ExprUtils Grid Eqs Print Vec Cpp CppIntrin YaskKernel Soln YC_STENCIL_NAMES:= $(notdir $(patsubst %.cpp,%,$(wildcard $(YC_STENCIL_DIR)/*.cpp))) YC_OBJS := $(addprefix $(YC_OBJ_DIR)/,$(addsuffix .o,$(YC_SRC_NAMES) $(COMM_SRC_NAMES))) YC_STENCIL_OBJS := $(addprefix $(YC_OBJ_DIR)/,$(addsuffix .o,$(YC_STENCIL_NAMES))) -YC_INC_DIRS := $(INC_DIR) $(YC_LIB_SRC_DIR) $(COMM_DIR) +YC_INC_DIRS := $(INC_DIR) $(YC_LIB_SRC_DIR) $(COMM_DIR) $(COEFF_DIR) YC_INC_GLOB := $(wildcard $(addsuffix /*.hpp,$(YC_INC_DIRS))) YC_STENCIL_INC_GLOB := $(wildcard $(YC_STENCIL_DIR)/*.hpp $(YC_STENCIL_DIR)/*/*.hpp) @@ -87,6 +87,11 @@ $(YC_OBJ_DIR)/%.o: $(COMM_DIR)/%.cpp $(YC_INC_GLOB) $(CXX_PREFIX) $(YC_CXX) $(YC_CXXFLAGS) -x c++ -fPIC -c -o $@ $< @ls -l $@ +$(YC_OBJ_DIR)/%.o: $(COEFF_DIR)/%.cpp $(YC_INC_GLOB) + $(MKDIR) $(YC_OBJ_DIR) + $(CXX_PREFIX) $(YC_CXX) $(YC_CXXFLAGS) -x c++ -fPIC -c -o $@ $< + @ls -l $@ + ######## Primary targets. default: compiler @@ -120,7 +125,7 @@ $(YC_SWIG_OUT_DIR)/yask_compiler_api_wrap.cpp: $(YC_SWIG_DIR)/yask*.i $(INC_DIR) $(SWIG) -version $(MKDIR) $(YC_SWIG_OUT_DIR) $(PY_OUT_DIR) $(SWIG) -v -DYC_MODULE=$(YC_MODULE) -cppext cpp \ - -I$(INC_DIR) -I$(COMM_DIR) -I$(COMM_DIR)/swig \ + -I$(INC_DIR) -I$(COMM_DIR) -I$(COMM_DIR)/swig -I$(COEFF_DIR) \ -c++ -python -o $@ -outdir $(PY_OUT_DIR) -builtin $< # Turn off asserts to work around known SWIG issue: diff --git a/src/contrib/coefficients/fd_coeff.cpp b/src/contrib/coefficients/fd_coeff.cpp index c23b61d2..d969b5c6 100644 --- a/src/contrib/coefficients/fd_coeff.cpp +++ b/src/contrib/coefficients/fd_coeff.cpp @@ -28,80 +28,60 @@ IN THE SOFTWARE. #include #include +#include #include "fd_coeff.hpp" -#define MIN(x, y) (((x) < (y)) ? (x): (y)) -#define MAX(x, y) (((x) > (y)) ? (x): (y)) using namespace std; -/*input:coeff=empty coefficient array (or one that can be overwritten) - eval_point=point at which the derivative is approximated - order=order of the derivative to approximate (e.g. f'' corresponds to order = 2) - points=array of points from which to construct the approximation of the derivative. usually an equi-spaced array with points [-radius*h, -(radius-1)*h,...0, ... radius*h] - num_points=number of elements in points[], e.g. the number of points used to approximate the derivative. - Note: if num_points < order+1, then the coefficients will all be 0 - - - output:void, fills the coefficient array such that -f^(m)[eval_point] ~~ sum of coeff[i]*f[point[i]] from i = 0 to num_points-1 -*/ - -void fd_coeff(float *coeff, const float eval_point, const int order, float *points, const int num_points) -{ - float c1, c2, c3; - float x_0=eval_point; - float center=0; - - - - -// float* d = (float*) malloc((order+1)*num_points*num_points*sizeof(float)); - float d[(order+1)*num_points*num_points]; - int m_idx = (order+1)*num_points; - int n_idx = num_points; - - //array initializer 1 - /* - memset(d, 0.f, sizeof(d)); - */ - - //array initializer 2 - int sizeofd = (order+1)*(num_points)*(num_points)*sizeof(float); - memset(d, 0.f, sizeofd); - - - //array initializer 3 - /* - for(int m=0; m <= order; ++m){ - for(int n=0; n< num_points; ++n){ - for(int v=0; v #include #include "fd_coeff.hpp" -#define MIN(x, y) (((x) < (y)) ? (x): (y)) -#define MAX(x, y) (((x) > (y)) ? (x): (y)) using namespace std; @@ -41,52 +39,45 @@ int main() const int order = 2; //set the evaluation point e.g. we want to approximate some derivative f^(m)[eval_point] - //for most application, this is 0 - float eval_point = 0; + //for most applications, this is 0 + double eval_point = 0.0; const int radius = 2; const int num_points = 2*radius+1; - float coeff[num_points]; - memset(coeff, 0.0, sizeof(coeff)); - - //float* coeff = (float*) malloc(num_points*sizeof(float)); - //memset(coeff, 0.0, num_points*sizeof(float)); + double coeff[num_points]; //Construct a set of points (-h*radius, -h*(radius-1), .. 0, h,..., h*radius) //Could pass any arbitrary array grid_points = {x_0, x_1, ... x_n} - //float* grid_points = (float*) malloc(num_points*sizeof(float)); - float grid_points[num_points]; + double grid_points[num_points]; cout << "Approximating derivative from grid points: " ; for(int i=0; i= 0); + assert(count <= VLEN); + if (count == 0) + res.u = b.u; + else if (count == VLEN) + res.u = a.u; + else { + #if defined(NO_INTRINSICS) - // must make temp copies in case &res == &a or &b. - real_vec_t tmpa = a, tmpb = b; - for (int i = 0; i < VLEN-count; i++) - res.u.r[i] = tmpb.u.r[i + count]; - for (int i = VLEN-count; i < VLEN; i++) - res.u.r[i] = tmpa.u.r[i + count - VLEN]; + // must make temp copies in case &res == &a or &b. + real_vec_t tmpa = a, tmpb = b; + for (int i = 0; i < VLEN-count; i++) + res.u.r[i] = tmpb.u.r[i + count]; + for (int i = VLEN-count; i < VLEN; i++) + res.u.r[i] = tmpa.u.r[i + count - VLEN]; + // For AVX2, use 8-bit op per 128-bit lane w/count*REAL_BYTES. +#elif defined(USE_AVX2) + // See https://software.intel.com/en-us/blogs/2015/01/13/programming-using-avx2-permutations. + // Each nybble of ctrl is + // 0: lo part of A. + // 1: hi part of A. + // 2: lo part of B. + // 3: hi part of B. + auto tmp = _mm256_permute2x128_si256(b.u.mi, a.u.mi, 0x21); +#ifdef TRACE_INTRINSICS + std::cout << " tmp: "; + real_vec_t tmpa; + tmpa.u.mi = tmp; + tmpa.print_reals(std::cout); +#endif + // count must be 1..VLEN-1. + if (count == VLEN/2) + res.u.mi = tmp; + else if (count < VLEN/2) + res.u.mi = _mm256_alignr_epi8(tmp, b.u.mi, count*REAL_BYTES); + else + res.u.mi = _mm256_alignr_epi8(a.u.mi, tmp, (count-(VLEN/2))*REAL_BYTES); + + // For AVX but not AVX2. #elif defined(USE_INTRIN256) - // Not really an intrinsic, but not element-wise, either. - // Put the 2 parts in a local array, then extract the desired part - // using an unaligned load. - typedef real_t R2[VLEN * 2] CACHE_ALIGNED; - R2 r2; - *((real_vec_t*)(&r2[0])) = b; - *((real_vec_t*)(&r2[VLEN])) = a; - real_vec_t* p = (real_vec_t*)(&r2[count]); // not usually aligned. - res.u.mr = INAME(loadu)((imem_t const*)p); - + // Not really an intrinsic, but not element-wise, either. + // Put the 2 parts in a local array, then extract the desired part + // using an unaligned load. + // The Intel compiler converts this into an efficient sequence + // using vmovup*, vshufp*, vinsertf128, and/or vunpcklp* instrs. + typedef real_t R2[VLEN * 2] CACHE_ALIGNED; + R2 r2; + *((real_vec_t*)(&r2[0])) = b; + *((real_vec_t*)(&r2[VLEN])) = a; + real_vec_t* p = (real_vec_t*)(&r2[count]); // not usually aligned. + res.u.mr = INAME(loadu)((imem_t const*)p); + + // For DP on KNC, use 32-bit op w/2x count. #elif REAL_BYTES == 8 && defined(ARCH_KNC) && defined(USE_INTRIN512) - // For KNC, for 64-bit align, use the 32-bit op w/2x count. - res.u.mi = _mm512_alignr_epi32(a.u.mi, b.u.mi, count*2); + res.u.mi = _mm512_alignr_epi32(a.u.mi, b.u.mi, count*2); + // Everything else. #else - res.u.mi = INAMEI(alignr)(a.u.mi, b.u.mi, count); + res.u.mi = INAMEI(alignr)(a.u.mi, b.u.mi, count); #endif - + } + #ifdef TRACE_INTRINSICS std::cout << " res: "; res.print_reals(std::cout); diff --git a/src/kernel/lib/soln_apis.cpp b/src/kernel/lib/soln_apis.cpp index 4ca599a3..28a4b815 100644 --- a/src/kernel/lib/soln_apis.cpp +++ b/src/kernel/lib/soln_apis.cpp @@ -239,7 +239,7 @@ namespace yask { #ifdef SHOW_GROUPS os << " sub-block-group-size: " << opts->_sub_block_group_sizes.makeDimValStr(" * ") << endl << - " block-group-size: " << opts->_block_group_sizes.makeDimValStr(" * ") << endl << + " block-group-size: " << opts->_block_group_sizes.makeDimValStr(" * ") << endl; #endif os << "\nOther settings:\n" " yask-version: " << yask_get_version_string() << endl << diff --git a/src/stencils/Iso3dfdStencil.cpp b/src/stencils/Iso3dfdStencil.cpp index eeb6892a..38ef0746 100644 --- a/src/stencils/Iso3dfdStencil.cpp +++ b/src/stencils/Iso3dfdStencil.cpp @@ -38,20 +38,16 @@ class Iso3dfdStencil : public StencilRadiusBase { MAKE_DOMAIN_INDEX(x); // spatial dim. MAKE_DOMAIN_INDEX(y); // spatial dim. MAKE_DOMAIN_INDEX(z); // spatial dim. - MAKE_MISC_INDEX(r); // to index the coefficients. // Grids. MAKE_GRID(pressure, t, x, y, z); // time-varying 3D pressure grid. MAKE_GRID(vel, x, y, z); // constant 3D vel grid (c(x,y,z)^2 * delta_t^2). - MAKE_ARRAY(coeff, r); // FD coefficients. public: // For this stencil, the 'radius' is the number of FD coefficients on // either side of center in each spatial dimension. For example, - // radius=8 implements a 16th-order accurate FD stencil. To obtain the - // correct result, the 'coeff' array should be initialized with the - // corresponding central FD coefficients, adjusted for grid spacing. + // radius=8 implements a 16th-order accurate FD stencil. // The accuracy in time is fixed at 2nd order. Iso3dfdStencil(StencilList& stencils, string suffix="", int radius=8) : StencilRadiusBase("iso3dfd" + suffix, stencils, radius) { } @@ -60,15 +56,36 @@ class Iso3dfdStencil : public StencilRadiusBase { // Define RHS expression for pressure at t+1 based on values from vel and pressure at t. virtual GridValue get_next_p() { + // Grid spacing. + // In this implementation, it's a constant. + // Could make this a YASK variable to allow setting at run-time. + double delta_xyz = 50.0; + double d2 = delta_xyz * delta_xyz; + + // Spatial FD coefficients for 2nd derivative. + auto coeff = get_center_fd_coefficients(2, _radius); + size_t c0i = _radius; // index of center sample. + + for (size_t i = 0; i < coeff.size(); i++) { + + // Need 3 copies of center sample for x, y, and z FDs. + if (i == c0i) + coeff[i] *= 3.0; + + // Divide each by delta_xyz^2. + coeff[i] /= d2; + } + + // Calculate FDx + FDy + FDz. // Start with center value multiplied by coeff 0. - GridValue next_p = pressure(t, x, y, z) * coeff(0); + GridValue fd_sum = pressure(t, x, y, z) * coeff[c0i]; // Add values from x, y, and z axes multiplied by the // coeff for the given radius. for (int r = 1; r <= _radius; r++) { // Add values from axes at radius r. - next_p += ( + fd_sum += ( // x-axis. pressure(t, x-r, y, z) + pressure(t, x+r, y, z) + @@ -81,13 +98,32 @@ class Iso3dfdStencil : public StencilRadiusBase { pressure(t, x, y, z-r) + pressure(t, x, y, z+r) - ) * coeff(r); + ) * coeff[c0i + r]; // R & L coeffs are identical. } - // Finish equation, including t-1 and velocity components. - next_p = (2.0 * pressure(t, x, y, z)) - - pressure(t-1, x, y, z) // subtract pressure from t-1. - + (next_p * vel(x, y, z)); // add next_p * velocity. + // Temporal FD coefficients. + // For this implementation, just check the known values to + // simplify the solution. + // But we could parameterize by accuracy-order in time as well. + int torder = 2; + auto tcoeff = get_forward_fd_coefficients(2, torder); + assert(tcoeff[0] == 1.0); // pressure(t+1). + assert(tcoeff[1] == -2.0); // -2 * pressure(t+1). + assert(tcoeff[2] == 1.0); // pressure(t-1). + + // Wave equation is: + // 2nd time derivative(p) = c^2 * laplacian(p). + // See https://en.wikipedia.org/wiki/Wave_equation. + + // So, wave equation with FD approximations is: + // (p(t+1) - 2 * p(t) + p(t-1)) / delta_t^2 = c^2 * fd_sum. + + // Solve wave equation for p(t+1): + // p(t+1) = 2 * p(t) - p(t-1) + c^2 * fd_sum * delta_t^2. + + // Let vel = c^2 * delta_t^2 for each grid point. + GridValue next_p = (2.0 * pressure(t, x, y, z)) - + pressure(t-1, x, y, z) + (fd_sum * vel(x, y, z)); return next_p; } diff --git a/utils/bin/gen_loops.pl b/utils/bin/gen_loops.pl index ad519071..527681a9 100755 --- a/utils/bin/gen_loops.pl +++ b/utils/bin/gen_loops.pl @@ -738,7 +738,7 @@ ($) } # use serpentine path in next loop if possible. - elsif (lc $tok eq 'broken_serpentine') { + elsif (lc $tok eq 'serpentine') { $features |= $bSerp; } @@ -967,17 +967,19 @@ () if (!$command_line || $OPT{help} || @ARGV < 1) { print "Outputs C++ code to scan N-D grids.\n", "Usage: $script [options] \n", - "The contains optionally-nested scans across the given", + "The contains optionally-nested scans across the given\n", " indices between 0 and N-1 indicated by 'loop()'\n", "Indices may be specified as a comma-separated list or range,\n", " using the variable 'N' as needed.\n", "Inner loops should contain call statements that generate calls to calculation functions.\n", "A loop statement with more than one argument will generate a single collapsed loop.\n", "Optional loop modifiers:\n", - " omp: generate an OpenMP for loop (distribute work across SW threads).\n", + " omp: generate an OpenMP for loop (distribute work across SW threads).*\n", " grouped: generate grouped scan within a collapsed loop.\n", - ## broken: " serpentine: generate reverse scan when enclosing loop dimension is odd.\n", - " square_wave: generate 2D square-wave scan for two innermost dimensions of a collapsed loop.\n", + " serpentine: generate reverse scan when enclosing loop dimension is odd.*\n", + " square_wave: generate 2D square-wave scan for two innermost dimensions of a collapsed loop.*\n", + " * Do not use these modifiers for YASK rank or block loops because they must\n", + " execute with strictly-increasing indices when using temporal tiling.\n", "A 'ScanIndices' var must be defined in C++ code prior to including the generated code.\n", " This struct contains the following 'Indices' elements:\n", " 'begin': [in] first index to scan in each dim.\n", @@ -1001,7 +1003,7 @@ () " $script -ndims 3 'omp loop(0,1) { loop(2) { call(f); } }'\n", " $script -ndims 3 'omp loop(0) { loop(1,2) { call(f); } }'\n", " $script -ndims 3 'grouped omp loop(0..N-1) { call(f); }'\n", - " $script -ndims 3 'omp loop(0) { square loop(1..N-1) { call(f); } }'\n", + " $script -ndims 3 'omp loop(0) { square_wave loop(1..N-1) { call(f); } }'\n", " $script -ndims 4 'omp loop(0..N+1) { loop(N+2,N-1) { call(f); } }'\n"; exit 1; } diff --git a/utils/bin/get_loop_stats.pl b/utils/bin/get_loop_stats.pl index 676a6f63..7781b673 100755 --- a/utils/bin/get_loop_stats.pl +++ b/utils/bin/get_loop_stats.pl @@ -27,6 +27,8 @@ # report some stats on them. use strict; +use File::Basename; + my $minInstrs = 2; my $printAsm = 0; my $targetLabel = ""; @@ -65,6 +67,8 @@ sub usage { my $fname = $arg; my %files; # map from file index to source file-name. + my %dirs; # map from file index to source dir-name. + my %dirIndices; # map from dir-name to dir index. my %loopLabels; my %astats; # arg stats. my %istats; # instr stats. @@ -79,8 +83,21 @@ sub usage { my ($locInfo, $srcFile, $curFn); # strings describing current location. my @lines; # lines to print. + # Header. + if (!$pass) { + print "\n'$fname'...\n"; + } else { + my %id; + for my $dir (keys %dirIndices) { + $id{$dirIndices{$dir}} = $dir; + } + print "\nDirectory key:\n"; + for my $di (sort { $a <=> $b } keys %id) { + print " = $id{$di}\n"; + } + } + open F, "<$fname" or usage("error: cannot open '$fname'"); - print "\n'$fname'...\n" if !$pass; while () { chomp; @@ -88,7 +105,12 @@ sub usage { # .file 40 "src/stencil_block_loops.hpp" if (/^\s*\.file\s+(\d+)\s+"(.*)"/) { my ($fi, $fn) = ($1, $2); - $files{$fi} = $fn; + $files{$fi} = basename($fn); + my $dir = dirname($fn); + $dirs{$fi} = dirname($fn); + if ($dir && !exists($dirIndices{$dir})) { + $dirIndices{$dir} = scalar keys %dirIndices; + } } # location, e.g., @@ -98,6 +120,10 @@ sub usage { if (exists $files{$fi}) { $srcFile = $files{$fi}; $locInfo = "$srcFile:$info"; + my $srcDir = $dirs{$fi}; + if ($srcDir && exists($dirIndices{$srcDir})) { + $locInfo = "/$locInfo"; + } } else { $srcFile = ""; $locInfo = ""; diff --git a/utils/bin/yask_tuner.pl b/utils/bin/yask_tuner.pl index 0a3da276..a5969cf4 100755 --- a/utils/bin/yask_tuner.pl +++ b/utils/bin/yask_tuner.pl @@ -371,9 +371,10 @@ sub usage { ('123', '132', '213', '231', '312', '321'); # Possible space-filling curve modifiers. +my @pathNamesIncreasing = + ('', 'grouped'); my @pathNames = - ('', 'square_wave', 'grouped'); -## ('', 'serpentine', 'square_wave serpentine', 'grouped'); + (@pathNamesIncreasing, 'serpentine', 'square_wave serpentine', 'square_wave'); # List of folds. if ( !@folds ) { @@ -471,14 +472,17 @@ sub usage { # Loops, from the list above. # Each loop consists of index order and path mods. + # Block and rank paths require increasing indices. [ 0, $#loopOrders, 1, 'subBlockOrder' ], [ 0, $#pathNames, 1, 'subBlockPath' ], [ 0, $#loopOrders, 1, 'miniBlockOrder' ], [ 0, $#pathNames, 1, 'miniBlockPath' ], [ 0, $#loopOrders, 1, 'blockOrder' ], - [ 0, $#pathNames, 1, 'blockPath' ], + [ 0, $#pathNamesIncreasing, 1, 'blockPath' ], [ 0, $#loopOrders, 1, 'regionOrder' ], [ 0, $#pathNames, 1, 'regionPath' ], + [ 0, $#loopOrders, 1, 'rankOrder' ], + [ 0, $#pathNamesIncreasing, 1, 'rankPath' ], # how to shape vectors, from the list above. [ 0, $#folds, 1, 'fold' ], @@ -495,7 +499,7 @@ sub usage { # other build options. [ 0, $#schedules, 1, 'ompRegionSchedule' ], # OMP schedule for region loop. - [ 0, $#schedules, 1, 'ompBlockSchedule' ], # OMP schedule for block loop. + [ 0, $#schedules, 1, 'ompBlockSchedule' ], # OMP schedule for mini-block loop. ); } @@ -660,7 +664,7 @@ ($$$) $val = $fixedVals{$key}; return $val if defined $val; - # return default value for build var if disabled. + # return dummy value for build var if disabled. return 1 if (!$doBuild && $isBuildVar); die "internal error: value for gene '$key' not provided.\n"; @@ -731,11 +735,16 @@ ($$) my $makeCmd = "echo 'build disabled'"; if ($doBuild) { - $tag .= "_".md5_hex($macros, $margs, $makeArgs, $realBytes, $radius); + my $tagPrefix = $tag."_p".$$."_"; + $tag = $tagPrefix.md5_hex($macros, $margs, $makeArgs, $realBytes, $radius); + + # Remove binaries that haven't been used in a while. + system "find bin lib -name '*$tagPrefix*' -amin +15 | xargs --no-run-if-empty rm"; # Already exists? - if (-x "bin/yask_kernel.$tag.$arch.exe") { - $makeCmd = "echo 'binary exists'"; + if (-x "bin/yask_kernel.$tag.$arch.exe" && + -x "lib/libyask_kernel.$tag.$arch.so") { + $makeCmd = "echo 'binary & library exist'"; } else { $makeCmd = @@ -797,16 +806,13 @@ ($$$) push @cmdOut, $line; # E.g., - # 5-D grid (t=2 * tidx=2 * x=12 * y=12 * z=42) 't_grids' with data at 0x7fa476600000 containing 1.47656MiB (24.192K SIMD FP element(s) of 64 byte(s) each) - # 4-D grid (t=2 * x=5 * y=19 * z=19) 'pressure' with data at 0x65cbc0 containing 112.812KiB (3.61K SIMD FP element(s) of 32 byte(s) each) - # 3-D grid (x=3 * y=3 * z=3) 'vel' with data at 0x6790c0 containing 864B (27 SIMD FP element(s) of 32 byte(s) each) - # 1-D grid (r=9) 'coeff' with data at 0x679600 containing 36B (9 FP element(s) of 4 byte(s) each) + # 'A' 4-D var (t=2 * x=8 * y=48 * z=49) with storage at 0x2aba63016000 ... my $ngrids = 1; - if (/^\d-?D grid.*x=.*y=.*z=/) { + if (/\d-?D .*x=.*y=.*z=/) { for my $w (split ' ',$line) { if ($w =~ /(\w+)=(\d+)/) { my ($dim, $sz) = ($1, $2); - if ($dim !~ /^[xyz]/) { + if ($dim eq 't') { $ngrids *= $sz; } } @@ -1015,17 +1021,14 @@ ($$$$$$) } # return loop-ctrl vars. -sub makeLoopVars($$$$$) { +sub makeLoopVars($$$$) { my $h = shift; my $makePrefix = shift; # e.g., 'BLOCK'. my $tunerPrefix = shift; # e.g., 'block'. - my $reqdMods = shift; # e.g., ''. my $lastDim = shift; # e.g., 2 or 3. my $order = readHash($h, $tunerPrefix."Order", 1); my $orderStr = $loopOrders[$order]; # e.g., '231'. - my $path = readHash($h, $tunerPrefix."Path", 1); - my $pathStr = @pathNames[$path]; # e.g., 'grouped'. # dimension vars. my @dims = split '',$orderStr; # e.g., ('2', '3', '1). @@ -1033,9 +1036,17 @@ ($$$$$) # vars to create. my $order = join(',', @dims); # e.g., '2, 1'. - my $outerMods = "$pathStr $reqdMods"; + my $outerMods = $reqdMods; my $innerMods = ''; + # path gene? + my $pathKey = $tunerPrefix."Path"; + if (exists $h->{$pathKey}) { + my $path = readHash($h, $pathKey, 1); + my $pathStr = @pathNames[$path]; # e.g., 'grouped'. + $outerMods = "$pathStr $outerMods"; + } + my $loopVars = " ".$makePrefix."_LOOP_ORDER='$order'"; $loopVars .= " ".$makePrefix."_LOOP_OUTER_MODS='$outerMods'"; $loopVars .= " ".$makePrefix."_LOOP_INNER_MODS='$innerMods'"; @@ -1306,10 +1317,11 @@ sub fitness { $mvars .= " fold=x=$fs[0],y=$fs[1],z=$fs[2]"; # gen-loops vars. - $mvars .= makeLoopVars($h, 'REGION', 'region', '', 3); - $mvars .= makeLoopVars($h, 'BLOCK', 'block', '', 3); - $mvars .= makeLoopVars($h, 'MINI_BLOCK', 'miniBlock', '', 3); - $mvars .= makeLoopVars($h, 'SUB_BLOCK', 'subBlock', '', 2); + $mvars .= makeLoopVars($h, 'RANK', 'rank', 3); + $mvars .= makeLoopVars($h, 'REGION', 'region', 3); + $mvars .= makeLoopVars($h, 'BLOCK', 'block', 3); + $mvars .= makeLoopVars($h, 'MINI_BLOCK', 'miniBlock', 3); + $mvars .= makeLoopVars($h, 'SUB_BLOCK', 'subBlock', 2); # other vars. $mvars .= " omp_region_schedule=$regionScheduleStr omp_block_schedule=$blockScheduleStr";