Skip to content

Commit

Permalink
Merge pull request #197 from intel/develop
Browse files Browse the repository at this point in the history
Develop
  • Loading branch information
chuckyount authored Feb 6, 2019
2 parents 0c7482d + 5e7174f commit e4043ba
Show file tree
Hide file tree
Showing 14 changed files with 192 additions and 95 deletions.
2 changes: 0 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,6 @@
# real_bytes: FP precision: 4=float, 8=double.
#
# fold: How to fold vectors (x*y*z).
# Vectorization in dimensions perpendicular to the inner loop
# (defined by SUB_BLOCK_LOOP_INNER_VARS below) often works well.
# fold_4byte: How to fold vectors when real_bytes=4.
# fold_8byte: How to fold vectors when real_bytes=8.
#
Expand Down
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ YASK contains a domain-specific compiler to convert scalar stencil code to SIMD-
* Awk.
* Gnu make.
* Bash shell.
* Numactl.
* Optional utilities and their purposes:
* The `indent` or `gindent` utility, used automatically during the build process
to make the generated code easier for humans to read.
Expand All @@ -57,6 +58,9 @@ YASK contains a domain-specific compiler to convert scalar stencil code to SIMD-
for functional testing if you don't have native support for any given instruction set.

### Backward-compatibility notices:
* Version 2.17.00 determined the host architecture in `make` and `bin/yask.sh` and number of MPI ranks in `bin/yask.sh`.
This changed the old behavior of `make` defaulting to `snb` architecture and `bin/yask.sh` requiring `-arch` and `-ranks`.
Those options are still available to override the host-based default.
* Version 2.16.03 moved the position of the log-file name to the last column in the CSV output of `utils/bin/yask_log_to_csv.pl`.
* Version 2.15.04 required a call to `yc_grid::set_dynamic_step_alloc(true)` to allow changing the
allocation in the step (time) dimension for grid variables created at YASK compile-time.
Expand Down
1 change: 1 addition & 0 deletions src/common/common.mk
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ YC_SRC_DIR := $(SRC_DIR)/compiler
SWIG := swig
PERL := perl
MKDIR := mkdir -p -v
BASH := bash

# Find include path needed for python interface.
# NB: constructing string inside print() to work for python 2 or 3.
Expand Down
2 changes: 1 addition & 1 deletion src/common/common_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ namespace yask {
// for numbers above 9 (at least up to 99).

// Format: "major.minor.patch".
const string version = "2.16.05";
const string version = "2.17.00";

string yask_get_version_string() {
return version;
Expand Down
7 changes: 5 additions & 2 deletions src/kernel/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -29,14 +29,17 @@

# Initial defaults.
stencil ?= iso3dfd
arch ?= snb
mpi ?= 1
numa ?= 1
pmem ?= 0
real_bytes ?= 4
radius ?= 2
allow_new_grid_types ?= 1

# Determine default architecture by running kernel script w/special knob.
# Do not assume it has been installed in $(BIN_OUT_DIR) yet.
arch ?= $(shell $(BASH) ./yask.sh -show_arch)

# Defaults based on stencil type (and arch for some stencils).
ifeq ($(stencil),)
$(error Stencil not specified)
Expand Down Expand Up @@ -214,7 +217,7 @@ else ifeq ($(arch),intel64)

else

$(error Architecture not recognized; use arch=knl, knc, skl, hsw, bdw, ivb, snb, or intel64 (no explicit vectorization))
$(error Architecture not recognized; use arch=knl, knc, clx, skx, hsw, bdw, ivb, snb, or intel64 (no explicit vectorization))

endif # arch-specific.

Expand Down
5 changes: 3 additions & 2 deletions src/kernel/lib/alloc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ namespace yask {
constexpr int _pmem_key = 2000; // leave space after this for pmem devices.

// Alloc 'nbytes' for each requested mem type.
// Pointers are returned in '_data_buf'.
// Pointers are returned in 'data_buf'.
// 'ngrids' and 'type' are only used for debug msg.
void StencilContext::_alloc_data(const map <int, size_t>& nbytes,
const map <int, size_t>& ngrids,
Expand Down Expand Up @@ -810,7 +810,8 @@ namespace yask {
int rthreads = set_region_threads();

// Delete any existing scratch grids.
// Create new scratch grids.
// Create new scratch grids, but without any
// data allocated.
makeScratchGrids(rthreads);

// Find the max mini-block size across all packs.
Expand Down
98 changes: 56 additions & 42 deletions src/kernel/lib/auto_tuner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ namespace yask {
assert(settings);
if (name.length())
_name += "(" + name + ")";
clear(settings->_do_auto_tune);
}

// Eval auto-tuner for given number of steps.
Expand Down Expand Up @@ -145,15 +146,24 @@ namespace yask {

// Print the best settings.
void AutoTuner::print_settings(ostream& os) const {
os << _name << ": best-block-size: " <<
_settings->_block_sizes.makeDimValStr(" * ") << endl <<
_name << ": mini-block-size: " <<
_settings->_mini_block_sizes.makeDimValStr(" * ") << endl <<
_name << ": sub-block-size: " <<
if (tune_mini_blks())
os << _name << ": best-mini-block-size: " <<
target_sizes().makeDimValStr(" * ") << endl;
else
os << _name << ": best-block-size: " <<
target_sizes().makeDimValStr(" * ") << endl <<
_name << ": mini-block-size: " <<
_settings->_mini_block_sizes.makeDimValStr(" * ") << endl;
os << _name << ": sub-block-size: " <<
_settings->_sub_block_sizes.makeDimValStr(" * ") << endl <<
flush;
}

// Access settings.
bool AutoTuner::tune_mini_blks() const {
return _context->get_settings()->_tune_mini_blks;
}

// Reset the auto-tuner.
void AutoTuner::clear(bool mark_done, bool verbose) {
STATE_VARS(this);
Expand All @@ -166,18 +176,18 @@ namespace yask {

// Apply the best known settings from existing data, if any.
if (best_rate > 0.) {
_settings->_block_sizes = best_block;
target_sizes() = best_sizes;
apply();
os << _name << ": applying block-size " <<
best_block.makeDimValStr(" * ") << endl;
os << _name << ": applying size " <<
best_sizes.makeDimValStr(" * ") << endl;
}

// Reset all vars.
results.clear();
n2big = n2small = n2far = 0;
best_block = _settings->_block_sizes;
best_sizes = target_sizes();
best_rate = 0.;
center_block = best_block;
center_sizes = best_sizes;
radius = max_radius;
done = mark_done;
neigh_idx = 0;
Expand All @@ -192,22 +202,22 @@ namespace yask {
min_blks = set_region_threads();

// Adjust starting block if needed.
for (auto dim : center_block.getDims()) {
for (auto dim : center_sizes.getDims()) {
auto& dname = dim.getName();
auto& dval = dim.getVal();

if (dname == step_dim) {
block_steps = opts->_block_sizes[dname];
center_block[dname] = block_steps;
target_steps = target_sizes()[dname]; // save value.
center_sizes[dname] = target_steps;
} else {
auto dmax = max(idx_t(1), opts->_region_sizes[dname] / 2);
auto dmax = max(idx_t(1), outer_sizes()[dname] / 2);
if (dval > dmax || dval < 1)
center_block[dname] = dmax;
center_sizes[dname] = dmax;
}
}
if (!done) {
TRACE_MSG(_name << ": starting block-size: " <<
center_block.makeDimValStr(" * "));
TRACE_MSG(_name << ": starting size: " <<
center_sizes.makeDimValStr(" * "));
TRACE_MSG(_name << ": starting search radius: " << radius);
}
} // clear.
Expand Down Expand Up @@ -242,7 +252,9 @@ namespace yask {
return;

// Done.
os << _name << ": in warmup for " << ctime << " secs" << endl;
os << _name << ": finished warmup for " << ctime << " secs\n" <<
_name << ": tuning " << (tune_mini_blks() ? "mini-" : "") <<
"block sizes...\n";
in_warmup = false;

// Measure this step only.
Expand All @@ -259,16 +271,16 @@ namespace yask {
os << _name << ": radius=" << radius << ": " <<
csteps << " steps(s) in " << ctime <<
" secs (" << rate <<
" steps/sec) with block-size " <<
_settings->_block_sizes.makeDimValStr(" * ") << endl;
" steps/sec) with size " <<
target_sizes().makeDimValStr(" * ") << endl;
csteps = 0;
ctime = 0.;

// Save result.
results[_settings->_block_sizes] = rate;
results[target_sizes()] = rate;
bool is_better = rate > best_rate;
if (is_better) {
best_block = _settings->_block_sizes;
best_sizes = target_sizes();
best_rate = rate;
better_neigh_found = true;
}
Expand All @@ -289,8 +301,8 @@ namespace yask {
// Next neighbor of center point.
neigh_idx++;

// Determine new block size.
IdxTuple bsize(center_block);
// Determine new size.
IdxTuple bsize(center_sizes);
bool ok = true;
int mdist = 0; // manhattan dist from center.
for (auto odim : ofs.getDims()) {
Expand All @@ -299,14 +311,14 @@ namespace yask {

// Min and max sizes of this dim.
auto dmin = dims->_cluster_pts[dname];
auto dmax = opts->_region_sizes[dname];
auto dmax = outer_sizes()[dname];

// Determine distance of GD neighbors.
auto dist = dmin; // step by cluster size.
dist = max(dist, min_dist);
dist *= radius;

auto sz = center_block[dname];
auto sz = center_sizes[dname];
switch (dofs) {
case 0: // reduce size in 'odim'.
sz -= dist;
Expand Down Expand Up @@ -344,7 +356,7 @@ namespace yask {
bsize[dname] = sz;

} // domain dims.
TRACE_MSG(_name << ": checking block-size " <<
TRACE_MSG(_name << ": checking size " <<
bsize.makeDimValStr(" * "));

// Too small?
Expand All @@ -355,7 +367,7 @@ namespace yask {

// Too few?
else if (ok) {
idx_t nblks = get_num_domain_points(opts->_region_sizes) /
idx_t nblks = get_num_domain_points(outer_sizes()) /
get_num_domain_points(bsize);
if (nblks < min_blks) {
ok = false;
Expand All @@ -367,7 +379,7 @@ namespace yask {
if (ok && !results.count(bsize)) {

// Run next step with this size.
_settings->_block_sizes = bsize;
target_sizes() = bsize;
break; // out of block-search loop.
}

Expand All @@ -379,8 +391,8 @@ namespace yask {
// Should GD continue?
bool stop_gd = !better_neigh_found;

// Make new center at best block so far.
center_block = best_block;
// Make new center at best size so far.
center_sizes = best_sizes;

// Reset search vars.
neigh_idx = 0;
Expand All @@ -404,35 +416,37 @@ namespace yask {
TRACE_MSG(_name << ": new search radius=" << radius);
}
else {
TRACE_MSG(_name << ": continuing search from block " <<
center_block.makeDimValStr(" * "));
TRACE_MSG(_name << ": continuing search from " <<
center_sizes.makeDimValStr(" * "));
}
} // beyond next neighbor of center.
} // search for new setting to try.

// Fix settings for next step.
// Assumption is that block size in one pack doesn't affect
// Assumption is that sizes in one pack doesn't affect
// perf in another pack.
apply();
TRACE_MSG(_name << ": next block-size " <<
_settings->_block_sizes.makeDimValStr(" * "));
TRACE_MSG(_name << ": next size " <<
target_sizes().makeDimValStr(" * "));
} // eval.

// Apply auto-tuner settings to prepare for a run.
// Does *not* set the settings being tuned.
void AutoTuner::apply() {
STATE_VARS(this);

// Restore step-dim value for block.
_settings->_block_sizes[step_posn] = block_steps;
target_sizes()[step_posn] = target_steps;

// Change block-based sizes to 0 so adjustSettings()
// Change derived sizes to 0 so adjustSettings()
// will set them to the default.
// TODO: tune mini- and sub-block sizes also.
if (!tune_mini_blks()) {
_settings->_block_group_sizes.setValsSame(0);
_settings->_mini_block_sizes.setValsSame(0);
}
_settings->_mini_block_group_sizes.setValsSame(0);
_settings->_sub_block_sizes.setValsSame(0);
_settings->_sub_block_group_sizes.setValsSame(0);
_settings->_mini_block_sizes.setValsSame(0);
_settings->_mini_block_group_sizes.setValsSame(0);
_settings->_block_group_sizes.setValsSame(0);

// Save debug output and set to null.
auto saved_op = get_debug_output();
Expand Down
27 changes: 23 additions & 4 deletions src/kernel/lib/auto_tuner.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,12 +62,12 @@ namespace yask {
int n2big = 0, n2small = 0, n2far = 0;

// Best so far.
IdxTuple best_block;
IdxTuple best_sizes;
double best_rate = 0.;

// Current point in search.
IdxTuple center_block;
idx_t block_steps = 0;
IdxTuple center_sizes;
idx_t target_steps = 0;
idx_t radius = 0;
bool done = false;
idx_t neigh_idx = 0;
Expand All @@ -92,7 +92,26 @@ namespace yask {
// Increment this to track steps.
idx_t steps_done = 0;

// Change settings pointer.
// Access settings.
bool tune_mini_blks() const;
IdxTuple& target_sizes() {
return tune_mini_blks() ?
_settings->_mini_block_sizes : _settings->_block_sizes;
}
IdxTuple& outer_sizes() {
return tune_mini_blks() ?
_settings->_block_sizes : _settings->_region_sizes;
}
IdxTuple& target_sizes() const {
return tune_mini_blks() ?
_settings->_mini_block_sizes : _settings->_block_sizes;
}
IdxTuple& outer_sizes() const {
return tune_mini_blks() ?
_settings->_block_sizes : _settings->_region_sizes;
}

// Change settings pointers.
void set_settings(KernelSettings* p) {
_settings = p;
}
Expand Down
10 changes: 10 additions & 0 deletions src/kernel/lib/settings.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -405,6 +405,16 @@ namespace yask {
"Maximum GiB to allocate on preferred NUMA node before allocating on pmem device.",
_numa_pref_max));
#endif
parser.add_option(new CommandLineParser::BoolOption
("auto_tune",
"Adjust block sizes *during* normal operation to tune for performance. "
"May cause varying performance between steps.",
_do_auto_tune));
parser.add_option(new CommandLineParser::BoolOption
("auto_tune_mini_blocks",
"Apply the auto-tuner to mini-block sizes instead of block sizes. "
"Particularly useful when using temporal block tiling.",
_tune_mini_blks));
}

// Print usage message.
Expand Down
Loading

0 comments on commit e4043ba

Please sign in to comment.