Skip to content

Commit

Permalink
Merge pull request #238 from intel/develop
Browse files Browse the repository at this point in the history
Develop
  • Loading branch information
chuckyount authored Oct 21, 2019
2 parents f68d2ac + 85e9cf1 commit 9fdc2e3
Show file tree
Hide file tree
Showing 33 changed files with 1,562 additions and 1,473 deletions.
8 changes: 4 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,13 @@ YASK contains a domain-specific compiler to convert stencil-equation specificati
for multi-socket and multi-node operation or
Intel(R) Parallel Studio XE Composer Edition for C++ Linux
for single-socket only
(2018 or later; 2019.3 or later recommended).
* There was an issue in Intel(R) MPI versions 2019.1 and 2019.2 that
(2018 or later; 2019 update 5 (2019.0.5 or 2019u5) or later recommended).
* There was an issue in Intel(R) MPI versions 2019u1 and 2019u2 that
caused the application to crash when allocating very
large shared-memory (shm) regions, so those
versions are not recommended when using the `-use_shm` feature.
This issue was resolved in MPI version 2019.3.
* If you are using g++ version 8.x or later, Intel(R) C++ version 2019.x or later
This issue was resolved in MPI version 2019u3.
* If you are using g++ version 8.x or later, Intel(R) C++ version 2019
is required.
* Building a YASK kernel with the Gnu C++ compiler is possible.
Limited testing with g++ 8.2.0 shows the "iso3dfd" kernel
Expand Down
2 changes: 1 addition & 1 deletion src/common/common_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ namespace yask {
// for numbers above 9 (at least up to 99).

// Format: "major.minor.patch".
const string version = "3.02.01";
const string version = "3.03.00";

string yask_get_version_string() {
return version;
Expand Down
1 change: 1 addition & 0 deletions src/compiler/compiler_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ void usage(const string& cmd,
" avx YASK stencil classes for CORE AVX ISA (256-bit HW SIMD vectors).\n"
" avx2 YASK stencil classes for CORE AVX2 ISA (256-bit HW SIMD vectors).\n"
" avx512 YASK stencil classes for CORE AVX-512 ISA (512-bit HW SIMD vectors).\n"
" avx512lo YASK stencil classes for CORE AVX-512 ISA (256-bit HW SIMD vectors).\n"
" knc YASK stencil classes for Knights-Corner ISA (512-bit HW SIMD vectors).\n"
" knl YASK stencil classes for Knights-Landing (MIC) AVX-512 ISA (512-bit HW SIMD vectors).\n"
" intel64 YASK stencil classes for generic C++ (no explicit HW SIMD vectors).\n"
Expand Down
11 changes: 8 additions & 3 deletions src/compiler/lib/CppIntrin.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -244,6 +244,7 @@ namespace yask {
// Print 512-bit AVX intrinsic code.
class YASKAvx512Printer : public YASKCppPrinter {
protected:
bool _is_lo;
virtual CppVecPrintHelper* newCppVecPrintHelper(VecInfoVisitor& vv,
CounterVisitor& cv) {
return new CppAvx512PrintHelper(vv, _settings, _dims, &cv,
Expand All @@ -254,10 +255,14 @@ namespace yask {
YASKAvx512Printer(StencilSolution& stencil,
EqBundles& eqBundles,
EqBundlePacks& eqBundlePacks,
EqBundles& clusterEqBundles) :
YASKCppPrinter(stencil, eqBundles, eqBundlePacks, clusterEqBundles) { }
EqBundles& clusterEqBundles,
bool is_lo = false) :
YASKCppPrinter(stencil, eqBundles, eqBundlePacks, clusterEqBundles),
_is_lo(is_lo) { }

virtual int num_vec_elems() const { return 64 / _settings._elem_bytes; }
virtual int num_vec_elems() const {
return (_is_lo ? 32 : 64) / _settings._elem_bytes;
}

// Whether multi-dim folding is efficient.
virtual bool is_folding_efficient() const { return true; }
Expand Down
2 changes: 2 additions & 0 deletions src/compiler/lib/Solution.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,8 @@ namespace yask {
_printer = new YASKAvx256Printer(*this, *_eqBundles, *_eqBundlePacks, *_clusterEqBundles);
else if (target == "avx512" || target == "knl")
_printer = new YASKAvx512Printer(*this, *_eqBundles, *_eqBundlePacks, *_clusterEqBundles);
else if (target == "avx512lo")
_printer = new YASKAvx512Printer(*this, *_eqBundles, *_eqBundlePacks, *_clusterEqBundles, true);
else if (target == "dot")
_printer = new DOTPrinter(*this, *_clusterEqBundles, false);
else if (target == "dot-lite")
Expand Down
12 changes: 10 additions & 2 deletions src/kernel/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,9 @@ ifneq ($(filter $(arch),avx snb ivb),)
TARGET := avx
else ifneq ($(filter $(arch),avx2 hsw bdw),)
TARGET := avx2
else ifneq ($(filter $(arch),avx512 avx512f skx skl clx),)
else ifneq ($(filter $(arch),avx512lo),)
TARGET := avx512lo
else ifneq ($(filter $(arch),avx512 avx512hi avx512f skx skl clx),)
TARGET := avx512
else ifneq ($(filter $(arch),knl),)
TARGET := knl
Expand All @@ -73,7 +75,7 @@ else ifneq ($(filter $(arch),knc),)
else ifneq ($(filter $(arch),intel64 cpp),)
TARGET := intel64
else
$(error Target not recognized; use arch=avx512, avx2, avx, knl, knc, or intel64)
$(error Target not recognized; use arch=avx512, avx512lo, avx2, avx, knl, knc, or intel64)
endif

# Specify YK_STENCIL and/or YK_ARCH on 'make' cmd-line to name binaries differently.
Expand Down Expand Up @@ -108,6 +110,12 @@ else ifeq ($(TARGET),avx512)
CXX_ISA ?= -march=skylake-avx512
MACROS += USE_INTRIN512

else ifeq ($(TARGET),avx512lo)

ICC_ISA ?= -xCORE-AVX512
CXX_ISA ?= -march=skylake-avx512
MACROS += USE_INTRIN512LO

else ifeq ($(TARGET),avx2)

ICC_ISA ?= -xCORE-AVX2
Expand Down
22 changes: 11 additions & 11 deletions src/kernel/lib/alloc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ namespace yask {
os << endl;
}
#endif

// Base ptrs for all default-alloc'd data.
// These pointers will be shared by the ones in the var
// objects, which will take over ownership when these go
Expand All @@ -156,7 +156,7 @@ namespace yask {
#ifdef USE_PMEM
auto preferredNUMASize = opts->_numa_pref_max * 1024*1024*(size_t)1024;
#endif

// Pass 0: assign PMEM node when preferred NUMA node is not enough.
// Pass 1: count required size for each NUMA node, allocate chunk of memory at end.
// Pass 2: distribute parts of already-allocated memory chunk.
Expand Down Expand Up @@ -281,7 +281,7 @@ namespace yask {

// Get calculated max dist needed for this var.
int maxdist = gp->get_halo_exchange_l1_norm();

// Always use max dist with WF. Do this because edge
// and/or corner values may be needed in WF extensions
// even it not needed w/o WFs.
Expand All @@ -300,7 +300,7 @@ namespace yask {
"' (max L1-norm = " << maxdist << ")");
continue; // to next var.
}

// Lookup first & last domain indices and calc exchange sizes
// for this var.
bool found_delta = false;
Expand Down Expand Up @@ -640,7 +640,7 @@ namespace yask {

// At this point, we have all the buffers configured.
// Now we need to allocate space for them.

// Base ptrs for all alloc'd data.
// These pointers will be shared by the ones in the var
// objects, which will take over ownership when these go
Expand All @@ -656,7 +656,7 @@ namespace yask {

// Make sure pad is big enough for shm locks.
assert(_data_buf_pad >= sizeof(SimpleLock));

// Allocate MPI buffers.
// Pass 0: count required size, allocate chunk of memory at end.
// Pass 1: distribute parts of already-allocated memory chunk.
Expand Down Expand Up @@ -685,7 +685,7 @@ namespace yask {
for (auto& gtab : sb_ofs[gname])
gtab.resize(env->num_shm_ranks, 0);
}

// Visit buffers for each neighbor for this var.
var_mpi_data.visitNeighbors
([&](const IdxTuple& roffsets,
Expand All @@ -704,7 +704,7 @@ namespace yask {
numa_pref = _shmem_key;
assert(nshm_rank < env->num_shm_ranks);
}

// Send and recv.
for (int bd = 0; bd < MPIBufs::nBufDirs; bd++) {
auto& buf = var_mpi_data.getBuf(MPIBufs::BufDir(bd), roffsets);
Expand Down Expand Up @@ -758,7 +758,7 @@ namespace yask {
nbufs[numa_pref]++;
if (pass == 0)
TRACE_MSG(" MPI buf '" << buf.name << "' needs " <<
makeByteStr(sbytes) <<
makeByteStr(sbytes) <<
" (mem-key = " << numa_pref << ")");
}
} // snd/rcv.
Expand Down Expand Up @@ -828,7 +828,7 @@ namespace yask {
}
TRACE_MSG("allocScratchData: max mini-block size across pack(s) is " <<
mblksize.makeDimValStr(" * "));

// Pass 0: count required size, allocate chunk of memory at end.
// Pass 1: distribute parts of already-allocated memory chunk.
for (int pass = 0; pass < 2; pass++) {
Expand Down Expand Up @@ -900,5 +900,5 @@ namespace yask {

} // scratch-var passes.
}

} // namespace yask.
18 changes: 9 additions & 9 deletions src/kernel/lib/auto_tuner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ namespace yask {
_name += "(" + name + ")";
clear(settings->_do_auto_tune);
}

// Eval auto-tuner for given number of steps.
void StencilContext::eval_auto_tuner(idx_t num_steps) {
STATE_VARS(this);
Expand All @@ -57,7 +57,7 @@ namespace yask {
else
_at.eval();
}

// Reset auto-tuners.
void StencilContext::reset_auto_tuner(bool enable, bool verbose) {
for (auto& sp : stPacks)
Expand All @@ -77,7 +77,7 @@ namespace yask {
done = _at.is_done();
return !done;
}

// Apply auto-tuning immediately, i.e., not as part of normal processing.
// Will alter data in vars.
void StencilContext::run_auto_tuner_now(bool verbose) {
Expand Down Expand Up @@ -159,7 +159,7 @@ namespace yask {
DEBUG_MSG(_name << ": sub-block-size: " <<
_settings->_sub_block_sizes.removeDim(step_posn).makeDimValStr(" * "));
}

// Access settings.
bool AutoTuner::tune_mini_blks() const {
return _context->get_settings()->_tune_mini_blks;
Expand Down Expand Up @@ -210,7 +210,7 @@ namespace yask {
// Check whether sizes within search limits.
bool AutoTuner::checkSizes(const IdxTuple& bsize) {
bool ok = true;

// Too small?
if (ok && get_num_domain_points(bsize) < min_pts) {
n2small++;
Expand All @@ -228,7 +228,7 @@ namespace yask {
}
return ok;
}

// Evaluate the previous run and take next auto-tuner step.
void AutoTuner::eval() {
STATE_VARS(this);
Expand Down Expand Up @@ -257,7 +257,7 @@ namespace yask {
ctime << " secs (" << rate <<
" steps/sec) cumulative; best-rate = " << best_rate <<
"; min-secs = " << min_secs);

// Still in warmup?
if (in_warmup) {

Expand Down Expand Up @@ -413,7 +413,7 @@ namespace yask {
// Check sizes.
if (ok && !checkSizes(bsize))
ok = false;


// Valid size and not already checked?
if (ok && results.count(bsize) == 0) {
Expand Down Expand Up @@ -475,7 +475,7 @@ namespace yask {

// Restore step-dim value for block.
target_sizes()[step_posn] = target_steps;

// Change derived sizes to 0 so adjustSettings()
// will set them to the default.
if (!tune_mini_blks()) {
Expand Down
6 changes: 3 additions & 3 deletions src/kernel/lib/auto_tuner.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ namespace yask {
// Null stream to throw away debug info.
yask_output_factory yof;
yask_output_ptr nullop = yof.new_null_output();

// Whether to print progress.
bool verbose = false;

Expand Down Expand Up @@ -112,12 +112,12 @@ namespace yask {
return tune_mini_blks() ?
_settings->_block_sizes : _settings->_region_sizes;
}

// Change settings pointers.
void set_settings(KernelSettings* p) {
_settings = p;
}

// Reset all state to beginning.
void clear(bool mark_done, bool verbose = false);

Expand Down
Loading

0 comments on commit 9fdc2e3

Please sign in to comment.