Skip to content

Commit

Permalink
Merge pull request #211 from intel/develop
Browse files Browse the repository at this point in the history
Develop
  • Loading branch information
chuckyount authored Mar 29, 2019
2 parents 5b925cb + 3e2350e commit 0034352
Show file tree
Hide file tree
Showing 15 changed files with 109 additions and 76 deletions.
3 changes: 2 additions & 1 deletion include/yask_kernel_api.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -82,14 +82,15 @@ namespace yask {
/// Bootstrap factory to create a stencil solution.
class yk_factory {
public:
yk_factory();
virtual ~yk_factory() {}

/// Version information.
/**
@returns String describing the current version.
*/
virtual std::string
get_version_string();
get_version_string();

/// Create an object to hold environment information.
/**
Expand Down
2 changes: 1 addition & 1 deletion src/common/common_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ namespace yask {
// for numbers above 9 (at least up to 99).

// Format: "major.minor.patch".
const string version = "2.19.02";
const string version = "2.19.03";

string yask_get_version_string() {
return version;
Expand Down
1 change: 1 addition & 0 deletions src/common/common_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ inline void omp_set_num_threads(int n) { }
inline void omp_set_nested(int n) { }
inline int omp_get_max_active_levels() { return 1; }
inline void omp_set_max_active_levels(int n) { }
inline int omp_get_level() { return 1; }
inline void omp_init_lock(omp_lock_t* p) { }
inline bool omp_set_lock(omp_lock_t* p) { return true; }
inline void omp_unset_lock(omp_lock_t* p) { }
Expand Down
83 changes: 39 additions & 44 deletions src/kernel/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -71,56 +71,48 @@ else ifeq ($(YC_STENCIL),cube)
MACROS += MAX_EXCH_DIST=3

else ifneq ($(findstring iso3dfd,$(YC_STENCIL)),)
MACROS += MAX_EXCH_DIST=1
def_pad_args := -ep 1
MACROS += MAX_EXCH_DIST=1
def_pad_args := -ep 1
ifeq ($(arch),knl)
fold_4byte := x=2,y=8
cluster := x=2
def_block_args := -bx 160 -by 256 -bz 96
pfd_l1 := 1
pfd_l2 := 0
fold_4byte := x=2,y=8
cluster := x=2
def_block_args := -bx 160 -by 256 -bz 96
pfd_l1 := 1
pfd_l2 := 0
else ifneq ($(filter $(arch),hsw bdw),)
def_thread_divisor := 2
def_block_threads := 1
def_block_args := -bx 296 -by 5 -bz 290
cluster := z=2
pfd_l1 := 0
pfd_l2 := 0
def_block_args := -bx 48 -by 60 -bz 112
pfd_l1 := 0
pfd_l2 := 0
else ifneq ($(filter $(arch),skx skl clx),)
def_thread_divisor := 1
def_block_threads := 2
def_block_args := -bx 108 -by 28 -bz 132
cluster := x=1
pfd_l1 := 0
pfd_l2 := 2
def_block_args := -bx 108 -by 28 -bz 132
pfd_l1 := 0
pfd_l2 := 2
endif

else ifneq ($(findstring awp,$(YC_STENCIL)),)
def_block_args := -b 32
YC_FLAGS += -min-es 1
def_pad_args := -ep 1
YC_FLAGS += -min-es 1
def_pad_args := -ep 1
ifeq ($(arch),knl)
def_thread_divisor := 2
def_block_threads := 4
def_block_args := -b 48 -bx 112
pfd_l1 := 1
pfd_l2 := 0
def_thread_divisor := 2
def_block_threads := 4
def_block_args := -bx 48 -by 48 -bx 112
pfd_l1 := 1
pfd_l2 := 0
else ifneq ($(filter $(arch),hsw bdw),)
cluster := y=2
def_block_args := -bx 128 -by 16 -bz 32
more_def_args += -sbx 32 -sby 2 -sbz 32
pfd_l1 := 1
pfd_l2 := 2
def_block_args := -bx 64 -by 8 -bz 64
pfd_l1 := 1
pfd_l2 := 2
else ifneq ($(filter $(arch),skx skl clx),)
def_block_args := -bx 44 -by 8 -bz 108
pfd_l1 := 1
pfd_l2 := 0
def_block_args := -bx 64 -by 8 -bz 108
pfd_l1 := 1
pfd_l2 := 0
endif

else ifneq ($(findstring ssg,$(YC_STENCIL)),)
ifneq ($(filter $(arch),skx skl clx),)
ifneq ($(filter $(arch),hsw bdw),)
def_block_args := -bx 64 -by 16 -bz 96
else ifneq ($(filter $(arch),skx skl clx),)
def_block_args := -bx 96 -by 16 -bz 80
def_block_threads := 2
endif

else ifneq ($(findstring fsg,$(YC_STENCIL)),)
Expand All @@ -131,16 +123,16 @@ else ifneq ($(findstring fsg,$(YC_STENCIL)),)
def_block_threads := 2
pfd_l1 := 0
pfd_l2 := 2
else ifneq ($(filter $(arch),hsw bdw),)
def_block_args := -bx 48 -by 4 -bz 128
else ifneq ($(filter $(arch),skx skl clx),)
def_block_args := -bx 188 -by 12 -bz 24
def_block_threads := 1
endif

else ifeq ($(YC_STENCIL),tti)
MACROS += MAX_EXCH_DIST=3
ifneq ($(filter $(arch),skx skl clx),)
def_block_args := -bx 80 -by 16 -bz 40
def_block_threads := 1
endif

else ifeq ($(YC_STENCIL),stream)
Expand Down Expand Up @@ -170,7 +162,7 @@ else ifeq ($(arch),knl)
GCXX_ISA ?= -march=knl
MACROS += USE_INTRIN512 USE_RCP28 NUMA_PREF=1
YC_TARGET ?= avx512
def_block_args ?= -b 96
def_block_args ?= -b 64
def_block_threads ?= 8
pfd_l1 ?= 1
pfd_l2 ?= 0
Expand Down Expand Up @@ -227,7 +219,7 @@ omp_block_schedule ?= static,1
omp_misc_schedule ?= guided
def_thread_divisor ?= 1
def_block_threads ?= 2
def_block_args ?= -b 64
def_block_args ?= -b 32
cluster ?= x=1
pfd_l1 ?= 0
pfd_l2 ?= 2
Expand Down Expand Up @@ -391,7 +383,7 @@ endif
# Add options for PMEM.
ifeq ($(pmem),1)
YK_CXXFLAGS += -I/opt/intel/memkind/include
YK_LIBS += -L/opt/intel/memkind/lib -lmemkind
YK_LIBS += -L/opt/intel/memkind/lib -lmemkind
MACROS += USE_PMEM
endif

Expand Down Expand Up @@ -440,10 +432,12 @@ endif
# VTUNE settings.
ifeq ($(vtune),1)
MACROS += USE_VTUNE
ifneq ($(VTUNE_AMPLIFIER_2019_DIR),)
VTUNE_DIR := $(VTUNE_AMPLIFIER_2019_DIR)
ifneq ($(VTUNE_AMPLIFIER_2020_DIR),)
VTUNE_DIR := $(VTUNE_AMPLIFIER_2020_DIR)
else ifneq ($(VTUNE_AMPLIFIER_2019_DIR),)
VTUNE_DIR := $(VTUNE_AMPLIFIER_2019_DIR)
else ifneq ($(VTUNE_AMPLIFIER_2018_DIR),)
VTUNE_DIR := $(VTUNE_AMPLIFIER_2018_DIR)
else ifneq ($(VTUNE_AMPLIFIER_XE_2017_DIR),)
VTUNE_DIR := $(VTUNE_AMPLIFIER_XE_2017_DIR)
else ifneq ($(VTUNE_AMPLIFIER_XE_2016_DIR),)
Expand Down Expand Up @@ -850,6 +844,7 @@ stencil-tests:
$(MAKE) clean; $(MAKE) $(STENCIL_TEST_ARGS) stencil=test_misc_2d fold=x=2,y=2 EXTRA_YC_FLAGS=-interleave-misc
$(MAKE) clean; $(MAKE) $(STENCIL_TEST_ARGS) stencil=test_step_cond_1d fold=x=4
$(MAKE) clean; $(MAKE) $(STENCIL_TEST_ARGS) stencil=test_subdomain_1d fold=x=4
$(MAKE) clean; $(MAKE) $(STENCIL_TEST_ARGS) stencil=test_subdomain_2d fold=x=2,y=2
$(MAKE) clean; $(MAKE) $(STENCIL_TEST_ARGS) stencil=test_subdomain_3d fold=x=2,y=2
$(MAKE) clean; $(MAKE) $(STENCIL_TEST_ARGS) stencil=test_scratch_1d fold=x=4
$(MAKE) clean; $(MAKE) $(STENCIL_TEST_ARGS) stencil=test_scratch_3d fold=x=2,z=2
Expand Down
8 changes: 8 additions & 0 deletions src/kernel/lib/context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,9 @@ namespace yask {
STATE_VARS(this);
run_time.start();

// Start vtune collection.
VTUNE_RESUME;

// Determine step dir from order of first/last.
idx_t step_dir = (last_step_index >= first_step_index) ? 1 : -1;

Expand Down Expand Up @@ -553,7 +556,12 @@ namespace yask {
cache_model.disable();
}
#endif

// Stop vtune collection.
VTUNE_PAUSE;

run_time.stop();

} // run_solution().

// Calculate results within a region. Each region is typically computed
Expand Down
4 changes: 2 additions & 2 deletions src/kernel/lib/generic_grids.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ namespace yask {
return _layout_base->get_sizes();
}
void set_dim_sizes(const Indices& sizes) {
for (int i = 0; i < _grid_dims.size(); i++)
for (int i = 0; size_t(i) < _grid_dims.size(); i++)
_grid_dims.setVal(i, sizes[i]);
_sync_layout_with_dims();
}
Expand Down Expand Up @@ -305,7 +305,7 @@ namespace yask {
virtual idx_t get_index(const Indices& idxs, bool check=true) const final {
#ifdef CHECK
if (check) {
for (int i = 0; i < this->_grid_dims.size(); i++) {
for (int i = 0; size_t(i) < this->_grid_dims.size(); i++) {
idx_t j = idxs[i];
assert(j >= 0);
assert(j < this->_grid_dims.getVal(i));
Expand Down
4 changes: 2 additions & 2 deletions src/kernel/lib/indices.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -99,9 +99,9 @@ namespace yask {
// Write to an IdxTuple.
// The 'tgt' must have the same number of dims.
void setTupleVals(IdxTuple& tgt) const {
assert(tgt.size() == _ndims);
assert(tgt.size() == size_t(_ndims));
for (int i = 0; i < _ndims; i++)
if (i < tgt.size())
if (size_t(i) < tgt.size())
tgt.setVal(i, _idxs[i]);
}

Expand Down
2 changes: 1 addition & 1 deletion src/kernel/lib/realv_grids.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -995,7 +995,7 @@ namespace yask {
int line) const {
STATE_VARS_CONST(this);
TRACE_MEM_MSG("prefetchVecNorm<" << level << ">(" <<
makeIndexString(vec_idxs.multElements(_vec_lens)) << ")");
makeIndexString(vec_idxs.mulElements(_vec_lens)) << ")");

auto p = getVecPtrNorm(vec_idxs, alloc_step_idx, false);
prefetch<level>(p);
Expand Down
2 changes: 2 additions & 0 deletions src/kernel/lib/settings.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -462,6 +462,8 @@ namespace yask {
DimsPtr _dims;

// Sizes in elements (points).
// All these tuples contain stencil dims, even the ones that
// don't strictly need them.
IdxTuple _global_sizes; // Overall problem domain sizes.
IdxTuple _rank_sizes; // This rank's domain sizes.
IdxTuple _region_sizes; // region size (used for wave-front tiling).
Expand Down
6 changes: 6 additions & 0 deletions src/kernel/lib/setup.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,12 @@ using namespace std;

namespace yask {

// Stop collecting VTune data when a factory is defined.
// Even better to use -start-paused option.
yk_factory::yk_factory() {
VTUNE_PAUSE;
}

// ScanIndices ctor.
ScanIndices::ScanIndices(const Dims& dims, bool use_vec_align, IdxTuple* ofs) :
ndims(NUM_STENCIL_DIMS),
Expand Down
4 changes: 2 additions & 2 deletions src/kernel/lib/soln_apis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -257,8 +257,8 @@ namespace yask {
#endif
os <<
" vector-len: " << VLEN << endl <<
" extra-padding: " << opts->_extra_pad_sizes.makeDimValStr() << endl <<
" minimum-padding: " << opts->_min_pad_sizes.makeDimValStr() << endl <<
" extra-padding: " << opts->_extra_pad_sizes.removeDim(step_posn).makeDimValStr() << endl <<
" minimum-padding: " << opts->_min_pad_sizes.removeDim(step_posn).makeDimValStr() << endl <<
" L1-prefetch-distance: " << PFD_L1 << endl <<
" L2-prefetch-distance: " << PFD_L2 << endl <<
" max-halos: " << max_halos.makeDimValStr() << endl;
Expand Down
4 changes: 2 additions & 2 deletions src/kernel/lib/yask.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -145,11 +145,11 @@ typedef std::uint64_t uidx_t;

// macro for debug message.
#ifdef TRACE
#define TRACE_MSG0(os, msg) if (opts->_trace) { \
#define TRACE_MSG0(os, msg) do { if (opts->_trace) { \
KernelEnv::set_debug_lock(); \
(os) << "YASK: " << msg << std::endl << std::flush; \
KernelEnv::unset_debug_lock(); \
} else (void)0
} } while(0)
#else
#define TRACE_MSG0(os, msg) ((void)0)
#endif
Expand Down
2 changes: 1 addition & 1 deletion src/kernel/lib/yask_stencil.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ IN THE SOFTWARE.
// First/last index macros.
// These are relative to global problem, not rank.
#define FIRST_INDEX(dim) (0)
#define LAST_INDEX(dim) (_context->get_settings().get()->_global_sizes[DOMAIN_DIM_IDX_ ## dim] - 1)
#define LAST_INDEX(dim) (_context->get_settings().get()->_global_sizes[STENCIL_DIM_IDX_ ## dim] - 1)

// Macros for 1D<->nD transforms.
#include "yask_layout_macros.hpp"
Expand Down
32 changes: 12 additions & 20 deletions src/kernel/yask_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -191,12 +191,13 @@ struct AppSettings : public KernelSettings {
// Exit with help message if requested.
void splash(ostream& os, int argc, char** argv)
{
// See https://en.wikipedia.org/wiki/Box-drawing_character.
os <<
"┌────────────────────────────────────────────┐\n"
"Y.A.S.K. ── Yet Another Stencil Kernel \n"
" https://github.com/intel/yask \n"
"Copyright (c) 2014-2019, Intel Corporation \n"
"└────────────────────────────────────────────┘\n"
" \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n"
" \u2502 Y.A.S.K. \u2500\u2500 Yet Another Stencil Kernel \u2502\n"
" \u2502 https://github.com/intel/yask \u2502\n"
" \u2502 Copyright (c) 2014-2019, Intel Corporation \u2502\n"
" \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n"
"\n"
"Version: " << yask_get_version_string() << endl <<
"Stencil name: " YASK_STENCIL_NAME << endl;
Expand Down Expand Up @@ -240,14 +241,10 @@ int main(int argc, char** argv)
// just a line.
string divLine;
for (int i = 0; i < 70; i++)
divLine += "";
divLine += "\u2500";
divLine += "\n";

try {
// Stop collecting VTune data.
// Even better to use -start-paused option.
VTUNE_PAUSE;

// Bootstrap factories from kernel API.
yk_factory kfac;
yask_output_factory yof;
Expand Down Expand Up @@ -415,17 +412,11 @@ int main(int argc, char** argv)
}
kenv->global_barrier();

// Start vtune collection.
VTUNE_RESUME;

// Actual work.
context->clear_timers();
ksoln->run_solution(first_t, last_t);
kenv->global_barrier();

// Stop vtune collection.
VTUNE_PAUSE;

// Calc and report perf.
auto tstats = context->get_stats();
auto stats = dynamic_pointer_cast<Stats>(tstats);
Expand Down Expand Up @@ -466,15 +457,16 @@ int main(int argc, char** argv)
" mid-throughput (num-points/sec): " << makeNumStr(mid_trial->pts_ps) << endl <<
divLine <<
"Notes:\n"
" The 50th-percentile trial is the same as the median trial\n"
" when there is an odd number of trials. When there is an even\n"
" number of trials, the nearest-rank method is used. An odd\n"
" number of trials is recommended.\n"
" Num-reads/sec, num-writes/sec, and FLOPS are metrics based on\n"
" stencil specifications and can vary due to differences in\n"
" implementations and optimizations.\n"
" Num-points/sec is based on overall problem size and is\n"
" a more reliable performance metric, esp. when comparing\n"
" across implementations.\n"
" The 50th-percentile trial is the same as the median trial\n"
" when there is an odd number of trials. When there is an even\n"
" number of trials, the nearest-rank method is used.\n";
" across implementations.\n";
context->print_warnings();
}

Expand Down
Loading

0 comments on commit 0034352

Please sign in to comment.