diff --git a/README.md b/README.md index 0504391b..6d4c477f 100644 --- a/README.md +++ b/README.md @@ -48,11 +48,14 @@ and Intel(R) graphics processors. to make the generated code easier for humans to read. You'll get a warning when running `make` if one of these doesn't exist. Everything will still work, but the generated code will be difficult to read. - Reading the generated code is only necessary for debug or curiosity. + Reading the generated code is only necessary for debug, performance analysis, etc. * SWIG (4.0.0 or later): http://www.swig.org, for creating the Python interface. * Python 3 (3.6.1 or later): https://www.python.org/downloads, for creating and using the Python interface. + Included with Intel(R) oneAPI HPC Toolkit. + * Python `numpy` package for running Python interface tests. + Included with Intel(R) oneAPI HPC Toolkit. * Doxygen (1.9.0 or later): https://www.doxygen.nl, for creating updated API documentation. If you're not changing the API documentation, you can view the existing documentation diff --git a/docs/YASK-tutorial.pdf b/docs/YASK-tutorial.pdf index d4b45851..dd4bcdbb 100644 Binary files a/docs/YASK-tutorial.pdf and b/docs/YASK-tutorial.pdf differ diff --git a/src/common/common_utils.cpp b/src/common/common_utils.cpp index caa77582..94dab876 100644 --- a/src/common/common_utils.cpp +++ b/src/common/common_utils.cpp @@ -44,7 +44,7 @@ namespace yask { // for numbers above 9 (at least up to 99). // Format: "major.minor.patch[-alpha|-beta]". - const string version = "4.04.04"; + const string version = "4.04.05"; string yask_get_version_string() { return version; @@ -57,10 +57,12 @@ namespace yask { // Return num with SI multiplier and "iB" suffix, // e.g., 412KiB. + // Use only for storage bytes, e.g., not for + // rates like bytes/sec. string make_byte_str(size_t nbytes) { if (!is_suffix_print_enabled) - return to_string(nbytes); + return to_string(nbytes) + " Bytes"; ostringstream os; double num = double(nbytes); @@ -89,7 +91,8 @@ namespace yask { } // Return num with SI multiplier, e.g. "3.14M". - // Use this one for rates, etc. + // Use this one for printing any number that is + // not number of storage bytes. string make_num_str(idx_t num) { if (!is_suffix_print_enabled || (num > -1000 && num < 1000)) return to_string(num); diff --git a/src/common/tuple.cpp b/src/common/tuple.cpp index 9427ee1b..e964b353 100644 --- a/src/common/tuple.cpp +++ b/src/common/tuple.cpp @@ -202,8 +202,8 @@ namespace yask { size_t prev_size = 1; // Loop thru dims. - int start_dim = _first_inner ? 0 : size()-1; - int end_dim = _first_inner ? size() : -1; + int start_dim = _first_inner ? 0 : get_num_dims()-1; + int end_dim = _first_inner ? get_num_dims() : -1; int step_dim = _first_inner ? 1 : -1; for (int di = start_dim; di != end_dim; di += step_dim) { auto& i = _q.at(di); @@ -242,8 +242,8 @@ namespace yask { size_t prev_size = 1; // Loop thru dims. - int start_dim = _first_inner ? 0 : size()-1; - int stop_dim = _first_inner ? size() : -1; + int start_dim = _first_inner ? 0 : get_num_dims()-1; + int stop_dim = _first_inner ? get_num_dims() : -1; int step_dim = _first_inner ? 1 : -1; for (int di = start_dim; di != stop_dim; di += step_dim) { auto& i = _q.at(di); diff --git a/src/compiler/Makefile b/src/compiler/Makefile index f4346609..bebe05bc 100644 --- a/src/compiler/Makefile +++ b/src/compiler/Makefile @@ -62,8 +62,8 @@ YC_CXX := $(CXX) YC_CXXOPT ?= -O2 YC_CXXDBG ?= -g YC_CXXFLAGS := -YC_CXXFLAGS_API := -std=c++17 -YC_CXXFLAGS_API += -Wall -Wno-unknown-pragmas -Wno-unused-variable +YC_CXXWARN := -Wall -Wno-unknown-pragmas -Wno-unused-variable +YC_CXXFLAGS_API := -std=c++17 $(YC_CXXWARN) YC_CXX_INCFLAGS := $(addprefix -I,$(YC_INC_DIRS)) YC_CXX_INCFLAGS_API := $(addprefix -I,$(INC_DIR)) diff --git a/src/kernel/Makefile b/src/kernel/Makefile index 570a8ecd..165d8387 100644 --- a/src/kernel/Makefile +++ b/src/kernel/Makefile @@ -31,13 +31,8 @@ YASK_BASE ?= $(abspath ../..) include $(YASK_BASE)/src/common/common.mk -# Initial default settings for the YASK kernel library. +# Default settings for invoking the YASK compiler. # These can be overridden on the 'make' command-line. -# See src/common/common.mk for more setting vars. -numa ?= 1 -allow_new_var_types ?= 1 -streaming_stores ?= 0 -use_rcp ?= 0 use_ptrs ?= 1 use_safe_ptrs ?= 0 outer_domain_layout ?= 0 @@ -45,6 +40,14 @@ inner_misc_layout ?= 1 first_inner ?= 1 early_loads ?= 0 min_buffer_len ?= 1 + +# Default settings for the YASK kernel library. +# These can be overridden on the 'make' command-line. +# See src/common/common.mk for more setting vars. +numa ?= 1 +allow_new_var_types ?= 1 +streaming_stores ?= 0 +use_rcp ?= 0 trace ?= 0 trace_mem ?= 0 check ?= 0 @@ -56,9 +59,7 @@ check ?= 0 # YASK compiler settings for offload. ifeq ($(offload),1) - pfd_l1 := 0 - pfd_l2 := 0 - inner_loop_dim := 1 + inner_loop_dim := 1 outer_domain_layout := 1 endif @@ -87,7 +88,7 @@ ifeq ($(TARGET),knl) VEC_MACROS += USE_RCP28 endif MACROS += NUMA_PREF=1 - pfd_l1 ?= 1 + pfd_l1 := 1 else ifeq ($(TARGET),avx512) @@ -156,18 +157,18 @@ endif ifneq ($(step_dim),) YC_FLAGS += -step-dim $(step_dim) endif -ifneq ($(pfd_l1),) - YC_FLAGS += -l1-prefetch-dist $(pfd_l1) -endif -ifneq ($(pfd_l2),) - YC_FLAGS += -l2-prefetch-dist $(pfd_l2) -endif ifneq ($(inner_loop_dim),) YC_FLAGS += -inner-loop-dim $(inner_loop_dim) endif ifneq ($(min_buffer_len),) YC_FLAGS += -min-buffer-len $(min_buffer_len) endif +ifneq ($(pfd_l1),) + YC_FLAGS += -l1-prefetch-dist $(pfd_l1) +endif +ifneq ($(pfd_l2),) + YC_FLAGS += -l2-prefetch-dist $(pfd_l2) +endif # Stencil compiler flags that are boolean. ifeq ($(use_ptrs),1) @@ -351,7 +352,7 @@ endif # Compiler-specific settings. # Create a compiler invocation to test for macro settings. -YK_CXX_TEST := $(YK_CXX) +YK_CXX_TEST := $(YK_CXXCMD) cxx_is_llvm_intel := $(call MACRO_DEF,$(YK_CXX_TEST),__INTEL_LLVM_COMPILER) cxx_is_clang := $(call MACRO_DEF,$(YK_CXX_TEST),__clang__) cxx_is_intel := $(call MACRO_DEF,$(YK_CXX_TEST),__INTEL_COMPILER) @@ -512,7 +513,8 @@ NANO_BLOCK_LOOP_ORDER ?= DOMAIN_LOOP_DIMS ifeq ($(offload),1) NANO_BLOCK_LOOP_OMP ?= omp target teams distribute thread_limit(thread_limit) device(KernelEnv::_omp_devn) NANO_BLOCK_LOOP_FLAGS += -omp '$(NANO_BLOCK_LOOP_OMP)' -NANO_BLOCK_LOOP_CODE := $(NANO_BLOCK_LOOP_MODS) omp loop($(NANO_BLOCK_LOOP_ORDER)) { } +NANO_BLOCK_LOOP_MODS += omp +NANO_BLOCK_LOOP_CODE := $(NANO_BLOCK_LOOP_MODS) loop($(NANO_BLOCK_LOOP_ORDER)) { } else NANO_BLOCK_LOOP_CODE := $(NANO_BLOCK_LOOP_MODS) loop($(NANO_BLOCK_LOOP_ORDER)) { } endif diff --git a/src/kernel/lib/alloc.cpp b/src/kernel/lib/alloc.cpp index ec09062b..136f5313 100644 --- a/src/kernel/lib/alloc.cpp +++ b/src/kernel/lib/alloc.cpp @@ -103,10 +103,14 @@ namespace yask { else if (numa_available() != -1) { numa_set_bind_policy(0); if (numa_pref >= 0 && numa_pref <= numa_max_node()) - numa_alloc_onnode(nbytes, numa_pref); + p = numa_alloc_onnode(nbytes, numa_pref); else - numa_alloc_local(nbytes); + p = numa_alloc_local(nbytes); // Interleaved not available. + + if (!p) + THROW_YASK_EXCEPTION("cannot allocate " + make_byte_str(nbytes) + + " using numa-node (or policy) " + to_string(numa_pref)); } else THROW_YASK_EXCEPTION("explicit NUMA policy allocation is not available"); @@ -158,15 +162,6 @@ namespace yask { #endif // not USE_NUMA_POLICY_LIB. - #else - THROW_YASK_EXCEPTION("NUMA allocation is not enabled; build with numa=1"); - #endif // USE_NUMA. - - // Should not get here w/null p; throw exception. - if (!p) - THROW_YASK_EXCEPTION("cannot allocate " + make_byte_str(nbytes) + - " using numa-node (or policy) " + to_string(numa_pref)); - // Check alignment. if ((size_t(p) & (CACHELINE_BYTES - 1)) != 0) FORMAT_AND_THROW_YASK_EXCEPTION("NUMA-allocated " << p << " is not " << @@ -177,6 +172,10 @@ namespace yask { // Return as a char* as required for shared_ptr ctor. return static_cast(p); + + #else + THROW_YASK_EXCEPTION("NUMA allocation is not enabled; build with numa=1"); + #endif // USE_NUMA. } // Reverse numa_alloc(). @@ -217,38 +216,36 @@ namespace yask { void *p = 0; + #ifdef USE_OFFLOAD + THROW_YASK_EXCEPTION("mapping offload device memory to shm not yet supported; " + "use '-no-use_shm' option"); + // Allocate using MPI shm. - #ifdef USE_MPI + #elif defined(USE_MPI) assert(shm_comm); assert(shm_win); MPI_Info win_info; MPI_Info_create(&win_info); MPI_Info_set(win_info, "alloc_shared_noncontig", "true"); MPI_Win_allocate_shared(nbytes, 1, win_info, *shm_comm, &p, shm_win); - MPI_Info_free(&win_info); - MPI_Win_lock_all(0, *shm_win); - #else - THROW_YASK_EXCEPTION("MPI shm allocation is not enabled; build with mpi=1"); - #endif - if (!p) THROW_YASK_EXCEPTION("cannot allocate " + make_byte_str(nbytes) + " using MPI shm"); + MPI_Info_free(&win_info); + MPI_Win_lock_all(0, *shm_win); // Check alignment. if ((size_t(p) & (CACHELINE_BYTES - 1)) != 0) FORMAT_AND_THROW_YASK_EXCEPTION("MPI shm-allocated " << p << " is not " << CACHELINE_BYTES << "-byte aligned"); - #ifdef USE_OFFLOAD - THROW_YASK_EXCEPTION("mapping offload device memory to shm not yet supported; " - "use '-no-use_shm'"); - #endif - // Cannot typically use huge pages for shm, so not calling set_huge(). // Return as a char* as required for shared_ptr ctor. return static_cast(p); + #else + THROW_YASK_EXCEPTION("MPI shm allocation is not enabled; build with mpi=1"); + #endif } // Reverse shm_alloc(). diff --git a/src/kernel/lib/auto_tuner.cpp b/src/kernel/lib/auto_tuner.cpp index 27beadeb..7bbaf6fc 100644 --- a/src/kernel/lib/auto_tuner.cpp +++ b/src/kernel/lib/auto_tuner.cpp @@ -431,8 +431,6 @@ namespace yask { } } // beyond next neighbor of center. } // while(true) search for new setting to try. - - THROW_YASK_EXCEPTION("(internal fault) exited from infinite loop"); } // eval. // Apply best settings if avail, and adjust other settings. diff --git a/src/kernel/lib/indices.hpp b/src/kernel/lib/indices.hpp index 2cde897a..9ff4e0c2 100644 --- a/src/kernel/lib/indices.hpp +++ b/src/kernel/lib/indices.hpp @@ -544,6 +544,7 @@ namespace yask { static_assert(std::is_trivially_copyable::value, "Needed for OpenMP offload"); + #if 0 // Define OMP reductions on Indices. #pragma omp declare reduction(min_idxs : Indices : \ omp_out = omp_out.min_elements(omp_in) ) \ @@ -551,6 +552,7 @@ namespace yask { #pragma omp declare reduction(max_idxs : Indices : \ omp_out = omp_out.max_elements(omp_in) ) \ initializer (omp_priv = omp_orig) + #endif // Layout base class. // This class hierarchy is NOT virtual. diff --git a/src/kernel/yask.sh b/src/kernel/yask.sh index 24b3f3a8..fab7dac0 100755 --- a/src/kernel/yask.sh +++ b/src/kernel/yask.sh @@ -181,6 +181,7 @@ while true; do echo " -v" echo " Shortcut for the following options:" echo " $val" + echo " Adds '/tests' to path of log_dir." echo " If you want to override any of these values, place them after '-v'." echo " -show_arch" echo " Print the default architecture string and exit." @@ -357,6 +358,9 @@ dump="head -v -n -0" # Init log file. : ${logfile:=yask.$stencil.$arch.$exe_host.n$nnodes.r$nranks.`date +%Y-%m-%d_%H-%M-%S`_p$$.log} if [[ -n "$logdir" ]]; then + if [[ $doval == 1 ]]; then + logdir="$logdir/tests" + fi logfile="$logdir/$logfile" fi echo "Writing log to '$logfile'." diff --git a/utils/bin/gen_loops.pl b/utils/bin/gen_loops.pl index 6b70227f..30075878 100755 --- a/utils/bin/gen_loops.pl +++ b/utils/bin/gen_loops.pl @@ -961,8 +961,14 @@ ($) # use OpenMP on next loop. elsif (lc $tok eq 'omp') { - $features |= $bOmpPar; - print "info: using OpenMP on following loop(s).\n"; + if ($OPT{omp} !~ /\w/) { + warn "info: ignoring OpenMP loop modifier because '-omp' argument is empty.\n"; + } + + else { + $features |= $bOmpPar; + print "info: using OpenMP on following loop(s).\n"; + } } # generate manual-scheduling optimizations in next loop. @@ -1238,7 +1244,7 @@ () " tiled: generate tiled scan within a >1D loop.\n", " serpentine: generate reverse scan when enclosing loop index is odd.*\n", " square_wave: generate 2D square-wave scan for two innermost dims of >1D loop.*\n", - " * Do not use these modifiers for YASK rank or block loops because they must\n", + " * Do not use these modifiers for YASK block or Mega-block loops because they must\n", " execute with strictly-increasing indices when using temporal tiling.\n", " Also, do not combile these modifiers with 'tiled' or 'manual'.\n", "A 'ScanIndices' type must be defined in C++ code prior to including the generated code.\n",