diff --git a/src/common/common_utils.cpp b/src/common/common_utils.cpp index cc732b77..ff19867b 100644 --- a/src/common/common_utils.cpp +++ b/src/common/common_utils.cpp @@ -47,7 +47,7 @@ namespace yask { // https://semver.org/. // Format: "major.minor.patch[-alpha|-beta]". - const string version = "4.04.06"; + const string version = "4.04.07"; string yask_get_version_string() { return version; diff --git a/src/kernel/Makefile b/src/kernel/Makefile index c3f6578c..7ca55097 100644 --- a/src/kernel/Makefile +++ b/src/kernel/Makefile @@ -70,9 +70,19 @@ endif # YASK compiler settings for offload. ifeq ($(offload),1) - inner_loop_dim := 1 - outer_domain_layout := 1 + + # BKMs for Intel GPUs. + ifeq ($(cxx_is_llvm_intel),1) + outer_domain_layout := 1 + early_loads := 0 + min_buffer_len := 1 + inner_loop_dim := 1 + endif + + # BKMs for Nvidia GPUs. ifeq ($(cxx_is_nv),1) + outer_domain_layout := 0 + early_loads := 0 min_buffer_len := 99 endif endif @@ -482,7 +492,11 @@ RANK_LOOP_CODE := $(RANK_LOOP_MODS) loop($(RANK_LOOP_ORDER)) { } # 'omp' modifier creates an outer OpenMP loop so that each block is assigned # to a top-level OpenMP thread. MEGA_BLOCK_LOOP_MODS := -MEGA_BLOCK_LOOP_OMP := omp parallel for schedule(dynamic,1) proc_bind(spread) +ifeq ($(cxx_is_llvm_intel),1) + MEGA_BLOCK_LOOP_OMP := omp parallel for schedule(dynamic,1) proc_bind(spread) +else + MEGA_BLOCK_LOOP_OMP := omp parallel for schedule(dynamic,1) +endif MEGA_BLOCK_LOOP_FLAGS := -prefix mega_block_ -omp '$(MEGA_BLOCK_LOOP_OMP)' MEGA_BLOCK_LOOP_ORDER := DOMAIN_LOOP_DIMS MEGA_BLOCK_LOOP_CODE := $(MEGA_BLOCK_LOOP_MODS) omp loop($(MEGA_BLOCK_LOOP_ORDER)) { } @@ -501,7 +515,11 @@ BLOCK_LOOP_CODE := $(BLOCK_LOOP_MODS) loop($(BLOCK_LOOP_ORDER)) { } # nested OpenMP thread. The OpenMP construct is not used when running with # '-bind_inner_threads' because another parallel section is created. MICRO_BLOCK_LOOP_MODS := -MICRO_BLOCK_LOOP_OMP := omp parallel for schedule(static,1) proc_bind(spread) +ifeq ($(cxx_is_llvm_intel),1) + MICRO_BLOCK_LOOP_OMP := omp parallel for schedule(static,1) proc_bind(spread) +else + MICRO_BLOCK_LOOP_OMP := omp parallel for schedule(static,1) +endif MICRO_BLOCK_LOOP_FLAGS := -prefix micro_block_ -omp '$(MICRO_BLOCK_LOOP_OMP)' MICRO_BLOCK_LOOP_ORDER := DOMAIN_LOOP_DIMS MICRO_BLOCK_LOOP_CODE := $(MICRO_BLOCK_LOOP_MODS) omp loop($(MICRO_BLOCK_LOOP_ORDER)) { } diff --git a/src/kernel/lib/stencil_calc.cpp b/src/kernel/lib/stencil_calc.cpp index bdba42fc..83c9b11c 100644 --- a/src/kernel/lib/stencil_calc.cpp +++ b/src/kernel/lib/stencil_calc.cpp @@ -205,7 +205,7 @@ namespace yask { // Start threads within a block. Each of these threads // will eventually work on a separate nano-block. This // is nested within an OMP outer thread. - _Pragma("omp parallel proc_bind(spread)") { + _Pragma("omp parallel") { assert(omp_get_level() == 2); assert(omp_get_num_threads() == nbt); int inner_thread_idx = omp_get_thread_num(); diff --git a/src/kernel/yask.sh b/src/kernel/yask.sh index fab7dac0..3d7a2408 100755 --- a/src/kernel/yask.sh +++ b/src/kernel/yask.sh @@ -448,7 +448,7 @@ if [[ $doval == 1 ]]; then fi # Commands to capture some important system status and config info for benchmark documentation. -config_cmds="sleep 1; uptime; lscpu; cpuinfo -A; sed '/^$/q' /proc/cpuinfo; cpupower frequency-info; uname -a; $dump /etc/system-release; $dump /proc/cmdline; $dump /proc/meminfo; free -gt; numactl -H; ulimit -a; ipcs -l; env | awk '/YASK/ { print \"env:\", \$1 }'" +config_cmds="sleep 1; uptime; lscpu; cpuinfo -A; sed '/^$/q' /proc/cpuinfo; cpupower frequency-info; uname -a; $dump /etc/system-release; $dump /proc/cmdline; $dump /proc/meminfo; free -gt; numactl -H; ulimit -a; ipcs -l; module list; env | awk '/YASK/ { print \"env:\", \$1 }'" # Add settings for offload kernel. if [[ $is_offload == 1 ]]; then