diff --git a/src/common/common_utils.cpp b/src/common/common_utils.cpp
index cc732b77..ff19867b 100644
--- a/src/common/common_utils.cpp
+++ b/src/common/common_utils.cpp
@@ -47,7 +47,7 @@ namespace yask {
     // https://semver.org/.
 
     // Format: "major.minor.patch[-alpha|-beta]".
-    const string version = "4.04.06";
+    const string version = "4.04.07";
 
     string yask_get_version_string() {
         return version;
diff --git a/src/kernel/Makefile b/src/kernel/Makefile
index c3f6578c..7ca55097 100644
--- a/src/kernel/Makefile
+++ b/src/kernel/Makefile
@@ -70,9 +70,19 @@ endif
 
 # YASK compiler settings for offload.
 ifeq ($(offload),1)
- inner_loop_dim 	:=	1
- outer_domain_layout	:=	1
+
+ # BKMs for Intel GPUs.
+ ifeq ($(cxx_is_llvm_intel),1)
+  outer_domain_layout	:=	1
+  early_loads		:=	0
+  min_buffer_len	:=	1
+  inner_loop_dim 	:=	1
+ endif
+
+ # BKMs for Nvidia GPUs.
  ifeq ($(cxx_is_nv),1)
+  outer_domain_layout	:=	0
+  early_loads		:=	0
   min_buffer_len	:=	99
  endif
 endif
@@ -482,7 +492,11 @@ RANK_LOOP_CODE		:=	$(RANK_LOOP_MODS) loop($(RANK_LOOP_ORDER)) { }
 # 'omp' modifier creates an outer OpenMP loop so that each block is assigned
 # to a top-level OpenMP thread.
 MEGA_BLOCK_LOOP_MODS	:=
-MEGA_BLOCK_LOOP_OMP	:=	omp parallel for schedule(dynamic,1) proc_bind(spread)
+ifeq ($(cxx_is_llvm_intel),1)
+ MEGA_BLOCK_LOOP_OMP	:=	omp parallel for schedule(dynamic,1) proc_bind(spread)
+else
+ MEGA_BLOCK_LOOP_OMP	:=	omp parallel for schedule(dynamic,1)
+endif
 MEGA_BLOCK_LOOP_FLAGS	:=	-prefix mega_block_ -omp '$(MEGA_BLOCK_LOOP_OMP)'
 MEGA_BLOCK_LOOP_ORDER	:=	DOMAIN_LOOP_DIMS
 MEGA_BLOCK_LOOP_CODE	:=	$(MEGA_BLOCK_LOOP_MODS) omp loop($(MEGA_BLOCK_LOOP_ORDER)) { } 
@@ -501,7 +515,11 @@ BLOCK_LOOP_CODE		:=	$(BLOCK_LOOP_MODS) loop($(BLOCK_LOOP_ORDER)) { }
 # nested OpenMP thread. The OpenMP construct is not used when running with
 # '-bind_inner_threads' because another parallel section is created.
 MICRO_BLOCK_LOOP_MODS	:=
-MICRO_BLOCK_LOOP_OMP	:=	omp parallel for schedule(static,1) proc_bind(spread)
+ifeq ($(cxx_is_llvm_intel),1)
+ MICRO_BLOCK_LOOP_OMP	:=	omp parallel for schedule(static,1) proc_bind(spread)
+else
+ MICRO_BLOCK_LOOP_OMP	:=	omp parallel for schedule(static,1)
+endif
 MICRO_BLOCK_LOOP_FLAGS	:=	-prefix micro_block_ -omp '$(MICRO_BLOCK_LOOP_OMP)'
 MICRO_BLOCK_LOOP_ORDER	:=	DOMAIN_LOOP_DIMS
 MICRO_BLOCK_LOOP_CODE	:=	$(MICRO_BLOCK_LOOP_MODS) omp loop($(MICRO_BLOCK_LOOP_ORDER)) { }
diff --git a/src/kernel/lib/stencil_calc.cpp b/src/kernel/lib/stencil_calc.cpp
index bdba42fc..83c9b11c 100644
--- a/src/kernel/lib/stencil_calc.cpp
+++ b/src/kernel/lib/stencil_calc.cpp
@@ -205,7 +205,7 @@ namespace yask {
                     // Start threads within a block.  Each of these threads
                     // will eventually work on a separate nano-block.  This
                     // is nested within an OMP outer thread.
-                    _Pragma("omp parallel proc_bind(spread)") {
+                    _Pragma("omp parallel") {
                         assert(omp_get_level() == 2);
                         assert(omp_get_num_threads() == nbt);
                         int inner_thread_idx = omp_get_thread_num();
diff --git a/src/kernel/yask.sh b/src/kernel/yask.sh
index fab7dac0..3d7a2408 100755
--- a/src/kernel/yask.sh
+++ b/src/kernel/yask.sh
@@ -448,7 +448,7 @@ if [[ $doval == 1 ]]; then
 fi
 
 # Commands to capture some important system status and config info for benchmark documentation.
-config_cmds="sleep 1; uptime; lscpu; cpuinfo -A; sed '/^$/q' /proc/cpuinfo; cpupower frequency-info; uname -a; $dump /etc/system-release; $dump /proc/cmdline; $dump /proc/meminfo; free -gt; numactl -H; ulimit -a; ipcs -l; env | awk '/YASK/ { print \"env:\", \$1 }'"
+config_cmds="sleep 1; uptime; lscpu; cpuinfo -A; sed '/^$/q' /proc/cpuinfo; cpupower frequency-info; uname -a; $dump /etc/system-release; $dump /proc/cmdline; $dump /proc/meminfo; free -gt; numactl -H; ulimit -a; ipcs -l; module list; env | awk '/YASK/ { print \"env:\", \$1 }'"
 
 # Add settings for offload kernel.
 if [[ $is_offload == 1 ]]; then