diff --git a/include/yask_common_api.hpp b/include/yask_common_api.hpp
index 1891fda6..fdcb662a 100644
--- a/include/yask_common_api.hpp
+++ b/include/yask_common_api.hpp
@@ -33,6 +33,7 @@ IN THE SOFTWARE.
 #define YASK_COMMON_API
 
 #include <string>
+#include <vector>
 #include <iostream>
 #include <ostream>
 #include <memory>
@@ -204,6 +205,55 @@ namespace yask {
         virtual ~yask_null_output() {}
     };
 
+    /// Create finite-difference (FD) coefficients for the standard center form.
+    /**
+       Find FD coefficients with `radius` sample points to both the left and right
+       of the center sample and evaluation point on a uniformly-spaced grid. 
+       The FD has `radius * 2`-order accuracy.
+       @returns `radius * 2 + 1` FD coefficients.
+    */
+    std::vector<double>
+    get_center_fd_coefficients(int derivative_order
+                               /**< [in] `1` for 1st derivative, `2` for 2nd, etc. */,
+                               int radius
+                               /**< [in] Number of points to either side of the center point. */ );
+
+    /// Create finite-difference (FD) coefficients for the standard forward form.
+    /**
+       Find FD coefficients with `accuracy_order` sample points to the right
+       of the center sample and evaluation point on a uniformly-spaced grid. 
+       @returns `accuracy_order + 1` FD coefficients.
+    */
+    std::vector<double>
+    get_forward_fd_coefficients(int derivative_order
+                                /**< [in] `1` for 1st derivative, `2` for 2nd, etc. */,
+                                int accuracy_order
+                                /**< [in] Number of points to the right of the center point. */ );
+    
+    /// Create finite-difference (FD) coefficients for the standard backward form.
+    /**
+       Find FD coefficients with `accuracy_order` sample points to the left
+       of the center sample and evaluation point on a uniformly-spaced grid. 
+       @returns `accuracy_order + 1` FD coefficients.
+    */
+    std::vector<double>
+    get_backward_fd_coefficients(int derivative_order
+                                 /**< [in] `1` for 1st derivative, `2` for 2nd, etc. */,
+                                 int accuracy_order
+                                 /**< [in] Number of points to the left of the center point. */ );
+    
+    /// Create finite-difference (FD) coefficients at arbitrary evaluation and sample points.
+    /**
+       @returns `sample_points` FD coefficients.
+    */
+    std::vector<double>
+    get_arbitrary_fd_coefficients(int derivative_order
+                                  /**< [in] `1` for 1st derivative, `2` for 2nd, etc. */,
+                                  double eval_point
+                                  /**< [in] Location of evaluation point. */,
+                                  const std::vector<double> sample_points
+                                  /**< [in] Locations of sampled points. Must have at least 2. */ );
+    
     /** @}*/
 
 } // namespace yask.
diff --git a/src/common/common.mk b/src/common/common.mk
index 7a7d18d3..2b9faac9 100644
--- a/src/common/common.mk
+++ b/src/common/common.mk
@@ -55,7 +55,8 @@ endif
 
 # Common source.
 COMM_DIR	:=	$(SRC_DIR)/common
-COMM_SRC_NAMES	:=	output common_utils tuple combo
+COMM_SRC_NAMES	:=	output common_utils tuple combo fd_coeff fd_coeff2
+COEFF_DIR	:=	$(SRC_DIR)/contrib/coefficients
 
 # YASK stencil compiler.
 # This is here because both the compiler and kernel
diff --git a/src/common/common_utils.cpp b/src/common/common_utils.cpp
index 512781c9..aaa42ab5 100644
--- a/src/common/common_utils.cpp
+++ b/src/common/common_utils.cpp
@@ -43,7 +43,7 @@ namespace yask {
     // for numbers above 9 (at least up to 99).
 
     // Format: "major.minor.patch".
-    const string version = "2.20.00";
+    const string version = "2.21.00";
 
     string yask_get_version_string() {
         return version;
diff --git a/src/common/fd_coeff2.cpp b/src/common/fd_coeff2.cpp
new file mode 100644
index 00000000..6d0334d0
--- /dev/null
+++ b/src/common/fd_coeff2.cpp
@@ -0,0 +1,75 @@
+/*****************************************************************************
+
+YASK: Yet Another Stencil Kernel
+Copyright (c) 2014-2019, Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to
+deal in the Software without restriction, including without limitation the
+rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+sell copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+* The above copyright notice and this permission notice shall be included in
+  all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+IN THE SOFTWARE.
+
+*****************************************************************************/
+
+///////// FD coefficients implementation.
+
+#include "yask_common_api.hpp"
+#include "fd_coeff.hpp"
+#include "common_utils.hpp"
+
+using namespace std;
+
+namespace yask {
+
+    // C++-style interface.
+    vector<double> get_arbitrary_fd_coefficients(int derivative_order, double eval_point, const vector<double> sample_points) {
+        if (derivative_order < 1)
+            THROW_YASK_EXCEPTION("Error: get_fd_coefficients() called with derivative-order less than 1");
+        int n = int(sample_points.size());
+        if (n < 2)
+            THROW_YASK_EXCEPTION("Error: get_fd_coefficients() called with fewer than 2 sample points");
+        vector<double> coeffs(n);
+        fd_coeff(&coeffs[0], double(eval_point), derivative_order, &sample_points[0], n);
+        return coeffs;
+    }
+
+    // Common FD forms for uniform grid spacing.
+    vector<double> get_center_fd_coefficients(int derivative_order, int radius) {
+        if (radius < 1)
+            THROW_YASK_EXCEPTION("get_center_fd_coefficients() called with less than radius 1");
+        vector<double> pts;
+        for (int i = -radius; i <= radius; i++)
+            pts.push_back(i);
+        assert(sizeof(pts) == size_t(radius * 2 + 1));
+        return get_arbitrary_fd_coefficients(derivative_order, 0, pts);
+    }
+    vector<double> get_forward_fd_coefficients(int derivative_order, int accuracy_order) {
+        if (accuracy_order < 1)
+            THROW_YASK_EXCEPTION("get_forward_fd_coefficients() called with less than order-of-accuracy 1");
+        vector<double> pts;
+        for (int i = 0; i <= accuracy_order; i++)
+            pts.push_back(i);
+        return get_arbitrary_fd_coefficients(derivative_order, 0, pts);
+    }
+    vector<double> get_backward_fd_coefficients(int derivative_order, int accuracy_order) {
+        if (accuracy_order < 1)
+            THROW_YASK_EXCEPTION("get_backward_fd_coefficients() called with less than order-of-accuracy 1");
+        vector<double> pts;
+        for (int i = -accuracy_order; i <= 0; i++)
+            pts.push_back(i);
+        return get_arbitrary_fd_coefficients(derivative_order, 0, pts);
+    }
+
+} // yask namespace.
diff --git a/src/compiler/Makefile b/src/compiler/Makefile
index 99b1051c..a2b78b45 100644
--- a/src/compiler/Makefile
+++ b/src/compiler/Makefile
@@ -50,7 +50,7 @@ YC_SRC_NAMES	:=	Expr ExprUtils Grid Eqs Print Vec Cpp CppIntrin YaskKernel Soln
 YC_STENCIL_NAMES:=	$(notdir $(patsubst %.cpp,%,$(wildcard $(YC_STENCIL_DIR)/*.cpp)))
 YC_OBJS		:=	$(addprefix $(YC_OBJ_DIR)/,$(addsuffix .o,$(YC_SRC_NAMES) $(COMM_SRC_NAMES)))
 YC_STENCIL_OBJS	:=	$(addprefix $(YC_OBJ_DIR)/,$(addsuffix .o,$(YC_STENCIL_NAMES)))
-YC_INC_DIRS	:=	$(INC_DIR) $(YC_LIB_SRC_DIR) $(COMM_DIR)
+YC_INC_DIRS	:=	$(INC_DIR) $(YC_LIB_SRC_DIR) $(COMM_DIR) $(COEFF_DIR)
 YC_INC_GLOB	:=	$(wildcard $(addsuffix /*.hpp,$(YC_INC_DIRS)))
 YC_STENCIL_INC_GLOB :=	$(wildcard $(YC_STENCIL_DIR)/*.hpp $(YC_STENCIL_DIR)/*/*.hpp)
 
@@ -87,6 +87,11 @@ $(YC_OBJ_DIR)/%.o: $(COMM_DIR)/%.cpp $(YC_INC_GLOB)
 	$(CXX_PREFIX) $(YC_CXX) $(YC_CXXFLAGS) -x c++ -fPIC -c -o $@ $<
 	@ls -l $@
 
+$(YC_OBJ_DIR)/%.o: $(COEFF_DIR)/%.cpp $(YC_INC_GLOB)
+	$(MKDIR) $(YC_OBJ_DIR)
+	$(CXX_PREFIX) $(YC_CXX) $(YC_CXXFLAGS) -x c++ -fPIC -c -o $@ $<
+	@ls -l $@
+
 ######## Primary targets.
 
 default: compiler
@@ -120,7 +125,7 @@ $(YC_SWIG_OUT_DIR)/yask_compiler_api_wrap.cpp: $(YC_SWIG_DIR)/yask*.i $(INC_DIR)
 	$(SWIG) -version
 	$(MKDIR) $(YC_SWIG_OUT_DIR) $(PY_OUT_DIR)
 	$(SWIG) -v -DYC_MODULE=$(YC_MODULE) -cppext cpp \
-	  -I$(INC_DIR) -I$(COMM_DIR) -I$(COMM_DIR)/swig \
+	  -I$(INC_DIR) -I$(COMM_DIR) -I$(COMM_DIR)/swig -I$(COEFF_DIR) \
 	  -c++ -python -o $@ -outdir $(PY_OUT_DIR) -builtin $<
 
 # Turn off asserts to work around known SWIG issue:
diff --git a/src/contrib/coefficients/fd_coeff.cpp b/src/contrib/coefficients/fd_coeff.cpp
index c23b61d2..d969b5c6 100644
--- a/src/contrib/coefficients/fd_coeff.cpp
+++ b/src/contrib/coefficients/fd_coeff.cpp
@@ -28,80 +28,60 @@ IN THE SOFTWARE.
 
 #include <iostream>
 #include <cstring>
+#include <vector>
 #include "fd_coeff.hpp"
-#define MIN(x, y) (((x) < (y)) ? (x): (y))
-#define MAX(x, y) (((x) > (y)) ? (x): (y))
 
 using namespace std;
-/*input:coeff=empty coefficient array (or one that can be overwritten)
-	eval_point=point at which the derivative is approximated
-	order=order of the derivative to approximate (e.g. f'' corresponds to order = 2)
-	points=array of points from which to construct the approximation of the derivative. usually an equi-spaced array with points [-radius*h, -(radius-1)*h,...0, ... radius*h]
-	num_points=number of elements in points[], e.g. the number of points used to approximate the derivative.
-	Note: if num_points < order+1, then the coefficients will all be 0
-
-
-  output:void, fills the coefficient array such that
-f^(m)[eval_point] ~~ sum of coeff[i]*f[point[i]] from i = 0 to num_points-1
-*/
-
-void fd_coeff(float *coeff, const float eval_point, const int order, float *points, const int num_points)
-{
-    float c1, c2, c3;
-    float x_0=eval_point;
-    float center=0;
-
-
-
-
-//  float* d = (float*) malloc((order+1)*num_points*num_points*sizeof(float));
-    float d[(order+1)*num_points*num_points];
-    int m_idx = (order+1)*num_points;
-    int n_idx = num_points;
-
-    //array initializer 1
-    /*
-    memset(d, 0.f, sizeof(d));
-    */
-
-    //array initializer 2
-    int sizeofd = (order+1)*(num_points)*(num_points)*sizeof(float);
-    memset(d, 0.f, sizeofd);
-
-
-    //array initializer 3
-    /*
-    for(int m=0; m <= order; ++m){
-	for(int n=0; n< num_points; ++n){
-	    for(int v=0; v<num_points;++v){
-	    d[m*m_idx+n*n_idx+v]=0.f;
-	    }}}
+namespace yask {
+
+    // C-style interface with arbitrary point locations.
+    /* input:coeff=empty coefficient array (or one that can be overwritten)
+       eval_point=point at which the derivative is approximated order=order
+       of the derivative to approximate (e.g. f'' corresponds to order = 2)
+       points=array of points from which to construct the approximation of
+       the derivative.  usually an equi-spaced array with points [-radius*h,
+       -(radius-1)*h,...0, ... radius*h] num_points=number of elements in
+       points[], e.g. the number of points used to approximate the
+       derivative.  Note: if num_points < order+1, then the coefficients will
+       all be 0
+       
+       output:void, fills the coefficient array such that
+       f^(m)[eval_point] ~~ sum of coeff[i]*f[point[i]] from i = 0 to num_points-1
     */
-
-
-    d[0]=1.f;
-    c1 = 1.f;
-
-    for(int n=1; n<=num_points-1;++n){
-        c2=1.f;
-	for(int v=0; v<=n-1; ++v){
-            c3 = points[n] - points[v];
-            c2 = c2*c3;
-            for(int m=0; m<=MIN(n, order); ++m){
-		d[m*m_idx+n*n_idx + v] = (points[n]-x_0)*d[m*m_idx + (n-1)*n_idx + v] - m*d[(m-1)*m_idx + (n-1)*n_idx + v];
-		d[m*m_idx + n*n_idx + v] *= 1.f/c3;
+    void fd_coeff(double *coeff, const double eval_point, const int order, const double *points, const int num_points)
+    {
+        double c1, c2, c3;
+        double x_0=eval_point;
+
+        int m_idx = (order+1)*num_points;
+        int n_idx = num_points;
+        int dsz = m_idx * n_idx;
+        double d[dsz];
+    
+        for (int i = 1; i < dsz; i++)
+            d[i] = 0.0;
+        d[0]=1.0;
+        c1 = 1.0;
+
+        for(int n=1; n<=num_points-1;++n){
+            c2=1.0;
+            for(int v=0; v<=n-1; ++v){
+                c3 = points[n] - points[v];
+                c2 = c2*c3;
+                for(int m=0; m<=min(n, order); ++m){
+                    d[m*m_idx+n*n_idx + v] = (points[n]-x_0)*d[m*m_idx + (n-1)*n_idx + v] - m*d[(m-1)*m_idx + (n-1)*n_idx + v];
+                    d[m*m_idx + n*n_idx + v] *= 1.0/c3;
+                }
             }
-	}
-	for(int m=0; m<= MIN(n, order); ++m){
-            d[m*m_idx+n*n_idx+n] = m*d[(m-1)*m_idx+(n-1)*n_idx+(n-1)] - (points[n-1]-x_0)*d[m*m_idx+(n-1)*n_idx+n-1];
-            d[m*m_idx+n*n_idx+n] *= c1/c2;
-	}
-        c1=c2;
-    }
+            for(int m=0; m<= min(n, order); ++m){
+                d[m*m_idx+n*n_idx+n] = m*d[(m-1)*m_idx+(n-1)*n_idx+(n-1)] - (points[n-1]-x_0)*d[m*m_idx+(n-1)*n_idx+n-1];
+                d[m*m_idx+n*n_idx+n] *= c1/c2;
+            }
+            c1=c2;
+        }
 
-    for(int i=0; i<num_points; ++i){
-	coeff[i] = d[order*m_idx+(num_points-1)*n_idx + i];
+        for(int i=0; i<num_points; ++i){
+            coeff[i] = d[order*m_idx+(num_points-1)*n_idx + i];
+        }
     }
-
-//    free(d);
 }
diff --git a/src/contrib/coefficients/fd_coeff.hpp b/src/contrib/coefficients/fd_coeff.hpp
index 112aaa19..47cedf98 100644
--- a/src/contrib/coefficients/fd_coeff.hpp
+++ b/src/contrib/coefficients/fd_coeff.hpp
@@ -26,7 +26,9 @@ IN THE SOFTWARE.
 // Finite-differences coefficients code.
 // Contributed by Jeremy Tillay.
 
-#ifndef HEADER_COEFF
-#define HEADER_COEFF
-void fd_coeff(float *coeff, const float eval_point, const int order, float *points, const int num_points);
-#endif
+#pragma once
+
+namespace yask {
+    void fd_coeff(double *coeff, const double eval_point, const int order, const double *points, const int num_points);
+}
+
diff --git a/src/contrib/coefficients/fd_coeff_test.cpp b/src/contrib/coefficients/fd_coeff_test.cpp
index 1bd0c1f7..66ecee4c 100644
--- a/src/contrib/coefficients/fd_coeff_test.cpp
+++ b/src/contrib/coefficients/fd_coeff_test.cpp
@@ -29,8 +29,6 @@ IN THE SOFTWARE.
 #include <iostream>
 #include <cstring>
 #include "fd_coeff.hpp"
-#define MIN(x, y) (((x) < (y)) ? (x): (y))
-#define MAX(x, y) (((x) > (y)) ? (x): (y))
 
 using namespace std;
 
@@ -41,52 +39,45 @@ int main()
     const int order = 2;
 
     //set the evaluation point e.g. we want to approximate some derivative f^(m)[eval_point]
-    //for most application, this is 0
-    float eval_point = 0;
+    //for most applications, this is 0
+    double eval_point = 0.0;
 
     const int radius = 2;
     const int num_points = 2*radius+1;
-    float coeff[num_points];
-    memset(coeff, 0.0, sizeof(coeff));
-
-    //float* coeff = (float*) malloc(num_points*sizeof(float));
-    //memset(coeff, 0.0, num_points*sizeof(float));
+    double coeff[num_points];
 
     //Construct a set of points (-h*radius, -h*(radius-1), .. 0, h,..., h*radius)
     //Could pass any arbitrary array grid_points = {x_0, x_1, ... x_n}
 
-    //float* grid_points = (float*) malloc(num_points*sizeof(float));
-    float grid_points[num_points];
+    double grid_points[num_points];
     cout << "Approximating derivative from grid points: " ;
     for(int i=0; i<num_points; i++){
         grid_points[i] = (-radius + i);
         cout << grid_points[i]<< ", ";
     }
-
     cout << endl;
 
-    fd_coeff(coeff, eval_point, order, grid_points, num_points);
+    yask::fd_coeff(coeff, eval_point, order, grid_points, num_points);
+
+    cout << "The coefficients are: ";
+    for(int i=0; i<num_points; i++) {
+        if (i)
+            cout << ", ";
+    	cout << coeff[i];
+    }
+    cout << endl;
 
     string suffix = (order == 1) ? "st" : (order == 2) ? "nd" : (order == 3) ? "rd" : "th";
     cout << "The " << order << suffix << " derivative of f("<< eval_point <<
         ") is approximated by this " << num_points << "-point FD formula:" << endl;
     cout << "f^(" << order << ")(" << eval_point << ") ~= ";
 
-
     for(int i=0; i<num_points; i++) {
         if (i)
             cout << " + ";
-            cout << coeff[i] << "*f[" << grid_points[i] << "]";
-    }
-    cout << endl;
-
-    cout << "Therefore, the coefficients are: ";
-    for(int i=0; i<num_points; i++) {
-    	cout << coeff[i] << ", ";
+        cout << coeff[i] << "*f[" << grid_points[i] << "]";
     }
     cout << endl;
 
-//free(grid_points);
-//free(coeff);
-return 0;
+    return 0;
 }
diff --git a/src/kernel/Makefile b/src/kernel/Makefile
index fbdc71d9..a346b6c8 100644
--- a/src/kernel/Makefile
+++ b/src/kernel/Makefile
@@ -180,7 +180,7 @@ else ifneq ($(filter $(arch),hsw bdw),)
 
  ISA		?=	-xCORE-AVX2
  GCXX_ISA	?=	-march=haswell
- MACROS		+=	USE_INTRIN256
+ MACROS		+=	USE_INTRIN256 USE_AVX2
  YC_TARGET	?=	avx2
 
 else ifeq ($(arch),ivb)
@@ -324,7 +324,7 @@ YK_GEN_HEADERS	:=	$(addprefix $(YK_GEN_DIR)/, \
 				yask_layouts.hpp \
 				yask_grid_code.hpp) \
 				$(YK_CODE_FILE)
-YK_INC_DIRS	:=	$(INC_DIR) $(YK_LIB_SRC_DIR) $(COMM_DIR)
+YK_INC_DIRS	:=	$(INC_DIR) $(YK_LIB_SRC_DIR) $(COMM_DIR) $(COEFF_DIR)
 YK_INC_GLOB	:=	$(addsuffix /*.hpp,$(YK_INC_DIRS))
 YK_API_TEST_EXEC :=	$(BIN_OUT_DIR)/$(YK_BASE)_api_test.exe
 YK_GRID_TEST_EXEC :=	$(BIN_OUT_DIR)/$(YK_BASE)_grid_test.exe
@@ -569,6 +569,11 @@ $(YK_OBJ_DIR)/%.o: $(COMM_DIR)/%.cpp $(YK_INC_GLOB)
 	$(CXX_PREFIX) $(YK_CXX) $(YK_CXXFLAGS) -x c++ -fPIC -c -o $@ $<
 	@ls -l $@
 
+$(YK_OBJ_DIR)/%.o: $(COEFF_DIR)/%.cpp $(YK_INC_GLOB)
+	$(MKDIR) $(dir $@)
+	$(CXX_PREFIX) $(YK_CXX) $(YK_CXXFLAGS) -x c++ -fPIC -c -o $@ $<
+	@ls -l $@
+
 $(YK_OBJ_DIR)/%.o: $(YK_LIB_SRC_DIR)/%.cpp $(YK_INC_GLOB)
 	$(MKDIR) $(dir $@)
 	$(CXX_PREFIX) $(YK_CXX) $(YK_CXXFLAGS) -x c++ -fPIC -c -o $@ $<
@@ -681,7 +686,7 @@ $(YK_SWIG_OUT_DIR)/yask_kernel_api_wrap.cpp: $(YK_SWIG_DIR)/yask*.i $(INC_DIR)/*
 	$(MKDIR) $(YK_SWIG_OUT_DIR) $(PY_OUT_DIR)
 	$(SWIG) -v -DYK_MODULE=$(YK_PY_MOD_BASE) -cppext cpp \
 	  -DUSE_MPI -DMPI_VERSION=3 -DMPI_Comm=int \
-	  -I$(INC_DIR) -I$(COMM_DIR) -I$(COMM_DIR)/swig \
+	  -I$(INC_DIR) -I$(COMM_DIR) -I$(COMM_DIR)/swig -I$(COEFF_DIR) \
 	  -c++ -python -o $@ -outdir $(PY_OUT_DIR) -builtin $<
 
 $(YK_SWIG_OUT_DIR)/yask_kernel_api_wrap.o: $(YK_SWIG_OUT_DIR)/yask_kernel_api_wrap.cpp
@@ -873,8 +878,8 @@ all:
 clean:
 	rm -fr $(YK_OUT_DIR)
 	rm -fv *.s
-	- find . $(COMM_DIR) -name '*.*.o' -print -delete
-	- find . $(COMM_DIR) -name '*.optrpt' -print -delete
+	- find . $(COMM_DIR) $(COEFF_DIR) -name '*.*.o' -print -delete
+	- find . $(COMM_DIR) $(COEFF_DIR) -name '*.optrpt' -print -delete
 
 # Remove executables, libs, etc.
 # Also remove logs from kernel dir, which are most likely from testing.
diff --git a/src/kernel/lib/realv.hpp b/src/kernel/lib/realv.hpp
index f7566e63..47abe53a 100644
--- a/src/kernel/lib/realv.hpp
+++ b/src/kernel/lib/realv.hpp
@@ -41,7 +41,6 @@ namespace yask {
     const ctrl_t ctrl_idx_mask = 0xf;
     const ctrl_t ctrl_sel_bit = 0x10;
 #ifdef USE_INTRIN256
-    const idx_t vec_elems = 8;
     typedef __m256 simd_t;
     typedef __m256i isimd_t;
     typedef float imem_t;
@@ -49,7 +48,6 @@ namespace yask {
 #define INAME(op) _mm256_ ## op ## _ps
 #define INAMEI(op) _mm256_ ## op ## _epi32
 #elif defined(USE_INTRIN512)
-    const idx_t vec_elems = 16;
     typedef __m512 simd_t;
     typedef __m512i isimd_t;
     typedef void imem_t;
@@ -66,7 +64,6 @@ namespace yask {
     const ctrl_t ctrl_idx_mask = 0x7;
     const ctrl_t ctrl_sel_bit = 0x8;
 #ifdef USE_INTRIN256
-    const idx_t vec_elems = 4;
     typedef __m256d simd_t;
     typedef __m256i isimd_t;
     typedef double imem_t;
@@ -74,7 +71,6 @@ namespace yask {
 #define INAME(op) _mm256_ ## op ## _pd
 #define INAMEI(op) _mm256_ ## op ## _epi64
 #elif defined(USE_INTRIN512)
-    const idx_t vec_elems = 8;
     typedef __m512d simd_t;
     typedef __m512i isimd_t;
     typedef void imem_t;
@@ -592,7 +588,7 @@ namespace yask {
     SVML_1ARG(yask_sin, sin, sin, sinf) // sine.
     SVML_1ARG(yask_cos, cos, cos, cosf) // cosine.
     SVML_1ARG(yask_atan, atan, atan, atanf) // inv (arc) tangent.
-    SVML_2ARG(yask_pow, pow, pow, powf) // inv (arc) tangent.
+    SVML_2ARG(yask_pow, pow, pow, powf) // power.
 #undef SVML_1ARG_SCALAR
 #undef SVML_1ARG
 #undef SVML_2ARG_SCALAR
@@ -637,34 +633,69 @@ namespace yask {
         std::cout << " b: ";
         b.print_reals(std::cout);
 #endif
-
+        assert(count >= 0);
+        assert(count <= VLEN);
+        if (count == 0)
+            res.u = b.u;
+        else if (count == VLEN)
+            res.u = a.u;
+        else {
+        
 #if defined(NO_INTRINSICS)
-        // must make temp copies in case &res == &a or &b.
-        real_vec_t tmpa = a, tmpb = b;
-        for (int i = 0; i < VLEN-count; i++)
-            res.u.r[i] = tmpb.u.r[i + count];
-        for (int i = VLEN-count; i < VLEN; i++)
-            res.u.r[i] = tmpa.u.r[i + count - VLEN];
+            // must make temp copies in case &res == &a or &b.
+            real_vec_t tmpa = a, tmpb = b;
+            for (int i = 0; i < VLEN-count; i++)
+                res.u.r[i] = tmpb.u.r[i + count];
+            for (int i = VLEN-count; i < VLEN; i++)
+                res.u.r[i] = tmpa.u.r[i + count - VLEN];
 
+            // For AVX2, use 8-bit op per 128-bit lane w/count*REAL_BYTES.
+#elif defined(USE_AVX2)
+            // See https://software.intel.com/en-us/blogs/2015/01/13/programming-using-avx2-permutations.
+            // Each nybble of ctrl is
+            // 0: lo part of A.
+            // 1: hi part of A.
+            // 2: lo part of B.
+            // 3: hi part of B.
+            auto tmp = _mm256_permute2x128_si256(b.u.mi, a.u.mi, 0x21);
+#ifdef TRACE_INTRINSICS
+            std::cout << " tmp: ";
+            real_vec_t tmpa;
+            tmpa.u.mi = tmp;
+            tmpa.print_reals(std::cout);
+#endif
+            // count must be 1..VLEN-1.
+            if (count == VLEN/2)
+                res.u.mi = tmp;
+            else if (count < VLEN/2)
+                res.u.mi = _mm256_alignr_epi8(tmp, b.u.mi, count*REAL_BYTES);
+            else
+                res.u.mi = _mm256_alignr_epi8(a.u.mi, tmp, (count-(VLEN/2))*REAL_BYTES);
+
+            // For AVX but not AVX2.
 #elif defined(USE_INTRIN256)
-        // Not really an intrinsic, but not element-wise, either.
-        // Put the 2 parts in a local array, then extract the desired part
-        // using an unaligned load.
-        typedef real_t R2[VLEN * 2] CACHE_ALIGNED;
-        R2 r2;
-        *((real_vec_t*)(&r2[0])) = b;
-        *((real_vec_t*)(&r2[VLEN])) = a;
-        real_vec_t* p = (real_vec_t*)(&r2[count]); // not usually aligned.
-        res.u.mr = INAME(loadu)((imem_t const*)p);
-
+            // Not really an intrinsic, but not element-wise, either.
+            // Put the 2 parts in a local array, then extract the desired part
+            // using an unaligned load.
+            // The Intel compiler converts this into an efficient sequence
+            // using vmovup*, vshufp*, vinsertf128, and/or vunpcklp* instrs.
+            typedef real_t R2[VLEN * 2] CACHE_ALIGNED;
+            R2 r2;
+            *((real_vec_t*)(&r2[0])) = b;
+            *((real_vec_t*)(&r2[VLEN])) = a;
+            real_vec_t* p = (real_vec_t*)(&r2[count]); // not usually aligned.
+            res.u.mr = INAME(loadu)((imem_t const*)p);
+
+            // For DP on KNC, use 32-bit op w/2x count.
 #elif REAL_BYTES == 8 && defined(ARCH_KNC) && defined(USE_INTRIN512)
-        // For KNC, for 64-bit align, use the 32-bit op w/2x count.
-        res.u.mi = _mm512_alignr_epi32(a.u.mi, b.u.mi, count*2);
+            res.u.mi = _mm512_alignr_epi32(a.u.mi, b.u.mi, count*2);
 
+            // Everything else.
 #else
-        res.u.mi = INAMEI(alignr)(a.u.mi, b.u.mi, count);
+            res.u.mi = INAMEI(alignr)(a.u.mi, b.u.mi, count);
 #endif
-
+        }
+        
 #ifdef TRACE_INTRINSICS
         std::cout << " res: ";
         res.print_reals(std::cout);
diff --git a/src/kernel/lib/soln_apis.cpp b/src/kernel/lib/soln_apis.cpp
index 4ca599a3..28a4b815 100644
--- a/src/kernel/lib/soln_apis.cpp
+++ b/src/kernel/lib/soln_apis.cpp
@@ -239,7 +239,7 @@ namespace yask {
 #ifdef SHOW_GROUPS
         os << 
             " sub-block-group-size:  " << opts->_sub_block_group_sizes.makeDimValStr(" * ") << endl <<
-            " block-group-size:      " << opts->_block_group_sizes.makeDimValStr(" * ") << endl <<
+            " block-group-size:      " << opts->_block_group_sizes.makeDimValStr(" * ") << endl;
 #endif
         os << "\nOther settings:\n"
             " yask-version:          " << yask_get_version_string() << endl <<
diff --git a/src/stencils/Iso3dfdStencil.cpp b/src/stencils/Iso3dfdStencil.cpp
index eeb6892a..38ef0746 100644
--- a/src/stencils/Iso3dfdStencil.cpp
+++ b/src/stencils/Iso3dfdStencil.cpp
@@ -38,20 +38,16 @@ class Iso3dfdStencil : public StencilRadiusBase {
     MAKE_DOMAIN_INDEX(x);         // spatial dim.
     MAKE_DOMAIN_INDEX(y);         // spatial dim.
     MAKE_DOMAIN_INDEX(z);         // spatial dim.
-    MAKE_MISC_INDEX(r);           // to index the coefficients.
 
     // Grids.
     MAKE_GRID(pressure, t, x, y, z); // time-varying 3D pressure grid.
     MAKE_GRID(vel, x, y, z);         // constant 3D vel grid (c(x,y,z)^2 * delta_t^2).
-    MAKE_ARRAY(coeff, r);            // FD coefficients.
 
 public:
 
     // For this stencil, the 'radius' is the number of FD coefficients on
     // either side of center in each spatial dimension.  For example,
-    // radius=8 implements a 16th-order accurate FD stencil.  To obtain the
-    // correct result, the 'coeff' array should be initialized with the
-    // corresponding central FD coefficients, adjusted for grid spacing.
+    // radius=8 implements a 16th-order accurate FD stencil.  
     // The accuracy in time is fixed at 2nd order.
     Iso3dfdStencil(StencilList& stencils, string suffix="", int radius=8) :
         StencilRadiusBase("iso3dfd" + suffix, stencils, radius) { }
@@ -60,15 +56,36 @@ class Iso3dfdStencil : public StencilRadiusBase {
     // Define RHS expression for pressure at t+1 based on values from vel and pressure at t.
     virtual GridValue get_next_p() {
 
+        // Grid spacing.
+        // In this implementation, it's a constant.
+        // Could make this a YASK variable to allow setting at run-time.
+        double delta_xyz = 50.0;
+        double d2 = delta_xyz * delta_xyz;
+        
+        // Spatial FD coefficients for 2nd derivative.
+        auto coeff = get_center_fd_coefficients(2, _radius);
+        size_t c0i = _radius;      // index of center sample.
+
+        for (size_t i = 0; i < coeff.size(); i++) {
+
+            // Need 3 copies of center sample for x, y, and z FDs.
+            if (i == c0i)
+                coeff[i] *= 3.0;
+
+            // Divide each by delta_xyz^2.
+            coeff[i] /= d2;
+        }
+
+        // Calculate FDx + FDy + FDz.
         // Start with center value multiplied by coeff 0.
-        GridValue next_p = pressure(t, x, y, z) * coeff(0);
+        GridValue fd_sum = pressure(t, x, y, z) * coeff[c0i];
 
         // Add values from x, y, and z axes multiplied by the
         // coeff for the given radius.
         for (int r = 1; r <= _radius; r++) {
 
             // Add values from axes at radius r.
-            next_p += (
+            fd_sum += (
                        // x-axis.
                        pressure(t, x-r, y, z) +
                        pressure(t, x+r, y, z) +
@@ -81,13 +98,32 @@ class Iso3dfdStencil : public StencilRadiusBase {
                        pressure(t, x, y, z-r) +
                        pressure(t, x, y, z+r)
 
-                       ) * coeff(r);
+                       ) * coeff[c0i + r]; // R & L coeffs are identical.
         }
 
-        // Finish equation, including t-1 and velocity components.
-        next_p = (2.0 * pressure(t, x, y, z))
-            - pressure(t-1, x, y, z) // subtract pressure from t-1.
-            + (next_p * vel(x, y, z));       // add next_p * velocity.
+        // Temporal FD coefficients.
+        // For this implementation, just check the known values to
+        // simplify the solution.
+        // But we could parameterize by accuracy-order in time as well.
+        int torder = 2;
+        auto tcoeff = get_forward_fd_coefficients(2, torder);
+        assert(tcoeff[0] == 1.0);  // pressure(t+1).
+        assert(tcoeff[1] == -2.0); // -2 * pressure(t+1).
+        assert(tcoeff[2] == 1.0);  // pressure(t-1).
+
+        // Wave equation is:
+        // 2nd time derivative(p) = c^2 * laplacian(p).
+        // See https://en.wikipedia.org/wiki/Wave_equation.
+        
+        // So, wave equation with FD approximations is:
+        // (p(t+1) - 2 * p(t) + p(t-1)) / delta_t^2 = c^2 * fd_sum.
+
+        // Solve wave equation for p(t+1):
+        // p(t+1) = 2 * p(t) - p(t-1) + c^2 * fd_sum * delta_t^2.
+
+        // Let vel = c^2 * delta_t^2 for each grid point.
+        GridValue next_p = (2.0 * pressure(t, x, y, z)) -
+            pressure(t-1, x, y, z) + (fd_sum * vel(x, y, z));
 
         return next_p;
     }
diff --git a/utils/bin/gen_loops.pl b/utils/bin/gen_loops.pl
index ad519071..527681a9 100755
--- a/utils/bin/gen_loops.pl
+++ b/utils/bin/gen_loops.pl
@@ -738,7 +738,7 @@ ($)
         }
         
         # use serpentine path in next loop if possible.
-        elsif (lc $tok eq 'broken_serpentine') {
+        elsif (lc $tok eq 'serpentine') {
             $features |= $bSerp;
         }
         
@@ -967,17 +967,19 @@ ()
     if (!$command_line || $OPT{help} || @ARGV < 1) {
         print "Outputs C++ code to scan N-D grids.\n",
             "Usage: $script [options] <code-string>\n",
-            "The <code-string> contains optionally-nested scans across the given",
+            "The <code-string> contains optionally-nested scans across the given\n",
             "  indices between 0 and N-1 indicated by 'loop(<indices>)'\n",
             "Indices may be specified as a comma-separated list or <first..last> range,\n",
             "  using the variable 'N' as needed.\n",
             "Inner loops should contain call statements that generate calls to calculation functions.\n",
             "A loop statement with more than one argument will generate a single collapsed loop.\n",
             "Optional loop modifiers:\n",
-            "  omp:             generate an OpenMP for loop (distribute work across SW threads).\n",
+            "  omp:             generate an OpenMP for loop (distribute work across SW threads).*\n",
             "  grouped:         generate grouped scan within a collapsed loop.\n",
-            ## broken: "  serpentine:      generate reverse scan when enclosing loop dimension is odd.\n",
-            "  square_wave:     generate 2D square-wave scan for two innermost dimensions of a collapsed loop.\n",
+            "  serpentine:      generate reverse scan when enclosing loop dimension is odd.*\n",
+            "  square_wave:     generate 2D square-wave scan for two innermost dimensions of a collapsed loop.*\n",
+            "      * Do not use these modifiers for YASK rank or block loops because they must\n",
+            "        execute with strictly-increasing indices when using temporal tiling.\n",
             "A 'ScanIndices' var must be defined in C++ code prior to including the generated code.\n",
             "  This struct contains the following 'Indices' elements:\n",
             "  'begin':       [in] first index to scan in each dim.\n",
@@ -1001,7 +1003,7 @@ ()
             "  $script -ndims 3 'omp loop(0,1) { loop(2) { call(f); } }'\n",
             "  $script -ndims 3 'omp loop(0) { loop(1,2) { call(f); } }'\n",
             "  $script -ndims 3 'grouped omp loop(0..N-1) { call(f); }'\n",
-            "  $script -ndims 3 'omp loop(0) { square loop(1..N-1) { call(f); } }'\n",
+            "  $script -ndims 3 'omp loop(0) { square_wave loop(1..N-1) { call(f); } }'\n",
             "  $script -ndims 4 'omp loop(0..N+1) { loop(N+2,N-1) { call(f); } }'\n";
         exit 1;
     }
diff --git a/utils/bin/get_loop_stats.pl b/utils/bin/get_loop_stats.pl
index 676a6f63..7781b673 100755
--- a/utils/bin/get_loop_stats.pl
+++ b/utils/bin/get_loop_stats.pl
@@ -27,6 +27,8 @@
 # report some stats on them.
 
 use strict;
+use File::Basename;
+
 my $minInstrs = 2;
 my $printAsm = 0;
 my $targetLabel = "";
@@ -65,6 +67,8 @@ sub usage {
 
   my $fname = $arg;
   my %files;                    # map from file index to source file-name.
+  my %dirs;                     # map from file index to source dir-name.
+  my %dirIndices;               # map from dir-name to dir index.
   my %loopLabels;
   my %astats;                   # arg stats.
   my %istats;                   # instr stats.
@@ -79,8 +83,21 @@ sub usage {
     my ($locInfo, $srcFile, $curFn); # strings describing current location.
     my @lines;                  # lines to print.
 
+    # Header.
+    if (!$pass) {
+      print "\n'$fname'...\n";
+    } else {
+      my %id;
+      for my $dir (keys %dirIndices) {
+        $id{$dirIndices{$dir}} = $dir;
+      }
+      print "\nDirectory key:\n";
+      for my $di (sort { $a <=> $b } keys %id) {
+        print "  <dir$di> = $id{$di}\n";
+      }
+    }
+
     open F, "<$fname" or usage("error: cannot open '$fname'");
-    print "\n'$fname'...\n" if !$pass;
     while (<F>) {
       chomp;
 
@@ -88,7 +105,12 @@ sub usage {
       #  .file   40 "src/stencil_block_loops.hpp"
       if (/^\s*\.file\s+(\d+)\s+"(.*)"/) {
         my ($fi, $fn) = ($1, $2);
-        $files{$fi} = $fn;
+        $files{$fi} = basename($fn);
+        my $dir = dirname($fn);
+        $dirs{$fi} = dirname($fn);
+        if ($dir && !exists($dirIndices{$dir})) {
+          $dirIndices{$dir} = scalar keys %dirIndices;
+        }
       }
 
       # location, e.g.,
@@ -98,6 +120,10 @@ sub usage {
         if (exists $files{$fi}) {
           $srcFile = $files{$fi};
           $locInfo = "$srcFile:$info";
+          my $srcDir = $dirs{$fi};
+          if ($srcDir && exists($dirIndices{$srcDir})) {
+            $locInfo = "<dir$dirIndices{$srcDir}>/$locInfo";
+          }
         } else {
           $srcFile = "";
           $locInfo = "";
diff --git a/utils/bin/yask_tuner.pl b/utils/bin/yask_tuner.pl
index 0a3da276..a5969cf4 100755
--- a/utils/bin/yask_tuner.pl
+++ b/utils/bin/yask_tuner.pl
@@ -371,9 +371,10 @@ sub usage {
   ('123', '132', '213', '231', '312', '321');
 
 # Possible space-filling curve modifiers.
+my @pathNamesIncreasing =
+  ('', 'grouped');
 my @pathNames =
-  ('', 'square_wave', 'grouped');
-##  ('', 'serpentine', 'square_wave serpentine', 'grouped');
+  (@pathNamesIncreasing, 'serpentine', 'square_wave serpentine', 'square_wave');
 
 # List of folds.
 if ( !@folds ) {
@@ -471,14 +472,17 @@ sub usage {
 
      # Loops, from the list above.
      # Each loop consists of index order and path mods.
+     # Block and rank paths require increasing indices.
      [ 0, $#loopOrders, 1, 'subBlockOrder' ],
      [ 0, $#pathNames, 1, 'subBlockPath' ],
      [ 0, $#loopOrders, 1, 'miniBlockOrder' ],
      [ 0, $#pathNames, 1, 'miniBlockPath' ],
      [ 0, $#loopOrders, 1, 'blockOrder' ],
-     [ 0, $#pathNames, 1, 'blockPath' ],
+     [ 0, $#pathNamesIncreasing, 1, 'blockPath' ],
      [ 0, $#loopOrders, 1, 'regionOrder' ],
      [ 0, $#pathNames, 1, 'regionPath' ],
+     [ 0, $#loopOrders, 1, 'rankOrder' ],
+     [ 0, $#pathNamesIncreasing, 1, 'rankPath' ],
 
      # how to shape vectors, from the list above.
      [ 0, $#folds, 1, 'fold' ],
@@ -495,7 +499,7 @@ sub usage {
 
      # other build options.
      [ 0, $#schedules, 1, 'ompRegionSchedule' ], # OMP schedule for region loop.
-     [ 0, $#schedules, 1, 'ompBlockSchedule' ], # OMP schedule for block loop.
+     [ 0, $#schedules, 1, 'ompBlockSchedule' ], # OMP schedule for mini-block loop.
 
     );
 }
@@ -660,7 +664,7 @@ ($$$)
   $val = $fixedVals{$key};
   return $val if defined $val;
 
-  # return default value for build var if disabled.
+  # return dummy value for build var if disabled.
   return 1 if (!$doBuild && $isBuildVar);
 
   die "internal error: value for gene '$key' not provided.\n";
@@ -731,11 +735,16 @@ ($$)
   my $makeCmd = "echo 'build disabled'";
 
   if ($doBuild) {
-    $tag .= "_".md5_hex($macros, $margs, $makeArgs, $realBytes, $radius);
+    my $tagPrefix = $tag."_p".$$."_";
+    $tag = $tagPrefix.md5_hex($macros, $margs, $makeArgs, $realBytes, $radius);
+
+    # Remove binaries that haven't been used in a while.
+    system "find bin lib -name '*$tagPrefix*' -amin +15 | xargs --no-run-if-empty rm";
 
     # Already exists?
-    if (-x "bin/yask_kernel.$tag.$arch.exe") {
-      $makeCmd = "echo 'binary exists'";
+    if (-x "bin/yask_kernel.$tag.$arch.exe" &&
+        -x "lib/libyask_kernel.$tag.$arch.so") {
+      $makeCmd = "echo 'binary & library exist'";
     }
     else {
       $makeCmd =
@@ -797,16 +806,13 @@ ($$$)
         push @cmdOut, $line;
 
         # E.g.,
-        # 5-D grid (t=2 * tidx=2 * x=12 * y=12 * z=42) 't_grids' with data at 0x7fa476600000 containing 1.47656MiB (24.192K SIMD FP element(s) of 64 byte(s) each)
-        # 4-D grid (t=2 * x=5 * y=19 * z=19) 'pressure' with data at 0x65cbc0 containing 112.812KiB (3.61K SIMD FP element(s) of 32 byte(s) each)
-        # 3-D grid (x=3 * y=3 * z=3) 'vel' with data at 0x6790c0 containing 864B (27 SIMD FP element(s) of 32 byte(s) each)
-        # 1-D grid (r=9) 'coeff' with data at 0x679600 containing 36B (9 FP element(s) of 4 byte(s) each)
+        # 'A' 4-D var (t=2 * x=8 * y=48 * z=49) with storage at 0x2aba63016000 ...
         my $ngrids = 1;
-        if (/^\d-?D grid.*x=.*y=.*z=/) {
+        if (/\d-?D .*x=.*y=.*z=/) {
           for my $w (split ' ',$line) {
             if ($w =~ /(\w+)=(\d+)/) {
               my ($dim, $sz) = ($1, $2);
-              if ($dim !~ /^[xyz]/) {
+              if ($dim eq 't') {
                 $ngrids *= $sz;
               }
             }
@@ -1015,17 +1021,14 @@ ($$$$$$)
 }
 
 # return loop-ctrl vars.
-sub makeLoopVars($$$$$) {
+sub makeLoopVars($$$$) {
   my $h = shift;
   my $makePrefix = shift;       # e.g., 'BLOCK'.
   my $tunerPrefix = shift;      # e.g., 'block'.
-  my $reqdMods = shift;         # e.g., ''.
   my $lastDim = shift;          # e.g., 2 or 3.
 
   my $order = readHash($h, $tunerPrefix."Order", 1);
   my $orderStr = $loopOrders[$order];           # e.g., '231'.
-  my $path = readHash($h, $tunerPrefix."Path", 1);
-  my $pathStr = @pathNames[$path];                # e.g., 'grouped'.
 
   # dimension vars.
   my @dims = split '',$orderStr;      # e.g., ('2', '3', '1).
@@ -1033,9 +1036,17 @@ ($$$$$)
 
   # vars to create.
   my $order = join(',', @dims);  # e.g., '2, 1'.
-  my $outerMods = "$pathStr $reqdMods";
+  my $outerMods = $reqdMods;
   my $innerMods = '';
 
+  # path gene?
+  my $pathKey = $tunerPrefix."Path";
+  if (exists $h->{$pathKey}) {
+    my $path = readHash($h, $pathKey, 1);
+    my $pathStr = @pathNames[$path];                # e.g., 'grouped'.
+    $outerMods = "$pathStr $outerMods";
+  }
+  
   my $loopVars = " ".$makePrefix."_LOOP_ORDER='$order'";
   $loopVars .= " ".$makePrefix."_LOOP_OUTER_MODS='$outerMods'";
   $loopVars .= " ".$makePrefix."_LOOP_INNER_MODS='$innerMods'";
@@ -1306,10 +1317,11 @@ sub fitness {
   $mvars .= " fold=x=$fs[0],y=$fs[1],z=$fs[2]";
 
   # gen-loops vars.
-  $mvars .= makeLoopVars($h, 'REGION', 'region', '', 3);
-  $mvars .= makeLoopVars($h, 'BLOCK', 'block', '', 3);
-  $mvars .= makeLoopVars($h, 'MINI_BLOCK', 'miniBlock', '', 3);
-  $mvars .= makeLoopVars($h, 'SUB_BLOCK', 'subBlock', '', 2);
+  $mvars .= makeLoopVars($h, 'RANK', 'rank', 3);
+  $mvars .= makeLoopVars($h, 'REGION', 'region', 3);
+  $mvars .= makeLoopVars($h, 'BLOCK', 'block', 3);
+  $mvars .= makeLoopVars($h, 'MINI_BLOCK', 'miniBlock', 3);
+  $mvars .= makeLoopVars($h, 'SUB_BLOCK', 'subBlock', 2);
 
   # other vars.
   $mvars .= " omp_region_schedule=$regionScheduleStr omp_block_schedule=$blockScheduleStr";