diff --git a/Src/Base/AMReX_Array.H b/Src/Base/AMReX_Array.H
index 15ddde4d1e..d9f5fd1af5 100644
--- a/Src/Base/AMReX_Array.H
+++ b/Src/Base/AMReX_Array.H
@@ -10,6 +10,7 @@
 #include <AMReX_REAL.H>
 #include <AMReX_Algorithm.H>
 #include <AMReX_Dim3.H>
+#include <AMReX_SmallMatrix.H>
 
 #include <array>
 #include <memory>
@@ -148,10 +149,6 @@ namespace amrex {
      * order (last index moving fastest). If not specified, Fortran order is
      * assumed.
      */
-    namespace Order {
-        struct C {};
-        struct F {};
-    }
 
     /**
      * A GPU-compatible one-dimensional array.
@@ -280,7 +277,7 @@ namespace amrex {
      *               default if not given)
      */
     template <class T, int XLO, int XHI, int YLO, int YHI,
-              class ORDER=Order::F>
+              Order ORDER = Order::F>
     struct Array2D
     {
         /**
@@ -370,8 +367,7 @@ namespace amrex {
          * If the order is not specified, Fortran column-major order is assumed
          * (the index \c i moves the fastest)
          */
-        template <typename O=ORDER,
-                  std::enable_if_t<std::is_same_v<O,Order::F>,int> = 0>
+        template <Order Ord=ORDER, std::enable_if_t<Ord==Order::F,int> = 0>
         [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
         const T& operator() (int i, int j) const noexcept {
             AMREX_ASSERT(i >= XLO && i <= XHI && j >= YLO && j <= YHI);
@@ -384,8 +380,7 @@ namespace amrex {
          * If the order is not specified, Fortran column-major order is assumed
          * (the index \c i moves the fastest)
          */
-        template <typename O=ORDER,
-                  std::enable_if_t<std::is_same_v<O,Order::F>,int> = 0>
+        template <Order Ord=ORDER, std::enable_if_t<Ord==Order::F,int> = 0>
         [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
         T& operator() (int i, int j) noexcept {
             AMREX_ASSERT(i >= XLO && i <= XHI && j >= YLO && j <= YHI);
@@ -398,8 +393,7 @@ namespace amrex {
          * When the order is manually specified as Order::C, row-major order
          * is used (the index \c j moves the fastest).
          */
-        template <typename O=ORDER,
-                  std::enable_if_t<std::is_same_v<O,Order::C>,int> = 0>
+        template <Order Ord=ORDER, std::enable_if_t<Ord==Order::C,int> = 0>
         [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
         const T& operator() (int i, int j) const noexcept {
             AMREX_ASSERT(i >= XLO && i <= XHI && j >= YLO && j <= YHI);
@@ -412,8 +406,7 @@ namespace amrex {
          * When the order is manually specified as Order::C, row-major order
          * is used (the index \c j moves the fastest).
          */
-        template <typename O=ORDER,
-                  std::enable_if_t<std::is_same_v<O,Order::C>,int> = 0>
+        template <Order Ord=ORDER, std::enable_if_t<Ord==Order::C,int> = 0>
         [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
         T& operator() (int i, int j) noexcept {
             AMREX_ASSERT(i >= XLO && i <= XHI && j >= YLO && j <= YHI);
@@ -551,7 +544,7 @@ namespace amrex {
      *               default if not given)
      */
     template <class T, int XLO, int XHI, int YLO, int YHI, int ZLO, int ZHI,
-              class ORDER=Order::F>
+              Order ORDER=Order::F>
     struct Array3D
     {
         /**
@@ -662,8 +655,7 @@ namespace amrex {
          * If the order is not specified, Fortran column-major order is assumed
          * (the index \c i moves the fastest)
          */
-        template <typename O=ORDER,
-                  std::enable_if_t<std::is_same_v<O,Order::F>,int> = 0>
+        template <Order Ord=ORDER, std::enable_if_t<Ord==Order::F,int> = 0>
         [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
         const T& operator() (int i, int j, int k) const noexcept {
             return arr[i+j*(XHI-XLO+1)+k*((XHI-XLO+1)*(YHI-YLO+1))
@@ -676,8 +668,7 @@ namespace amrex {
          * If the order is not specified, Fortran column-major order is assumed
          * (the index \c i moves the fastest)
          */
-        template <typename O=ORDER,
-                  std::enable_if_t<std::is_same_v<O,Order::F>,int> = 0>
+        template <Order Ord=ORDER, std::enable_if_t<Ord==Order::F,int> = 0>
         [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
         T& operator() (int i, int j, int k) noexcept {
             return arr[i+j*(XHI-XLO+1)+k*((XHI-XLO+1)*(YHI-YLO+1))
@@ -690,8 +681,7 @@ namespace amrex {
          * When the order is manually specified as Order::C, row-major order
          * is used (the index \c k moves the fastest).
          */
-        template <typename O=ORDER,
-                  std::enable_if_t<std::is_same_v<O,Order::C>,int> = 0>
+        template <Order Ord=ORDER, std::enable_if_t<Ord==Order::C,int> = 0>
         [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
         const T& operator() (int i, int j, int k) const noexcept {
             return arr[k+j*(ZHI-ZLO+1)+i*((ZHI-ZLO+1)*(YHI-YLO+1))
@@ -704,8 +694,7 @@ namespace amrex {
          * When the order is manually specified as Order::C, row-major order
          * is used (the index \c k moves the fastest).
          */
-        template <typename O=ORDER,
-                  std::enable_if_t<std::is_same_v<O,Order::C>,int> = 0>
+        template <Order Ord=ORDER, std::enable_if_t<Ord==Order::C,int> = 0>
         [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
         T& operator() (int i, int j, int k) noexcept {
             return arr[k+j*(ZHI-ZLO+1)+i*((ZHI-ZLO+1)*(YHI-YLO+1))
diff --git a/Src/Base/AMReX_ConstexprFor.H b/Src/Base/AMReX_ConstexprFor.H
new file mode 100644
index 0000000000..972dd1ac30
--- /dev/null
+++ b/Src/Base/AMReX_ConstexprFor.H
@@ -0,0 +1,38 @@
+#ifndef AMREX_CONSTEXPR_FOR_H_
+#define AMREX_CONSTEXPR_FOR_H_
+#include <AMReX_Config.H>
+
+#include <AMReX_GpuQualifiers.H>
+#include <AMReX_Extension.H>
+
+#include <type_traits>
+
+namespace amrex {
+
+// Implementation of "constexpr for" based on
+// https://artificial-mind.net/blog/2020/10/31/constexpr-for
+//
+// Approximates what one would get from a compile-time
+// unrolling of the loop
+// for (int i = 0; i < N; ++i) {
+//    f(i);
+// }
+//
+// The mechanism is recursive: we evaluate f(i) at the current
+// i and then call the for loop at i+1. f() is a lambda function
+// that provides the body of the loop and takes only an integer
+// i as its argument.
+
+template<auto I, auto N, class F>
+AMREX_GPU_HOST_DEVICE AMREX_INLINE
+constexpr void constexpr_for (F const& f)
+{
+    if constexpr (I < N) {
+        f(std::integral_constant<decltype(I), I>());
+        constexpr_for<I+1, N>(f);
+    }
+}
+
+}
+
+#endif
diff --git a/Src/Base/AMReX_Loop.H b/Src/Base/AMReX_Loop.H
index fe76b8c988..fe216bac45 100644
--- a/Src/Base/AMReX_Loop.H
+++ b/Src/Base/AMReX_Loop.H
@@ -3,6 +3,7 @@
 #include <AMReX_Config.H>
 
 #include <AMReX_Box.H>
+#include <AMReX_ConstexprFor.H>
 #include <AMReX_Extension.H>
 
 namespace amrex {
@@ -567,30 +568,6 @@ void LoopConcurrentOnCpu (BoxND<dim> const& bx, int ncomp, F const& f) noexcept
     }
 }
 
-// Implementation of "constexpr for" based on
-// https://artificial-mind.net/blog/2020/10/31/constexpr-for
-//
-// Approximates what one would get from a compile-time
-// unrolling of the loop
-// for (int i = 0; i < N; ++i) {
-//    f(i);
-// }
-//
-// The mechanism is recursive: we evaluate f(i) at the current
-// i and then call the for loop at i+1. f() is a lambda function
-// that provides the body of the loop and takes only an integer
-// i as its argument.
-
-template<auto I, auto N, class F>
-AMREX_GPU_HOST_DEVICE AMREX_INLINE
-constexpr void constexpr_for (F const& f)
-{
-    if constexpr (I < N) {
-        f(std::integral_constant<decltype(I), I>());
-        constexpr_for<I+1, N>(f);
-    }
-}
-
 #include <AMReX_Loop.nolint.H>
 
 }
diff --git a/Src/Base/AMReX_SmallMatrix.H b/Src/Base/AMReX_SmallMatrix.H
new file mode 100644
index 0000000000..7122e9f52c
--- /dev/null
+++ b/Src/Base/AMReX_SmallMatrix.H
@@ -0,0 +1,419 @@
+#ifndef AMREX_SMALL_MATRIX_H_
+#define AMREX_SMALL_MATRIX_H_
+#include <AMReX_Config.H>
+
+#include <AMReX_Algorithm.H>
+#include <AMReX_BLassert.H>
+#include <AMReX_Extension.H>
+#include <AMReX_GpuQualifiers.H>
+#include <AMReX_ConstexprFor.H>
+
+#include <algorithm>
+#include <initializer_list>
+#include <iostream>
+#include <type_traits>
+
+namespace amrex {
+
+    enum struct Order { C, F, RowMajor=C, ColumnMajor=F };
+
+    /**
+     * \brief Matrix class with compile-time size
+     *
+     * The starting index for both rows and columns is always zero.  Also
+     * note that column vectors and row vectors are special cases of a
+     * Matrix.
+     *
+     * \tparam T Matrix element data type.
+     * \tparam NRows Number of rows.
+     * \tparam NCols Number of columns.
+     * \tparam ORDER Memory layout order. Order::F (i.e., column-major) by default.
+     */
+    template <class T, int NRows, int NCols, Order ORDER = Order::F>
+    struct SmallMatrix
+    {
+        using value_type = T;
+        using reference_type = T&;
+        static constexpr int row_size = NRows;
+        static constexpr int column_size = NCols;
+        static constexpr Order ordering = ORDER;
+
+        /**
+         * \brief Default constructor
+         *
+         * The data are uninitialized by default. If you want to initialize
+         * to zero, you can do `SmallMatrix<T,NRows,NCols> M{};`.
+         */
+        AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+        constexpr SmallMatrix () = default;
+
+        /**
+         * \brief Constructs column- or row-vector
+         *
+         * The data are initialized with the given variadic arguments. If
+         * the number of argument is less than the size of the vector, the
+         * rest of the vector is initialized to zero.
+         */
+        template <typename... Ts, int MM=NRows, int NN=NCols,
+                  std::enable_if_t<MM==1 || NN==1, int> = 0>
+        AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+        constexpr explicit SmallMatrix (Ts... vs)
+            : m_mat{vs...}
+        {
+            static_assert(sizeof...(vs) <= std::max(NRows,NCols));
+        }
+
+        /**
+         * \brief Constructs SmallMatrix with nested std::initializer_list
+         *
+         * The initializer list is assumed to be in row-major order, even when
+         * the ordering for the SmallMatrix object is colum-major. Below is
+         * an example of constructing a matrix with 2 rows and 3 columns.
+         \verbatim
+             SmallMatrix<double,2,3> M{{11., 12., 13.},
+                                       {21., 22., 23.}};
+         \endverbatim
+         */
+        AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+        explicit SmallMatrix (std::initializer_list<std::initializer_list<T>> const& init)
+        {
+            AMREX_ASSERT(NRows == init.size());
+            int i = 0;
+            for (auto const& row : init) {
+                AMREX_ASSERT(NCols == row.size());
+                int j = 0;
+                for (auto const& x : row) {
+                    (*this)(i,j) = x;
+                    ++j;
+                }
+                ++i;
+            }
+        }
+
+        //! Returns a const reference to the element at row i and column j.
+        [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+        const T& operator() (int i, int j) const noexcept {
+            AMREX_ASSERT(i < NRows && j < NCols);
+            if constexpr (ORDER == Order::F) {
+                return m_mat[i+j*NRows];
+            } else {
+                return m_mat[j+i*NCols];
+            }
+        }
+
+        //! Returns a reference to the element at row i and column j.
+        [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+        T& operator() (int i, int j) noexcept {
+            AMREX_ASSERT(i < NRows && j < NCols);
+            if constexpr (ORDER == Order::F) {
+                return m_mat[i+j*NRows];
+            } else {
+                return m_mat[j+i*NCols];
+            }
+        }
+
+        //! Returns a const reference to element i of a vector
+        template <int MM=NRows, int NN=NCols, std::enable_if_t<(MM==1 || NN==1), int> = 0>
+        [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+        const T& operator() (int i) const noexcept {
+            AMREX_ASSERT(i < NRows*NCols);
+            return m_mat[i];
+        }
+
+        //! Returns a reference to element i of a vector
+        template <int MM=NRows, int NN=NCols, std::enable_if_t<(MM==1 || NN==1), int> = 0>
+        [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+        T& operator() (int i) noexcept {
+            AMREX_ASSERT(i < NRows*NCols);
+            return m_mat[i];
+        }
+
+        //! Returns a const reference to element i of a vector
+        template <int MM=NRows, int NN=NCols, std::enable_if_t<(MM==1 || NN==1), int> = 0>
+        [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+        const T& operator[] (int i) const noexcept {
+            AMREX_ASSERT(i < NRows*NCols);
+            return m_mat[i];
+        }
+
+        //! Returns a reference to element i of a vector
+        template <int MM=NRows, int NN=NCols, std::enable_if_t<(MM==1 || NN==1), int> = 0>
+        [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+        T& operator[] (int i) noexcept {
+            AMREX_ASSERT(i < NRows*NCols);
+            return m_mat[i];
+        }
+
+        /**
+         * Returns a \c const pointer address to the first element of the
+         * SmallMatrix object, as if the object is treated as one-dimensional.
+         */
+        [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+        const T* begin () const noexcept { return m_mat; }
+
+        /**
+         * Returns a \c const pointer address right after the last element of the
+         * SmallMatrix object, as if the object is treated as one-dimensional.
+         */
+        [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+        const T* end () const noexcept { return m_mat + NRows*NCols; }
+
+        /**
+         * Returns a pointer address to the first element of the
+         * SmallMatrix object, as if the object is treated as one-dimensional.
+         */
+        [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+        T* begin () noexcept { return m_mat; }
+
+        /**
+         * Returns a pointer address right after the last element of the
+         * SmallMatrix object, as if the object is treated as one-dimensional.
+         */
+        [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+        T* end () noexcept { return m_mat + NRows*NCols; }
+
+        //! Set all elements in the matrix to the given value
+        AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+        SmallMatrix<T,NRows,NCols,ORDER>&
+        setVal (T val)
+        {
+            for (auto& x : m_mat) { x = val; }
+            return *this;
+        }
+
+        //! Returns an identity matrix
+        template <int MM=NRows, int NN=NCols, std::enable_if_t<MM==NN, int> = 0>
+        static constexpr
+        AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+        SmallMatrix<T,NRows,NCols,ORDER>
+        Identity () noexcept {
+            SmallMatrix<T,NRows,NCols,ORDER> I{};
+            constexpr_for<0,NRows>([&] (int i) { I(i,i) = T(1); });
+            return I;
+        }
+
+        //! Returns a matrix initialized with zeros
+        static constexpr
+        AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+        SmallMatrix<T,NRows,NCols,ORDER>
+        Zero () noexcept {
+            SmallMatrix<T,NRows,NCols,ORDER> Z{};
+            return Z;
+        }
+
+        //! Returns transposed matrix
+        [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+        SmallMatrix<T,NCols,NRows,ORDER>
+        transpose () const
+        {
+            SmallMatrix<T,NCols,NRows,ORDER> r;
+            for (int j = 0; j < NRows; ++j) {
+                for (int i = 0; i < NCols; ++i) {
+                    r(i,j) = (*this)(j,i);
+                }
+            }
+            return r;
+        }
+
+        //! Transposes a square matrix in-place.
+        template <int MM=NRows, int NN=NCols, std::enable_if_t<MM==NN,int> = 0>
+        AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+        SmallMatrix<T,NRows,NCols,ORDER>&
+        transposeInPlace ()
+        {
+            for (int j = 1; j < NCols; ++j) {
+                for (int i = 0; i < j; ++i) {
+                    amrex::Swap((*this)(i,j), (*this)(j,i));
+                }
+            }
+            return *this;
+        }
+
+        //! Returns the product of all elements in the matrix
+        [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+        T product () const
+        {
+            T p = 1;
+            for (auto const& x : m_mat) {
+                p *= x;
+            }
+            return p;
+        }
+
+        //! Returns the sum of all elements in the matrix
+        [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+        T sum () const
+        {
+            T s = 0;
+            for (auto const& x : m_mat) {
+                s += x;
+            }
+            return s;
+        }
+
+        //! Returns the trace of a square matrix
+        template <int MM=NRows, int NN=NCols, std::enable_if_t<MM==NN,int> = 0>
+        [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+        T trace () const
+        {
+            T t = 0;
+            constexpr_for<0,MM>([&] (int i) { t += (*this)(i,i); });
+            return t;
+        }
+
+        //! Operator += performing matrix addition as in (*this) += rhs
+        AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+        SmallMatrix<T,NRows,NCols,ORDER>&
+        operator += (SmallMatrix<T,NRows,NCols,ORDER> const& rhs)
+        {
+            for (int n = 0; n < NRows*NCols; ++n) {
+                m_mat[n] += rhs.m_mat[n];
+            }
+            return *this;
+        }
+
+        //! Binary operator + returning the result of maxtrix addition, lhs+rhs
+        AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+        friend SmallMatrix<T,NRows,NCols,ORDER>
+        operator+ (SmallMatrix<T,NRows,NCols,ORDER>        lhs,
+                   SmallMatrix<T,NRows,NCols,ORDER> const& rhs)
+        {
+            lhs += rhs;
+            return lhs;
+        }
+
+        //! Operator -= performing matrix subtraction as in (*this) -= rhs
+        AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+        SmallMatrix<T,NRows,NCols,ORDER>&
+        operator -= (SmallMatrix<T,NRows,NCols,ORDER> const& rhs)
+        {
+            for (int n = 0; n < NRows*NCols; ++n) {
+                m_mat[n] -= rhs.m_mat[n];
+            }
+            return *this;
+        }
+
+        //! Binary operator - returning the result of maxtrix subtraction, lhs-rhs
+        AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+        friend SmallMatrix<T,NRows,NCols,ORDER>
+        operator- (SmallMatrix<T,NRows,NCols,ORDER>        lhs,
+                   SmallMatrix<T,NRows,NCols,ORDER> const& rhs)
+        {
+            lhs -= rhs;
+            return lhs;
+        }
+
+        //! Unary minus operator
+        [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+        SmallMatrix<T,NRows,NCols,ORDER>
+        operator- () const
+        {
+            return (*this) * T(-1);
+        }
+
+        //! Operator *= that scales this matrix in place by a scalar.
+        AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+        SmallMatrix<T,NRows,NCols,ORDER>&
+        operator *= (T a)
+        {
+            for (auto& x : m_mat) {
+                x *= a;
+            }
+            return *this;
+        }
+
+        //! Returns the product of a matrix and a scalar
+        AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+        friend SmallMatrix<T,NRows,NCols,ORDER>
+        operator* (SmallMatrix<T,NRows,NCols,ORDER> m, T a)
+        {
+            m *= a;
+            return m;
+        }
+
+        //! Returns the product of a scalar and a matrix
+        AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+        friend SmallMatrix<T,NRows,NCols,ORDER>
+        operator* (T a, SmallMatrix<T,NRows,NCols,ORDER> m)
+        {
+            m *= a;
+            return m;
+        }
+
+        //! Returns matrix product of two matrices
+        template <class U, int N1, int N2, int N3, Order Ord>
+        AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+        friend SmallMatrix<U,N1,N3,Ord>
+        operator* (SmallMatrix<U,N1,N2,Ord> const& lhs,
+                   SmallMatrix<U,N2,N3,Ord> const& rhs);
+
+        //! Returns the dot product of two vectors
+        [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+        T dot (SmallMatrix<T,NRows,NCols,ORDER> const& rhs) const
+        {
+            T r = 0;
+            for (int n = 0; n < NRows*NCols; ++n) {
+                r += m_mat[n] * rhs.m_mat[n];
+            }
+            return r;
+        }
+
+    private:
+        T m_mat[NRows*NCols];
+    };
+
+    template <class U, int N1, int N2, int N3, Order Ord>
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    SmallMatrix<U,N1,N3,Ord>
+    operator* (SmallMatrix<U,N1,N2,Ord> const& lhs,
+               SmallMatrix<U,N2,N3,Ord> const& rhs)
+    {
+        SmallMatrix<U,N1,N3,Ord> r;
+        if constexpr (Ord == Order::F) {
+            for (int j = 0; j < N3; ++j) {
+                constexpr_for<0,N1>([&] (int i) { r(i,j) = U(0); });
+                for (int k = 0; k < N2; ++k) {
+                    auto b = rhs(k,j);
+                    constexpr_for<0,N1>([&] (int i)
+                    {
+                        r(i,j) += lhs(i,k) * b;
+                    });
+                }
+            }
+        } else {
+            for (int i = 0; i < N1; ++i) {
+                constexpr_for<0,N3>([&] (int j) { r(i,j) = U(0); });
+                for (int k = 0; k < N2; ++k) {
+                    auto a = lhs(i,k);
+                    constexpr_for<0,N3>([&] (int j)
+                    {
+                        r(i,j) += a * rhs(k,j);
+                    });
+                }
+            }
+        }
+        return r;
+    }
+
+    template <class T, int NRows, int NCols, Order ORDER>
+    std::ostream& operator<< (std::ostream& os,
+                              SmallMatrix<T,NRows,NCols,ORDER> const& mat)
+    {
+        for (int i = 0; i < NRows; ++i) {
+            os << mat(i,0);
+            for (int j = 1; j < NCols; ++j) {
+                os << " " << mat(i,j);
+            }
+            os << "\n";
+        }
+        return os;
+    }
+
+    template <class T, int N>
+    using SmallVector = SmallMatrix<T,N,1>;
+
+    template <class T, int N>
+    using SmallRowVector = SmallMatrix<T,1,N>;
+}
+
+#endif
diff --git a/Src/Base/AMReX_TableData.H b/Src/Base/AMReX_TableData.H
index 9c2a0d06ca..ab19c7efff 100644
--- a/Src/Base/AMReX_TableData.H
+++ b/Src/Base/AMReX_TableData.H
@@ -72,7 +72,7 @@ struct Table1D
 #endif
 };
 
-template <typename T, typename ORDER = Order::F>
+template <typename T, Order ORDER = Order::F>
 struct Table2D
 {
     T* AMREX_RESTRICT p = nullptr;
@@ -110,9 +110,7 @@ struct Table2D
 #if defined(AMREX_DEBUG) || defined(AMREX_BOUND_CHECK)
         index_assert(i,j);
 #endif
-        static_assert(std::is_same_v<ORDER,Order::F> ||
-                      std::is_same_v<ORDER,Order::C>);
-        if constexpr (std::is_same_v<ORDER,Order::F>) {
+        if constexpr (ORDER == Order::F) {
             return p[(i-begin[0])+(j-begin[1])*stride1];
         } else {
             return p[(i-begin[0])*stride1+(j-begin[1])];
@@ -146,7 +144,7 @@ private:
     static constexpr int len0 (GpuArray<int,2> const& a_begin,
                                GpuArray<int,2> const& a_end) noexcept
     {
-        if constexpr (std::is_same_v<ORDER,Order::F>) {
+        if constexpr (ORDER == Order::F) {
             return a_end[0] - a_begin[0];
         } else {
             return a_end[1] - a_begin[1];
@@ -154,7 +152,7 @@ private:
     }
 };
 
-template <typename T, typename ORDER = Order::F>
+template <typename T, Order ORDER = Order::F>
 struct Table3D
 {
     T* AMREX_RESTRICT p = nullptr;
@@ -195,9 +193,7 @@ struct Table3D
 #if defined(AMREX_DEBUG) || defined(AMREX_BOUND_CHECK)
         index_assert(i,j,k);
 #endif
-        static_assert(std::is_same_v<ORDER,Order::F> ||
-                      std::is_same_v<ORDER,Order::C>);
-        if constexpr (std::is_same_v<ORDER,Order::F>) {
+        if constexpr (ORDER == Order::F) {
             return p[(i-begin[0])+(j-begin[1])*stride1+(k-begin[2])*stride2];
         } else {
             return p[(i-begin[0])*stride2+(j-begin[1])*stride1+(k-begin[2])];
@@ -234,7 +230,7 @@ private:
     static constexpr int len0 (GpuArray<int,3> const& a_begin,
                                GpuArray<int,3> const& a_end) noexcept
     {
-        if constexpr (std::is_same_v<ORDER,Order::F>) {
+        if constexpr (ORDER == Order::F) {
             return a_end[0] - a_begin[0];
         } else {
             return a_end[2] - a_begin[2];
@@ -248,7 +244,7 @@ private:
     }
 };
 
-template <typename T, typename ORDER = Order::F>
+template <typename T, Order ORDER = Order::F>
 struct Table4D
 {
     T* AMREX_RESTRICT p = nullptr;
@@ -292,9 +288,7 @@ struct Table4D
 #if defined(AMREX_DEBUG) || defined(AMREX_BOUND_CHECK)
         index_assert(i,j,k,n);
 #endif
-        static_assert(std::is_same_v<ORDER,Order::F> ||
-                      std::is_same_v<ORDER,Order::C>);
-        if constexpr (std::is_same_v<ORDER,Order::F>) {
+        if constexpr (ORDER == Order::F) {
             return p[(i-begin[0])+(j-begin[1])*stride1+(k-begin[2])*stride2+(n-begin[3])*stride3];
         } else {
             return p[(i-begin[0])*stride3+(j-begin[1])*stride2+(k-begin[2])*stride1+(n-begin[3])];
@@ -333,7 +327,7 @@ private:
     static constexpr int len0 (GpuArray<int,4> const& a_begin,
                                GpuArray<int,4> const& a_end) noexcept
     {
-        if constexpr (std::is_same_v<ORDER,Order::F>) {
+        if constexpr (ORDER == Order::F) {
             return a_end[0] - a_begin[0];
         } else {
             return a_end[3] - a_begin[3];
@@ -343,7 +337,7 @@ private:
     static constexpr int len1 (GpuArray<int,4> const& a_begin,
                                GpuArray<int,4> const& a_end) noexcept
     {
-        if constexpr (std::is_same_v<ORDER,Order::F>) {
+        if constexpr (ORDER == Order::F) {
             return a_end[1] - a_begin[1];
         } else {
             return a_end[2] - a_begin[2];
@@ -353,7 +347,7 @@ private:
     static constexpr int len2 (GpuArray<int,4> const& a_begin,
                                GpuArray<int,4> const& a_end) noexcept
     {
-        if constexpr (std::is_same_v<ORDER,Order::F>) {
+        if constexpr (ORDER == Order::F) {
             return a_end[2] - a_begin[2];
         } else {
             return a_end[1] - a_begin[1];
@@ -399,13 +393,13 @@ private:
  *      // We can now use table in device lambda.
  * \endcode
  */
-template <typename T, int N, typename ORDER = Order::F>
+template <typename T, int N, Order ORDER = Order::F>
 class TableData
     : public DataAllocator
 {
 public:
 
-    template <class U, int M, class O> friend class TableData;
+    template <class U, int M, Order O> friend class TableData;
     using value_type = T;
     using table_type = std::conditional_t<N==1, Table1D<T>,
                        std::conditional_t<N==2, Table2D<T, ORDER>,
@@ -459,7 +453,7 @@ private:
     bool m_ptr_owner = false;
 };
 
-template <typename T, int N, typename ORDER>
+template <typename T, int N, Order ORDER>
 TableData<T,N,ORDER>::TableData (Array<int,N> const& lo, Array<int,N> const& hi, Arena* ar)
     : DataAllocator{ar}, m_lo(lo), m_hi(hi)
 {
@@ -467,7 +461,7 @@ TableData<T,N,ORDER>::TableData (Array<int,N> const& lo, Array<int,N> const& hi,
 }
 
 
-template <typename T, int N, typename ORDER>
+template <typename T, int N, Order ORDER>
 TableData<T,N,ORDER>::TableData (TableData<T,N,ORDER>&& rhs) noexcept
     : DataAllocator{rhs.arena()},
       m_dptr(rhs.m_dptr),
@@ -480,7 +474,7 @@ TableData<T,N,ORDER>::TableData (TableData<T,N,ORDER>&& rhs) noexcept
     rhs.m_ptr_owner = false;
 }
 
-template <typename T, int N, typename ORDER>
+template <typename T, int N, Order ORDER>
 TableData<T,N,ORDER>&
 TableData<T,N,ORDER>::operator= (TableData<T,N,ORDER> && rhs) noexcept
 {
@@ -498,20 +492,17 @@ TableData<T,N,ORDER>::operator= (TableData<T,N,ORDER> && rhs) noexcept
     return *this;
 }
 
-template <typename T, int N, typename ORDER>
+template <typename T, int N, Order ORDER>
 TableData<T,N,ORDER>::~TableData () noexcept
 {
     static_assert(std::is_trivially_copyable<T>() &&
                   std::is_trivially_destructible<T>(),
                   "TableData<T,N,ORDER>: T must be trivially copyable and trivially destructible");
     static_assert(N>=1 && N <=4, "TableData<T,N,ORDER>: N must be in the range of [1,4]");
-    static_assert(std::is_same_v<ORDER,Order::F> ||
-                  std::is_same_v<ORDER,Order::C>,
-                  "TableDat<T,N,ORDER>: ORDER must be either Order::F or Order::C");
     clear();
 }
 
-template <typename T, int N, typename ORDER>
+template <typename T, int N, Order ORDER>
 void
 TableData<T,N,ORDER>::resize (Array<int,N> const& lo, Array<int,N> const& hi, Arena* ar)
 {
@@ -535,7 +526,7 @@ TableData<T,N,ORDER>::resize (Array<int,N> const& lo, Array<int,N> const& hi, Ar
     }
 }
 
-template <typename T, int N, typename ORDER>
+template <typename T, int N, Order ORDER>
 Long
 TableData<T,N,ORDER>::size () const noexcept
 {
@@ -546,7 +537,7 @@ TableData<T,N,ORDER>::size () const noexcept
     return r;
 }
 
-template <typename T, int N, typename ORDER>
+template <typename T, int N, Order ORDER>
 void
 TableData<T,N,ORDER>::clear () noexcept
 {
@@ -559,7 +550,7 @@ TableData<T,N,ORDER>::clear () noexcept
     }
 }
 
-template <typename T, int N, typename ORDER>
+template <typename T, int N, Order ORDER>
 void
 TableData<T,N,ORDER>::define ()
 {
@@ -578,42 +569,42 @@ namespace detail {
     Table1D<T> make_table (T* p, Array<int,1> const& lo, Array<int,1> const& hi) {
         return Table1D<T>(p, lo[0], hi[0]+1);
     }
-    template <typename T, typename ORDER>
+    template <typename T, Order ORDER>
     Table2D<T,ORDER> make_table (T* p, Array<int,2> const& lo, Array<int,2> const& hi) {
         return Table2D<T,ORDER>(p, {lo[0],lo[1]}, {hi[0]+1,hi[1]+1});
     }
-    template <typename T, typename ORDER>
+    template <typename T, Order ORDER>
     Table3D<T> make_table (T* p, Array<int,3> const& lo, Array<int,3> const& hi) {
         return Table3D<T,ORDER>(p, {lo[0],lo[1],lo[2]}, {hi[0]+1,hi[1]+1,hi[2]+1});
     }
-    template <typename T, typename ORDER>
+    template <typename T, Order ORDER>
     Table4D<T> make_table (T* p, Array<int,4> const& lo, Array<int,4> const& hi) {
         return Table4D<T,ORDER>(p, {lo[0],lo[1],lo[2],lo[3]}, {hi[0]+1,hi[1]+1,hi[2]+1,hi[3]+1});
     }
 }
 
-template <typename T, int N, typename ORDER>
+template <typename T, int N, Order ORDER>
 typename TableData<T,N,ORDER>::table_type
 TableData<T,N,ORDER>::table () noexcept
 {
     return detail::make_table<T,ORDER>(m_dptr, m_lo, m_hi);
 }
 
-template <typename T, int N, typename ORDER>
+template <typename T, int N, Order ORDER>
 typename TableData<T,N,ORDER>::const_table_type
 TableData<T,N,ORDER>::table () const noexcept
 {
     return detail::make_table<T const, ORDER>(m_dptr, m_lo, m_hi);
 }
 
-template <typename T, int N, typename ORDER>
+template <typename T, int N, Order ORDER>
 typename TableData<T,N,ORDER>::const_table_type
 TableData<T,N,ORDER>::const_table () const noexcept
 {
     return detail::make_table<T const, ORDER>(m_dptr, m_lo, m_hi);
 }
 
-template <typename T, int N, typename ORDER>
+template <typename T, int N, Order ORDER>
 void
 TableData<T,N,ORDER>::copy (TableData<T,N,ORDER> const& rhs) noexcept
 {
diff --git a/Src/Base/CMakeLists.txt b/Src/Base/CMakeLists.txt
index 882f401228..2b6387ece1 100644
--- a/Src/Base/CMakeLists.txt
+++ b/Src/Base/CMakeLists.txt
@@ -14,6 +14,8 @@ foreach(D IN LISTS AMReX_SPACEDIM)
        AMReX_BlockMutex.cpp
        AMReX_Enum.H
        AMReX_GpuComplex.H
+       AMReX_SmallMatrix.H
+       AMReX_ConstexprFor.H
        AMReX_Vector.H
        AMReX_TableData.H
        AMReX_Tuple.H
diff --git a/Src/Base/Make.package b/Src/Base/Make.package
index c64fa50f11..264de0581f 100644
--- a/Src/Base/Make.package
+++ b/Src/Base/Make.package
@@ -4,6 +4,7 @@ AMREX_BASE=EXE
 C$(AMREX_BASE)_headers += AMReX_ccse-mpi.H AMReX_Algorithm.H AMReX_Any.H AMReX_Array.H
 C$(AMREX_BASE)_headers += AMReX_Enum.H
 C$(AMREX_BASE)_headers += AMReX_Vector.H AMReX_TableData.H AMReX_Tuple.H AMReX_Math.H
+C$(AMREX_BASE)_headers += AMReX_SmallMatrix.H AMReX_ConstexprFor.H
 
 C$(AMREX_BASE)_headers += AMReX_TypeList.H
 
diff --git a/Tests/CMakeLists.txt b/Tests/CMakeLists.txt
index a5ec9e1cf4..90730c24fb 100644
--- a/Tests/CMakeLists.txt
+++ b/Tests/CMakeLists.txt
@@ -123,7 +123,7 @@ else()
    #
    set( AMREX_TESTS_SUBDIRS Amr AsyncOut CLZ CTOParFor DeviceGlobal Enum
                             MultiBlock MultiPeriod ParmParse Parser Parser2 Reinit
-                            RoundoffDomain)
+                            RoundoffDomain SmallMatrix)
 
    if (AMReX_PARTICLES)
      list(APPEND AMREX_TESTS_SUBDIRS Particles)
diff --git a/Tests/SmallMatrix/CMakeLists.txt b/Tests/SmallMatrix/CMakeLists.txt
new file mode 100644
index 0000000000..224c4563c8
--- /dev/null
+++ b/Tests/SmallMatrix/CMakeLists.txt
@@ -0,0 +1,9 @@
+foreach(D IN LISTS AMReX_SPACEDIM)
+    set(_sources     main.cpp)
+    set(_input_files)
+
+    setup_test(${D} _sources _input_files)
+
+    unset(_sources)
+    unset(_input_files)
+endforeach()
diff --git a/Tests/SmallMatrix/GNUmakefile b/Tests/SmallMatrix/GNUmakefile
new file mode 100644
index 0000000000..d0d895ff52
--- /dev/null
+++ b/Tests/SmallMatrix/GNUmakefile
@@ -0,0 +1,24 @@
+AMREX_HOME := ../..
+
+DEBUG	= FALSE
+
+DIM	= 3
+
+COMP    = gcc
+
+USE_MPI   = FALSE
+USE_OMP   = FALSE
+USE_CUDA  = FALSE
+USE_HIP   = FALSE
+USE_SYCL  = FALSE
+
+BL_NO_FORT = TRUE
+
+TINY_PROFILE = FALSE
+
+include $(AMREX_HOME)/Tools/GNUMake/Make.defs
+
+include ./Make.package
+include $(AMREX_HOME)/Src/Base/Make.package
+
+include $(AMREX_HOME)/Tools/GNUMake/Make.rules
diff --git a/Tests/SmallMatrix/Make.package b/Tests/SmallMatrix/Make.package
new file mode 100644
index 0000000000..6b4b865e8f
--- /dev/null
+++ b/Tests/SmallMatrix/Make.package
@@ -0,0 +1 @@
+CEXE_sources += main.cpp
diff --git a/Tests/SmallMatrix/main.cpp b/Tests/SmallMatrix/main.cpp
new file mode 100644
index 0000000000..2dc502b044
--- /dev/null
+++ b/Tests/SmallMatrix/main.cpp
@@ -0,0 +1,144 @@
+#include <AMReX.H>
+#include <AMReX_Math.H>
+#include <AMReX_Print.H>
+#include <AMReX_REAL.H>
+#include <AMReX_SmallMatrix.H>
+
+using namespace amrex;
+
+int main (int argc, char* argv[])
+{
+    static_assert(Order::C == Order::RowMajor &&
+                  Order::F == Order::ColumnMajor);
+
+    amrex::Initialize(argc, argv);
+    {
+        SmallMatrix<Real,3,4> m34{};
+        for (int j = 0; j < 4; ++j) {
+            for (int i = 0; i < 3; ++i) {
+                AMREX_ALWAYS_ASSERT(m34(i,j) == 0.0_rt);
+            }
+        }
+    }
+    {
+        SmallVector<Real,3> cv{};
+        SmallRowVector<Real,3> rv{};
+        SmallVector<int,3> cv2{1,2,3};
+        SmallRowVector<int,3> rv2{0,10,20};
+        SmallVector<int,5> cv3{0,1,2};
+        for (int j = 0; j < 3; ++j) {
+            AMREX_ALWAYS_ASSERT(cv(j) == 0.0_rt &&
+                                rv(j) == 0.0_rt &&
+                                cv2(j) == j+1 &&
+                                rv2(j) == j*10 &&
+                                cv3(j) == j);
+        }
+        AMREX_ALWAYS_ASSERT(cv3(3) == 0 && cv3(4) == 0);
+    }
+    {
+        SmallMatrix<int,3,4> m34{{0,3,6,9},
+                                 {1,4,7,10},
+                                 {2,5,8,11}};
+        int v = 0;
+        for (int j = 0; j < 4; ++j) {
+            for (int i = 0; i < 3; ++i) {
+                AMREX_ALWAYS_ASSERT(m34(i,j) == v++);
+            }
+        }
+        std::cout << m34;
+    }
+    {
+        SmallMatrix<int,3,4,Order::C> m34{{0,1,2,3},
+                                          {4,5,6,7},
+                                          {8,9,10,11}};
+        int v = 0;
+        for (int i = 0; i < 3; ++i) {
+            for (int j = 0; j < 4; ++j) {
+                AMREX_ALWAYS_ASSERT(m34(i,j) == v++);
+            }
+        }
+    }
+    {
+        auto v3 = SmallVector<double,3>::Zero();
+        v3[0] = 1.;
+        v3(1) = 2.;
+        v3[2] = 3.;
+        auto m33 = SmallMatrix<double,3,3>::Identity();
+        auto r = m33*v3;
+        AMREX_ALWAYS_ASSERT(almostEqual(r[0],v3[0]) &&
+                            almostEqual(r[1],v3[1]) &&
+                            almostEqual(r[2],v3[2]));
+    }
+    {
+        SmallMatrix<int,4,3,Order::C> A{{1, 0, 1},
+                                        {2, 1, 1},
+                                        {0, 1, 1},
+                                        {1, 1, 2}};
+        SmallMatrix<int,3,3,Order::C> B{{1, 2, 1},
+                                        {2, 3, 1},
+                                        {4, 2, 2}};
+        SmallMatrix<int,3,1,Order::C> C{10, 8, 6};
+        auto ABC = A*B*C;
+        AMREX_ALWAYS_ASSERT(ABC(0,0) == 100 &&
+                            ABC(1,0) == 182 &&
+                            ABC(2,0) == 118 &&
+                            ABC(3,0) == 218);
+    }
+    {
+        SmallMatrix<int,3,4,Order::F> A{{1, 2, 0, 1},
+                                        {0, 1, 1, 1},
+                                        {1, 1, 1, 2}};
+        SmallMatrix<int,3,3,Order::F> B{{1, 2, 4},
+                                        {2, 3, 2},
+                                        {1, 1, 2}};
+        SmallMatrix<int,1,3,Order::F> C{10, 8, 6};
+        auto ABC = A.transpose()*B.transposeInPlace()*C.transpose();
+        AMREX_ALWAYS_ASSERT(ABC(0,0) == 100 &&
+                            ABC(1,0) == 182 &&
+                            ABC(2,0) == 118 &&
+                            ABC(3,0) == 218);
+    }
+    {
+        SmallMatrix<int, 3, 4> m;
+        m.setVal(2);
+        using M = decltype(m);
+        AMREX_ALWAYS_ASSERT(m.product() == Math::powi<M::row_size*M::column_size>(2));
+        AMREX_ALWAYS_ASSERT(m.sum() == 2*m.row_size*m.column_size);
+    }
+    {
+        SmallMatrix<double, 5, 5> m{{1.0, 3.4, 4.5, 5.6, 6.7},
+                                    {1.3, 2.0, 4.5, 5.6, 6.7},
+                                    {1.3, 1.0, 3.0, 5.6, 6.7},
+                                    {1.3, 1.4, 4.5, 4.0, 6.7},
+                                    {1.3, 1.0, 4.5, 5.6, 5.0}};
+        AMREX_ALWAYS_ASSERT(m.trace() == double(1+2+3+4+5));
+    }
+    {
+        SmallMatrix<int,2,3> a{{+1, +2, +3},
+                               {+7, +8, +9}};
+        SmallMatrix<int,2,3> b{{-1, -2, -3},
+                               {-7, -8, -9}};
+        auto c = a*2 + 2*b;
+        for (auto const& x : c) {
+            AMREX_ALWAYS_ASSERT(x == 0);
+        }
+    }
+    {
+        SmallMatrix<int,2,3> a{{+1, +2, +3},
+                               {+7, +8, +9}};
+        SmallMatrix<int,2,3> b{{-1, -2, -3},
+                               {-7, -8, -9}};
+        auto c = -a - b;
+        for (auto const& x : c) {
+            AMREX_ALWAYS_ASSERT(x == 0);
+        }
+    }
+    {
+        SmallMatrix<int,2,3> a{{+1, +2, +3},
+                               {+7, +8, +9}};
+        SmallMatrix<int,2,3> b;
+        b.setVal(-1);
+        AMREX_ALWAYS_ASSERT(a.dot(b) == -30);
+    }
+    amrex::Finalize();
+}