diff --git a/doc/operators.rst b/doc/operators.rst
index c58bc02b0..9e519c4ae 100644
--- a/doc/operators.rst
+++ b/doc/operators.rst
@@ -3,6 +3,7 @@ Discontinuous Galerkin operators
 
 .. automodule:: grudge.op
 .. automodule:: grudge.trace_pair
+.. automodule:: grudge.flux_differencing
 
 
 Transferring data between discretizations
diff --git a/examples/euler/acoustic_pulse.py b/examples/euler/acoustic_pulse.py
index c44a6322e..48b7641c7 100644
--- a/examples/euler/acoustic_pulse.py
+++ b/examples/euler/acoustic_pulse.py
@@ -29,7 +29,22 @@
 
 import pyopencl as cl
 import pyopencl.tools as cl_tools
+
+from grudge.array_context import (
+    PyOpenCLArrayContext,
+    PytatoPyOpenCLArrayContext
+)
+from grudge.models.euler import (
+    ConservedEulerField,
+    EulerOperator,
+    EntropyStableEulerOperator,
+    InviscidWallBC
+)
+from meshmode.mesh import TensorProductElementGroup
+from grudge.shortcuts import rk4_step
+
 from arraycontext import ArrayContext
+
 from meshmode.mesh import BTAG_ALL
 from pytools.obj_array import make_obj_array
 
@@ -105,8 +120,24 @@ def run_acoustic_pulse(actx,
                        order=3,
                        final_time=1,
                        resolution=16,
+                       esdg=False,
                        overintegration=False,
-                       visualize=False):
+                       visualize=False,
+                       tpe=False):
+
+    logger.info(
+        """
+        Acoustic pulse parameters:\n
+        order: %s\n
+        final_time: %s\n
+        resolution: %s\n
+        entropy stable: %s\n
+        overintegration: %s\n
+        visualize: %s
+        """,
+        order, final_time, resolution, esdg,
+        overintegration, visualize
+    )
 
     # eos-related parameters
     gamma = 1.4
@@ -118,10 +149,12 @@ def run_acoustic_pulse(actx,
     dim = 2
     box_ll = -0.5
     box_ur = 0.5
+    group_cls = TensorProductElementGroup if tpe else None
     mesh = generate_regular_rect_mesh(
         a=(box_ll,)*dim,
         b=(box_ur,)*dim,
-        nelements_per_axis=(resolution,)*dim)
+        nelements_per_axis=(resolution,)*dim,
+        group_cls=group_cls)
 
     from meshmode.discretization.poly_element import (
         QuadratureSimplexGroupFactory,
@@ -130,8 +163,19 @@ def run_acoustic_pulse(actx,
 
     from grudge.discretization import make_discretization_collection
     from grudge.dof_desc import DISCR_TAG_BASE, DISCR_TAG_QUAD
+    from meshmode.discretization.poly_element import (
+        InterpolatoryEdgeClusteredGroupFactory,
+        QuadratureGroupFactory)
+
+    if esdg:
+        case = "esdg-pulse"
+        operator_cls = EntropyStableEulerOperator
+    else:
+        case = "pulse"
+        operator_cls = EulerOperator
+
+    exp_name = f"fld-{case}-N{order}-K{resolution}"
 
-    exp_name = f"fld-acoustic-pulse-N{order}-K{resolution}"
     if overintegration:
         exp_name += "-overintegrated"
         quad_tag = DISCR_TAG_QUAD
@@ -141,9 +185,8 @@ def run_acoustic_pulse(actx,
     dcoll = make_discretization_collection(
         actx, mesh,
         discr_tag_to_group_factory={
-            DISCR_TAG_BASE: default_simplex_group_factory(
-                base_dim=mesh.dim, order=order),
-            DISCR_TAG_QUAD: QuadratureSimplexGroupFactory(2*order)
+            DISCR_TAG_BASE: InterpolatoryEdgeClusteredGroupFactory(order),
+            DISCR_TAG_QUAD: QuadratureGroupFactory(2*order)
         }
     )
 
@@ -151,7 +194,7 @@ def run_acoustic_pulse(actx,
 
     # {{{ Euler operator
 
-    euler_operator = EulerOperator(
+    euler_operator = operator_cls(
         dcoll,
         bdry_conditions={BTAG_ALL: InviscidWallBC()},
         flux_type="lf",
@@ -208,7 +251,8 @@ def rhs(t, q):
 
 
 def main(ctx_factory, order=3, final_time=1, resolution=16,
-         overintegration=False, visualize=False, lazy=False):
+         overintegration=False, visualize=False, lazy=False,
+         esdg=False, tpe=False):
     cl_ctx = ctx_factory()
     queue = cl.CommandQueue(cl_ctx)
 
@@ -224,13 +268,20 @@ def main(ctx_factory, order=3, final_time=1, resolution=16,
             force_device_scalars=True,
         )
 
+    if not actx.supports_nonscalar_broadcasting and esdg is True:
+        raise RuntimeError(
+            "Cannot use ESDG with an array context that cannot perform "
+            "nonscalar broadcasting. Run with --lazy instead."
+        )
+
     run_acoustic_pulse(
         actx,
         order=order,
         resolution=resolution,
+        esdg=esdg,
         overintegration=overintegration,
         final_time=final_time,
-        visualize=visualize
+        visualize=visualize, tpe=tpe
     )
 
 
@@ -241,12 +292,16 @@ def main(ctx_factory, order=3, final_time=1, resolution=16,
     parser.add_argument("--order", default=3, type=int)
     parser.add_argument("--tfinal", default=0.1, type=float)
     parser.add_argument("--resolution", default=16, type=int)
+    parser.add_argument("--esdg", action="store_true",
+                        help="use entropy stable dg")
     parser.add_argument("--oi", action="store_true",
                         help="use overintegration")
     parser.add_argument("--visualize", action="store_true",
                         help="write out vtk output")
     parser.add_argument("--lazy", action="store_true",
                         help="switch to a lazy computation mode")
+    parser.add_argument("--tpe", action="store_true",
+                        help="use tensor product elements")
     args = parser.parse_args()
 
     logging.basicConfig(level=logging.INFO)
@@ -254,6 +309,7 @@ def main(ctx_factory, order=3, final_time=1, resolution=16,
          order=args.order,
          final_time=args.tfinal,
          resolution=args.resolution,
+         esdg=args.esdg,
          overintegration=args.oi,
          visualize=args.visualize,
-         lazy=args.lazy)
+         lazy=args.lazy, tpe=args.tpe)
diff --git a/examples/euler/sod.py b/examples/euler/sod.py
new file mode 100644
index 000000000..abf138f41
--- /dev/null
+++ b/examples/euler/sod.py
@@ -0,0 +1,227 @@
+__copyright__ = """
+Copyright (C) 2021 University of Illinois Board of Trustees
+"""
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+
+from grudge.dof_desc import BoundaryDomainTag
+import pyopencl as cl
+import pyopencl.tools as cl_tools
+
+from arraycontext import thaw, freeze
+from grudge.array_context import PytatoPyOpenCLArrayContext
+from grudge.models.euler import (
+    EntropyStableEulerOperator,
+    ConservedEulerField,
+    PrescribedBC,
+    conservative_to_primitive_vars,
+)
+from grudge.shortcuts import rk4_step
+
+from pytools.obj_array import make_obj_array
+
+import grudge.op as op
+
+import logging
+logger = logging.getLogger(__name__)
+
+
+def sod_shock_initial_condition(nodes, t=0):
+    gamma = 1.4
+    dim = len(nodes)
+    gmn1 = 1.0 / (gamma - 1.0)
+    x = nodes[0]
+    actx = x.array_context
+    zeros = 0*x
+
+    _x0 = 0.5
+    _rhoin = 1.0
+    _rhoout = 0.125
+    _pin = 1.0
+    _pout = 0.1
+    rhoin = zeros + _rhoin
+    rhoout = zeros + _rhoout
+    energyin = zeros + gmn1 * _pin
+    energyout = zeros + gmn1 * _pout
+
+    x0 = zeros + _x0
+    sigma = 1e-13
+    weight = 0.5 * (1.0 - actx.np.tanh(1.0/sigma * (x - x0)))
+
+    mass = rhoout + (rhoin - rhoout)*weight
+    energy = energyout + (energyin - energyout)*weight
+    momentum = make_obj_array([zeros for _ in range(dim)])
+
+    return ConservedEulerField(mass=mass, energy=energy, momentum=momentum)
+
+
+def run_sod_shock_tube(
+        actx, order=4, resolution=32, final_time=0.2, visualize=False):
+
+    logger.info(
+        """
+        Sod 1-D parameters:\n
+        order: %s\n
+        final_time: %s\n
+        resolution: %s\n
+        visualize: %s
+        """,
+        order, final_time, resolution, visualize
+    )
+
+    # eos-related parameters
+    gamma = 1.4
+
+    # {{{ discretization
+
+    from meshmode.mesh.generation import generate_regular_rect_mesh
+
+    dim = 1
+    box_ll = 0.0
+    box_ur = 1.0
+    mesh = generate_regular_rect_mesh(
+        a=(box_ll,)*dim,
+        b=(box_ur,)*dim,
+        nelements_per_axis=(resolution,)*dim,
+        boundary_tag_to_face={
+            "prescribed": ["+x", "-x"],
+        }
+    )
+
+    from grudge import DiscretizationCollection
+    from grudge.dof_desc import \
+        DISCR_TAG_BASE, DISCR_TAG_QUAD
+    from meshmode.discretization.poly_element import \
+        (default_simplex_group_factory,
+         QuadratureSimplexGroupFactory)
+
+    exp_name = f"fld-sod-1d-N{order}-K{resolution}"
+    quad_tag = DISCR_TAG_QUAD
+
+    dcoll = DiscretizationCollection(
+        actx, mesh,
+        discr_tag_to_group_factory={
+            DISCR_TAG_BASE: default_simplex_group_factory(dim, order),
+            DISCR_TAG_QUAD: QuadratureSimplexGroupFactory(order + 2)
+        }
+    )
+
+    # }}}
+
+    # {{{ Euler operator
+
+    dd_prescribed = BoundaryDomainTag("prescribed")
+    bcs = {
+        dd_prescribed: PrescribedBC(prescribed_state=sod_shock_initial_condition)
+    }
+
+    euler_operator = EntropyStableEulerOperator(
+        dcoll,
+        bdry_conditions=bcs,
+        flux_type="lf",
+        gamma=gamma,
+        quadrature_tag=quad_tag
+    )
+
+    def rhs(t, q):
+        return euler_operator.operator(t, q)
+
+    compiled_rhs = actx.compile(rhs)
+
+    fields = sod_shock_initial_condition(thaw(dcoll.nodes(), actx))
+
+    from grudge.dt_utils import h_min_from_volume
+
+    cfl = 0.01
+    cn = 0.5*(order + 1)**2
+    dt = cfl * actx.to_numpy(h_min_from_volume(dcoll)) / cn
+
+    logger.info("Timestep size: %g", dt)
+
+    # }}}
+
+    from grudge.shortcuts import make_visualizer
+
+    vis = make_visualizer(dcoll)
+
+    # {{{ time stepping
+
+    step = 0
+    t = 0.0
+    while t < final_time:
+        if step % 10 == 0:
+            norm_q = actx.to_numpy(op.norm(dcoll, fields, 2))
+            logger.info("[%04d] t = %.5f |q| = %.5e", step, t, norm_q)
+            if visualize:
+                rho, velocity, pressure = \
+                    conservative_to_primitive_vars(fields, gamma=gamma)
+                vis.write_vtk_file(
+                    f"{exp_name}-{step:04d}.vtu",
+                    [
+                        ("rho", rho),
+                        ("energy", fields.energy),
+                        ("momentum", fields.momentum),
+                        ("velocity", velocity),
+                        ("pressure", pressure)
+                    ]
+                )
+            assert norm_q < 10000
+
+        fields = thaw(freeze(fields, actx), actx)
+        fields = rk4_step(fields, t, dt, compiled_rhs)
+        t += dt
+        step += 1
+
+    # }}}
+
+
+def main(ctx_factory, order=4, final_time=0.2, resolution=32, visualize=False):
+    cl_ctx = ctx_factory()
+    queue = cl.CommandQueue(cl_ctx)
+    actx = PytatoPyOpenCLArrayContext(
+        queue,
+        allocator=cl_tools.MemoryPool(cl_tools.ImmediateAllocator(queue)),
+    )
+
+    run_sod_shock_tube(
+        actx, order=order,
+        resolution=resolution,
+        final_time=final_time,
+        visualize=visualize)
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--order", default=4, type=int)
+    parser.add_argument("--tfinal", default=0.2, type=float)
+    parser.add_argument("--resolution", default=32, type=int)
+    parser.add_argument("--visualize", action="store_true")
+    args = parser.parse_args()
+
+    logging.basicConfig(level=logging.INFO)
+    main(cl.create_some_context,
+         order=args.order,
+         final_time=args.tfinal,
+         resolution=args.resolution,
+         visualize=args.visualize)
diff --git a/examples/euler/vortex.py b/examples/euler/vortex.py
index 974981efb..9e1af1d1b 100644
--- a/examples/euler/vortex.py
+++ b/examples/euler/vortex.py
@@ -28,9 +28,14 @@
 import pyopencl as cl
 import pyopencl.tools as cl_tools
 
+from grudge.array_context import PytatoPyOpenCLArrayContext, PyOpenCLArrayContext
+from grudge.models.euler import (
+    vortex_initial_condition,
+    EulerOperator,
+    EntropyStableEulerOperator
+)
 import grudge.op as op
 from grudge.array_context import PyOpenCLArrayContext, PytatoPyOpenCLArrayContext
-from grudge.models.euler import EulerOperator, vortex_initial_condition
 from grudge.shortcuts import rk4_step
 
 
@@ -38,6 +43,7 @@
 
 
 def run_vortex(actx, order=3, resolution=8, final_time=5,
+               esdg=False,
                overintegration=False,
                flux_type="central",
                visualize=False):
@@ -48,11 +54,12 @@ def run_vortex(actx, order=3, resolution=8, final_time=5,
         order: %s\n
         final_time: %s\n
         resolution: %s\n
+        entropy stable: %s\n
         overintegration: %s\n
         flux_type: %s\n
         visualize: %s
         """,
-        order, final_time, resolution,
+        order, final_time, resolution, esdg,
         overintegration, flux_type, visualize
     )
 
@@ -77,7 +84,14 @@ def run_vortex(actx, order=3, resolution=8, final_time=5,
     from grudge.discretization import make_discretization_collection
     from grudge.dof_desc import DISCR_TAG_BASE, DISCR_TAG_QUAD
 
-    exp_name = f"fld-vortex-N{order}-K{resolution}-{flux_type}"
+    if esdg:
+        case = "esdg-vortex"
+        operator_cls = EntropyStableEulerOperator
+    else:
+        case = "vortex"
+        operator_cls = EulerOperator
+
+    exp_name = f"fld-{case}-N{order}-K{resolution}-{flux_type}"
 
     if overintegration:
         exp_name += "-overintegrated"
@@ -98,7 +112,7 @@ def run_vortex(actx, order=3, resolution=8, final_time=5,
 
     # {{{ Euler operator
 
-    euler_operator = EulerOperator(
+    euler_operator = operator_cls(
         dcoll,
         flux_type=flux_type,
         gamma=gamma,
@@ -155,6 +169,7 @@ def rhs(t, q):
 
 
 def main(ctx_factory, order=3, final_time=5, resolution=8,
+         esdg=False,
          overintegration=False,
          lf_stabilization=False,
          visualize=False,
@@ -174,6 +189,12 @@ def main(ctx_factory, order=3, final_time=5, resolution=8,
             force_device_scalars=True,
         )
 
+    if not actx.supports_nonscalar_broadcasting and esdg is True:
+        raise RuntimeError(
+            "Cannot use ESDG with an array context that cannot perform "
+            "nonscalar broadcasting. Run with --lazy instead."
+        )
+
     if lf_stabilization:
         flux_type = "lf"
     else:
@@ -184,6 +205,7 @@ def main(ctx_factory, order=3, final_time=5, resolution=8,
         order=order,
         resolution=resolution,
         final_time=final_time,
+        esdg=esdg,
         overintegration=overintegration,
         flux_type=flux_type,
         visualize=visualize
@@ -197,6 +219,8 @@ def main(ctx_factory, order=3, final_time=5, resolution=8,
     parser.add_argument("--order", default=3, type=int)
     parser.add_argument("--tfinal", default=0.015, type=float)
     parser.add_argument("--resolution", default=8, type=int)
+    parser.add_argument("--esdg", action="store_true",
+                        help="use entropy stable dg")
     parser.add_argument("--oi", action="store_true",
                         help="use overintegration")
     parser.add_argument("--lf", action="store_true",
@@ -212,6 +236,7 @@ def main(ctx_factory, order=3, final_time=5, resolution=8,
          order=args.order,
          final_time=args.tfinal,
          resolution=args.resolution,
+         esdg=args.esdg,
          overintegration=args.oi,
          lf_stabilization=args.lf,
          visualize=args.visualize,
diff --git a/examples/tensor-product-examples/acoustic_pulse.py b/examples/tensor-product-examples/acoustic_pulse.py
new file mode 100644
index 000000000..577d0da4e
--- /dev/null
+++ b/examples/tensor-product-examples/acoustic_pulse.py
@@ -0,0 +1,262 @@
+__copyright__ = """
+Copyright (C) 2021 University of Illinois Board of Trustees
+"""
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+
+from meshmode.mesh import TensorProductElementGroup
+import numpy as np
+
+import pyopencl as cl
+import pyopencl.tools as cl_tools
+
+from grudge.array_context import (
+    PyOpenCLArrayContext,
+    PytatoPyOpenCLArrayContext
+)
+from grudge.models.euler import (
+    ConservedEulerField,
+    EulerOperator,
+    InviscidWallBC
+)
+from grudge.shortcuts import rk4_step
+
+from meshmode.mesh import BTAG_ALL
+
+from pytools.obj_array import make_obj_array
+
+import grudge.op as op
+
+import logging
+logger = logging.getLogger(__name__)
+
+
+def gaussian_profile(
+        x_vec, t=0, rho0=1.0, rhoamp=1.0, p0=1.0, gamma=1.4,
+        center=None, velocity=None):
+
+    dim = len(x_vec)
+    if center is None:
+        center = np.zeros(shape=(dim,))
+    if velocity is None:
+        velocity = np.zeros(shape=(dim,))
+
+    lump_loc = center + t * velocity
+
+    # coordinates relative to lump center
+    rel_center = make_obj_array(
+        [x_vec[i] - lump_loc[i] for i in range(dim)]
+    )
+    actx = x_vec[0].array_context
+    r = actx.np.sqrt(np.dot(rel_center, rel_center))
+    expterm = rhoamp * actx.np.exp(1 - r ** 2)
+
+    mass = expterm + rho0
+    mom = velocity * mass
+    energy = (p0 / (gamma - 1.0)) + np.dot(mom, mom) / (2.0 * mass)
+
+    return ConservedEulerField(mass=mass, energy=energy, momentum=mom)
+
+
+def make_pulse(amplitude, r0, w, r):
+    dim = len(r)
+    r_0 = np.zeros(dim)
+    r_0 = r_0 + r0
+    rel_center = make_obj_array(
+        [r[i] - r_0[i] for i in range(dim)]
+    )
+    actx = r[0].array_context
+    rms2 = w * w
+    r2 = np.dot(rel_center, rel_center) / rms2
+    return amplitude * actx.np.exp(-.5 * r2)
+
+
+def acoustic_pulse_condition(x_vec, t=0):
+    dim = len(x_vec)
+    vel = np.zeros(shape=(dim,))
+    orig = np.zeros(shape=(dim,))
+    uniform_gaussian = gaussian_profile(
+        x_vec, t=t, center=orig, velocity=vel, rhoamp=0.0)
+
+    amplitude = 1.0
+    width = 0.1
+    pulse = make_pulse(amplitude, orig, width, x_vec)
+
+    return ConservedEulerField(
+        mass=uniform_gaussian.mass,
+        energy=uniform_gaussian.energy + pulse,
+        momentum=uniform_gaussian.momentum
+    )
+
+
+def run_acoustic_pulse(actx,
+                       order=3,
+                       final_time=1,
+                       resolution=4,
+                       overintegration=False,
+                       visualize=False):
+
+    # eos-related parameters
+    gamma = 1.4
+
+    # {{{ discretization
+
+    from meshmode.mesh.generation import generate_regular_rect_mesh
+
+    dim = 3
+    box_ll = -0.5
+    box_ur = 0.5
+    mesh = generate_regular_rect_mesh(
+        a=(box_ll,)*dim,
+        b=(box_ur,)*dim,
+        nelements_per_axis=(resolution,)*dim,
+        group_cls=TensorProductElementGroup)
+
+    from grudge import DiscretizationCollection
+    from grudge.dof_desc import DISCR_TAG_BASE, DISCR_TAG_QUAD
+    from meshmode.discretization.poly_element import \
+        LegendreGaussLobattoTensorProductGroupFactory as LGL
+
+    exp_name = f"fld-acoustic-pulse-N{order}-K{resolution}"
+    if overintegration:
+        exp_name += "-overintegrated"
+        quad_tag = DISCR_TAG_QUAD
+    else:
+        quad_tag = None
+
+    dcoll = DiscretizationCollection(
+        actx, mesh,
+        discr_tag_to_group_factory={
+            DISCR_TAG_BASE: LGL(order)
+        }
+    )
+
+    # }}}
+
+    # {{{ Euler operator
+
+    euler_operator = EulerOperator(
+        dcoll,
+        bdry_conditions={BTAG_ALL: InviscidWallBC()},
+        flux_type="lf",
+        gamma=gamma,
+        quadrature_tag=quad_tag
+    )
+
+    def rhs(t, q):
+        return euler_operator.operator(t, q)
+
+    compiled_rhs = actx.compile(rhs)
+
+    from grudge.dt_utils import h_min_from_volume
+
+    cfl = 0.125
+    cn = 0.5*(order + 1)**2
+    dt = cfl * actx.to_numpy(h_min_from_volume(dcoll)) / cn
+
+    fields = acoustic_pulse_condition(actx.thaw(dcoll.nodes()))
+
+    logger.info("Timestep size: %g", dt)
+
+    # }}}
+
+    from grudge.shortcuts import make_visualizer
+
+    vis = make_visualizer(dcoll)
+
+    # {{{ time stepping
+
+    step = 0
+    t = 0.0
+    while t < final_time:
+        if step % 10 == 0:
+            norm_q = actx.to_numpy(op.norm(dcoll, fields, 2))
+            logger.info("[%04d] t = %.5f |q| = %.5e", step, t, norm_q)
+            if visualize:
+                vis.write_vtk_file(
+                    f"{exp_name}-{step:04d}.vtu",
+                    [
+                        ("rho", fields.mass),
+                        ("energy", fields.energy),
+                        ("momentum", fields.momentum)
+                    ]
+                )
+            assert norm_q < 5
+
+        fields = actx.thaw(actx.freeze(fields))
+        fields = rk4_step(fields, t, dt, compiled_rhs)
+        t += dt
+        step += 1
+
+    # }}}
+
+
+def main(ctx_factory, order=3, final_time=1, resolution=16,
+         overintegration=False, visualize=False, lazy=False):
+    cl_ctx = ctx_factory()
+    queue = cl.CommandQueue(cl_ctx)
+
+    if lazy:
+        actx = PytatoPyOpenCLArrayContext(
+            queue,
+            allocator=cl_tools.MemoryPool(cl_tools.ImmediateAllocator(queue)),
+        )
+    else:
+        actx = PyOpenCLArrayContext(
+            queue,
+            allocator=cl_tools.MemoryPool(cl_tools.ImmediateAllocator(queue)),
+            force_device_scalars=True,
+        )
+
+    run_acoustic_pulse(
+        actx,
+        order=order,
+        resolution=resolution,
+        overintegration=overintegration,
+        final_time=final_time,
+        visualize=visualize
+    )
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--order", default=3, type=int)
+    parser.add_argument("--tfinal", default=0.1, type=float)
+    parser.add_argument("--resolution", default=16, type=int)
+    parser.add_argument("--oi", action="store_true",
+                        help="use overintegration")
+    parser.add_argument("--visualize", action="store_true",
+                        help="write out vtk output")
+    parser.add_argument("--lazy", action="store_true",
+                        help="switch to a lazy computation mode")
+    args = parser.parse_args()
+
+    logging.basicConfig(level=logging.INFO)
+    main(cl.create_some_context,
+         order=args.order,
+         final_time=args.tfinal,
+         resolution=args.resolution,
+         overintegration=args.oi,
+         visualize=args.visualize,
+         lazy=args.lazy)
diff --git a/examples/tp-transform-cartoon.py b/examples/tp-transform-cartoon.py
new file mode 100644
index 000000000..f26a58737
--- /dev/null
+++ b/examples/tp-transform-cartoon.py
@@ -0,0 +1,102 @@
+import loopy as lp
+
+import meshmode.mesh.generation as mgen
+
+import numpy as np
+import pyopencl as cl
+import pytato as pt
+
+from grudge import op
+from grudge.array_context import OutputIsTensorProductDOFArrayOrdered
+from grudge.discretization import make_discretization_collection
+
+from meshmode.array_context import PytatoPyOpenCLArrayContext
+
+
+class PytatoTensorProductArrayContext(PytatoPyOpenCLArrayContext):
+    def transform_dag(self, dag):
+        if "dag_dots" not in dir(self):
+            self.dag_dots = []
+
+        self.dag_dots.append(pt.get_dot_graph(dag))
+
+        return super().transform_dag(dag)
+
+    def transform_loopy_program(self, t_unit):
+        knl = t_unit.default_entrypoint
+
+        # {{{ adjust strides according to tensor product structure
+        if knl.tags_of_type(OutputIsTensorProductDOFArrayOrdered):
+            new_args = []
+            for arg in knl.args:
+                if arg.is_output:
+                    arg = arg.copy(dim_tags=(
+                        f"N{len(arg.shape)-1},"
+                        + ",".join(f"N{i}"
+                                   for i in range(len(arg.shape)-1))
+                        ))
+
+                new_args.append(arg)
+
+            knl = knl.copy(args=new_args)
+        # }}}
+
+        # {{{ prefetch
+        # }}}
+
+        # {{{ tile
+        # }}}
+
+        # FIXME: remove this (eventually)
+        knl = lp.set_options(knl, insert_gbarriers=True)
+        t_unit = t_unit.with_kernel(knl)
+        self.dev_code = lp.generate_code_v2(t_unit).device_code()
+
+        return super().transform_loopy_program(t_unit)
+
+
+def main():
+    order = 1
+
+    ctx = cl.create_some_context()
+    queue = cl.CommandQueue(ctx)
+    actx = PytatoTensorProductArrayContext(queue)
+
+    dim = 2
+    res = 2
+
+    from meshmode.mesh import TensorProductElementGroup
+    from meshmode.discretization.poly_element import \
+            LegendreGaussLobattoTensorProductGroupFactory as LGL
+
+    mesh = mgen.generate_regular_rect_mesh(
+            a=(-1,)*dim, b=(1,)*dim,
+            nelements_per_axis=(res,)*dim,
+            group_cls=TensorProductElementGroup)
+
+    import grudge.dof_desc as dd
+    dcoll = make_discretization_collection(
+            actx,
+            mesh,
+            discr_tag_to_group_factory={
+                dd.DISCR_TAG_BASE: LGL(order)})
+
+    def f(x):
+        result = dcoll.zeros(actx) + 1
+        for i in range(dim-1):
+            result = result * actx.np.sin(np.pi*x[i])
+        result = result * actx.np.cos(np.pi/2*x[dim-1])
+        return result
+
+
+    x = actx.thaw(dcoll.nodes())
+
+    u = f(x)
+
+    grad_u = op.local_grad(dcoll, u)
+    grad_u = actx.np.stack(grad_u)[0]
+    pt.show_dot_graph(grad_u)
+
+if __name__ == "__main__":
+    main()
+
diff --git a/grudge/array_context.py b/grudge/array_context.py
index aca1edc08..305063755 100644
--- a/grudge/array_context.py
+++ b/grudge/array_context.py
@@ -116,8 +116,7 @@
 
 class PyOpenCLArrayContext(_PyOpenCLArrayContextBase):
     """Inherits from :class:`meshmode.array_context.PyOpenCLArrayContext`. Extends it
-    to understand :mod:`grudge`-specific transform metadata. (Of which there isn't
-    any, for now.)
+    to understand :mod:`grudge`-specific transform metadata.
     """
     def __init__(self, queue: "pyopencl.CommandQueue",
             allocator: Optional["pyopencl.tools.AllocatorBase"] = None,
@@ -132,6 +131,30 @@ def __init__(self, queue: "pyopencl.CommandQueue",
         super().__init__(queue, allocator,
                          wait_event_queue_length, force_device_scalars)
 
+    def transform_loopy_program(self, t_unit):
+        knl = t_unit.default_entrypoint
+
+        # {{{ process tensor product specific metadata
+
+        if knl.tags_of_type(OutputIsTensorProductDOFArrayOrdered):
+            new_args = []
+            for arg in knl.args:
+                if arg.is_output:
+                    arg = arg.copy(dim_tags=(
+                        f"N{len(arg.shape)-1},"
+                        + ",".join(f"N{i}"
+                                   for i in range(len(arg.shape)-1))
+                        ))
+
+                new_args.append(arg)
+
+            knl = knl.copy(args=new_args)
+            t_unit = t_unit.with_kernel(knl)
+
+        # }}}
+
+        return super().transform_loopy_program(t_unit)
+
 # }}}
 
 
@@ -139,8 +162,7 @@ def __init__(self, queue: "pyopencl.CommandQueue",
 
 class PytatoPyOpenCLArrayContext(_PytatoPyOpenCLArrayContextBase):
     """Inherits from :class:`meshmode.array_context.PytatoPyOpenCLArrayContext`.
-    Extends it to understand :mod:`grudge`-specific transform metadata. (Of
-    which there isn't any, for now.)
+    Extends it to understand :mod:`grudge`-specific transform metadata.
     """
     def __init__(self, queue, allocator=None,
             *,
@@ -162,6 +184,29 @@ def __init__(self, queue, allocator=None,
         super().__init__(queue, allocator,
                 compile_trace_callback=compile_trace_callback)
 
+    def transform_loopy_program(self, t_unit):
+        knl = t_unit.default_entrypoint
+
+        # {{{ process tensor product specific metadata
+
+        if knl.tags_of_type(OutputIsTensorProductDOFArrayOrdered):
+            new_args = []
+            for arg in knl.args:
+                if arg.is_output:
+                    arg = arg.copy(dim_tags=(
+                        f"N{len(arg.shape)-1},"
+                        + ",".join(f"N{i}"
+                                   for i in range(len(arg.shape)-1))
+                        ))
+
+                new_args.append(arg)
+
+            knl = knl.copy(args=new_args)
+
+        # }}}
+
+        return super().transform_loopy_program(t_unit)
+
 # }}}
 
 
@@ -236,19 +281,36 @@ def _dag_to_compiled_func(self, dict_of_named_arrays,
         self.actx._compile_trace_callback(self.f, "pre_find_distributed_partition",
                 dict_of_named_arrays)
 
-        distributed_partition = pt.find_distributed_partition(
-            # pylint-ignore-reason:
-            # '_BasePytatoArrayContext' has no
-            # 'mpi_communicator' member
-            # pylint: disable=no-member
-            self.actx.mpi_communicator, dict_of_named_arrays)
-
-        if __debug__:
-            # pylint-ignore-reason:
-            # '_BasePytatoArrayContext' has no 'mpi_communicator' member
-            pt.verify_distributed_partition(
-                self.actx.mpi_communicator,  # pylint: disable=no-member
-                distributed_partition)
+#        # https://github.com/inducer/pytato/pull/393 changes the function signature
+#        try:
+#            # pylint: disable=too-many-function-args
+#            distributed_partition = pt.find_distributed_partition(
+#                # pylint-ignore-reason:
+#                # '_BasePytatoArrayContext' has no
+#                # 'mpi_communicator' member
+#                # pylint: disable=no-member
+#                self.actx.mpi_communicator, dict_of_named_arrays)
+#        except TypeError as e:
+#            if "find_distributed_partition() takes 1 positional" in str(e):
+#                distributed_partition = pt.find_distributed_partition(
+#                    dict_of_named_arrays)
+#            else:
+#                raise
+
+        with ProcessLogger(logger, "pt.find_distributed_partition"):
+            distributed_partition = pt.find_distributed_partition(
+                # pylint-ignore-reason:
+                # '_BasePytatoArrayContext' has no
+                # 'mpi_communicator' member
+                # pylint: disable=no-member
+                self.actx.mpi_communicator, dict_of_named_arrays)
+
+            if __debug__:
+                # pylint-ignore-reason:
+                # '_BasePytatoArrayContext' has no 'mpi_communicator' member
+                pt.verify_distributed_partition(
+                    self.actx.mpi_communicator,  # pylint: disable=no-member
+                    distributed_partition)
 
         self.actx._compile_trace_callback(self.f, "post_find_distributed_partition",
                 distributed_partition)
@@ -389,9 +451,11 @@ def to_output_template(keys, _):
 
 class MPIPytatoArrayContextBase(MPIBasedArrayContext):
     def __init__(
-            self, mpi_communicator, queue, *, mpi_base_tag, allocator=None,
-            compile_trace_callback: Callable[[Any, str, Any], None] | None = None,
-            ) -> None:
+            self, mpi_communicator, queue, *,
+            mpi_base_tag, allocator=None,
+            compile_trace_callback: Optional[Callable[[Any, str, Any], None]] = None,
+            use_axis_tag_inference_fallback: bool = False,
+            use_einsum_inference_fallback: bool = False) -> None:
         """
         :arg compile_trace_callback: A function of three arguments
             *(what, stage, ir)*, where *what* identifies the object
@@ -406,7 +470,9 @@ def __init__(
                  "to reduce device allocations)", stacklevel=2)
 
         super().__init__(queue, allocator,
-                compile_trace_callback=compile_trace_callback)
+                compile_trace_callback=compile_trace_callback,
+                use_axis_tag_inference_fallback=use_axis_tag_inference_fallback,
+                use_einsum_inference_fallback=use_einsum_inference_fallback)
 
         self.mpi_communicator = mpi_communicator
         self.mpi_base_tag = mpi_base_tag
@@ -421,7 +487,9 @@ def clone(self):
         # pylint: disable=no-member
         return type(self)(self.mpi_communicator, self.queue,
                 mpi_base_tag=self.mpi_base_tag,
-                allocator=self.allocator)
+                allocator=self.allocator,
+                use_axis_tag_inference_fallback=self.use_axis_tag_inference_fallback,
+                use_einsum_inference_fallback=self.use_einsum_inference_fallback)
 
 # }}}
 
@@ -646,5 +714,55 @@ def get_reasonable_array_context_class(
 
 # }}}
 
+# {{{ distributed + numpy
+
+try:
+    from arraycontext import NumpyArrayContext
+
+    class MPINumpyArrayContext(NumpyArrayContext, MPIBasedArrayContext):
+        """An array context for using distributed computation with :mod:`numpy`
+        eager evaluation.
+        .. autofunction:: __init__
+        """
+
+        def __init__(self, mpi_communicator) -> None:
+            super().__init__()
+            self.mpi_communicator = mpi_communicator
+
+        def clone(self):
+            return type(self)(self.mpi_communicator)
+
+except ImportError:
+    print("Failed to import numpy array context.")
+    pass
+
+# }}}
+
+
+# {{{ tensor product-specific machinery
+
+class OutputIsTensorProductDOFArrayOrdered(Tag):
+    """Signify that the strides will not be of order "C" or "F".
+
+    The strides for the arrays containing tensor product element data are of the
+    form (slow, fastest, faster, fast). These strides are not "C" or "F" order.
+    Hence, this specialized array context takes care of specifying the
+    particular strides required.
+    """
+    pass
+
+
+class MassMatrix1d(Tag):
+    """Used in DAG transformation to realize algebraic simplification of 1D
+    inverse mass operator times mass operator.
+    """
+    pass
+
+class InverseMassMatrix1d(Tag):
+    """See MassMatrix1d.
+    """
+
+# }}}
+
 
 # vim: foldmethod=marker
diff --git a/grudge/discretization.py b/grudge/discretization.py
index 4c5e36f7a..fa60caa8a 100644
--- a/grudge/discretization.py
+++ b/grudge/discretization.py
@@ -6,6 +6,8 @@
 .. autofunction:: make_discretization_collection
 
 .. currentmodule:: grudge.discretization
+.. autoclass:: PartID
+.. autofunction:: filter_part_boundaries
 """
 
 __copyright__ = """
@@ -34,7 +36,11 @@
 """
 
 from collections.abc import Mapping
-from typing import TYPE_CHECKING, Any, Optional
+from typing import (
+    TYPE_CHECKING, Any, List, Mapping, Optional, Sequence, Tuple, Union
+)
+from meshmode.discretization.poly_element import (
+    InterpolatoryEdgeClusteredGroupFactory, ModalGroupFactory)
 from warnings import warn
 
 import numpy as np
@@ -47,14 +53,12 @@
     DiscretizationConnection,
     make_face_restriction,
 )
-from meshmode.discretization.poly_element import (
-    InterpolatoryEdgeClusteredGroupFactory,
-    ModalGroupFactory,
-)
 from meshmode.dof_array import DOFArray
 from meshmode.mesh import BTAG_PARTITION, Mesh
 from pytools import memoize_method, single_valued
 
+from dataclasses import dataclass, replace
+
 from grudge.dof_desc import (
     DD_VOLUME_ALL,
     DISCR_TAG_BASE,
@@ -75,6 +79,74 @@
     import mpi4py.MPI
 
 
+@dataclass(frozen=True)
+class PartID:
+    """Unique identifier for a piece of a partitioned mesh.
+
+    .. attribute:: volume_tag
+
+        The volume of the part.
+
+    .. attribute:: rank
+
+        The (optional) MPI rank of the part.
+
+    """
+    volume_tag: VolumeTag
+    rank: Optional[int] = None
+
+
+# {{{ part ID normalization
+
+def _normalize_mesh_part_ids(
+        mesh: Mesh,
+        self_volume_tag: VolumeTag,
+        all_volume_tags: Sequence[VolumeTag],
+        mpi_communicator: Optional["mpi4py.MPI.Intracomm"] = None):
+    """Convert a mesh's configuration-dependent "part ID" into a fixed type."""
+    from numbers import Integral
+    if mpi_communicator is not None:
+        # Accept PartID or rank (assume intra-volume for the latter)
+        def as_part_id(mesh_part_id):
+            if isinstance(mesh_part_id, PartID):
+                return mesh_part_id
+            elif isinstance(mesh_part_id, Integral):
+                return PartID(self_volume_tag, int(mesh_part_id))
+            else:
+                raise TypeError(f"Unable to convert {mesh_part_id} to PartID.")
+    else:
+        # Accept PartID or volume tag
+        def as_part_id(mesh_part_id):
+            if isinstance(mesh_part_id, PartID):
+                return mesh_part_id
+            elif mesh_part_id in all_volume_tags:
+                return PartID(mesh_part_id)
+            else:
+                raise TypeError(f"Unable to convert {mesh_part_id} to PartID.")
+
+    facial_adjacency_groups = mesh.facial_adjacency_groups
+
+    new_facial_adjacency_groups = []
+
+    from meshmode.mesh import InterPartAdjacencyGroup
+    for grp_list in facial_adjacency_groups:
+        new_grp_list = []
+        for fagrp in grp_list:
+            if isinstance(fagrp, InterPartAdjacencyGroup):
+                part_id = as_part_id(fagrp.part_id)
+                new_fagrp = replace(
+                    fagrp,
+                    boundary_tag=BTAG_PARTITION(part_id),
+                    part_id=part_id)
+            else:
+                new_fagrp = fagrp
+            new_grp_list.append(new_fagrp)
+        new_facial_adjacency_groups.append(new_grp_list)
+
+    return mesh.copy(facial_adjacency_groups=new_facial_adjacency_groups)
+
+# }}}
+
 MeshOrDiscr = Mesh | Discretization
 TagToElementGroupFactory = Mapping[DiscretizationTag, ElementGroupFactory]
 
@@ -155,6 +227,9 @@ def __init__(self, array_context: ArrayContext,
             order: int | None = None,
             discr_tag_to_group_factory: TagToElementGroupFactory | None = None,
             mpi_communicator: Optional["mpi4py.MPI.Intracomm"] = None,
+            inter_part_connections: Optional[
+                Mapping[Tuple[PartID, PartID],
+                    DiscretizationConnection]] = None,
             ) -> None:
         """
         :arg discr_tag_to_group_factory: A mapping from discretization tags
@@ -205,6 +280,9 @@ def __init__(self, array_context: ArrayContext,
 
             mesh = volume_discrs
 
+            mesh = _normalize_mesh_part_ids(
+                mesh, VTAG_ALL, [VTAG_ALL], mpi_communicator=mpi_communicator)
+
             discr_tag_to_group_factory = _normalize_discr_tag_to_group_factory(
                     dim=mesh.dim,
                     discr_tag_to_group_factory=discr_tag_to_group_factory,
@@ -218,17 +296,32 @@ def __init__(self, array_context: ArrayContext,
 
             del mesh
 
+            if inter_part_connections is not None:
+                raise TypeError("may not pass inter_part_connections when "
+                        "DiscretizationCollection constructor is called in "
+                        "legacy mode")
+
+            self._inter_part_connections = \
+                    _set_up_inter_part_connections(
+                            array_context=self._setup_actx,
+                            mpi_communicator=mpi_communicator,
+                            volume_discrs=volume_discrs,
+                            base_group_factory=(
+                                discr_tag_to_group_factory[DISCR_TAG_BASE]))
+
             # }}}
         else:
             assert discr_tag_to_group_factory is not None
             self._discr_tag_to_group_factory = discr_tag_to_group_factory
 
-        self._volume_discrs = volume_discrs
+            if inter_part_connections is None:
+                raise TypeError("inter_part_connections must be passed when "
+                        "DiscretizationCollection constructor is called in "
+                        "'modern' mode")
 
-        self._dist_boundary_connections = {
-            vtag: self._set_up_distributed_communication(
-                        vtag, mpi_communicator, array_context)
-            for vtag in self._volume_discrs.keys()}
+            self._inter_part_connections = inter_part_connections
+
+        self._volume_discrs = volume_discrs
 
     # }}}
 
@@ -784,6 +877,128 @@ def normal(self, dd):
     # }}}
 
 
+# {{{ distributed/multi-volume setup
+
+def _set_up_inter_part_connections(
+        array_context: ArrayContext,
+        mpi_communicator: Optional["mpi4py.MPI.Intracomm"],
+        volume_discrs: Mapping[VolumeTag, Discretization],
+        base_group_factory: ElementGroupFactory,
+        ) -> Mapping[
+                Tuple[PartID, PartID],
+                DiscretizationConnection]:
+
+    from meshmode.distributed import (get_connected_parts,
+            make_remote_group_infos, InterRankBoundaryInfo,
+            MPIBoundaryCommSetupHelper)
+
+    rank = mpi_communicator.Get_rank() if mpi_communicator is not None else None
+
+    # Save boundary restrictions as they're created to avoid potentially creating
+    # them twice in the loop below
+    cached_part_bdry_restrictions: Mapping[
+        Tuple[PartID, PartID],
+        DiscretizationConnection] = {}
+
+    def get_part_bdry_restriction(self_part_id, other_part_id):
+        cached_result = cached_part_bdry_restrictions.get(
+            (self_part_id, other_part_id), None)
+        if cached_result is not None:
+            return cached_result
+        return cached_part_bdry_restrictions.setdefault(
+            (self_part_id, other_part_id),
+            make_face_restriction(
+                array_context, volume_discrs[self_part_id.volume_tag],
+                base_group_factory,
+                boundary_tag=BTAG_PARTITION(other_part_id)))
+
+    inter_part_conns: Mapping[
+            Tuple[PartID, PartID],
+            DiscretizationConnection] = {}
+
+    irbis = []
+
+    for vtag, volume_discr in volume_discrs.items():
+        part_id = PartID(vtag, rank)
+        connected_part_ids = get_connected_parts(volume_discr.mesh)
+        for connected_part_id in connected_part_ids:
+            bdry_restr = get_part_bdry_restriction(
+                self_part_id=part_id, other_part_id=connected_part_id)
+
+            if connected_part_id.rank == rank:
+                # {{{ rank-local interface between multiple volumes
+
+                connected_bdry_restr = get_part_bdry_restriction(
+                    self_part_id=connected_part_id, other_part_id=part_id)
+
+                from meshmode.discretization.connection import \
+                        make_partition_connection
+                inter_part_conns[connected_part_id, part_id] = \
+                    make_partition_connection(
+                        array_context,
+                        local_bdry_conn=bdry_restr,
+                        remote_bdry_discr=connected_bdry_restr.to_discr,
+                        remote_group_infos=make_remote_group_infos(
+                            array_context, part_id, connected_bdry_restr))
+
+                # }}}
+            else:
+                # {{{ cross-rank interface
+
+                if mpi_communicator is None:
+                    raise RuntimeError("must supply an MPI communicator "
+                        "when using a distributed mesh")
+
+                irbis.append(
+                        InterRankBoundaryInfo(
+                            local_part_id=part_id,
+                            remote_part_id=connected_part_id,
+                            remote_rank=connected_part_id.rank,
+                            local_boundary_connection=bdry_restr))
+
+                # }}}
+
+    if irbis:
+        assert mpi_communicator is not None
+
+        with MPIBoundaryCommSetupHelper(mpi_communicator, array_context,
+                irbis, base_group_factory) as bdry_setup_helper:
+            while True:
+                conns = bdry_setup_helper.complete_some()
+                if not conns:
+                    # We're done.
+                    break
+
+                inter_part_conns.update(conns)
+
+    return inter_part_conns
+
+# }}}
+
+
+# {{{ modal group factory
+
+def _generate_modal_group_factory(nodal_group_factory):
+    from meshmode.discretization.poly_element import (
+        ModalSimplexGroupFactory,
+        ModalTensorProductGroupFactory
+    )
+    from meshmode.mesh import SimplexElementGroup, TensorProductElementGroup
+
+    order = nodal_group_factory.order
+    mesh_group_cls = nodal_group_factory.mesh_group_class
+
+    if mesh_group_cls is SimplexElementGroup:
+        return ModalSimplexGroupFactory(order=order)
+    elif mesh_group_cls is TensorProductElementGroup:
+        return ModalTensorProductGroupFactory(order=order)
+    else:
+        raise ValueError(
+            f"Unknown mesh element group: {mesh_group_cls}"
+        )
+
+# }}}
+
 # {{{ make_discretization_collection
 
 
@@ -843,6 +1058,8 @@ def make_discretization_collection(
 
     del order
 
+    mpi_communicator = getattr(array_context, "mpi_communicator", None)
+
     if any(
             isinstance(mesh_or_discr, Discretization)
             for mesh_or_discr in volumes.values()):
@@ -851,14 +1068,60 @@ def make_discretization_collection(
     volume_discrs = {
         vtag: Discretization(
             array_context,
-            mesh,
+            _normalize_mesh_part_ids(
+                mesh, vtag, volumes.keys(), mpi_communicator=mpi_communicator),
             discr_tag_to_group_factory[DISCR_TAG_BASE])
         for vtag, mesh in volumes.items()}
 
     return DiscretizationCollection(
             array_context=array_context,
             volume_discrs=volume_discrs,
-            discr_tag_to_group_factory=discr_tag_to_group_factory)
+            discr_tag_to_group_factory=discr_tag_to_group_factory,
+            inter_part_connections=_set_up_inter_part_connections(
+                array_context=array_context,
+                mpi_communicator=mpi_communicator,
+                volume_discrs=volume_discrs,
+                base_group_factory=discr_tag_to_group_factory[DISCR_TAG_BASE]))
+
+# }}}
+
+
+# {{{ filter_part_boundaries
+
+def filter_part_boundaries(
+        dcoll: DiscretizationCollection,
+        *,
+        volume_dd: DOFDesc = DD_VOLUME_ALL,
+        neighbor_volume_dd: Optional[DOFDesc] = None,
+        neighbor_rank: Optional[int] = None) -> List[DOFDesc]:
+    """
+    Retrieve tags of part boundaries that match *neighbor_volume_dd* and/or
+    *neighbor_rank*.
+    """
+    vol_mesh = dcoll.discr_from_dd(volume_dd).mesh
+
+    from meshmode.mesh import InterPartAdjacencyGroup
+    filtered_part_bdry_dds = [
+        volume_dd.trace(fagrp.boundary_tag)
+        for fagrp_list in vol_mesh.facial_adjacency_groups
+        for fagrp in fagrp_list
+        if isinstance(fagrp, InterPartAdjacencyGroup)]
+
+    if neighbor_volume_dd is not None:
+        filtered_part_bdry_dds = [
+            bdry_dd
+            for bdry_dd in filtered_part_bdry_dds
+            if (
+                bdry_dd.domain_tag.tag.part_id.volume_tag
+                == neighbor_volume_dd.domain_tag.tag)]
+
+    if neighbor_rank is not None:
+        filtered_part_bdry_dds = [
+            bdry_dd
+            for bdry_dd in filtered_part_bdry_dds
+            if bdry_dd.domain_tag.tag.part_id.rank == neighbor_rank]
+
+    return filtered_part_bdry_dds
 
 # }}}
 
diff --git a/grudge/dt_utils.py b/grudge/dt_utils.py
index 36aac52bf..ace1232f3 100644
--- a/grudge/dt_utils.py
+++ b/grudge/dt_utils.py
@@ -236,20 +236,35 @@ def h_min_from_volume(
 def dt_geometric_factors(
         dcoll: DiscretizationCollection, dd: DOFDesc | None = None) -> DOFArray:
     r"""Computes a geometric scaling factor for each cell following
-    [Hesthaven_2008]_, section 6.4, defined as the inradius (radius of an
-    inscribed circle/sphere).
+    [Hesthaven_2008]_, section 6.4, For simplicial elemenents, this factor is
+    defined as the inradius (radius of an inscribed circle/sphere). For
+    non-simplicial elements, a mean length measure is returned.
 
-    Specifically, the inradius for each element is computed using the following
-    formula from [Shewchuk_2002]_, Table 1, for simplicial cells
-    (triangles/tetrahedra):
+    Specifically, the inradius for each simplicial element is computed using the
+    following formula from [Shewchuk_2002]_, Table 1 (triangles, tetrahedra):
 
     .. math::
 
-        r_D = \frac{d V}{\sum_{i=1}^{N_{faces}} F_i},
+        r_D = \frac{d~V}{\sum_{i=1}^{N_{faces}} F_i},
 
     where :math:`d` is the topological dimension, :math:`V` is the cell volume,
     and :math:`F_i` are the areas of each face of the cell.
 
+    For non-simplicial elements, we use the following formula for a mean
+    cell size measure:
+
+    .. math::
+
+        r_D = \frac{2~d~V}{\sum_{i=1}^{N_{faces}} F_i},
+
+    where :math:`d` is the topological dimension, :math:`V` is the cell volume,
+    and :math:`F_i` are the areas of each face of the cell. Other valid choices
+    here include the shortest, longest, average of the cell diagonals, or edges.
+    The value returned by this routine (i.e. the cell volume divided by the
+    average cell face area) is bounded by the extrema of the cell edge lengths,
+    is straightforward to calculate regardless of element shape, and jibes well
+    with the foregoing calculation for simplicial elements.
+
     :arg dd: a :class:`~grudge.dof_desc.DOFDesc`, or a value convertible to one.
         Defaults to the base volume discretization if not provided.
     :returns: a frozen :class:`~meshmode.dof_array.DOFArray` containing the
@@ -263,11 +278,11 @@ def dt_geometric_factors(
     actx = dcoll._setup_actx
     volm_discr = dcoll.discr_from_dd(dd)
 
-    if any(not isinstance(grp, SimplexElementGroupBase)
-           for grp in volm_discr.groups):
-        raise NotImplementedError(
-            "Geometric factors are only implemented for simplex element groups"
-        )
+    # assumes !simplex = tpe
+    tpe = any(not isinstance(grp, SimplexElementGroupBase)
+              for grp in volm_discr.groups)
+
+    r_fac = 0.5 if tpe else dcoll.dim
 
     if volm_discr.dim != volm_discr.ambient_dim:
         from warnings import warn
@@ -296,50 +311,78 @@ def dt_geometric_factors(
         )
     )
 
-    if actx.supports_nonscalar_broadcasting:
-        # Compute total surface area of an element by summing over the
-        # individual face areas
-        surface_areas = DOFArray(
-            actx,
-            data=tuple(
-                actx.einsum(
-                    "fej->e",
-                    tag_axes(actx, {
-                        0: DiscretizationFaceAxisTag(),
-                        1: DiscretizationElementAxisTag(),
-                        2: DiscretizationDOFAxisTag()
+    if tpe:
+        if actx.supports_nonscalar_broadcasting:
+            surface_areas = DOFArray(
+                actx,
+                data=tuple(
+                    actx.np.max(
+                        tag_axes(actx, {
+                            0: DiscretizationFaceAxisTag(),
+                            1: DiscretizationElementAxisTag(),
+                        },
+                                 face_ae_i.reshape(
+                                     vgrp.mesh_el_group.nfaces,
+                                     vgrp.nelements)),
+                        axis=0)
+                    for vgrp, face_ae_i in zip(volm_discr.groups, face_areas)))
+        else:
+            el_data_per_group = []
+            for igrp, group in enumerate(volm_discr.mesh.groups):
+                nelements = group.nelements
+                nfaces = group.nfaces
+                el_face_data = face_areas[igrp].reshape(nfaces, nelements,
+                                                       face_areas[igrp].shape[1])
+                el_data_np = np.ascontiguousarray(
+                    np.max(actx.to_numpy(el_face_data), axis=0)[:, 0:1])
+                el_data = actx.from_numpy(el_data_np)
+                el_data = el_data.reshape(nelements)
+                el_data_per_group.append(el_data)
+            surface_areas = DOFArray(actx, tuple(el_data_per_group))
+    else:
+        if actx.supports_nonscalar_broadcasting:
+            # Compute total surface area of an element by summing over the
+            # individual face areas
+            surface_areas = DOFArray(
+                actx,
+                data=tuple(
+                    actx.einsum(
+                        "fej->e",
+                        tag_axes(actx, {
+                            0: DiscretizationFaceAxisTag(),
+                            1: DiscretizationElementAxisTag(),
+                            2: DiscretizationDOFAxisTag()
                         },
+                                face_ae_i.reshape(
+                                    vgrp.mesh_el_group.nfaces,
+                                    vgrp.nelements,
+                                    face_ae_i.shape[-1])),
+                        tagged=(FirstAxisIsElementsTag(),))
+                    for vgrp, face_ae_i in zip(volm_discr.groups, face_areas)))
+        else:
+            surface_areas = DOFArray(
+                actx,
+                data=tuple(
+                    # NOTE: Whenever the array context can't perform nonscalar
+                    # broadcasting, elementwise reductions
+                    # (like `elementwise_integral`) repeat the *same* scalar value of
+                    # the reduction at each degree of freedom. To get a single
+                    # value for the face area (per face),
+                    # we simply average over the nodes, which gives the desired result.
+                    actx.einsum(
+                    "fej->e",
                         face_ae_i.reshape(
                             vgrp.mesh_el_group.nfaces,
                             vgrp.nelements,
-                            face_ae_i.shape[-1])),
-                    tagged=(FirstAxisIsElementsTag(),))
-
-                for vgrp, face_ae_i in zip(volm_discr.groups, face_areas, strict=True)))
-    else:
-        surface_areas = DOFArray(
-            actx,
-            data=tuple(
-                # NOTE: Whenever the array context can't perform nonscalar
-                # broadcasting, elementwise reductions
-                # (like `elementwise_integral`) repeat the *same* scalar value of
-                # the reduction at each degree of freedom. To get a single
-                # value for the face area (per face),
-                # we simply average over the nodes, which gives the desired result.
-                actx.einsum(
-                    "fej->e",
-                    face_ae_i.reshape(
-                        vgrp.mesh_el_group.nfaces,
-                        vgrp.nelements,
-                        face_ae_i.shape[-1]
-                    ) / afgrp.nunit_dofs,
+                            face_ae_i.shape[-1]
+                        ) / afgrp.nunit_dofs,
                     tagged=(FirstAxisIsElementsTag(),))
 
-                for vgrp, afgrp, face_ae_i in zip(volm_discr.groups,
-                                                  face_discr.groups,
-                                                  face_areas, strict=True)
+                    for vgrp, afgrp, face_ae_i in zip(volm_discr.groups,
+                                                      face_discr.groups,
+                                                      face_areas)
+                )
             )
-        )
 
     return actx.freeze(
             actx.tag(NameHint(f"dt_geometric_{dd.as_identifier()}"),
@@ -349,9 +392,8 @@ def dt_geometric_factors(
                             "e,ei->ei",
                             1/sae_i,
                             actx.tag_axis(1, DiscretizationDOFAxisTag(), cv_i),
-                            tagged=(FirstAxisIsElementsTag(),)) * dcoll.dim
-                        for cv_i, sae_i in zip(cell_vols, surface_areas,
-                                               strict=True)))))
+                            tagged=(FirstAxisIsElementsTag(),)) * r_fac
+                        for cv_i, sae_i in zip(cell_vols, surface_areas)))))
 
 # }}}
 
diff --git a/grudge/eager.py b/grudge/eager.py
index 626e15592..08cf08f2a 100644
--- a/grudge/eager.py
+++ b/grudge/eager.py
@@ -87,7 +87,8 @@ def nodal_max(self, dd, vec):
         return op.nodal_max(self, dd, vec)
 
 
-connected_ranks = op.connected_ranks
+# FIXME: Deprecate connected_ranks instead of removing
+connected_parts = op.connected_parts
 interior_trace_pair = op.interior_trace_pair
 cross_rank_trace_pairs = op.cross_rank_trace_pairs
 
diff --git a/grudge/flux_differencing.py b/grudge/flux_differencing.py
new file mode 100644
index 000000000..68f983fd5
--- /dev/null
+++ b/grudge/flux_differencing.py
@@ -0,0 +1,271 @@
+"""Grudge module for flux-differencing in entropy-stable DG methods
+
+Flux-differencing
+-----------------
+
+.. autofunction:: volume_flux_differencing
+"""
+
+__copyright__ = """
+Copyright (C) 2021 University of Illinois Board of Trustees
+"""
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+
+from arraycontext import (
+    ArrayContext,
+    map_array_container,
+    freeze
+)
+from arraycontext import ArrayOrContainer
+
+from functools import partial
+
+from meshmode.transform_metadata import FirstAxisIsElementsTag
+from meshmode.dof_array import DOFArray
+
+from grudge.discretization import DiscretizationCollection
+from grudge.dof_desc import DOFDesc
+import modepy as mp
+from pytools import memoize_in, keyed_memoize_in
+
+import numpy as np
+
+
+def _reference_skew_symmetric_hybridized_sbp_operators(
+        actx: ArrayContext,
+        base_element_group,
+        vol_quad_element_group,
+        face_quad_element_group, dtype):
+    @keyed_memoize_in(
+        actx, _reference_skew_symmetric_hybridized_sbp_operators,
+        lambda base_grp, quad_vol_grp, face_quad_grp: (
+            base_grp.discretization_key(),
+            quad_vol_grp.discretization_key(),
+            face_quad_grp.discretization_key()))
+    def get_reference_skew_symetric_hybridized_diff_mats(
+            base_grp, quad_vol_grp, face_quad_grp):
+        from modepy import faces_for_shape, face_normal
+        from grudge.interpolation import (
+            volume_quadrature_interpolation_matrix,
+            surface_quadrature_interpolation_matrix
+        )
+        from grudge.op import reference_inverse_mass_matrix
+
+        # {{{ Volume operators
+
+        weights = quad_vol_grp.quadrature_rule().weights
+        vdm_q = actx.to_numpy(
+            volume_quadrature_interpolation_matrix(actx, base_grp, quad_vol_grp))
+        inv_mass_mat = actx.to_numpy(
+            reference_inverse_mass_matrix(actx, base_grp))
+        p_mat = inv_mass_mat @ (vdm_q.T * weights)
+
+        # }}}
+
+        # {{{ Surface operators
+
+        faces = faces_for_shape(base_grp.shape)
+        nfaces = len(faces)
+        # NOTE: assumes same quadrature rule on all faces
+        face_weights = np.tile(face_quad_grp.quadrature_rule().weights, nfaces)
+        face_normals = [face_normal(face) for face in faces]
+        nnods_per_face = face_quad_grp.nunit_dofs
+        e = np.ones(shape=(nnods_per_face,))
+        nrstj = [
+            # nsrtJ = nhat * Jhatf, where nhat is the reference normal
+            # and Jhatf is the Jacobian det. of the transformation from
+            # the face of the reference element to the reference face.
+            np.concatenate([np.sign(nhat[idx])*e for nhat in face_normals])
+            for idx in range(base_grp.dim)
+        ]
+        b_mats = [np.diag(face_weights*nrstj[d]) for d in range(base_grp.dim)]
+        vf_mat = actx.to_numpy(
+            surface_quadrature_interpolation_matrix(
+                actx,
+                base_element_group=base_grp,
+                face_quad_element_group=face_quad_grp))
+        zero_mat = np.zeros((nfaces*nnods_per_face, nfaces*nnods_per_face),
+                            dtype=dtype)
+
+        # }}}
+
+        # {{{ Hybridized (volume + surface) operators
+
+        q_mats = [p_mat.T @ (weights * vdm_q.T @ vdm_q) @ diff_mat @ p_mat
+                  for diff_mat in mp.diff_matrices(base_grp.basis_obj(),
+                                                   base_grp.unit_nodes)]
+        e_mat = vf_mat @ p_mat
+        q_skew_hybridized = np.asarray(
+            [
+                np.block(
+                    [[q_mats[d] - q_mats[d].T, e_mat.T @ b_mats[d]],
+                    [-b_mats[d] @ e_mat, zero_mat]]
+                ) for d in range(base_grp.dim)
+            ],
+            order="C"
+        )
+
+        # }}}
+
+        return actx.freeze(actx.from_numpy(q_skew_hybridized))
+
+    return get_reference_skew_symetric_hybridized_diff_mats(
+        base_element_group,
+        vol_quad_element_group,
+        face_quad_element_group
+    )
+
+
+def _single_axis_hybridized_derivative_kernel(
+        dcoll, dd_quad, dd_face_quad, xyz_axis, flux_matrix):
+    if not dcoll._has_affine_groups(dd_quad.domain_tag):
+        raise NotImplementedError("Not implemented for non-affine elements yet.")
+
+    if not isinstance(flux_matrix, DOFArray):
+        return map_array_container(
+            partial(_single_axis_hybridized_derivative_kernel,
+                    dcoll, dd_quad, dd_face_quad, xyz_axis),
+            flux_matrix
+        )
+
+    from grudge.dof_desc import DISCR_TAG_BASE
+    dd_vol = dd_quad.with_discr_tag(DISCR_TAG_BASE)
+
+    from grudge.geometry import \
+        area_element, inverse_surface_metric_derivative
+    from grudge.interpolation import (
+        volume_and_surface_interpolation_matrix,
+        volume_and_surface_quadrature_interpolation
+    )
+
+    actx = flux_matrix.array_context
+
+    # FIXME: This is kinda meh
+    def inverse_jac_matrix():
+        @memoize_in(
+            dcoll,
+            (_single_axis_hybridized_derivative_kernel, dd_quad, dd_face_quad))
+        def _inv_surf_metric_deriv():
+            return freeze(
+                actx.np.stack(
+                    [
+                        actx.np.stack(
+                            [
+                                volume_and_surface_quadrature_interpolation(
+                                    dcoll, dd_quad, dd_face_quad,
+                                    area_element(actx, dcoll, dd=dd_vol)
+                                    * inverse_surface_metric_derivative(
+                                        actx, dcoll,
+                                        rst_ax, xyz_axis, dd=dd_vol
+                                    )
+                                ) for rst_ax in range(dcoll.dim)
+                            ]
+                        ) for xyz_axis in range(dcoll.ambient_dim)
+                    ]
+                ),
+                actx
+            )
+        return _inv_surf_metric_deriv()
+
+    return DOFArray(
+        actx,
+        data=tuple(
+            # r for rst axis
+            actx.einsum("ik,rej,rij,eij->ek",
+                        volume_and_surface_interpolation_matrix(
+                            actx,
+                            base_element_group=bgrp,
+                            vol_quad_element_group=qvgrp,
+                            face_quad_element_group=qafgrp
+                        ),
+                        ijm_i[xyz_axis],
+                        _reference_skew_symmetric_hybridized_sbp_operators(
+                            actx,
+                            bgrp,
+                            qvgrp,
+                            qafgrp,
+                            fmat_i.dtype
+                        ),
+                        fmat_i,
+                        arg_names=("Vh_mat_t", "inv_jac_t", "Q_mat", "F_mat"),
+                        tagged=(FirstAxisIsElementsTag(),))
+
+            for bgrp, qvgrp, qafgrp, fmat_i, ijm_i in zip(
+                    dcoll.discr_from_dd(dd_vol).groups,
+                    dcoll.discr_from_dd(dd_quad).groups,
+                    dcoll.discr_from_dd(dd_face_quad).groups,
+                    flux_matrix,
+                    inverse_jac_matrix()
+            )
+        )
+    )
+
+
+def volume_flux_differencing(
+        dcoll: DiscretizationCollection,
+        dd_quad: DOFDesc,
+        dd_face_quad: DOFDesc,
+        flux_matrices: ArrayOrContainer) -> ArrayOrContainer:
+    r"""Computes the volume contribution of the DG divergence operator using
+    flux-differencing:
+
+    .. math::
+
+       \mathrm{VOL} = \sum_{i=1}^{d}
+        \begin{bmatrix}
+            \mathbf{V}_q \\ \mathbf{V}_f
+        \end{bmatrix}^T
+        \left(
+            \left( \mathbf{Q}_{i} - \mathbf{Q}^T_{i} \right)
+            \circ \mathbf{F}_{i}
+        \right)\mathbf{1}
+
+    where :math:`\circ` denotes the
+    `Hadamard product <https://en.wikipedia.org/wiki/Hadamard_product_(matrices)>`__,
+    :math:`\mathbf{F}_{i}` are matrices whose entries are computed
+    as the evaluation of an entropy-conserving two-point flux function
+    (e.g. :func:`grudge.models.euler.divergence_flux_chandrashekar`)
+    and :math:`\mathbf{Q}_{i} - \mathbf{Q}^T_{i}` are the skew-symmetric
+    hybridized differentiation operators defined in (15) of
+    `this paper <https://arxiv.org/pdf/1902.01828.pdf>`__.
+
+    :arg flux_matrices: a :class:`~meshmode.dof_array.DOFArray` or an
+        :class:`~arraycontext.container.ArrayContainer` of them containing
+        evaluations of two-point flux.
+    :returns: a :class:`~meshmode.dof_array.DOFArray` or an
+        :class:`~arraycontext.container.ArrayContainer` of them.
+    """
+
+    def _hybridized_div(fmats):
+        return sum(_single_axis_hybridized_derivative_kernel(
+            dcoll, dd_quad, dd_face_quad, i, fmat_i)
+            for i, fmat_i in enumerate(fmats))
+
+    from grudge.tools import rec_map_subarrays
+    return rec_map_subarrays(
+        _hybridized_div,
+        (dcoll.ambient_dim,), (),
+        flux_matrices, scalar_cls=DOFArray)
+
+
+# vim: foldmethod=marker
diff --git a/grudge/geometry/metrics.py b/grudge/geometry/metrics.py
index b36e3ccd1..7c322a502 100644
--- a/grudge/geometry/metrics.py
+++ b/grudge/geometry/metrics.py
@@ -66,6 +66,8 @@
 from meshmode.transform_metadata import (
     DiscretizationAmbientDimAxisTag,
     DiscretizationTopologicalDimAxisTag,
+    DiscretizationDOFAxisTag,
+    DiscretizationElementAxisTag,
 )
 from pymbolic.geometric_algebra import MultiVector
 from pytools import memoize_in
@@ -79,6 +81,11 @@
 register_multivector_as_array_container()
 
 
+def _has_geoderiv_connection(grp):
+    from modepy.shapes import Simplex
+    return grp.is_affine and issubclass(grp._modepy_shape_cls, Simplex)
+
+
 def _geometry_to_quad_if_requested(
         dcoll, inner_dd, dd, vec, _use_geoderiv_connection):
 
@@ -98,7 +105,7 @@ def to_quad(vec):
     return DOFArray(
             vec.array_context,
             tuple(
-                geoderiv_vec_i if megrp.is_affine else all_quad_vec_i
+                geoderiv_vec_i if _has_geoderiv_connection(megrp) else all_quad_vec_i
                 for megrp, geoderiv_vec_i, all_quad_vec_i in zip(
                     dcoll.discr_from_dd(inner_dd).mesh.groups,
                     dcoll._base_to_geoderiv_connection(inner_dd)(vec),
@@ -653,11 +660,11 @@ def area_element(
 
     @memoize_in(dcoll, (area_element, dd, _use_geoderiv_connection))
     def _area_elements():
-        result = actx.np.sqrt(
-            pseudoscalar(
-                actx, dcoll, dd=dd,
-                _use_geoderiv_connection=_use_geoderiv_connection).norm_squared())
-
+        res = pseudoscalar(
+            actx, dcoll, dd=dd, _use_geoderiv_connection=_use_geoderiv_connection
+        ).norm_squared()
+        result = actx.np.sqrt(tag_axes(actx, {0: DiscretizationElementAxisTag(),
+                                              1: DiscretizationDOFAxisTag()}, res))
         return actx.freeze(
                 actx.tag(NameHint(f"area_el_{dd.as_identifier()}"), result))
 
diff --git a/grudge/interpolation.py b/grudge/interpolation.py
index 61bdf1a13..a2b9d1f77 100644
--- a/grudge/interpolation.py
+++ b/grudge/interpolation.py
@@ -32,7 +32,24 @@
 """
 
 
+import numpy as np
+
+from arraycontext import (
+    ArrayContext,
+    map_array_container
+)
+from arraycontext import ArrayOrContainerT
+
+from functools import partial
+
+from meshmode.transform_metadata import FirstAxisIsElementsTag
+
 from grudge.discretization import DiscretizationCollection
+from grudge.dof_desc import DOFDesc
+
+from meshmode.dof_array import DOFArray
+
+from pytools import keyed_memoize_in
 
 
 # FIXME: Should revamp interp and make clear distinctions
@@ -46,3 +63,129 @@ def interp(dcoll: DiscretizationCollection, src, tgt, vec):
     from grudge.projection import project
 
     return project(dcoll, src, tgt, vec)
+
+
+# {{{ Interpolation matrices
+
+def volume_quadrature_interpolation_matrix(
+        actx: ArrayContext, base_element_group, vol_quad_element_group):
+    @keyed_memoize_in(
+        actx, volume_quadrature_interpolation_matrix,
+        lambda base_grp, vol_quad_grp: (base_grp.discretization_key(),
+                                        vol_quad_grp.discretization_key()))
+    def get_volume_vand(base_grp, vol_quad_grp):
+        from modepy import vandermonde
+
+        basis = base_grp.basis_obj()
+        vdm_inv = np.linalg.inv(vandermonde(basis.functions,
+                                            base_grp.unit_nodes))
+        vdm_q = vandermonde(basis.functions, vol_quad_grp.unit_nodes) @ vdm_inv
+        return actx.freeze(actx.from_numpy(vdm_q))
+
+    return get_volume_vand(base_element_group, vol_quad_element_group)
+
+
+def surface_quadrature_interpolation_matrix(
+        actx: ArrayContext, base_element_group, face_quad_element_group):
+    @keyed_memoize_in(
+        actx, surface_quadrature_interpolation_matrix,
+        lambda base_grp, face_quad_grp: (base_grp.discretization_key(),
+                                         face_quad_grp.discretization_key()))
+    def get_surface_vand(base_grp, face_quad_grp):
+        nfaces = base_grp.mesh_el_group.nfaces
+        assert face_quad_grp.nelements == nfaces * base_grp.nelements
+
+        from modepy import vandermonde, faces_for_shape
+
+        basis = base_grp.basis_obj()
+        vdm_inv = np.linalg.inv(vandermonde(basis.functions,
+                                            base_grp.unit_nodes))
+        faces = faces_for_shape(base_grp.shape)
+        # NOTE: Assumes same quadrature rule on each face
+        face_quadrature = face_quad_grp.quadrature_rule()
+
+        surface_nodes = faces[0].map_to_volume(face_quadrature.nodes)
+        for fidx in range(1, nfaces):
+            surface_nodes = np.append(
+                surface_nodes,
+                faces[fidx].map_to_volume(face_quadrature.nodes),
+                axis=1
+            )
+        vdm_f = vandermonde(basis.functions, surface_nodes) @ vdm_inv
+        return actx.freeze(actx.from_numpy(vdm_f))
+
+    return get_surface_vand(base_element_group, face_quad_element_group)
+
+
+def volume_and_surface_interpolation_matrix(
+        actx: ArrayContext,
+        base_element_group, vol_quad_element_group, face_quad_element_group):
+    @keyed_memoize_in(
+        actx, volume_and_surface_interpolation_matrix,
+        lambda base_grp, vol_quad_grp, face_quad_grp: (
+            base_grp.discretization_key(),
+            vol_quad_grp.discretization_key(),
+            face_quad_grp.discretization_key()))
+    def get_vol_surf_interpolation_matrix(base_grp, vol_quad_grp, face_quad_grp):
+        vq_mat = actx.to_numpy(
+            volume_quadrature_interpolation_matrix(
+                actx,
+                base_element_group=base_grp,
+                vol_quad_element_group=vol_quad_grp))
+        vf_mat = actx.to_numpy(
+            surface_quadrature_interpolation_matrix(
+                actx,
+                base_element_group=base_grp,
+                face_quad_element_group=face_quad_grp))
+        return actx.freeze(actx.from_numpy(np.block([[vq_mat], [vf_mat]])))
+
+    return get_vol_surf_interpolation_matrix(
+        base_element_group, vol_quad_element_group, face_quad_element_group
+    )
+
+# }}}
+
+
+def volume_and_surface_quadrature_interpolation(
+        dcoll: DiscretizationCollection,
+        dd_quad: DOFDesc,
+        dd_face_quad: DOFDesc,
+        vec: ArrayOrContainerT) -> ArrayOrContainerT:
+    """todo.
+    """
+    if not isinstance(vec, DOFArray):
+        return map_array_container(
+            partial(volume_and_surface_quadrature_interpolation,
+                    dcoll, dd_quad, dd_face_quad), vec
+        )
+
+    from grudge.dof_desc import DISCR_TAG_BASE
+    dd_vol = dd_quad.with_discr_tag(DISCR_TAG_BASE)
+    actx = vec.array_context
+    discr = dcoll.discr_from_dd(dd_vol)
+    quad_volm_discr = dcoll.discr_from_dd(dd_quad)
+    quad_face_discr = dcoll.discr_from_dd(dd_face_quad)
+
+    return DOFArray(
+        actx,
+        data=tuple(
+            actx.einsum("ij,ej->ei",
+                        volume_and_surface_interpolation_matrix(
+                            actx,
+                            base_element_group=bgrp,
+                            vol_quad_element_group=qvgrp,
+                            face_quad_element_group=qfgrp
+                        ),
+                        vec_i,
+                        arg_names=("Vh_mat", "vec"),
+                        tagged=(FirstAxisIsElementsTag(),))
+
+            for bgrp, qvgrp, qfgrp, vec_i in zip(
+                discr.groups,
+                quad_volm_discr.groups,
+                quad_face_discr.groups, vec)
+        )
+    )
+
+
+# vim: foldmethod=marker
diff --git a/grudge/models/euler.py b/grudge/models/euler.py
index ef55ad63e..d4e6e7129 100644
--- a/grudge/models/euler.py
+++ b/grudge/models/euler.py
@@ -4,6 +4,7 @@
 -----------------
 
 .. autoclass:: EulerOperator
+.. autoclass:: EntropyStableEulerOperator
 
 Predefined initial conditions
 -----------------------------
@@ -20,6 +21,9 @@
 
 .. autofunction:: euler_volume_flux
 .. autofunction:: euler_numerical_flux
+
+.. autofunction:: divergence_flux_chandrashekar
+.. autofunction:: entropy_stable_numerical_flux_chandrashekar
 """
 
 __copyright__ = """
@@ -53,6 +57,8 @@
     ArrayContext,
     dataclass_array_container,
     with_container_arithmetic,
+    map_array_container, thaw,
+    outer
 )
 from meshmode.dof_array import DOFArray
 from pytools.obj_array import make_obj_array
@@ -120,14 +126,13 @@ def vortex_initial_condition(
 
 # {{{ Variable transformation and helper routines
 
-def conservative_to_primitive_vars(cv_state: ConservedEulerField, gamma=1.4):
+def conservative_to_primitive_vars(cv_state: ConservedEulerField, gamma: float):
     """Converts from conserved variables (density, momentum, total energy)
     into primitive variables (density, velocity, pressure).
 
     :arg cv_state: A :class:`ConservedEulerField` containing the conserved
         variables.
-    :arg gamma: The isentropic expansion factor for a single-species gas
-        (default set to 1.4).
+    :arg gamma: The isentropic expansion factor.
     :returns: A :class:`Tuple` containing the primitive variables:
         (density, velocity, pressure).
     """
@@ -137,18 +142,75 @@ def conservative_to_primitive_vars(cv_state: ConservedEulerField, gamma=1.4):
     u = rho_u / rho
     p = (gamma - 1) * (rho_e - 0.5 * sum(rho_u * u))
 
-    return rho, u, p
+    return (rho, u, p)
 
 
-def compute_wavespeed(actx: ArrayContext, cv_state: ConservedEulerField, gamma=1.4):
-    """Computes the total translational wavespeed.
+def conservative_to_entropy_vars(cv_state: ConservedEulerField, gamma: float):
+    """Converts from conserved variables (density, momentum, total energy)
+    into entropy variables.
 
     :arg cv_state: A :class:`ConservedEulerField` containing the conserved
         variables.
     :arg gamma: The isentropic expansion factor for a single-species gas
         (default set to 1.4).
+    :returns: A :class:`ConservedEulerField` containing the entropy variables.
+    """
+    actx = cv_state.array_context
+    rho, u, p = conservative_to_primitive_vars(cv_state, gamma)
+
+    u_square = sum(v ** 2 for v in u)
+    s = actx.np.log(p) - gamma*actx.np.log(rho)
+    rho_p = rho / p
+
+    return ConservedEulerField(
+        mass=((gamma - s)/(gamma - 1)) - 0.5 * rho_p * u_square,
+        energy=-rho_p,
+        momentum=rho_p * u
+    )
+
+
+def entropy_to_conservative_vars(ev_state: ConservedEulerField, gamma: float):
+    """Converts from entropy variables into conserved variables
+    (density, momentum, total energy).
+
+    :arg ev_state: A :class:`ConservedEulerField` containing the entropy
+        variables.
+    :arg gamma: The isentropic expansion factor.
+    :returns: A :class:`ConservedEulerField` containing the conserved variables.
+    """
+    actx = ev_state.array_context
+    # See Hughes, Franca, Mallet (1986) A new finite element
+    # formulation for CFD: (DOI: 10.1016/0045-7825(86)90127-1)
+    inv_gamma_minus_one = 1/(gamma - 1)
+
+    # Convert to entropy `-rho * s` used by Hughes, France, Mallet (1986)
+    ev_state = ev_state * (gamma - 1)
+    v1 = ev_state.mass
+    v2t4 = ev_state.momentum
+    v5 = ev_state.energy
+
+    v_square = sum(v**2 for v in v2t4)
+    s = gamma - v1 + v_square/(2*v5)
+    rho_iota = (
+        ((gamma - 1) / (-v5)**gamma)**(inv_gamma_minus_one)
+    ) * actx.np.exp(-s * inv_gamma_minus_one)
+
+    return ConservedEulerField(
+        mass=-rho_iota * v5,
+        energy=rho_iota * (1 - v_square/(2*v5)),
+        momentum=rho_iota * v2t4
+    )
+
+
+def compute_wavespeed(cv_state: ConservedEulerField, gamma: float):
+    """Computes the total translational wavespeed.
+
+    :arg cv_state: A :class:`ConservedEulerField` containing the conserved
+        variables.
+    :arg gamma: The isentropic expansion factor.
     :returns: A :class:`~meshmode.dof_array.DOFArray` containing local wavespeeds.
     """
+    actx = cv_state.array_context
     rho, u, p = conservative_to_primitive_vars(cv_state, gamma=gamma)
 
     return actx.np.sqrt(np.dot(u, u)) + actx.np.sqrt(gamma * (p / rho))
@@ -169,7 +231,7 @@ def boundary_tpair(
             actx: ArrayContext,
             dcoll: DiscretizationCollection,
             dd_bc: DOFDesc,
-            state: ConservedEulerField, t=0):
+            restricted_state: ConservedEulerField, t=0):
         pass
 
 
@@ -180,14 +242,14 @@ def boundary_tpair(
             actx: ArrayContext,
             dcoll: DiscretizationCollection,
             dd_bc: DOFDesc,
-            state: ConservedEulerField, t=0):
-        actx = state.array_context
+            restricted_state: ConservedEulerField, t=0):
+        actx = restricted_state.array_context
         dd_base = as_dofdesc("vol", DISCR_TAG_BASE)
 
         return TracePair(
             dd_bc,
-            interior=op.project(dcoll, dd_base, dd_bc, state),
-            exterior=self.prescribed_state(actx.thaw(dcoll.nodes(dd_bc)), t=t)
+            interior=restricted_state,
+            exterior=self.prescribed_state(thaw(dcoll.nodes(dd_bc), actx), t=t)
         )
 
 
@@ -221,19 +283,17 @@ def boundary_tpair(
 # {{{ Euler operator
 
 def euler_volume_flux(
-        dcoll: DiscretizationCollection, cv_state: ConservedEulerField, gamma=1.4):
+        dcoll: DiscretizationCollection,
+        cv_state: ConservedEulerField, gamma: float):
     """Computes the (non-linear) volume flux for the
     Euler operator.
 
     :arg cv_state: A :class:`ConservedEulerField` containing the conserved
         variables.
-    :arg gamma: The isentropic expansion factor for a single-species gas
-        (default set to 1.4).
+    :arg gamma: The isentropic expansion factor.
     :returns: A :class:`ConservedEulerField` containing the volume fluxes.
     """
-    from arraycontext import outer
-
-    rho, u, p = conservative_to_primitive_vars(cv_state, gamma=gamma)
+    rho, u, p = conservative_to_primitive_vars(cv_state, gamma)
 
     return ConservedEulerField(
         mass=cv_state.momentum,
@@ -245,14 +305,13 @@ def euler_volume_flux(
 def euler_numerical_flux(
         actx: ArrayContext,
         dcoll: DiscretizationCollection, tpair: TracePair,
-        gamma=1.4, lf_stabilization=False):
+        gamma: float, dissipation=False):
     """Computes the interface numerical flux for the Euler operator.
 
     :arg tpair: A :class:`grudge.trace_pair.TracePair` containing the conserved
         variables on the interior and exterior sides of element facets.
-    :arg gamma: The isentropic expansion factor for a single-species gas
-        (default set to 1.4).
-    :arg lf_stabilization: A boolean denoting whether to apply Lax-Friedrichs
+    :arg gamma: The isentropic expansion factor.
+    :arg dissipation: A boolean denoting whether to apply Lax-Friedrichs
         dissipation.
     :returns: A :class:`ConservedEulerField` containing the interface fluxes.
     """
@@ -262,26 +321,26 @@ def euler_numerical_flux(
     dd_allfaces = dd_intfaces.with_domain_tag(
         BoundaryDomainTag(FACE_RESTR_ALL, VTAG_ALL)
         )
+
     q_ll = tpair.int
     q_rr = tpair.ext
 
     flux_tpair = TracePair(
         tpair.dd,
-        interior=euler_volume_flux(dcoll, q_ll, gamma=gamma),
-        exterior=euler_volume_flux(dcoll, q_rr, gamma=gamma)
+        interior=euler_volume_flux(dcoll, q_ll, gamma),
+        exterior=euler_volume_flux(dcoll, q_rr, gamma)
     )
     num_flux = flux_tpair.avg
-    normal = geo.normal(actx, dcoll, dd_intfaces)
-
-    if lf_stabilization:
-        from arraycontext import outer
+    normal = geo.normal(actx, dcoll, tpair.dd)
 
+    if dissipation:
         # Compute jump penalization parameter
-        lam = actx.np.maximum(compute_wavespeed(actx, q_ll, gamma=gamma),
-                              compute_wavespeed(actx, q_rr, gamma=gamma))
+
+        lam = actx.np.maximum(compute_wavespeed(q_ll, gamma),
+                              compute_wavespeed(q_rr, gamma))
         num_flux -= lam*outer(tpair.diff, normal)/2
 
-    return op.project(dcoll, dd_intfaces, dd_allfaces, num_flux @ normal)
+    return num_flux @ normal
 
 
 class EulerOperator(HyperbolicOperator):
@@ -310,7 +369,7 @@ def __init__(self, dcoll: DiscretizationCollection,
 
     def max_characteristic_velocity(self, actx: ArrayContext, **kwargs):
         state = kwargs["state"]
-        return compute_wavespeed(actx, state, gamma=self.gamma)
+        return compute_wavespeed(state, self.gamma)
 
     def operator(self, actx: ArrayContext, t, q):
         dcoll = self.dcoll
@@ -319,50 +378,294 @@ def operator(self, actx: ArrayContext, t, q):
         dq = as_dofdesc("vol", qtag)
         df = as_dofdesc("all_faces", qtag)
 
-        def interp_to_quad(u):
-            return op.project(dcoll, "vol", dq, u)
+        dissipation = self.lf_stabilization
 
-        # Compute volume fluxes
-        volume_fluxes = op.weak_local_div(
-            dcoll, dq,
-            interp_to_quad(euler_volume_flux(dcoll, q, gamma=gamma))
-        )
+        dd_base = as_dofdesc("vol", DISCR_TAG_BASE)
+        dd_vol_quad = as_dofdesc("vol", qtag)
+        dd_face_quad = as_dofdesc("all_faces", qtag)
+
+        def interp_to_quad_surf(tpair):
+            dd = tpair.dd
+            dd_quad = dd.with_discr_tag(qtag)
+            return TracePair(
+                dd_quad,
+                interior=op.project(dcoll, dd, dd_quad, tpair.int),
+                exterior=op.project(dcoll, dd, dd_quad, tpair.ext)
+            )
+
+        interior_trace_pairs = [
+            interp_to_quad_surf(tpair)
+            for tpair in op.interior_trace_pairs(dcoll, q)
+        ]
 
         # Compute interior interface fluxes
         interface_fluxes = (
             sum(
-                euler_numerical_flux(
-                    actx, dcoll,
-                    op.tracepair_with_discr_tag(dcoll, qtag, tpair),
-                    gamma=gamma,
-                    lf_stabilization=self.lf_stabilization
-                ) for tpair in op.interior_trace_pairs(dcoll, q)
+                op.project(dcoll, qtpair.dd, dd_face_quad,
+                           euler_numerical_flux(
+                               actx, dcoll,
+                               op.tracepair_with_discr_tag(dcoll, qtag, tpair),
+                               gamma=gamma,
+                               lf_stabilization=self.lf_stabilization
+                           ) for tpair in op.interior_trace_pairs(dcoll, q))
             )
         )
 
         # Compute boundary fluxes
         if self.bdry_conditions is not None:
-            bc_fluxes = sum(
-                euler_numerical_flux(
-                    actx, dcoll,
-                    self.bdry_conditions[btag].boundary_tpair(
+            for btag in self.bdry_conditions:
+                boundary_condition = self.bdry_conditions[btag]
+                dd_bc = as_dofdesc(btag).with_discr_tag(qtag)
+                bc_flux = op.project(
+                    dcoll,
+                    dd_bc,
+                    dd_face_quad,
+                    euler_numerical_flux(
                         actx, dcoll,
-                        as_dofdesc(btag, qtag),
-                        q,
-                        t=t
-                    ),
-                    gamma=gamma,
-                    lf_stabilization=self.lf_stabilization
-                ) for btag in self.bdry_conditions
-            )
-            interface_fluxes = interface_fluxes + bc_fluxes
+                        self.bdry_conditions[btag].boundary_tpair(
+                            actx, dcoll,
+                            as_dofdesc(btag, qtag),
+                            q,
+                            t=t
+                        ),
+                        gamma=gamma,
+                        lf_stabilization=self.lf_stabilization
+                    )
+                )
+                interface_fluxes = interface_fluxes + bc_flux
+
+        # Compute volume derivatives
+        volume_derivs = op.weak_local_div(
+            dcoll, dd_vol_quad,
+            euler_volume_flux(
+                dcoll, op.project(dcoll, dd_base, dd_vol_quad, q), gamma)
+        )
 
         return op.inverse_mass(
             dcoll,
-            volume_fluxes - op.face_mass(dcoll, df, interface_fluxes)
+            volume_derivs - op.face_mass(dcoll, dd_face_quad, interface_fluxes)
         )
 
 # }}}
 
 
+# {{{ Entropy stable Euler operator
+
+def divergence_flux_chandrashekar(
+        dcoll: DiscretizationCollection,
+        q_left: ConservedEulerField,
+        q_right: ConservedEulerField, gamma: float):
+    """Two-point volume flux based on the entropy conserving
+    and kinetic energy preserving two-point flux in:
+
+    Chandrashekar (2013) Kinetic Energy Preserving and Entropy Stable Finite
+    Volume Schemes for Compressible Euler and Navier-Stokes Equations:
+    `DOI <https://doi.org/10.4208/cicp.170712.010313a>`__.
+
+    :args q_left: A :class:`ConservedEulerField` containing the "left" state.
+    :args q_right: A :class:`ConservedEulerField` containing the "right" state.
+    :arg gamma: The isentropic expansion factor.
+    """
+    dim = dcoll.dim
+    actx = q_left.array_context
+
+    def ln_mean(x: DOFArray, y: DOFArray, epsilon=1e-4):
+        f2 = (x * (x - 2 * y) + y * y) / (x * (x + 2 * y) + y * y)
+        return actx.np.where(
+            actx.np.less(f2, epsilon),
+            (x + y) / (2 + f2*2/3 + f2*f2*2/5 + f2*f2*f2*2/7),
+            (y - x) / actx.np.log(y / x)
+        )
+
+    rho_left, u_left, p_left = conservative_to_primitive_vars(q_left, gamma)
+    rho_right, u_right, p_right = conservative_to_primitive_vars(q_right, gamma)
+
+    beta_left = 0.5 * rho_left / p_left
+    beta_right = 0.5 * rho_right / p_right
+    specific_kin_left = 0.5 * sum(v**2 for v in u_left)
+    specific_kin_right = 0.5 * sum(v**2 for v in u_right)
+
+    rho_avg = 0.5 * (rho_left + rho_right)
+    rho_mean = ln_mean(rho_left,  rho_right)
+    beta_mean = ln_mean(beta_left, beta_right)
+    beta_avg = 0.5 * (beta_left + beta_right)
+    u_avg = 0.5 * (u_left + u_right)
+    p_mean = 0.5 * rho_avg / beta_avg
+
+    velocity_square_avg = specific_kin_left + specific_kin_right
+
+    mass_flux = rho_mean * u_avg
+    momentum_flux = outer(mass_flux, u_avg) + np.eye(dim) * p_mean
+    energy_flux = (
+        mass_flux * 0.5 * (1/(gamma - 1)/beta_mean - velocity_square_avg)
+        + np.dot(momentum_flux, u_avg)
+    )
+
+    return ConservedEulerField(mass=mass_flux,
+                               energy=energy_flux,
+                               momentum=momentum_flux)
+
+
+def entropy_stable_numerical_flux_chandrashekar(
+        dcoll: DiscretizationCollection, tpair: TracePair,
+        gamma: float, dissipation=False):
+    """Entropy stable numerical flux based on the entropy conserving
+    and kinetic energy preserving two-point flux in:
+
+    Chandrashekar (2013) Kinetic Energy Preserving and Entropy Stable Finite
+    Volume Schemes for Compressible Euler and Navier-Stokes Equations
+    `DOI <https://doi.org/10.4208/cicp.170712.010313a>`__.
+
+    :arg tpair: A :class:`grudge.trace_pair.TracePair` containing the conserved
+        variables on the interior and exterior sides of element facets.
+    :arg gamma: The isentropic expansion factor.
+    :arg dissipation: A boolean denoting whether to apply Lax-Friedrichs
+        dissipation.
+    :returns: A :class:`ConservedEulerField` containing the interface fluxes.
+    """
+    q_int = tpair.int
+    q_ext = tpair.ext
+    actx = q_int.array_context
+
+    num_flux = divergence_flux_chandrashekar(
+        dcoll, q_left=q_int, q_right=q_ext, gamma=gamma)
+    normal = thaw(dcoll.normal(tpair.dd), actx)
+
+    if dissipation:
+        # Compute jump penalization parameter
+        lam = actx.np.maximum(compute_wavespeed(q_int, gamma),
+                              compute_wavespeed(q_ext, gamma))
+        num_flux -= lam*outer(tpair.diff, normal)/2
+
+    return num_flux @ normal
+
+
+class EntropyStableEulerOperator(EulerOperator):
+    """Discretizes the Euler equations using an entropy-stable
+    discontinuous Galerkin discretization as outlined in (15)
+    of `this paper <https://arxiv.org/pdf/1902.01828.pdf>`__.
+    """
+
+    def operator(self, t, q):
+        from grudge.projection import volume_quadrature_project
+        from grudge.interpolation import \
+            volume_and_surface_quadrature_interpolation
+
+        dcoll = self.dcoll
+        gamma = self.gamma
+        qtag = self.qtag
+        dissipation = self.lf_stabilization
+
+        dd_base = DOFDesc("vol", DISCR_TAG_BASE)
+        dd_vol_quad = DOFDesc("vol", qtag)
+        dd_face_quad = DOFDesc("all_faces", qtag)
+
+        # Convert to projected entropy variables: v_q = P_q v(u_q)
+        proj_entropy_vars = \
+            volume_quadrature_project(
+                dcoll, dd_vol_quad,
+                conservative_to_entropy_vars(
+                    # Interpolate state to vol quad grid: u_q = V_q u
+                    op.project(dcoll, dd_base, dd_vol_quad, q), gamma))
+
+        def modified_conserved_vars_tpair(tpair):
+            dd = tpair.dd
+            dd_quad = dd.with_discr_tag(qtag)
+            # Interpolate entropy variables to the surface quadrature grid
+            ev_tpair = op.project(dcoll, dd, dd_quad, tpair)
+            return TracePair(
+                dd_quad,
+                # Convert interior and exterior states to conserved variables
+                interior=entropy_to_conservative_vars(ev_tpair.int, gamma),
+                exterior=entropy_to_conservative_vars(ev_tpair.ext, gamma)
+            )
+
+        # Compute interior trace pairs containing the modified conserved
+        # variables (in terms of projected entropy variables)
+        interior_trace_pairs = [
+            modified_conserved_vars_tpair(tpair)
+            for tpair in op.interior_trace_pairs(dcoll, proj_entropy_vars)
+        ]
+
+        from functools import partial
+        from grudge.flux_differencing import volume_flux_differencing
+
+        def _reshape(shape, ary):
+            if not isinstance(ary, DOFArray):
+                return map_array_container(partial(_reshape, shape), ary)
+
+            return DOFArray(ary.array_context, data=tuple(
+                subary.reshape(grp.nelements, *shape)
+                # Just need group for determining the number of elements
+                for grp, subary in zip(dcoll.discr_from_dd(dd_base).groups, ary)))
+
+        # Compute the (modified) conserved state in terms of the projected
+        # entropy variables on both the volume and surface nodes
+        qtilde_vol_and_surf = \
+            entropy_to_conservative_vars(
+                # Interpolate projected entropy variables to
+                # volume + surface quadrature grids
+                volume_and_surface_quadrature_interpolation(
+                    dcoll, dd_vol_quad, dd_face_quad, proj_entropy_vars), gamma)
+
+        # FIXME: These matrices are actually symmetric. Could make use
+        # of that to avoid redundant computation.
+        flux_matrices = divergence_flux_chandrashekar(
+            dcoll,
+            _reshape((1, -1), qtilde_vol_and_surf),
+            _reshape((-1, 1), qtilde_vol_and_surf),
+            gamma
+        )
+
+        # Compute volume derivatives using flux differencing
+        volume_derivs = -volume_flux_differencing(
+            dcoll, dd_vol_quad, dd_face_quad, flux_matrices)
+
+        # Computing interface numerical fluxes
+        interface_fluxes = (
+            sum(
+                op.project(dcoll, qtpair.dd, dd_face_quad,
+                           entropy_stable_numerical_flux_chandrashekar(
+                               dcoll, qtpair, gamma, dissipation=dissipation))
+                for qtpair in interior_trace_pairs
+            )
+        )
+
+        # Compute boundary fluxes
+        if self.bdry_conditions is not None:
+            for btag in self.bdry_conditions:
+                boundary_condition = self.bdry_conditions[btag]
+                dd_bc = as_dofdesc(btag).with_discr_tag(qtag)
+                bc_flux = op.project(
+                    dcoll,
+                    dd_bc,
+                    dd_face_quad,
+                    entropy_stable_numerical_flux_chandrashekar(
+                        dcoll,
+                        boundary_condition.boundary_tpair(
+                            dcoll=dcoll,
+                            dd_bc=dd_bc,
+                            # Pass modified conserved state to be used as
+                            # the "interior" state for computing the boundary
+                            # trace pair
+                            restricted_state=entropy_to_conservative_vars(
+                                op.project(
+                                    dcoll, dd_base, dd_bc, proj_entropy_vars),
+                                gamma
+                            ),
+                            t=t
+                        ),
+                        gamma,
+                        dissipation=dissipation
+                    )
+                )
+                interface_fluxes = interface_fluxes + bc_flux
+
+        return op.inverse_mass(
+            dcoll,
+            volume_derivs - op.face_mass(dcoll, dd_face_quad, interface_fluxes)
+        )
+
+# }}}
+
 # vim: foldmethod=marker
diff --git a/grudge/models/wave.py b/grudge/models/wave.py
index 09623d622..66732e92d 100644
--- a/grudge/models/wave.py
+++ b/grudge/models/wave.py
@@ -187,7 +187,13 @@ def max_characteristic_velocity(self, actx, t=None, fields=None):
 
     def estimate_rk4_timestep(self, actx, dcoll, **kwargs):
         # FIXME: Sketchy, empirically determined fudge factor
-        return 0.38 * super().estimate_rk4_timestep(actx,  dcoll, **kwargs)
+        from meshmode.discretization.poly_element import SimplexElementGroupBase
+        from grudge.dof_desc import DD_VOLUME_ALL
+        volm_discr = dcoll.discr_from_dd(DD_VOLUME_ALL)
+        tpe = any(not isinstance(grp, SimplexElementGroupBase)
+                  for grp in volm_discr.groups)
+        fudge_fac = 0.38 if not tpe else 0.23
+        return fudge_fac * super().estimate_rk4_timestep(actx,  dcoll, **kwargs)
 
 # }}}
 
diff --git a/grudge/op.py b/grudge/op.py
index 1937f35d4..0ef01ae5c 100644
--- a/grudge/op.py
+++ b/grudge/op.py
@@ -73,33 +73,39 @@
 
 
 from functools import partial
-
 import numpy as np
 
 import modepy as mp
 from arraycontext import ArrayContext, ArrayOrContainer, map_array_container, tag_axes
-from meshmode.dof_array import DOFArray
+from meshmode.dof_array import DOFArray, warn
 from meshmode.transform_metadata import (
     DiscretizationDOFAxisTag,
     DiscretizationElementAxisTag,
     DiscretizationFaceAxisTag,
     FirstAxisIsElementsTag,
 )
+from modepy.tools import (
+    reshape_array_for_tensor_product_space as fold,
+    unreshape_array_for_tensor_product_space as unfold
+)
 from pytools import keyed_memoize_in
 from pytools.obj_array import make_obj_array
 
+# from grudge.array_context import OutputIsTensorProductDOFArrayOrdered
 import grudge.dof_desc as dof_desc
 from grudge.discretization import DiscretizationCollection
 from grudge.dof_desc import (
     DD_VOLUME_ALL,
     DISCR_TAG_BASE,
+    DISCR_TAG_QUAD,
     FACE_RESTR_ALL,
     DOFDesc,
     VolumeDomainTag,
     as_dofdesc,
 )
 from grudge.interpolation import interp
-from grudge.projection import project
+from grudge.projection import project, volume_quadrature_project
+
 from grudge.reductions import (
     elementwise_integral,
     elementwise_max,
@@ -117,10 +123,13 @@
 from grudge.trace_pair import (
     bdry_trace_pair,
     bv_trace_pair,
-    connected_ranks,
+    # connected_ranks,
     cross_rank_trace_pairs,
+    cross_rank_inter_volume_trace_pairs,
+    inter_volume_trace_pairs,
     interior_trace_pair,
     interior_trace_pairs,
+    local_inter_volume_trace_pairs,
     local_interior_trace_pair,
     project_tracepair,
     tracepair_with_discr_tag,
@@ -130,7 +139,8 @@
 __all__ = (
     "bdry_trace_pair",
     "bv_trace_pair",
-    "connected_ranks",
+    "connected_parts",
+    "cross_rank_inter_volume_trace_pairs",
     "cross_rank_trace_pairs",
     "elementwise_integral",
     "elementwise_max",
@@ -138,6 +148,7 @@
     "elementwise_sum",
     "face_mass",
     "integral",
+    "inter_volume_trace_pairs",
     "interior_trace_pair",
     "interior_trace_pairs",
     "interp",
@@ -145,6 +156,7 @@
     "local_d_dx",
     "local_div",
     "local_grad",
+    "local_inter_volume_trace_pairs",
     "local_interior_trace_pair",
     "mass",
     "nodal_max",
@@ -157,6 +169,7 @@
     "project",
     "project_tracepair",
     "tracepair_with_discr_tag",
+    "volume_quadrature_project",
     "weak_local_d_dx",
     "weak_local_div",
     "weak_local_grad",
@@ -176,20 +189,32 @@ def _single_axis_derivative_kernel(
     # - whether the chain rule terms ("inv_jac_mat") sit outside (strong)
     #   or inside (weak) the matrix-vector product that carries out the
     #   derivative, cf. "metric_in_matvec".
+
+    # {{{ simplicial single axis derivative
+
+    def compute_simplicial_derivative(actx, in_grp, out_grp,
+                                      get_diff_mat, vec, ijm,
+                                      xyz_axis, metric_in_matvec):
+        # r for rst axis
+        return actx.einsum(
+            "rej,rij,ej->ei" if metric_in_matvec else "rei,rij,ej->ei",
+            ijm[xyz_axis],
+            get_diff_mat(
+                actx,
+                out_element_group=out_grp,
+                in_element_group=in_grp),
+            vec,
+            arg_names=("inv_jac_t", "ref_stiffT_mat", "vec", ),
+            tagged=(FirstAxisIsElementsTag(),))
+
+    # }}}
+
     return DOFArray(
         actx,
         data=tuple(
-            # r for rst axis
-            actx.einsum("rej,rij,ej->ei" if metric_in_matvec else "rei,rij,ej->ei",
-                        ijm_i[xyz_axis],
-                        get_diff_mat(
-                            actx,
-                            out_element_group=out_grp,
-                            in_element_group=in_grp),
-                        vec_i,
-                        arg_names=("inv_jac_t", "ref_stiffT_mat", "vec", ),
-                        tagged=(FirstAxisIsElementsTag(),))
-
+            compute_simplicial_derivative(actx, in_grp, out_grp,
+                                               get_diff_mat, vec_i, ijm_i,
+                                               xyz_axis, metric_in_matvec)
             for out_grp, in_grp, vec_i, ijm_i in zip(
                 out_discr.groups,
                 in_discr.groups,
@@ -201,22 +226,33 @@ def _gradient_kernel(actx, out_discr, in_discr, get_diff_mat, inv_jac_mat, vec,
         *, metric_in_matvec):
     # See _single_axis_derivative_kernel for comments on the usage scenarios
     # (both strong and weak derivative) and their differences.
+
+    # {{{ simplicial grad
+
+    def compute_simplicial_grad(actx, in_grp, out_grp, get_diff_mat, vec_i,
+                                ijm_i, metric_in_matvec):
+        return actx.einsum(
+            "xrej,rij,ej->xei" if metric_in_matvec else "xrei,rij,ej->xei",
+            ijm_i,
+            get_diff_mat(
+                actx,
+                out_element_group=out_grp,
+                in_element_group=in_grp
+            ),
+            vec_i,
+            arg_names=("inv_jac_t", "ref_stiffT_mat", "vec"),
+            tagged=(FirstAxisIsElementsTag(),))
+
+    # }}}
+
     per_group_grads = [
-        # r for rst axis
-        # x for xyz axis
-        actx.einsum("xrej,rij,ej->xei" if metric_in_matvec else "xrei,rij,ej->xei",
-                    ijm_i,
-                    get_diff_mat(
-                        actx,
-                        out_element_group=out_grp,
-                        in_element_group=in_grp
-                    ),
-                    vec_i,
-                    arg_names=("inv_jac_t", "ref_stiffT_mat", "vec"),
-                    tagged=(FirstAxisIsElementsTag(),))
+        compute_simplicial_grad(actx, in_grp, out_grp, get_diff_mat, vec_i,
+                                     ijm_i, metric_in_matvec)
+
         for out_grp, in_grp, vec_i, ijm_i in zip(
             out_discr.groups, in_discr.groups, vec,
-            inv_jac_mat, strict=True)]
+                inv_jac_mat, strict=True)
+    ]
 
     return make_obj_array([
             DOFArray(actx, data=tuple([  # noqa: C409
@@ -229,24 +265,33 @@ def _divergence_kernel(actx, out_discr, in_discr, get_diff_mat, inv_jac_mat, vec
         *, metric_in_matvec):
     # See _single_axis_derivative_kernel for comments on the usage scenarios
     # (both strong and weak derivative) and their differences.
+
+    # {{{ simplicial div
+
+    def compute_simplicial_div(actx, in_grp, out_grp, get_diff_mat, vec_i,
+                               ijm_i, metric_in_matvec):
+        return actx.einsum(
+            "xrej,rij,xej->ei" if metric_in_matvec else "xrei,rij,xej->ei",
+            ijm_i,
+            get_diff_mat(
+                actx,
+                out_element_group=out_grp,
+                in_element_group=in_grp
+            ),
+            vec_i,
+            arg_names=("inv_jac_t", "ref_stiffT_mat", "vec"),
+            tagged=(FirstAxisIsElementsTag(),))
+
+    # }}}
+
     per_group_divs = [
-        # r for rst axis
-        # x for xyz axis
-        actx.einsum("xrej,rij,xej->ei" if metric_in_matvec else "xrei,rij,xej->ei",
-                    ijm_i,
-                    get_diff_mat(
-                        actx,
-                        out_element_group=out_grp,
-                        in_element_group=in_grp
-                    ),
-                    vec_i,
-                    arg_names=("inv_jac_t", "ref_stiffT_mat", "vec"),
-                    tagged=(FirstAxisIsElementsTag(),))
+        compute_simplicial_div(actx, in_grp, out_grp, get_diff_mat, vec_i,
+                                    ijm_i, metric_in_matvec)
+
         for out_grp, in_grp, vec_i, ijm_i in zip(
-            out_discr.groups,
-            in_discr.groups,
-            vec,
-            inv_jac_mat, strict=True)]
+            out_discr.groups, in_discr.groups, vec,
+                inv_jac_mat, strict=True)
+    ]
 
     return DOFArray(actx, data=tuple(per_group_divs))
 
@@ -446,6 +491,7 @@ def get_ref_stiffness_transpose_mat(out_grp, in_grp):
         if in_grp == out_grp:
             mmat = mp.mass_matrix(out_grp.basis_obj(), out_grp.unit_nodes)
             diff_matrices = mp.diff_matrices(out_grp.basis_obj(), out_grp.unit_nodes)
+
             return actx.freeze(
                 actx.tag_axis(1, DiscretizationDOFAxisTag(),
                     actx.from_numpy(
@@ -474,6 +520,7 @@ def get_ref_stiffness_transpose_mat(out_grp, in_grp):
                 ).copy()  # contigify the array
             )
         )
+
     return get_ref_stiffness_transpose_mat(out_element_group,
                                            in_element_group)
 
@@ -780,6 +827,7 @@ def reference_inverse_mass_matrix(actx: ArrayContext, element_group):
         lambda grp: grp.discretization_key())
     def get_ref_inv_mass_mat(grp):
         from modepy import inverse_mass_matrix
+
         basis = grp.basis_obj()
 
         return actx.freeze(
@@ -812,20 +860,231 @@ def _apply_inverse_mass_operator(
     discr = dcoll.discr_from_dd(dd_in)
     inv_area_elements = 1./area_element(actx, dcoll, dd=dd_in,
             _use_geoderiv_connection=actx.supports_nonscalar_broadcasting)
+
+    def apply_to_simplicial_elements(jac_inv, vec, ref_inv_mass):
+        # Based on https://arxiv.org/pdf/1608.03836.pdf
+        # true_Minv ~ ref_Minv * ref_M * (1/jac_det) * ref_Minv
+        return actx.einsum(
+            "ei,ij,ej->ei",
+            jac_inv,
+            ref_inv_mass,
+            vec,
+            tagged=(FirstAxisIsElementsTag(),))
+
     group_data = [
-            # Based on https://arxiv.org/pdf/1608.03836.pdf
-            # true_Minv ~ ref_Minv * ref_M * (1/jac_det) * ref_Minv
-            actx.einsum("ei,ij,ej->ei",
-                        jac_inv,
-                        reference_inverse_mass_matrix(actx, element_group=grp),
-                        vec_i,
-                        tagged=(FirstAxisIsElementsTag(),))
-            for grp, jac_inv, vec_i in zip(
-                discr.groups, inv_area_elements, vec, strict=True)]
+        apply_to_simplicial_elements(jac_inv, vec_i,
+            reference_inverse_mass_matrix(actx, element_group=grp))
+        for grp, jac_inv, vec_i in zip(discr.groups, inv_area_elements, vec)
+    ]
 
     return DOFArray(actx, data=tuple(group_data))
 
 
+def _apply_inverse_mass_operator_quad(
+        dcoll: DiscretizationCollection, dd, vec):
+    if not isinstance(vec, DOFArray):
+        return map_array_container(
+            partial(_apply_inverse_mass_operator_quad, dcoll, dd), vec
+        )
+
+    from grudge.geometry import area_element
+
+    actx = vec.array_context
+    dd_quad = dd.with_discr_tag(DISCR_TAG_QUAD)
+    dd_base = dd.with_discr_tag(DISCR_TAG_BASE)
+    discr_quad = dcoll.discr_from_dd(dd_quad)
+    discr_base = dcoll.discr_from_dd(dd_base)
+
+    # Based on https://arxiv.org/pdf/1608.03836.pdf
+    # true_Minv ~ ref_Minv * ref_M * (1/jac_det) * ref_Minv
+    # Overintegration version of action on *vec*:
+    # true_Minv ~ ref_Minv * (ref_M)_qtb * (1/Jac)_quad * P(Minv*vec)
+    # P => projection to quadrature, qti => quad-to-base
+
+    # Compute 1/Jac on quadrature discr
+    inv_area_elements = 1/area_element(
+            actx, dcoll, dd=dd_quad,
+            _use_geoderiv_connection=actx.supports_nonscalar_broadcasting)
+
+    def apply_minv_to_vec(vec, ref_inv_mass):
+        return actx.einsum(
+            "ij,ej->ei",
+            ref_inv_mass,
+            vec,
+            tagged=(FirstAxisIsElementsTag(),))
+
+    # The rest of wadg
+    def apply_rest_of_wadg(mm_inv, mm, vec):
+        return actx.einsum(
+            "ni,ij,ej->en",
+            mm_inv,
+            mm,
+            vec,
+            tagged=(FirstAxisIsElementsTag(),))
+
+    stage1_group_data = [
+        apply_minv_to_vec(
+            vec_i, reference_inverse_mass_matrix(actx, element_group=grp))
+        for grp, vec_i in zip(discr_base.groups, vec)
+    ]
+    stage2 = inv_area_elements * project(
+        dcoll, dd_base, dd_quad,
+        DOFArray(actx, data=tuple(stage1_group_data)))
+
+    wadg_group_data = [
+        apply_rest_of_wadg(
+            reference_inverse_mass_matrix(actx, out_grp),
+            reference_mass_matrix(actx, out_grp, in_grp), vec_i)
+        for in_grp, out_grp, vec_i in zip(
+                discr_quad.groups, discr_base.groups, stage2)
+    ]
+
+    return DOFArray(actx, data=tuple(wadg_group_data))
+
+
+"""
+def _apply_inverse_mass_operator_quad(
+        dcoll: DiscretizationCollection, dd_out, dd_in, vec):
+    if not isinstance(vec, DOFArray):
+        return map_array_container(
+            partial(_apply_inverse_mass_operator_quad, dcoll, dd_out, dd_in), vec
+        )
+
+    from grudge.geometry import area_element
+
+    if dd_out != dd_in:
+        raise ValueError(
+            "Cannot compute inverse of a mass matrix mapping "
+            "between different element groups; inverse is not "
+            "guaranteed to be well-defined"
+        )
+
+    actx = vec.array_context
+    dd_quad = dd_in
+    dd_base = dd_quad.with_discr_tag(DISCR_TAG_BASE)
+    discr_quad = dcoll.discr_from_dd(dd_quad)
+    discr_base = dcoll.discr_from_dd(dd_base)
+
+    # ae = \
+    #    project(dcoll, dd_base, dd_quad,
+    #            area_element(
+    #                actx, dcoll, dd=dd_base,
+    #                _use_geoderiv_connection=actx.supports_nonscalar_broadcasting))
+
+    ae = area_element(
+        actx, dcoll, dd=dd_quad,
+        _use_geoderiv_connection=actx.supports_nonscalar_broadcasting)
+
+    inv_area_elements = 1./ae
+
+    def apply_to_tensor_product_elements(grp, jac_inv, vec, ref_inv_mass):
+
+        vec = fold(grp.space, vec)
+
+        for xyz_axis in range(grp.dim):
+            vec = single_axis_operator_application(
+                actx, grp.dim, ref_inv_mass, xyz_axis, vec,
+                tags=(FirstAxisIsElementsTag(),
+                      OutputIsTensorProductDOFArrayOrdered(),),
+                arg_names=("ref_inv_mass_1d", "vec"))
+
+        vec = unfold(grp.space, vec)
+
+        return actx.einsum(
+            "ei,ei->ei",
+            jac_inv,
+            vec,
+            tagged=(FirstAxisIsElementsTag(),)
+        )
+
+    def apply_to_simplicial_elements_stage1(vec, ref_inv_mass):
+        # Based on https://arxiv.org/pdf/1608.03836.pdf
+        # true_Minv ~ ref_Minv * ref_M * (1/jac_det) * ref_Minv
+        return actx.einsum(
+            "ij,ej->ei",
+            ref_inv_mass,
+            vec,
+            tagged=(FirstAxisIsElementsTag(),))
+
+    def apply_to_simplicial_elements_staged(mm_inv, mm, vec):
+        return actx.einsum(
+            "ni,ij,ej->en",
+            mm_inv,
+            mm,
+            vec,
+            tagged=(FirstAxisIsElementsTag(),))
+
+    def apply_to_simplicial_elements_stage2(jac_inv, vec):
+        # Based on https://arxiv.org/pdf/1608.03836.pdf
+        # true_Minv ~ ref_Minv * ref_M * (1/jac_det) * ref_Minv
+        return actx.einsum(
+            "ei,ej->ei",
+            jac_inv,
+            vec,
+            tagged=(FirstAxisIsElementsTag(),))
+
+    def apply_to_simplicial_elements_stage3(mm, vec):
+        # Based on https://arxiv.org/pdf/1608.03836.pdf
+        # true_Minv ~ ref_Minv * ref_M * (1/jac_det) * ref_Minv
+        return actx.einsum(
+            "ij,ej->ei",
+            mm,
+            vec,
+            tagged=(FirstAxisIsElementsTag(),))
+
+    def apply_to_simplicial_elements_stage4(mm_inv, vec):
+        # Based on https://arxiv.org/pdf/1608.03836.pdf
+        # true_Minv ~ ref_Minv * ref_M * (1/jac_det) * ref_Minv
+        return actx.einsum(
+            "ij,ej->ei",
+            mm_inv,
+            vec,
+            tagged=(FirstAxisIsElementsTag(),))
+
+    stage1_group_data = [
+        apply_to_simplicial_elements_stage1(vec_i,
+            reference_inverse_mass_matrix(actx, element_group=grp))
+        for grp, vec_i in zip(discr_base.groups, vec)
+    ]
+
+    stage1 = DOFArray(actx, data=tuple(stage1_group_data))
+    stage1 = project(dcoll, dd_base, dd_quad, stage1)
+
+
+    stage2_group_data = [
+        apply_to_simplicial_elements_stage2(jac_inv, vec_i)
+        for jac_inv, vec_i in zip(inv_area_elements, stage1)
+    ]
+
+    stage2 = DOFArray(actx, data=tuple(stage2_group_data))
+
+    staged_group_data = [
+        apply_to_simplicial_elements_staged(
+            reference_inverse_mass_matrix(actx, out_grp),
+            reference_mass_matrix(actx, out_grp, in_grp), vec_i)
+        for in_grp, out_grp, vec_i in zip(
+                discr_quad.groups, discr_base.groups, stage2)
+    ]
+
+    stage3_group_data = [
+        apply_to_simplicial_elements_stage3(
+            reference_mass_matrix(actx, out_grp, in_grp), vec_i)
+        for out_grp, in_grp, vec_i in zip(discr_base.groups, discr_quad.groups,
+                                          stage2)
+    ]
+    stage3 = DOFArray(actx, data=tuple(stage3_group_data))
+
+    group_data = [
+        apply_to_simplicial_elements_stage4(
+            reference_inverse_mass_matrix(actx, element_group=grp), vec_i)
+        for grp, vec_i in zip(discr_base.groups, stage3)
+    ]
+
+    # return DOFArray(actx, data=tuple(group_data))
+    return DOFArray(actx, data=tuple(staged_group_data))
+"""
+
+
 def inverse_mass(dcoll: DiscretizationCollection, *args) -> ArrayOrContainer:
     r"""Return the action of the DG mass matrix inverse on a vector
     (or vectors) of :class:`~meshmode.dof_array.DOFArray`\ s, *vec*.
@@ -873,6 +1132,11 @@ def inverse_mass(dcoll: DiscretizationCollection, *args) -> ArrayOrContainer:
     else:
         raise TypeError("invalid number of arguments")
 
+    if dd.uses_quadrature():
+        # if not dcoll._has_affine_groups(dd.domain_tag):
+        return _apply_inverse_mass_operator_quad(dcoll, dd, vec)
+    # dd = dd.with_discr_tag(DISCR_TAG_BASE)
+
     return _apply_inverse_mass_operator(dcoll, dd, dd, vec)
 
 # }}}
@@ -1068,4 +1332,30 @@ def face_mass(dcoll: DiscretizationCollection, *args) -> ArrayOrContainer:
 # }}}
 
 
+# {{{ general single axis operator application
+
+def single_axis_operator_application(actx, dim, operator, axis, data,
+                                     arg_names=None, tags=None):
+    """
+    Used for applying 1D operators to a single axis of a tensor of DOF data.
+    """
+
+    if not isinstance(arg_names, tuple):
+        raise TypeError("arg_names must be a tuple.")
+    if not isinstance(tags, tuple):
+        raise TypeError("arg_names must be a tuple.")
+
+    operator_spec = "ij"
+    data_spec = f'e{"abcdefghklm"[:axis]}j{"nopqrstuvwxyz"[:dim-axis-1]}'
+    out_spec = f'e{"abcdefghklm"[:axis]}i{"nopqrstuvwxyz"[:dim-axis-1]}'
+
+    spec = operator_spec + "," + data_spec + "->" + out_spec
+
+    return actx.einsum(spec, operator, data,
+                       arg_names=arg_names,
+                       tagged=tags)
+
+# }}}
+
+
 # vim: foldmethod=marker
diff --git a/grudge/projection.py b/grudge/projection.py
index d5ebdd689..88f29ea7c 100644
--- a/grudge/projection.py
+++ b/grudge/projection.py
@@ -5,6 +5,7 @@
 -----------
 
 .. autofunction:: project
+.. autofunction:: volume_quadrature_project
 """
 
 from __future__ import annotations
@@ -34,8 +35,12 @@
 THE SOFTWARE.
 """
 
+from functools import partial
+from numbers import Number
 
-from arraycontext import ArrayOrContainer
+import numpy as np
+
+from arraycontext import ArrayOrContainer, map_array_container
 
 from grudge.discretization import DiscretizationCollection
 from grudge.dof_desc import (
@@ -45,6 +50,11 @@
     as_dofdesc,
 )
 
+from meshmode.dof_array import DOFArray
+from meshmode.transform_metadata import FirstAxisIsElementsTag
+
+from pytools import keyed_memoize_in
+
 
 def project(
         dcoll: DiscretizationCollection,
@@ -82,3 +92,65 @@ def project(
         return vec
 
     return dcoll.connection_from_dds(src_dofdesc, tgt_dofdesc)(vec)
+
+
+def volume_quadrature_project(
+        dcoll: DiscretizationCollection, dd_q, vec) -> ArrayOrContainer:
+    """Projects a field on the quadrature discreization, described by *dd_q*,
+    into the polynomial space described by the volume discretization.
+
+    :arg dd_q: a :class:`~grudge.dof_desc.DOFDesc`, or a value convertible to one.
+    :arg vec: a :class:`~meshmode.dof_array.DOFArray` or an
+        :class:`~arraycontext.container.ArrayContainer` of them.
+    :returns: a :class:`~meshmode.dof_array.DOFArray` or an
+        :class:`~arraycontext.container.ArrayContainer` like *vec*.
+    """
+    if not isinstance(vec, DOFArray):
+        return map_array_container(
+            partial(volume_quadrature_project, dcoll, dd_q), vec
+        )
+
+    from grudge.geometry import area_element
+    from grudge.interpolation import volume_quadrature_interpolation_matrix
+    from grudge.op import inverse_mass
+
+    from grudge.dof_desc import DISCR_TAG_BASE
+    dd_vol = dd_q.with_discr_tag(DISCR_TAG_BASE)
+
+    actx = vec.array_context
+    discr = dcoll.discr_from_dd(dd_vol)
+    quad_discr = dcoll.discr_from_dd(dd_q)
+    jacobians = area_element(
+        actx, dcoll, dd=dd_q,
+        _use_geoderiv_connection=actx.supports_nonscalar_broadcasting)
+
+    @keyed_memoize_in(
+        actx, volume_quadrature_project,
+        lambda base_grp, vol_quad_grp: (base_grp.discretization_key(),
+                                        vol_quad_grp.discretization_key()))
+    def get_mat(base_grp, vol_quad_grp):
+        vdm_q = actx.to_numpy(
+            volume_quadrature_interpolation_matrix(
+                actx, base_grp, vol_quad_grp
+            )
+        )
+        weights = np.diag(vol_quad_grp.quadrature_rule().weights)
+        return actx.freeze(actx.from_numpy(vdm_q.T @ weights))
+
+    return inverse_mass(
+        dcoll,
+        dd_vol,
+        DOFArray(
+            actx,
+            data=tuple(
+                actx.einsum("ij,ej,ej->ei",
+                            get_mat(bgrp, qgrp),
+                            jac_i,
+                            vec_i,
+                            arg_names=("vqw_t", "jac", "vec"),
+                            tagged=(FirstAxisIsElementsTag(),))
+                for bgrp, qgrp, vec_i, jac_i in zip(
+                    discr.groups, quad_discr.groups, vec, jacobians)
+            )
+        )
+    )
diff --git a/grudge/reductions.py b/grudge/reductions.py
index 6dcde1316..6f6108462 100644
--- a/grudge/reductions.py
+++ b/grudge/reductions.py
@@ -295,14 +295,23 @@ def integral(dcoll: DiscretizationCollection, dd, vec) -> Scalar:
         :class:`~arraycontext.ArrayContainer` of them.
     :returns: a device scalar denoting the evaluated integral.
     """
-    from grudge.op import _apply_mass_operator
-
     dd = dof_desc.as_dofdesc(dd)
+    discr = dcoll.discr_from_dd(dd)
 
-    ones = dcoll.discr_from_dd(dd).zeros(vec.array_context) + 1.0
-    return nodal_sum(
-        dcoll, dd, vec * _apply_mass_operator(dcoll, dd, dd, ones)
+    from grudge.geometry import area_element
+    actx = vec.array_context
+    area_elements = area_element(
+        actx, dcoll, dd=dd,
+        _use_geoderiv_connection=actx.supports_nonscalar_broadcasting)
+    qwts = DOFArray(
+        actx,
+        data=tuple(
+            actx.from_numpy(
+                np.tile(grp.quadrature_rule().weights,
+                        (ae_i.shape[0], 1)))*ae_i
+            for grp, ae_i in zip(discr.groups, area_elements))
     )
+    return nodal_sum(dcoll, dd, vec*qwts)
 
 # }}}
 
@@ -509,13 +518,22 @@ def elementwise_integral(
         raise TypeError("invalid number of arguments")
 
     dd = dof_desc.as_dofdesc(dd)
+    discr = dcoll.discr_from_dd(dd)
 
-    from grudge.op import _apply_mass_operator
-
-    ones = dcoll.discr_from_dd(dd).zeros(vec.array_context) + 1.0
-    return elementwise_sum(
-        dcoll, dd, vec * _apply_mass_operator(dcoll, dd, dd, ones)
+    from grudge.geometry import area_element
+    actx = vec.array_context
+    area_elements = area_element(
+        actx, dcoll, dd=dd,
+        _use_geoderiv_connection=actx.supports_nonscalar_broadcasting)
+    qwts = DOFArray(
+        actx,
+        data=tuple(
+            actx.from_numpy(
+                np.tile(grp.quadrature_rule().weights,
+                        (ae_i.shape[0], 1)))*ae_i
+            for grp, ae_i in zip(discr.groups, area_elements))
     )
+    return elementwise_sum(dcoll, dd, vec*qwts)
 
 # }}}
 
diff --git a/grudge/trace_pair.py b/grudge/trace_pair.py
index 1903a6a2f..599150f23 100644
--- a/grudge/trace_pair.py
+++ b/grudge/trace_pair.py
@@ -18,12 +18,15 @@
 .. autofunction:: bdry_trace_pair
 .. autofunction:: bv_trace_pair
 
-Interior and cross-rank trace functions
----------------------------------------
+Interior, cross-rank, and inter-volume traces
+---------------------------------------------
 
 .. autofunction:: interior_trace_pairs
 .. autofunction:: local_interior_trace_pair
+.. autofunction:: inter_volume_trace_pairs
+.. autofunction:: local_inter_volume_trace_pairs
 .. autofunction:: cross_rank_trace_pairs
+.. autofunction:: cross_rank_inter_volume_trace_pairs
 """
 
 __copyright__ = """
@@ -53,18 +56,29 @@
 from collections.abc import Hashable
 from dataclasses import dataclass
 from numbers import Number
-from typing import Any
+from typing import (
+    Any,
+    Hashable,
+    List,
+    Mapping,
+    Optional,
+    Tuple,
+    Type,
+    Sequence
+)
 from warnings import warn
 
 import numpy as np
 
 from arraycontext import (
     ArrayContainer,
+    ArrayContext,
     ArrayOrContainer,
     dataclass_array_container,
     flatten,
     from_numpy,
     get_container_context_recursively,
+    get_container_context_recursively_opt,
     to_numpy,
     unflatten,
     with_container_arithmetic,
@@ -74,13 +88,15 @@
 from pytools.persistent_dict import KeyBuilder
 
 import grudge.dof_desc as dof_desc
-from grudge.discretization import DiscretizationCollection
+from grudge.discretization import DiscretizationCollection, PartID
 from grudge.dof_desc import (
     DD_VOLUME_ALL,
     DISCR_TAG_BASE,
     FACE_RESTR_INTERIOR,
+    BoundaryDomainTag,
     ConvertibleToDOFDesc,
     DOFDesc,
+    VolumeTag,
     VolumeDomainTag,
 )
 from grudge.projection import project
@@ -347,6 +363,124 @@ def interior_trace_pairs(
 # }}}
 
 
+# {{{ inter-volume trace pairs
+
+def local_inter_volume_trace_pairs(
+        dcoll: DiscretizationCollection,
+        pairwise_volume_data: Mapping[
+            Tuple[DOFDesc, DOFDesc],
+            Tuple[ArrayOrContainer, ArrayOrContainer]]
+        ) -> Mapping[Tuple[DOFDesc, DOFDesc], TracePair]:
+    for vol_dd_pair in pairwise_volume_data.keys():
+        for vol_dd in vol_dd_pair:
+            if not isinstance(vol_dd.domain_tag, VolumeDomainTag):
+                raise ValueError(
+                    "pairwise_volume_data keys must describe volumes, "
+                    f"got '{vol_dd}'")
+            if vol_dd.discretization_tag != DISCR_TAG_BASE:
+                raise ValueError(
+                    "expected base-discretized DOFDesc in pairwise_volume_data, "
+                    f"got '{vol_dd}'")
+
+    rank = (
+        dcoll.mpi_communicator.Get_rank()
+        if dcoll.mpi_communicator is not None
+        else None)
+
+    result: Mapping[Tuple[DOFDesc, DOFDesc], TracePair] = {}
+
+    for vol_dd_pair, vol_data_pair in pairwise_volume_data.items():
+        from meshmode.mesh import mesh_has_boundary
+        if not mesh_has_boundary(
+                dcoll.discr_from_dd(vol_dd_pair[0]).mesh,
+                BTAG_PARTITION(PartID(vol_dd_pair[1].domain_tag.tag, rank))):
+            continue
+
+        directional_vol_dd_pairs = [
+            (vol_dd_pair[1], vol_dd_pair[0]),
+            (vol_dd_pair[0], vol_dd_pair[1])]
+
+        trace_dd_pair = tuple(
+            self_vol_dd.trace(
+                BTAG_PARTITION(
+                    PartID(other_vol_dd.domain_tag.tag, rank)))
+            for other_vol_dd, self_vol_dd in directional_vol_dd_pairs)
+
+        # Pre-compute the projections out here to avoid doing it twice inside
+        # the loop below
+        trace_data = {
+            trace_dd: project(dcoll, vol_dd, trace_dd, vol_data)
+            for vol_dd, trace_dd, vol_data in zip(
+                vol_dd_pair, trace_dd_pair, vol_data_pair)}
+
+        for other_vol_dd, self_vol_dd in directional_vol_dd_pairs:
+            self_part_id = PartID(self_vol_dd.domain_tag.tag, rank)
+            other_part_id = PartID(other_vol_dd.domain_tag.tag, rank)
+
+            self_trace_dd = self_vol_dd.trace(BTAG_PARTITION(other_part_id))
+            other_trace_dd = other_vol_dd.trace(BTAG_PARTITION(self_part_id))
+
+            self_trace_data = trace_data[self_trace_dd]
+            unswapped_other_trace_data = trace_data[other_trace_dd]
+
+            other_to_self = dcoll._inter_part_connections[
+                other_part_id, self_part_id]
+
+            def get_opposite_trace(ary):
+                if isinstance(ary, Number):
+                    return ary
+                else:
+                    return other_to_self(ary)  # noqa: B023
+
+            from arraycontext import rec_map_array_container
+            from meshmode.dof_array import DOFArray
+            other_trace_data = rec_map_array_container(
+                get_opposite_trace,
+                unswapped_other_trace_data,
+                leaf_class=DOFArray)
+
+            result[other_vol_dd, self_vol_dd] = TracePair(
+                self_trace_dd,
+                interior=self_trace_data,
+                exterior=other_trace_data)
+
+    return result
+
+
+def inter_volume_trace_pairs(dcoll: DiscretizationCollection,
+        pairwise_volume_data: Mapping[
+            Tuple[DOFDesc, DOFDesc],
+            Tuple[ArrayOrContainer, ArrayOrContainer]],
+        comm_tag: Hashable = None) -> Mapping[
+            Tuple[DOFDesc, DOFDesc],
+            List[TracePair]]:
+    """
+    Note that :func:`local_inter_volume_trace_pairs` provides the rank-local
+    contributions if those are needed in isolation. Similarly,
+    :func:`cross_rank_inter_volume_trace_pairs` provides only the trace pairs
+    defined on cross-rank boundaries.
+    """
+    # TODO documentation
+
+    result: Mapping[
+        Tuple[DOFDesc, DOFDesc],
+        List[TracePair]] = {}
+
+    local_tpairs = local_inter_volume_trace_pairs(dcoll, pairwise_volume_data)
+    cross_rank_tpairs = cross_rank_inter_volume_trace_pairs(
+        dcoll, pairwise_volume_data, comm_tag=comm_tag)
+
+    for directional_vol_dd_pair, tpair in local_tpairs.items():
+        result[directional_vol_dd_pair] = [tpair]
+
+    for directional_vol_dd_pair, tpairs in cross_rank_tpairs.items():
+        result.setdefault(directional_vol_dd_pair, []).extend(tpairs)
+
+    return result
+
+# }}}
+
+
 # {{{ distributed: helper functions
 
 class _TagKeyBuilder(KeyBuilder):
@@ -354,16 +488,25 @@ def update_for_type(self, key_hash, key: type[Any]):
         self.rec(key_hash, (key.__module__, key.__name__, key.__name__,))
 
 
+# FIXME: Deprecate connected_ranks instead of removing
 @memoize_on_first_arg
-def connected_ranks(
+def connected_parts(
         dcoll: DiscretizationCollection,
-        volume_dd: DOFDesc | None = None):
-    if volume_dd is None:
-        volume_dd = DD_VOLUME_ALL
-
-    from meshmode.distributed import get_connected_parts
-    return get_connected_parts(
-        dcoll._volume_discrs[volume_dd.domain_tag.tag].mesh)
+        self_volume_tag: VolumeTag,
+        other_volume_tag: VolumeTag
+        ) -> Sequence[PartID]:
+    result: List[PartID] = [
+        connected_part_id
+        for connected_part_id, part_id in dcoll._inter_part_connections.keys()
+        if (
+            part_id.volume_tag == self_volume_tag
+            and connected_part_id.volume_tag == other_volume_tag)]
+
+    # This commented bit might be newer way - check with Matt
+    # from meshmode.distributed import get_connected_parts
+    # return get_connected_parts(
+    #    dcoll._volume_discrs[volume_dd.domain_tag.tag].mesh)
+    return result
 
 
 def _sym_tag_to_num_tag(comm_tag: Hashable | None) -> int | None:
@@ -385,6 +528,7 @@ def _sym_tag_to_num_tag(comm_tag: Hashable | None) -> int | None:
 
     num_tag = sum(ord(ch) << i for i, ch in enumerate(digest)) % tag_ub
 
+    # FIXME: This prints the wrong numerical tag because of base_comm_tag below
     warn("Encountered unknown symbolic tag "
             f"'{comm_tag}', assigning a value of '{num_tag}'. "
             "This is a temporary workaround, please ensure that "
@@ -402,29 +546,49 @@ class _RankBoundaryCommunicationEager:
     base_comm_tag = 1273
 
     def __init__(self,
-                 dcoll: DiscretizationCollection,
-                 array_container: ArrayOrContainer,
-                 remote_rank, comm_tag: int | None = None,
-                 volume_dd=DD_VOLUME_ALL):
-        actx = get_container_context_recursively(array_container)
-        bdry_dd = volume_dd.trace(BTAG_PARTITION(remote_rank))
-
-        local_bdry_data = project(dcoll, volume_dd, bdry_dd, array_container)
+            actx: ArrayContext,
+            dcoll: DiscretizationCollection,
+            *,
+            local_part_id: PartID,
+            remote_part_id: PartID,
+            local_bdry_data: ArrayOrContainer,
+            remote_bdry_data_template: ArrayOrContainer,
+            comm_tag: Optional[Hashable] = None):
+
+        # inducer/grudge@main has this 
+        # local_bdry_data = project(dcoll, volume_dd, bdry_dd, array_container)
+
         comm = actx.mpi_communicator
+
         assert comm is not None
 
+        remote_rank = remote_part_id.rank
+        assert remote_rank is not None
+
         self.dcoll = dcoll
         self.array_context = actx
-        self.remote_bdry_dd = bdry_dd
-        self.bdry_discr = dcoll.discr_from_dd(bdry_dd)
+        self.local_part_id = local_part_id
+        self.remote_part_id = remote_part_id
+        self.local_bdry_dd = DOFDesc(
+            BoundaryDomainTag(
+                BTAG_PARTITION(remote_part_id),
+                volume_tag=local_part_id.volume_tag),
+            DISCR_TAG_BASE)
+        self.bdry_discr = dcoll.discr_from_dd(self.local_bdry_dd)
         self.local_bdry_data = local_bdry_data
-        self.local_bdry_data_np = \
-            to_numpy(flatten(self.local_bdry_data, actx), actx)
-
-        self.comm_tag = self.base_comm_tag
-        comm_tag = _sym_tag_to_num_tag(comm_tag)
-        if comm_tag is not None:
-            self.comm_tag += comm_tag
+        self.remote_bdry_data_template = remote_bdry_data_template
+
+        def _generate_num_comm_tag(sym_comm_tag):
+            result = self.base_comm_tag
+            num_comm_tag = _sym_tag_to_num_tag(sym_comm_tag)
+            if num_comm_tag is not None:
+                result += num_comm_tag
+            return result
+
+        send_sym_comm_tag = (remote_part_id.volume_tag, comm_tag)
+        recv_sym_comm_tag = (local_part_id.volume_tag, comm_tag)
+        self.send_comm_tag = _generate_num_comm_tag(send_sym_comm_tag)
+        self.recv_comm_tag = _generate_num_comm_tag(recv_sym_comm_tag)
         del comm_tag
 
         # Here, we initialize both send and receive operations through
@@ -446,36 +610,76 @@ def __init__(self,
         # requests is complete, however it is not clear that this is documented
         # behavior. We hold on to the buffer (via the instance attribute)
         # as well, just in case.
-        self.send_req = comm.Isend(self.local_bdry_data_np,
-                                   remote_rank,
-                                   tag=self.comm_tag)
-        self.remote_data_host_numpy = np.empty_like(self.local_bdry_data_np)
-        self.recv_req = comm.Irecv(self.remote_data_host_numpy,
-                                   remote_rank,
-                                   tag=self.comm_tag)
+        self.send_reqs = []
+        self.send_data = []
+
+        def send_single_array(key, local_subary):
+            if not isinstance(local_subary, Number):
+                local_subary_np = to_numpy(local_subary, actx)
+                self.send_reqs.append(
+                    comm.Isend(local_subary_np, remote_rank, tag=self.send_comm_tag))
+                self.send_data.append(local_subary_np)
+            return local_subary
+
+        self.recv_reqs = []
+        self.recv_data = {}
+
+        def recv_single_array(key, remote_subary_template):
+            if not isinstance(remote_subary_template, Number):
+                remote_subary_np = np.empty(
+                    remote_subary_template.shape,
+                    remote_subary_template.dtype)
+                self.recv_reqs.append(
+                    comm.Irecv(remote_subary_np, remote_rank,
+                        tag=self.recv_comm_tag))
+                self.recv_data[key] = remote_subary_np
+            return remote_subary_template
+
+        from arraycontext.container.traversal import rec_keyed_map_array_container
+        rec_keyed_map_array_container(send_single_array, local_bdry_data)
+        rec_keyed_map_array_container(recv_single_array, remote_bdry_data_template)
 
     def finish(self):
-        # Wait for the nonblocking receive request to complete before
+        from mpi4py import MPI
+
+        # Wait for the nonblocking receive requests to complete before
         # accessing the data
-        self.recv_req.Wait()
-
-        # Nonblocking receive is complete, we can now access the data and apply
-        # the boundary-swap connection
-        actx = self.array_context
-        remote_bdry_data_flat = from_numpy(self.remote_data_host_numpy, actx)
-        remote_bdry_data = unflatten(self.local_bdry_data,
-                                     remote_bdry_data_flat, actx)
-        bdry_conn = self.dcoll.distributed_boundary_swap_connection(
-            self.remote_bdry_dd)
-        swapped_remote_bdry_data = bdry_conn(remote_bdry_data)
-
-        # Complete the nonblocking send request associated with communicating
-        # `self.local_bdry_data_np`
-        self.send_req.Wait()
-
-        return TracePair(self.remote_bdry_dd,
-                         interior=self.local_bdry_data,
-                         exterior=swapped_remote_bdry_data)
+        MPI.Request.waitall(self.recv_reqs)
+
+        def finish_single_array(key, remote_subary_template):
+            if isinstance(remote_subary_template, Number):
+                # NOTE: Assumes that the same number is passed on every rank
+                return remote_subary_template
+            else:
+                return from_numpy(self.recv_data[key], self.array_context)
+
+        from arraycontext.container.traversal import rec_keyed_map_array_container
+        unswapped_remote_bdry_data = rec_keyed_map_array_container(
+            finish_single_array, self.remote_bdry_data_template)
+
+        remote_to_local = self.dcoll._inter_part_connections[
+            self.remote_part_id, self.local_part_id]
+
+        def get_opposite_trace(ary):
+            if isinstance(ary, Number):
+                return ary
+            else:
+                return remote_to_local(ary)
+
+        from arraycontext import rec_map_array_container
+        from meshmode.dof_array import DOFArray
+        remote_bdry_data = rec_map_array_container(
+            get_opposite_trace,
+            unswapped_remote_bdry_data,
+            leaf_class=DOFArray)
+
+        # Complete the nonblocking send requests
+        MPI.Request.waitall(self.send_reqs)
+
+        return TracePair(
+                self.local_bdry_dd,
+                interior=self.local_bdry_data,
+                exterior=remote_bdry_data)
 
 # }}}
 
@@ -484,51 +688,112 @@ def finish(self):
 
 class _RankBoundaryCommunicationLazy:
     def __init__(self,
-                 dcoll: DiscretizationCollection,
-                 array_container: ArrayOrContainer,
-                 remote_rank: int, comm_tag: Hashable,
-                 volume_dd=DD_VOLUME_ALL):
+            actx: ArrayContext,
+            dcoll: DiscretizationCollection,
+            *,
+            local_part_id: PartID,
+            remote_part_id: PartID,
+            local_bdry_data: ArrayOrContainer,
+            remote_bdry_data_template: ArrayOrContainer,
+            comm_tag: Optional[Hashable] = None) -> None:
+
         if comm_tag is None:
             raise ValueError("lazy communication requires 'comm_tag' to be supplied")
 
-        bdry_dd = volume_dd.trace(BTAG_PARTITION(remote_rank))
+        remote_rank = remote_part_id.rank
+        assert remote_rank is not None
 
         self.dcoll = dcoll
-        self.array_context = get_container_context_recursively(array_container)
-        self.remote_bdry_dd = bdry_dd
-        self.bdry_discr = dcoll.discr_from_dd(self.remote_bdry_dd)
-
-        self.local_bdry_data = project(
-            dcoll, volume_dd, bdry_dd, array_container)
-
-        from pytato import make_distributed_recv, staple_distributed_send
-
-        def communicate_single_array(key, local_bdry_ary):
-            ary_tag = (comm_tag, key)
-            return staple_distributed_send(
-                    local_bdry_ary, dest_rank=remote_rank, comm_tag=ary_tag,
-                    stapled_to=make_distributed_recv(
+        self.array_context = actx
+        self.local_bdry_dd = DOFDesc(
+            BoundaryDomainTag(
+                BTAG_PARTITION(remote_part_id),
+                volume_tag=local_part_id.volume_tag),
+            DISCR_TAG_BASE)
+        self.bdry_discr = dcoll.discr_from_dd(self.local_bdry_dd)
+        self.local_part_id = local_part_id
+        self.remote_part_id = remote_part_id
+
+        from pytato import (
+            make_distributed_recv,
+            make_distributed_send,
+            DistributedSendRefHolder)
+
+        # TODO: This currently assumes that local_bdry_data and
+        # remote_bdry_data_template have the same structure. This is not true
+        # in general. Find a way to staple the sends appropriately when the number
+        # of recvs is not equal to the number of sends
+        # FIXME: Overly restrictive (just needs to be the same structure)
+        assert type(local_bdry_data) == type(remote_bdry_data_template)
+
+        sends = {}
+
+        def send_single_array(key, local_subary):
+            if isinstance(local_subary, Number):
+                return
+            else:
+                ary_tag = (remote_part_id.volume_tag, comm_tag, key)
+                sends[key] = make_distributed_send(
+                    local_subary, dest_rank=remote_rank, comm_tag=ary_tag)
+
+        def recv_single_array(key, remote_subary_template):
+            if isinstance(remote_subary_template, Number):
+                # NOTE: Assumes that the same number is passed on every rank
+                return remote_subary_template
+            else:
+                ary_tag = (local_part_id.volume_tag, comm_tag, key)
+                return DistributedSendRefHolder(
+                    sends[key],
+                    make_distributed_recv(
                         src_rank=remote_rank, comm_tag=ary_tag,
-                        shape=local_bdry_ary.shape, dtype=local_bdry_ary.dtype,
-                        axes=local_bdry_ary.axes))
+                        shape=remote_subary_template.shape,
+                        dtype=remote_subary_template.dtype,
+                        axes=remote_subary_template.axes))
 
         from arraycontext.container.traversal import rec_keyed_map_array_container
-        self.remote_data = rec_keyed_map_array_container(
-                communicate_single_array, self.local_bdry_data)
 
-    def finish(self):
-        bdry_conn = self.dcoll.distributed_boundary_swap_connection(
-            self.remote_bdry_dd)
+        rec_keyed_map_array_container(send_single_array, local_bdry_data)
+        self.local_bdry_data = local_bdry_data
+
+        self.unswapped_remote_bdry_data = rec_keyed_map_array_container(
+            recv_single_array, remote_bdry_data_template)
 
-        return TracePair(self.remote_bdry_dd,
-                         interior=self.local_bdry_data,
-                         exterior=bdry_conn(self.remote_data))
+    def finish(self):
+        remote_to_local = self.dcoll._inter_part_connections[
+            self.remote_part_id, self.local_part_id]
+
+        def get_opposite_trace(ary):
+            if isinstance(ary, Number):
+                return ary
+            else:
+                return remote_to_local(ary)
+
+        from arraycontext import rec_map_array_container
+        from meshmode.dof_array import DOFArray
+        remote_bdry_data = rec_map_array_container(
+            get_opposite_trace,
+            self.unswapped_remote_bdry_data,
+            leaf_class=DOFArray)
+
+        return TracePair(
+                self.local_bdry_dd,
+                interior=self.local_bdry_data,
+                exterior=remote_bdry_data)
 
 # }}}
 
 
 # {{{ cross_rank_trace_pairs
 
+def _replace_dof_arrays(array_container, dof_array):
+    from arraycontext import rec_map_array_container
+    from meshmode.dof_array import DOFArray
+    return rec_map_array_container(
+        lambda x: dof_array if isinstance(x, DOFArray) else x,
+        array_container,
+        leaf_class=DOFArray)
+
+
 def cross_rank_trace_pairs(
         dcoll: DiscretizationCollection, ary: ArrayOrContainer,
         tag: Hashable = None,
@@ -537,9 +802,9 @@ def cross_rank_trace_pairs(
     r"""Get a :class:`list` of *ary* trace pairs for each partition boundary.
 
     For each partition boundary, the field data values in *ary* are
-    communicated to/from the neighboring partition. Presumably, this
-    communication is MPI (but strictly speaking, may not be, and this
-    routine is agnostic to the underlying communication).
+    communicated to/from the neighboring part. Presumably, this communication
+    is MPI (but strictly speaking, may not be, and this routine is agnostic to
+    the underlying communication).
 
     For each face on each partition boundary, a
     :class:`TracePair` is created with the locally, and
@@ -584,14 +849,36 @@ def cross_rank_trace_pairs(
 
     # }}}
 
-    if isinstance(ary, Number):
-        # NOTE: Assumed that the same number is passed on every rank
-        return [TracePair(
-                volume_dd.trace(BTAG_PARTITION(remote_rank)),
-                interior=ary, exterior=ary)
-            for remote_rank in connected_ranks(dcoll, volume_dd=volume_dd)]
+    if dcoll.mpi_communicator is None:
+        return []
+
+    rank = dcoll.mpi_communicator.Get_rank()
+
+    local_part_id = PartID(volume_dd.domain_tag.tag, rank)
 
-    actx = get_container_context_recursively(ary)
+    connected_part_ids = connected_parts(
+            dcoll, self_volume_tag=volume_dd.domain_tag.tag,
+            other_volume_tag=volume_dd.domain_tag.tag)
+
+    remote_part_ids = [
+        part_id
+        for part_id in connected_part_ids
+        if part_id.rank != rank]
+
+    # This asserts that there is only one data exchange per rank, so that
+    # there is no risk of mismatched data reaching the wrong recipient.
+    # (Since we have only a single tag.)
+    assert len(remote_part_ids) == len({part_id.rank for part_id in remote_part_ids})
+
+    actx = get_container_context_recursively_opt(ary)
+
+    if actx is None:
+        # NOTE: Assumes that the same number is passed on every rank
+        return [
+            TracePair(
+                volume_dd.trace(BTAG_PARTITION(remote_part_id)),
+                interior=ary, exterior=ary)
+            for remote_part_id in remote_part_ids]
 
     from grudge.array_context import MPIPytatoArrayContextBase
 
@@ -600,14 +887,170 @@ def cross_rank_trace_pairs(
     else:
         rbc_class = _RankBoundaryCommunicationEager
 
-    # Initialize and post all sends/receives
-    rank_bdry_communicators = [
-        rbc_class(dcoll, ary, remote_rank, comm_tag=comm_tag, volume_dd=volume_dd)
-        for remote_rank in connected_ranks(dcoll, volume_dd=volume_dd)
-    ]
+    rank_bdry_communicators = []
+
+    for remote_part_id in remote_part_ids:
+        bdry_dd = volume_dd.trace(BTAG_PARTITION(remote_part_id))
+
+        local_bdry_data = project(dcoll, volume_dd, bdry_dd, ary)
+
+        from arraycontext import tag_axes
+        from meshmode.transform_metadata import (
+            DiscretizationElementAxisTag,
+            DiscretizationDOFAxisTag)
+        remote_bdry_zeros = tag_axes(
+            actx, {
+                0: DiscretizationElementAxisTag(),
+                1: DiscretizationDOFAxisTag()},
+            dcoll._inter_part_connections[
+                remote_part_id, local_part_id].from_discr.zeros(actx))
+
+        remote_bdry_data_template = _replace_dof_arrays(
+            local_bdry_data, remote_bdry_zeros)
+
+        rank_bdry_communicators.append(
+            rbc_class(actx, dcoll,
+                local_part_id=local_part_id,
+                remote_part_id=remote_part_id,
+                local_bdry_data=local_bdry_data,
+                remote_bdry_data_template=remote_bdry_data_template,
+                comm_tag=comm_tag))
+
+    return [rbc.finish() for rbc in rank_bdry_communicators]
+
+# }}}
+
+
+# {{{ cross_rank_inter_volume_trace_pairs
+
+def cross_rank_inter_volume_trace_pairs(
+        dcoll: DiscretizationCollection,
+        pairwise_volume_data: Mapping[
+            Tuple[DOFDesc, DOFDesc],
+            Tuple[ArrayOrContainer, ArrayOrContainer]],
+        *, comm_tag: Hashable = None,
+        ) -> Mapping[
+            Tuple[DOFDesc, DOFDesc],
+            List[TracePair]]:
+    # FIXME: Should this interface take in boundary data instead?
+    # TODO: Docs
+    r"""Get a :class:`list` of *ary* trace pairs for each partition boundary.
+
+    :arg comm_tag: a hashable object used to match sent and received data
+        across ranks. Communication will only match if both endpoints specify
+        objects that compare equal. A generalization of MPI communication
+        tags to arbitary, potentially composite objects.
+
+    :returns: a :class:`list` of :class:`TracePair` objects.
+    """
+    # {{{ process arguments
+
+    for vol_dd_pair, vol_data_pair in pairwise_volume_data.items():
+        for vol_dd in vol_dd_pair:
+            if not isinstance(vol_dd.domain_tag, VolumeDomainTag):
+                raise ValueError(
+                    "pairwise_volume_data keys must describe volumes, "
+                    f"got '{vol_dd}'")
+            if vol_dd.discretization_tag != DISCR_TAG_BASE:
+                raise ValueError(
+                    "expected base-discretized DOFDesc in pairwise_volume_data, "
+                    f"got '{vol_dd}'")
+        # FIXME: This check could probably be made more robust
+        if type(vol_data_pair[0]) != type(vol_data_pair[1]):  # noqa: E721
+            raise ValueError("heterogeneous inter-volume data not supported.")
+
+    # }}}
+
+    if dcoll.mpi_communicator is None:
+        return {}
+
+    rank = dcoll.mpi_communicator.Get_rank()
+
+    for vol_data_pair in pairwise_volume_data.values():
+        for vol_data in vol_data_pair:
+            actx = get_container_context_recursively_opt(vol_data)
+            if actx is not None:
+                break
+        if actx is not None:
+            break
+
+    def get_remote_connected_parts(local_vol_dd, remote_vol_dd):
+        connected_part_ids = connected_parts(
+            dcoll, self_volume_tag=local_vol_dd.domain_tag.tag,
+            other_volume_tag=remote_vol_dd.domain_tag.tag)
+        return [
+            part_id
+            for part_id in connected_part_ids
+            if part_id.rank != rank]
+
+    if actx is None:
+        # NOTE: Assumes that the same number is passed on every rank for a
+        # given volume
+        return {
+            (remote_vol_dd, local_vol_dd): [
+                TracePair(
+                    local_vol_dd.trace(BTAG_PARTITION(remote_part_id)),
+                    interior=local_vol_ary, exterior=remote_vol_ary)
+                for remote_part_id in get_remote_connected_parts(
+                    local_vol_dd, remote_vol_dd)]
+            for (remote_vol_dd, local_vol_dd), (remote_vol_ary, local_vol_ary)
+            in pairwise_volume_data.items()}
+
+    from grudge.array_context import MPIPytatoArrayContextBase
+
+    if isinstance(actx, MPIPytatoArrayContextBase):
+        rbc_class = _RankBoundaryCommunicationLazy
+    else:
+        rbc_class = _RankBoundaryCommunicationEager
 
-    # Complete send/receives and return communicated data
-    return [rc.finish() for rc in rank_bdry_communicators]
+    rank_bdry_communicators = {}
+
+    for vol_dd_pair, vol_data_pair in pairwise_volume_data.items():
+        directional_volume_data = {
+            (vol_dd_pair[0], vol_dd_pair[1]): (vol_data_pair[0], vol_data_pair[1]),
+            (vol_dd_pair[1], vol_dd_pair[0]): (vol_data_pair[1], vol_data_pair[0])}
+
+        for dd_pair, data_pair in directional_volume_data.items():
+            other_vol_dd, self_vol_dd = dd_pair
+            other_vol_data, self_vol_data = data_pair
+
+            self_part_id = PartID(self_vol_dd.domain_tag.tag, rank)
+            other_part_ids = get_remote_connected_parts(self_vol_dd, other_vol_dd)
+
+            rbcs = []
+
+            for other_part_id in other_part_ids:
+                self_bdry_dd = self_vol_dd.trace(BTAG_PARTITION(other_part_id))
+                self_bdry_data = project(
+                    dcoll, self_vol_dd, self_bdry_dd, self_vol_data)
+
+                from arraycontext import tag_axes
+                from meshmode.transform_metadata import (
+                    DiscretizationElementAxisTag,
+                    DiscretizationDOFAxisTag)
+                other_bdry_zeros = tag_axes(
+                    actx, {
+                        0: DiscretizationElementAxisTag(),
+                        1: DiscretizationDOFAxisTag()},
+                    dcoll._inter_part_connections[
+                        other_part_id, self_part_id].from_discr.zeros(actx))
+
+                other_bdry_data_template = _replace_dof_arrays(
+                    other_vol_data, other_bdry_zeros)
+
+                rbcs.append(
+                    rbc_class(actx, dcoll,
+                        local_part_id=self_part_id,
+                        remote_part_id=other_part_id,
+                        local_bdry_data=self_bdry_data,
+                        remote_bdry_data_template=other_bdry_data_template,
+                        comm_tag=comm_tag))
+
+            rank_bdry_communicators[other_vol_dd, self_vol_dd] = rbcs
+
+    return {
+        directional_vol_dd_pair: [rbc.finish() for rbc in rbcs]
+        for directional_vol_dd_pair, rbcs in rank_bdry_communicators.items()}
 
 # }}}
 
diff --git a/test/mesh_data.py b/test/mesh_data.py
index 6203a800a..3e4454c7a 100644
--- a/test/mesh_data.py
+++ b/test/mesh_data.py
@@ -7,6 +7,7 @@
 import meshmode.mesh.generation as mgen
 from meshmode.mesh import Mesh
 from meshmode.mesh.io import read_gmsh
+from meshmode.mesh import TensorProductElementGroup
 
 
 class MeshBuilder(ABC):
@@ -114,18 +115,28 @@ def get_mesh(self, resolution, mesh_order=4):
 class _BoxMeshBuilderBase(MeshBuilder):
     resolutions: ClassVar[Sequence[Hashable]] = [4, 8, 16]
     mesh_order = 1
-
+    group_cls = None
     a = (-0.5, -0.5, -0.5)
     b = (+0.5, +0.5, +0.5)
+    tpe: bool = False
 
-    def get_mesh(self, resolution, mesh_order=4):
-        if not isinstance(resolution, list | tuple):
-            resolution = (resolution,) * self.ambient_dim
+    def __init__(self, tpe=False, a=(-0.5, -0.5, -0.5),
+                 b=(0.5, 0.5, 0.5)):
+        self.tpe = tpe
+        self.a = a
+        self.b = b
 
+    def get_mesh(self, resolution, mesh_order=None):
+        if mesh_order is None:
+            mesh_order = self.mesh_order
+        if not isinstance(resolution, (list, tuple)):
+            resolution = (resolution,) * self.ambient_dim
+        from meshmode.mesh import TensorProductElementGroup
+        group_cls = TensorProductElementGroup if self.tpe else None
         return mgen.generate_regular_rect_mesh(
                 a=self.a, b=self.b,
                 nelements_per_axis=resolution,
-                order=mesh_order)
+                group_cls=group_cls, order=mesh_order)
 
 
 class BoxMeshBuilder1D(_BoxMeshBuilderBase):
@@ -137,7 +148,7 @@ class BoxMeshBuilder2D(_BoxMeshBuilderBase):
 
 
 class BoxMeshBuilder3D(_BoxMeshBuilderBase):
-    ambient_dim = 2
+    ambient_dim = 3
 
 
 class WarpedRectMeshBuilder(MeshBuilder):
diff --git a/test/test_dt_utils.py b/test/test_dt_utils.py
index cf3ac2021..be4592b87 100644
--- a/test/test_dt_utils.py
+++ b/test/test_dt_utils.py
@@ -38,6 +38,13 @@
          PytestPytatoPyOpenCLArrayContextFactory,
          PytestNumpyArrayContextFactory])
 
+from grudge import make_discretization_collection
+#
+# =======
+# from grudge import DiscretizationCollection
+# >>>>>>> main
+
+import grudge.op as op
 import logging
 
 import mesh_data
@@ -52,7 +59,8 @@
 
 
 @pytest.mark.parametrize("name", ["interval", "box2d", "box3d"])
-def test_geometric_factors_regular_refinement(actx_factory, name):
+@pytest.mark.parametrize("tpe", [False, True])
+def test_geometric_factors_regular_refinement(actx_factory, name, tpe):
     from grudge.dt_utils import dt_geometric_factors
 
     actx = actx_factory()
@@ -60,22 +68,35 @@ def test_geometric_factors_regular_refinement(actx_factory, name):
     # {{{ cases
 
     if name == "interval":
+        if tpe:
+            pytest.skip()
         builder = mesh_data.BoxMeshBuilder1D()
     elif name == "box2d":
-        builder = mesh_data.BoxMeshBuilder2D()
+        builder = mesh_data.BoxMeshBuilder2D(tpe=tpe)
     elif name == "box3d":
-        builder = mesh_data.BoxMeshBuilder3D()
+        builder = mesh_data.BoxMeshBuilder3D(tpe=tpe)
+
     else:
         raise ValueError(f"unknown geometry name: {name}")
 
     # }}}
 
-    order = 4
+    from meshmode.discretization.poly_element import \
+        LegendreGaussLobattoTensorProductGroupFactory as Lgl
+    test_order = 4
+    order = None if tpe else test_order
+    from grudge.dof_desc import DISCR_TAG_BASE
+    dtag_to_grp_fac = {
+        DISCR_TAG_BASE: Lgl(test_order)
+    } if tpe else None
 
     min_factors = []
     for resolution in builder.resolutions:
-        mesh = builder.get_mesh(resolution, order)
-        dcoll = make_discretization_collection(actx, mesh, order=order)
+        mesh = builder.get_mesh(resolution, test_order)
+        dcoll = make_discretization_collection(
+            actx, mesh, order=order,
+            discr_tag_to_group_factory=dtag_to_grp_fac)
+
         min_factors.append(
             actx.to_numpy(
                 op.nodal_min(dcoll, "vol", actx.thaw(dt_geometric_factors(dcoll))))
@@ -88,9 +109,12 @@ def test_geometric_factors_regular_refinement(actx_factory, name):
     assert np.all(np.isclose(ratios, 2))
 
     # Make sure it works with empty meshes
-    mesh = builder.get_mesh(0)
-    dcoll = make_discretization_collection(actx, mesh, order=order)
-    factors = actx.thaw(dt_geometric_factors(dcoll))  # noqa: F841
+    # if not tpe:
+    #    mesh = builder.get_mesh(0, order)
+    #    dcoll = make_discretization_collection(
+    #        actx, mesh, order=order,
+    #        discr_tag_to_group_factory=dtag_to_grp_fac)
+    #    factors = actx.thaw(dt_geometric_factors(dcoll))  # noqa: F841
 
 
 @pytest.mark.parametrize("name", ["interval", "box2d", "box3d"])
@@ -152,19 +176,44 @@ def rhs(x):
 
 @pytest.mark.parametrize("dim", [1, 2])
 @pytest.mark.parametrize("degree", [2, 4])
-def test_wave_dt_estimate(actx_factory, dim, degree, visualize=False):
+@pytest.mark.parametrize("tpe", [False, True])
+def test_wave_dt_estimate(actx_factory, dim, degree, tpe, visualize=False):
+
     actx = actx_factory()
 
+    if tpe:
+        if dim == 1:
+            pytest.skip()
+
+    # {{{ cases
+
+    from meshmode.mesh import TensorProductElementGroup
+    group_cls = TensorProductElementGroup if tpe else None
+
     import meshmode.mesh.generation as mgen
 
     a = [0, 0, 0]
     b = [1, 1, 1]
     mesh = mgen.generate_regular_rect_mesh(
             a=a[:dim], b=b[:dim],
-            nelements_per_axis=(3,)*dim)
+            nelements_per_axis=(3,)*dim,
+            group_cls=group_cls)
+
     assert mesh.dim == dim
 
-    dcoll = make_discretization_collection(actx, mesh, order=degree)
+    from meshmode.discretization.poly_element import \
+        LegendreGaussLobattoTensorProductGroupFactory as Lgl
+
+    from grudge.dof_desc import DISCR_TAG_BASE
+    order = degree
+    dtag_to_grp_fac = None
+    if tpe:
+        order = None
+        dtag_to_grp_fac = {
+            DISCR_TAG_BASE: Lgl(degree)
+        }
+    dcoll = make_discretization_collection(
+        actx, mesh, order=order, discr_tag_to_group_factory=dtag_to_grp_fac)
 
     from grudge.models.wave import WeakWaveOperator
     wave_op = WeakWaveOperator(dcoll, c=1)
@@ -189,6 +238,7 @@ def test_wave_dt_estimate(actx_factory, dim, degree, visualize=False):
         RK4MethodBuilder.output_coeffs))
 
     dt_est = actx.to_numpy(wave_op.estimate_rk4_timestep(actx, dcoll))
+    print(f"{dt_est=}")
 
     if visualize:
         re, im = np.mgrid[-4:1:30j, -5:5:30j]
@@ -218,9 +268,73 @@ def test_wave_dt_estimate(actx_factory, dim, degree, visualize=False):
     assert not stable_dt_factors or max(stable_dt_factors) < 1.5, stable_dt_factors
 
 
+@pytest.mark.parametrize("dim", [2])
+@pytest.mark.parametrize("degree", [1, 2])
+@pytest.mark.parametrize("tpe", [True])
+def test_charlen(actx_factory, dim, degree, tpe, visualize=False):
+
+    from grudge.dt_utils import (
+        dt_geometric_factors,
+        dt_non_geometric_factors,
+        h_min_from_volume,
+        h_max_from_volume
+    )
+    actx = actx_factory()
+
+    if tpe:
+        if dim == 1:
+            pytest.skip()
+
+    # {{{ cases
+
+    from meshmode.mesh import TensorProductElementGroup
+    group_cls = TensorProductElementGroup if tpe else None
+
+    import meshmode.mesh.generation as mgen
+
+    a = [0, 0, 0]
+    b = [1, 1, 1]
+    nels1d = [2, 3, 4]
+
+    for nel1d in nels1d:
+        print(f"{dim=},{nel1d=},{degree=}")
+        mesh = mgen.generate_regular_rect_mesh(
+            a=a[:dim], b=b[:dim],
+            nelements_per_axis=(nel1d,)*dim,
+            group_cls=group_cls)
+        print(f"{mesh=}")
+        assert mesh.dim == dim
+
+        from meshmode.discretization.poly_element import \
+            LegendreGaussLobattoTensorProductGroupFactory as Lgl
+
+        from grudge.dof_desc import DISCR_TAG_BASE
+        order = degree
+        dtag_to_grp_fac = None
+        if tpe:
+            order = None
+            dtag_to_grp_fac = {
+                DISCR_TAG_BASE: Lgl(degree)
+            }
+
+        dcoll = make_discretization_collection(actx, mesh, order=order,
+                                         discr_tag_to_group_factory=dtag_to_grp_fac)
+
+        h_min = actx.to_numpy(h_min_from_volume(dcoll))
+        h_max = actx.to_numpy(h_max_from_volume(dcoll))
+        gfac = actx.to_numpy(dt_geometric_factors(dcoll))
+        ngfac = dt_non_geometric_factors(dcoll)
+
+        print(f"{h_min=}")
+        print(f"{h_max=}")
+        print(f"{gfac=}")
+        print(f"{ngfac=}")
+
+    # assert False
+
+
 # You can test individual routines by typing
 # $ python test_grudge.py 'test_routine()'
-
 if __name__ == "__main__":
     import sys
     if len(sys.argv) > 1:
diff --git a/test/test_euler_model.py b/test/test_euler_model.py
index 6e9e9c1d2..ec7c77da1 100644
--- a/test/test_euler_model.py
+++ b/test/test_euler_model.py
@@ -26,21 +26,27 @@
 
 import pytest
 
+from grudge.array_context import \
+    PytestPyOpenCLArrayContextFactory, PytestPytatoPyOpenCLArrayContextFactory
 from arraycontext import (
-    pytest_generate_tests_for_array_contexts,
+    pytest_generate_tests_for_array_contexts, thaw,
 )
 
 from grudge import op
-from grudge.array_context import PytestPyOpenCLArrayContextFactory
-
+import numpy as np
 
 logger = logging.getLogger(__name__)
 pytest_generate_tests = pytest_generate_tests_for_array_contexts(
-        [PytestPyOpenCLArrayContextFactory])
+        [PytestPyOpenCLArrayContextFactory,
+         PytestPytatoPyOpenCLArrayContextFactory])
+
+
+logger = logging.getLogger(__name__)
 
 
 @pytest.mark.parametrize("order", [1, 2, 3])
-def test_euler_vortex_convergence(actx_factory, order):
+@pytest.mark.parametrize("esdg", [False, True])
+def test_euler_vortex_convergence(actx_factory, order, esdg):
 
     from meshmode.discretization.poly_element import (
         QuadratureSimplexGroupFactory,
@@ -52,12 +58,26 @@ def test_euler_vortex_convergence(actx_factory, order):
     from grudge.discretization import make_discretization_collection
     from grudge.dof_desc import DISCR_TAG_BASE, DISCR_TAG_QUAD
     from grudge.dt_utils import h_max_from_volume
-    from grudge.models.euler import EulerOperator, vortex_initial_condition
+    from grudge.models.euler import (
+        vortex_initial_condition,
+        EulerOperator,
+        EntropyStableEulerOperator
+    )
     from grudge.shortcuts import rk4_step
 
     actx = actx_factory()
     eoc_rec = EOCRecorder()
     quad_tag = DISCR_TAG_QUAD
+    if esdg:
+        operator_cls = EntropyStableEulerOperator
+    else:
+        operator_cls = EulerOperator
+
+    if esdg and not actx.supports_nonscalar_broadcasting:
+        pytest.xfail(
+            "Flux-differencing computations requires an array context "
+            "that supports non-scalar broadcasting"
+        )
 
     for resolution in [8, 16, 32]:
 
@@ -83,7 +103,7 @@ def test_euler_vortex_convergence(actx_factory, order):
 
         # }}}
 
-        euler_operator = EulerOperator(
+        euler_operator = operator_cls(
             dcoll,
             flux_type="lf",
             gamma=1.4,
@@ -133,8 +153,55 @@ def rhs(t, q, euler_operator=euler_operator):
 
     logger.info("\n%s", eoc_rec.pretty_print(abscissa_label="h",
                                              error_label="L2 Error"))
+    assert eoc_rec.order_estimate() >= order + 0.5
+
+
+def test_entropy_variable_roundtrip(actx_factory):
+    from grudge.models.euler import (
+        entropy_to_conservative_vars,
+        conservative_to_entropy_vars,
+        vortex_initial_condition
+    )
+
+    actx = actx_factory()
+    gamma = 1.4  # Adiabatic expansion factor for single-gas Euler model
+
+    from meshmode.mesh.generation import generate_regular_rect_mesh
+
+    dim = 2
+    res = 5
+    mesh = generate_regular_rect_mesh(
+        a=(0, -5),
+        b=(20, 5),
+        nelements_per_axis=(2*res, res),
+        periodic=(True, True))
+
+    from meshmode.discretization.poly_element import \
+        default_simplex_group_factory
+    from grudge import DiscretizationCollection
+    from grudge.dof_desc import DISCR_TAG_BASE
+
+    order = 3
+    dcoll = DiscretizationCollection(
+        actx, mesh,
+        discr_tag_to_group_factory={
+            DISCR_TAG_BASE: default_simplex_group_factory(dim, order)
+        }
+    )
+
+    # Fields in conserved variables
+    fields = vortex_initial_condition(thaw(dcoll.nodes(), actx))
+
+    # Map back and forth between entropy and conserved vars
+    fields_ev = conservative_to_entropy_vars(fields, gamma)
+    ev_fields_to_cons = entropy_to_conservative_vars(fields_ev, gamma)
+    residual = ev_fields_to_cons - fields
+
+    assert actx.to_numpy(op.norm(dcoll, residual.mass, np.inf)) < 1e-13
+    assert actx.to_numpy(op.norm(dcoll, residual.energy, np.inf)) < 1e-13
     assert (
-        eoc_rec.order_estimate() >= order + 0.5
+        actx.to_numpy(op.norm(dcoll, residual.momentum[i], np.inf)) < 1e-13
+        for i in range(dim)
     )
 
 
diff --git a/test/test_grudge.py b/test/test_grudge.py
index b0849310b..617171725 100644
--- a/test/test_grudge.py
+++ b/test/test_grudge.py
@@ -53,10 +53,10 @@
 
 # {{{ mass operator trig integration
 
+@pytest.mark.parametrize("tpe", [True, False])
 @pytest.mark.parametrize("ambient_dim", [1, 2, 3])
-@pytest.mark.parametrize("discr_tag", [dof_desc.DISCR_TAG_BASE,
-                                       dof_desc.DISCR_TAG_QUAD])
-def test_mass_mat_trig(actx_factory, ambient_dim, discr_tag):
+@pytest.mark.parametrize("use_overint", [False, True])
+def test_mass_mat_trig(actx_factory, tpe, ambient_dim, use_overint):
     """Check the integral of some trig functions on an interval using the mass
     matrix.
     """
@@ -67,23 +67,27 @@ def test_mass_mat_trig(actx_factory, ambient_dim, discr_tag):
 
     a = -4.0 * np.pi
     b = +9.0 * np.pi
-    true_integral = 13*np.pi/2 * (b - a)**(ambient_dim - 1)
 
+    true_integral = 13*np.pi/2 * (b - a)**(ambient_dim - 1)
+    discr_tag = dof_desc.DISCR_TAG_QUAD if use_overint else dof_desc.DISCR_TAG_BASE
     from meshmode.discretization.poly_element import QuadratureSimplexGroupFactory
 
     dd_quad = dof_desc.DOFDesc(dof_desc.DTAG_VOLUME_ALL, discr_tag)
+    discr_order = order
     if discr_tag is dof_desc.DISCR_TAG_BASE:
         discr_tag_to_group_factory = {}
     else:
+        discr_order = None
         discr_tag_to_group_factory = {
-            discr_tag: QuadratureSimplexGroupFactory(order=2*order)
+            dof_desc.DISCR_TAG_BASE: InterpolatoryEdgeClusteredGroupFactory(order),
+            dof_desc.DISCR_TAG_QUAD: QuadratureGroupFactory(order=2*order)
         }
 
     mesh = mgen.generate_regular_rect_mesh(
             a=(a,)*ambient_dim, b=(b,)*ambient_dim,
             nelements_per_axis=(nel_1d,)*ambient_dim, order=1)
     dcoll = make_discretization_collection(
-        actx, mesh, order=order,
+        actx, mesh, order=discr_order,
         discr_tag_to_group_factory=discr_tag_to_group_factory
     )
 
@@ -155,26 +159,35 @@ def _spheroid_surface_area(radius, aspect_ratio):
 
 
 @pytest.mark.parametrize("name", [
-    "2-1-ellipse", "spheroid", "box2d", "box3d"
+    "box2d-tpe", "box3d-tpe", "box2d", "box3d", "2-1-ellipse", "spheroid",
     ])
-def test_mass_surface_area(actx_factory, name):
+@pytest.mark.parametrize("oi", [False, True])
+def test_mass_surface_area(actx_factory, name, oi):
+    from grudge.dof_desc import DISCR_TAG_BASE, DISCR_TAG_QUAD, as_dofdesc
     actx = actx_factory()
+    qtag = DISCR_TAG_QUAD if oi else DISCR_TAG_BASE
+    vol_dd_base = as_dofdesc(dof_desc.DTAG_VOLUME_ALL)
+    vol_dd_quad = vol_dd_base.with_discr_tag(qtag)
 
     # {{{ cases
 
     order = 4
-
+    tpe = name.endswith("-tpe")
     if name == "2-1-ellipse":
+        if tpe:
+            pytest.skip()
         builder = mesh_data.EllipseMeshBuilder(radius=3.1, aspect_ratio=2.0)
         surface_area = _ellipse_surface_area(builder.radius, builder.aspect_ratio)
     elif name == "spheroid":
+        if tpe:
+            pytest.skip()
         builder = mesh_data.SpheroidMeshBuilder()
         surface_area = _spheroid_surface_area(builder.radius, builder.aspect_ratio)
-    elif name == "box2d":
-        builder = mesh_data.BoxMeshBuilder2D()
+    elif name.startswith("box2d"):
+        builder = mesh_data.BoxMeshBuilder2D(tpe=tpe)
         surface_area = 1.0
-    elif name == "box3d":
-        builder = mesh_data.BoxMeshBuilder3D()
+    elif name.startswith("box3d"):
+        builder = mesh_data.BoxMeshBuilder3D(tpe=tpe)
         surface_area = 1.0
     else:
         raise ValueError(f"unknown geometry name: {name}")
@@ -189,17 +202,24 @@ def test_mass_surface_area(actx_factory, name):
 
     for resolution in builder.resolutions:
         mesh = builder.get_mesh(resolution, order)
-        dcoll = make_discretization_collection(actx, mesh, order=order)
-        volume_discr = dcoll.discr_from_dd(dof_desc.DD_VOLUME_ALL)
+        dcoll = make_discretization_collection(
+                   actx, mesh,
+                   discr_tag_to_group_factory={
+                       DISCR_TAG_BASE: InterpolatoryEdgeClusteredGroupFactory(order),
+                       DISCR_TAG_QUAD: QuadratureGroupFactory(3 * order)
+                   })
+        volume_discr = dcoll.discr_from_dd(vol_dd_quad)
+        nodes = actx.thaw(volume_discr.nodes())
 
         logger.info("ndofs:     %d", volume_discr.ndofs)
         logger.info("nelements: %d", volume_discr.mesh.nelements)
 
         # {{{ compute surface area
 
-        dd = dof_desc.DD_VOLUME_ALL
+        dd = vol_dd_quad
         ones_volm = volume_discr.zeros(actx) + 1
         approx_surface_area = actx.to_numpy(op.integral(dcoll, dd, ones_volm))
+        print(f"approx_surface_area({name})={approx_surface_area}")
 
         logger.info(
             f"surface: got {approx_surface_area:.5e} / expected {surface_area:.5e}")  # noqa: G004
@@ -220,6 +240,154 @@ def test_mass_surface_area(actx_factory, name):
 
     assert eoc.max_error() < 3e-13 or eoc.order_estimate() > order
 
+
+@pytest.mark.parametrize("name", [
+    "interval", "box2d", "box2d-tpe", "box3d", "box3d-tpe"
+    ])
+# @pytest.mark.parametrize("discr_order", [1, 2, 3, 4, 5])
+def test_correctness_of_quadrature(actx_factory, name):
+    from grudge.dof_desc import DISCR_TAG_BASE, DISCR_TAG_QUAD, as_dofdesc
+    actx = actx_factory()
+    vol_dd_base = as_dofdesc(dof_desc.DTAG_VOLUME_ALL)
+    vol_dd_quad = vol_dd_base.with_discr_tag(DISCR_TAG_QUAD)
+
+    # {{{ cases
+
+    tol = 5e-13
+    # discr_order = 1
+    dim = None
+    mesh_order = 1
+
+    tpe = name.endswith("-tpe")
+    if name.startswith("box2d"):
+        builder = mesh_data.BoxMeshBuilder2D(tpe=tpe,
+                                             a=(0, 0),
+                                             b=(2.0, 2.0))
+        dim = 2
+    elif name.startswith("box3d"):
+        builder = mesh_data.BoxMeshBuilder3D(tpe=tpe,
+                                             a=(0, 0, 0),
+                                             b=(2.0, 2.0, 2.0))
+        dim = 3
+    elif name == "interval":
+        builder = mesh_data.BoxMeshBuilder1D(tpe=False, a=(0.0,),
+                                             b=(2.0,))
+        dim = 1.0
+    else:
+        raise ValueError(f"unknown geometry name: {name}")
+    exact_volume = 2.0**dim
+    print(f"Domain: {name} ({dim}d), {exact_volume=}")
+    print(f"======================================================")
+
+    # }}}
+
+    # {{{ convergence
+
+    from pytools.convergence import EOCRecorder
+    for discr_order in range(1, 8):
+        print(f"   {discr_order=}")
+        print("   --------------------")
+        report_discr = True
+        for field_order in range(1, max(2*discr_order+1, 8)):
+            report_field_order = True
+            eoc_base = EOCRecorder()
+            eoc_quad = EOCRecorder()
+            for resolution in builder.resolutions:
+                mesh = builder.get_mesh(resolution, mesh_order)
+                dcoll = make_discretization_collection(
+                    actx, mesh,
+                    discr_tag_to_group_factory={
+                        DISCR_TAG_BASE:
+                        InterpolatoryEdgeClusteredGroupFactory(discr_order),
+                        DISCR_TAG_QUAD: QuadratureGroupFactory(discr_order)
+                    })
+                vol_discr_base = dcoll.discr_from_dd(vol_dd_base)
+                vol_discr_quad = dcoll.discr_from_dd(vol_dd_quad)
+                if report_discr:
+                    nelem = vol_discr_base.mesh.nelements
+                    ndofs_base = vol_discr_base.ndofs
+                    ndofs_quad = vol_discr_quad.ndofs
+                    dofs_per_el_base = ndofs_base/nelem
+                    dofs_per_el_quad = ndofs_quad/nelem
+                    print(f"   - {dofs_per_el_base=}, {dofs_per_el_quad=}")
+                    report_discr = False
+                if report_field_order:
+                    print(f"      - {field_order=}")
+                    print("      - - - - - - - - - -")
+                    report_field_order = False
+                nodes_base = actx.thaw(vol_discr_base.nodes())
+                nodes_quad = actx.thaw(vol_discr_quad.nodes())
+                ones_base = 0*nodes_base[0] + 1
+                ones_quad = 0*nodes_quad[0] + 1
+
+                approx_vol_base = \
+                    actx.to_numpy(op.integral(dcoll, vol_dd_base, ones_base))
+                approx_vol_quad = \
+                    actx.to_numpy(op.integral(dcoll, vol_dd_quad, ones_quad))
+                err_vol_base = abs(approx_vol_base - exact_volume)/exact_volume
+                err_vol_quad = abs(approx_vol_quad - exact_volume)/exact_volume
+
+                logger.info(
+                    f"Name: {name} ({dim}d)\n"
+                    f"Exact volume: {exact_volume}\n"
+                    f"volume base: got {approx_vol_base:.12e}\n"  # noqa: G004
+                    f"volume quad: got {approx_vol_quad:.12e}\n")  # noqa: G004
+
+                # Quadrature should get exact volume for all discr (p >= 1)
+                assert err_vol_base < tol
+                assert err_vol_quad < tol
+
+                field_base = nodes_base[0]**field_order
+                field_quad = nodes_quad[0]**field_order
+                if dim > 1:
+                    field_base = sum(nodes_base[i]**field_order
+                                     for i in range(dim))
+                    field_quad = sum(nodes_quad[i]**field_order
+                                     for i in range(dim))
+                ofac = 1.0/2**field_order
+                field_base = ofac * field_base * (field_order + 1.0)
+                field_quad = ofac * field_quad * (field_order + 1.0)
+                exact_integral = dim*exact_volume
+
+                integral_base =  \
+                    actx.to_numpy(op.integral(dcoll, vol_dd_base, field_base))
+                integral_quad =  \
+                    actx.to_numpy(op.integral(dcoll, vol_dd_quad, field_quad))
+                err_base = \
+                    abs(integral_base - exact_integral)/exact_integral
+                err_quad = \
+                    abs(integral_quad - exact_integral)/exact_integral
+
+                if field_order <= discr_order:
+                    assert err_base < tol
+                    assert err_quad < tol
+
+                # compute max element size
+                from grudge.dt_utils import h_max_from_volume
+                h_max = h_max_from_volume(dcoll)
+
+                eoc_base.add_data_point(actx.to_numpy(h_max), err_base)
+                eoc_quad.add_data_point(actx.to_numpy(h_max), err_quad)
+
+            logger.info("volume error(base)\n%s", str(eoc_base))
+            logger.info("volume error(quad)\n%s", str(eoc_quad))
+            print("---- base -----")
+            print(f"{eoc_base.pretty_print()}")
+            print("---- quad -----")
+            print(f"{eoc_quad.pretty_print()}")
+
+            # Sanity check here: *must* be exact if discr_order is sufficient
+            if discr_order >= field_order:
+                assert eoc_base.max_error() < tol
+                assert eoc_quad.max_error() < tol
+            else:
+                if eoc_base.max_error() > tol:  # *can* be exact(ish) otherwise
+                    assert eoc_base.order_estimate() > discr_order
+                if eoc_quad.max_error() > tol:
+                    assert eoc_quad.order_estimate() > discr_order
+
+        print("-=-=-=-=-=-=-=-=-=-=-=-=-=--=-=-=-")
+    print("==============================")
 # }}}
 
 
@@ -974,13 +1142,14 @@ def rhs(t, w, maxwell_operator=maxwell_operator):
 # {{{ models: variable coefficient advection oversampling
 
 @pytest.mark.parametrize("order", [2, 3, 4])
-def test_improvement_quadrature(actx_factory, order):
+@pytest.mark.parametrize("tpe", [False, True])
+def test_improvement_quadrature(actx_factory, order, tpe):
     """Test whether quadrature improves things and converges"""
-    from meshmode.discretization.poly_element import QuadratureSimplexGroupFactory
-    from meshmode.mesh import BTAG_ALL
-    from pytools.convergence import EOCRecorder
-
     from grudge.models.advection import VariableCoefficientAdvectionOperator
+    from pytools.convergence import EOCRecorder
+    # from meshmode.discretization.poly_element \
+    #    import QuadratureSimplexGroupFactory
+    from meshmode.mesh import BTAG_ALL
 
     actx = actx_factory()
 
@@ -1001,23 +1170,30 @@ def conv_test(descr, use_quad):
         else:
             qtag = None
 
+        group_cls = TensorProductElementGroup if tpe else None
+        qfac = 2 if tpe else 4
         ns = [20, 25]
+        discr_order = order
         for n in ns:
             mesh = mgen.generate_regular_rect_mesh(
                 a=(-0.5,)*dims,
                 b=(0.5,)*dims,
                 nelements_per_axis=(n,)*dims,
-                order=order)
+                order=order, group_cls=group_cls)
 
             if use_quad:
                 discr_tag_to_group_factory = {
-                    qtag: QuadratureSimplexGroupFactory(order=4*order)
+                    dof_desc.DISCR_TAG_BASE:
+                    InterpolatoryEdgeClusteredGroupFactory(order),
+                    dof_desc.DISCR_TAG_QUAD:
+                    QuadratureGroupFactory(order=qfac*order)
                 }
+                discr_order = None
             else:
                 discr_tag_to_group_factory = {}
 
             dcoll = make_discretization_collection(
-                actx, mesh, order=order,
+                actx, mesh, order=discr_order,
                 discr_tag_to_group_factory=discr_tag_to_group_factory
             )
 
@@ -1049,9 +1225,12 @@ def zero_inflow(dtag, t=0, dcoll=dcoll):
     eoc, errs = conv_test("no quadrature", False)
     q_eoc, q_errs = conv_test("with quadrature", True)
 
-    assert q_eoc > eoc
     assert (q_errs < errs).all()
     assert q_eoc > order - 0.1
+    # Fails for all tensor-product element types
+    if not tpe:
+        assert q_eoc > eoc
+
 
 # }}}
 
diff --git a/test/test_op.py b/test/test_op.py
index 17f49a074..7359eadf8 100644
--- a/test/test_op.py
+++ b/test/test_op.py
@@ -19,14 +19,17 @@
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
-
-
 import logging
 
 import numpy as np
 import pytest
 
+from meshmode.mesh import (
+    SimplexElementGroup,
+    TensorProductElementGroup
+)
 import meshmode.mesh.generation as mgen
+from meshmode.mesh.processing import affine_map
 from arraycontext import pytest_generate_tests_for_array_contexts
 from meshmode.discretization.poly_element import (
     InterpolatoryEdgeClusteredGroupFactory,
@@ -35,7 +38,11 @@
 from meshmode.mesh import BTAG_ALL
 from pytools.obj_array import make_obj_array
 
-from grudge import geometry, op
+from grudge import (
+    geometry as geo,
+    op,
+    DiscretizationCollection,
+)
 from grudge.array_context import PytestPyOpenCLArrayContextFactory
 from grudge.discretization import make_discretization_collection
 from grudge.dof_desc import (
@@ -57,6 +64,10 @@
 
 # {{{ gradient
 
+@pytest.mark.parametrize("group_cls", [
+    SimplexElementGroup,
+    TensorProductElementGroup
+])
 @pytest.mark.parametrize("form", ["strong", "weak", "weak-overint"])
 @pytest.mark.parametrize("dim", [1, 2, 3])
 @pytest.mark.parametrize("order", [2, 3])
@@ -66,30 +77,68 @@
     (True, False),
     (True, True)
     ])
-def test_gradient(actx_factory, form, dim, order, vectorize, nested,
-                  warp_mesh, visualize=False):
+def test_gradient(actx_factory, form, dim, order, warp_mesh, vectorize, nested,
+                  group_cls, visualize=False):
     actx = actx_factory()
 
     from pytools.convergence import EOCRecorder
     eoc_rec = EOCRecorder()
 
-    for n in [8, 12, 16] if warp_mesh else [4, 6, 8]:
+    if group_cls == TensorProductElementGroup:
+        if dim == 1:
+            pytest.skip("Skipping 1D for tensor-product elements.")
+        if warp_mesh:
+            pytest.skip("Skipping warped mesh for tensor-product elements.")
+
+    mesh_n = [8, 12, 16] if warp_mesh else [4, 6, 8]
+            
+    for n in mesh_n:
+
         if warp_mesh:
+            if group_cls == TensorProductElementGroup:
+                pytest.skip("Skipping warped mesh for TPE.")
             if dim == 1:
                 pytest.skip("warped mesh in 1D not implemented")
             mesh = mgen.generate_warped_rect_mesh(
                           dim=dim, order=order, nelements_side=n)
         else:
             mesh = mgen.generate_regular_rect_mesh(
-                    a=(-1,)*dim, b=(1,)*dim,
-                    nelements_per_axis=(n,)*dim)
+                a=(-1,)*dim, b=(1,)*dim,
+                nelements_per_axis=(n,)*dim,
+                group_cls=group_cls)
+
+        # if group_cls is TensorProductElementGroup:
 
+        #    import grudge.dof_desc as dd
+        #    from meshmode.discretization.poly_element import \
+        #            LegendreGaussLobattoTensorProductGroupFactory as LGL
+
+        #    dcoll = DiscretizationCollection(
+        #        actx,
+        #        mesh,
+        #        discr_tag_to_group_factory={
+        #            dd.DISCR_TAG_BASE: LGL(order)})
+
+        # elif group_cls is SimplexElementGroup:
         dcoll = make_discretization_collection(
-                   actx, mesh,
-                   discr_tag_to_group_factory={
-                       DISCR_TAG_BASE: InterpolatoryEdgeClusteredGroupFactory(order),
-                       DISCR_TAG_QUAD: QuadratureGroupFactory(3 * order)
-                   })
+            actx, mesh,
+            discr_tag_to_group_factory={
+                DISCR_TAG_BASE: InterpolatoryEdgeClusteredGroupFactory(order),
+                DISCR_TAG_QUAD: QuadratureGroupFactory(3 * order)
+            })
+
+        # else:
+        #    raise AssertionError('Expecting TensorProductElementGroup or '
+        #                         f'SimplexElementGroup. Found {group_cls}')
+
+        alpha = 0.3
+        rot_mat = np.array([
+                [np.cos(alpha), np.sin(alpha), 0],
+                [-np.sin(alpha), np.cos(alpha), 0],
+                [0, 0, 1],
+        ])[:dim, :dim]
+
+        mesh = affine_map(mesh, A=rot_mat)
 
         def f(x):
             result = 1
@@ -123,7 +172,7 @@ def get_flux(u_tpair, dcoll=dcoll):
             dd_allfaces = dd.with_domain_tag(
                 BoundaryDomainTag(FACE_RESTR_ALL, VTAG_ALL)
                 )
-            normal = geometry.normal(actx, dcoll, dd)
+            normal = geo.normal(actx, dcoll, dd)
             u_avg = u_tpair.avg
             if vectorize:
                 if nested:
@@ -216,6 +265,10 @@ def get_flux(u_tpair, dcoll=dcoll):
 
 # {{{ divergence
 
+@pytest.mark.parametrize("group_cls", [
+    SimplexElementGroup,
+    TensorProductElementGroup
+])
 @pytest.mark.parametrize("form", ["strong", "weak"])
 @pytest.mark.parametrize("dim", [1, 2, 3])
 @pytest.mark.parametrize("order", [2, 3])
@@ -225,7 +278,7 @@ def get_flux(u_tpair, dcoll=dcoll):
     (True, True)
     ])
 def test_divergence(actx_factory, form, dim, order, vectorize, nested,
-        visualize=False):
+                    group_cls, visualize=False):
     actx = actx_factory()
 
     from pytools.convergence import EOCRecorder
@@ -233,10 +286,40 @@ def test_divergence(actx_factory, form, dim, order, vectorize, nested,
 
     for n in [4, 6, 8]:
         mesh = mgen.generate_regular_rect_mesh(
-                a=(-1,)*dim, b=(1,)*dim,
-                nelements_per_axis=(n,)*dim)
+            a=(-1,)*dim, b=(1,)*dim,
+            nelements_per_axis=(n,)*dim,
+            group_cls=group_cls)
+
+        if group_cls is TensorProductElementGroup:
+            # no reason to test 1D tensor product elements
+            if dim == 1:
+                return
+
+            import grudge.dof_desc as dd
+            from meshmode.discretization.poly_element import \
+                    LegendreGaussLobattoTensorProductGroupFactory as LGL
+
+            dcoll = make_discretization_collection(
+                actx,
+                mesh,
+                discr_tag_to_group_factory={
+                    dd.DISCR_TAG_BASE: LGL(order)})
+
+        elif group_cls is SimplexElementGroup:
+            dcoll = DiscretizationCollection(actx, mesh, order=order)
+
+        else:
+            raise AssertionError('Expecting TensorProductElementGroup or '
+                                 f'SimplexElementGroup. Found {group_cls}')
+
+        alpha = 0.3
+        rot_mat = np.array([
+                [np.cos(alpha), np.sin(alpha), 0],
+                [-np.sin(alpha), np.cos(alpha), 0],
+                [0, 0, 1],
+        ])[:dim, :dim]
 
-        dcoll = make_discretization_collection(actx, mesh, order=order)
+        mesh = affine_map(mesh, A=rot_mat)
 
         def f(x, dcoll=dcoll):
             result = make_obj_array([dcoll.zeros(actx) + (i+1) for i in range(dim)])
@@ -277,7 +360,7 @@ def get_flux(u_tpair, dcoll=dcoll):
             dd_allfaces = dd.with_domain_tag(
                 BoundaryDomainTag(FACE_RESTR_ALL, VTAG_ALL)
                 )
-            normal = geometry.normal(actx, dcoll, dd)
+            normal = geo.normal(actx, dcoll, dd)
             flux = u_tpair.avg @ normal
             return op.project(dcoll, dd, dd_allfaces, flux)
 
diff --git a/test/test_sbp_ops.py b/test/test_sbp_ops.py
new file mode 100644
index 000000000..41bf0d6d1
--- /dev/null
+++ b/test/test_sbp_ops.py
@@ -0,0 +1,171 @@
+__copyright__ = "Copyright (C) 2021 University of Illinois Board of Trustees"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+
+import numpy as np
+
+from grudge import DiscretizationCollection
+from grudge.dof_desc import DOFDesc, DISCR_TAG_BASE, DISCR_TAG_QUAD
+
+import pytest
+
+from grudge.array_context import PytestPyOpenCLArrayContextFactory
+from arraycontext import pytest_generate_tests_for_array_contexts
+pytest_generate_tests = pytest_generate_tests_for_array_contexts(
+        [PytestPyOpenCLArrayContextFactory])
+
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+@pytest.mark.parametrize("dim", [1, 2, 3])
+@pytest.mark.parametrize("order", [1, 2, 3, 4])
+def test_reference_element_sbp_operators(actx_factory, dim, order):
+    actx = actx_factory()
+
+    from meshmode.mesh.generation import generate_regular_rect_mesh
+
+    nel_1d = 5
+    box_ll = -5.0
+    box_ur = 5.0
+    mesh = generate_regular_rect_mesh(
+        a=(box_ll,)*dim,
+        b=(box_ur,)*dim,
+        nelements_per_axis=(nel_1d,)*dim)
+
+    from meshmode.discretization.poly_element import \
+        default_simplex_group_factory, QuadratureSimplexGroupFactory
+
+    dcoll = DiscretizationCollection(
+        actx, mesh,
+        discr_tag_to_group_factory={
+            DISCR_TAG_BASE: default_simplex_group_factory(dim, order),
+            DISCR_TAG_QUAD: QuadratureSimplexGroupFactory(2*order)
+        }
+    )
+
+    dd_q = DOFDesc("vol", DISCR_TAG_QUAD)
+    dd_f = DOFDesc("all_faces", DISCR_TAG_QUAD)
+
+    volm_discr = dcoll.discr_from_dd("vol")
+    quad_discr = dcoll.discr_from_dd(dd_q)
+    quad_face_discr = dcoll.discr_from_dd(dd_f)
+
+    from meshmode.discretization.poly_element import diff_matrices
+    from modepy import faces_for_shape, face_normal
+    from grudge.interpolation import (
+        volume_quadrature_interpolation_matrix,
+        surface_quadrature_interpolation_matrix
+    )
+    from grudge.op import reference_inverse_mass_matrix
+
+    for vgrp, qgrp, qfgrp in zip(volm_discr.groups,
+                                 quad_discr.groups,
+                                 quad_face_discr.groups):
+        nq_vol = qgrp.nunit_dofs
+        nq_per_face = qfgrp.nunit_dofs
+        nfaces = vgrp.shape.nfaces
+        nq_faces = nfaces * nq_per_face
+        nq_total = nq_vol + nq_faces
+
+        # {{{ Volume operators
+
+        weights = qgrp.quadrature_rule().weights
+        vdm_q = actx.to_numpy(
+            volume_quadrature_interpolation_matrix(actx, vgrp, qgrp))
+        inv_mass_mat = actx.to_numpy(
+            reference_inverse_mass_matrix(actx, vgrp))
+        p_mat = inv_mass_mat @ (vdm_q.T * weights)
+
+        # }}}
+
+        # Checks Pq @ Vq = Minv @ Vq.T @ W @ Vq = I
+        assert np.allclose(p_mat @ vdm_q,
+                           np.identity(len(inv_mass_mat)), rtol=1e-15)
+
+        # {{{ Surface operators
+
+        faces = faces_for_shape(vgrp.shape)
+        # NOTE: assumes same quadrature rule on all faces
+        face_weights = np.tile(qfgrp.quadrature_rule().weights, nfaces)
+        face_normals = [face_normal(face) for face in faces]
+        e = np.ones(shape=(nq_per_face,))
+        nrstj = [np.concatenate([np.sign(nhat[idx])*e
+                                 for nhat in face_normals])
+                 for idx in range(vgrp.dim)]
+        b_mats = [np.diag(face_weights*nrstj[d]) for d in range(vgrp.dim)]
+        vf_mat = actx.to_numpy(
+            surface_quadrature_interpolation_matrix(
+                actx,
+                base_element_group=vgrp,
+                face_quad_element_group=qfgrp
+            )
+        )
+
+        # }}}
+
+        # {{{ Hybridized (volume + surface) operators
+
+        q_mats = [p_mat.T @ (weights * vdm_q.T @ vdm_q) @ diff_mat @ p_mat
+                  for diff_mat in diff_matrices(vgrp)]
+        e_mat = vf_mat @ p_mat
+        qtilde_mats = 0.5 * np.asarray(
+            [np.block([[q_mats[d] - q_mats[d].T, e_mat.T @ b_mats[d]],
+                       [-b_mats[d] @ e_mat, b_mats[d]]])
+             for d in range(dcoll.dim)]
+        )
+
+        # }}}
+
+        ones = np.ones(shape=(nq_total,))
+        zeros = np.zeros(shape=(nq_total,))
+        for idx in range(dim):
+            # Checks the generalized SBP property:
+            # Qi + Qi.T = E.T @ Bi @ E
+            # c.f. Lemma 1. https://arxiv.org/pdf/1708.01243.pdf
+            assert np.allclose(q_mats[idx] + q_mats[idx].T,
+                               e_mat.T @ b_mats[idx] @ e_mat, rtol=1e-15)
+
+            # Checks the SBP-like property for the skew hybridized operator
+            # Qitilde + Qitilde.T = [0 0; 0 Bi]
+            # c.f. Theorem 1 and Lemma 1. https://arxiv.org/pdf/1902.01828.pdf
+            residual = qtilde_mats[idx] + qtilde_mats[idx].T
+            residual[nq_vol:nq_vol+nq_faces, nq_vol:nq_vol+nq_faces] -= b_mats[idx]
+            assert np.allclose(residual, np.zeros(residual.shape), rtol=1e-15)
+
+            # Checks quadrature condition for: Qiskew @ ones = zeros
+            # Qiskew + Qiskew.T = [0 0; 0 Bi]
+            # c.f. Lemma 2. https://arxiv.org/pdf/1902.01828.pdf
+            assert np.allclose(np.dot(qtilde_mats[idx], ones),
+                               zeros, rtol=1e-15)
+
+
+# You can test individual routines by typing
+# $ python test_grudge.py 'test_routine()'
+
+if __name__ == "__main__":
+    import sys
+    if len(sys.argv) > 1:
+        exec(sys.argv[1])
+    else:
+        pytest.main([__file__])