diff --git a/doc/operators.rst b/doc/operators.rst
index 550a78023..881afd945 100644
--- a/doc/operators.rst
+++ b/doc/operators.rst
@@ -3,6 +3,7 @@ Discontinuous Galerkin operators
 
 .. automodule:: grudge.op
 .. automodule:: grudge.trace_pair
+.. automodule:: grudge.flux_differencing
 
 
 Transfering data between discretizations
diff --git a/examples/euler/acoustic_pulse.py b/examples/euler/acoustic_pulse.py
index 779062910..d5017de22 100644
--- a/examples/euler/acoustic_pulse.py
+++ b/examples/euler/acoustic_pulse.py
@@ -35,6 +35,7 @@
 from grudge.models.euler import (
     ConservedEulerField,
     EulerOperator,
+    EntropyStableEulerOperator,
     InviscidWallBC
 )
 from grudge.shortcuts import rk4_step
@@ -111,9 +112,24 @@ def run_acoustic_pulse(actx,
                        order=3,
                        final_time=1,
                        resolution=16,
+                       esdg=False,
                        overintegration=False,
                        visualize=False):
 
+    logger.info(
+        """
+        Acoustic pulse parameters:\n
+        order: %s\n
+        final_time: %s\n
+        resolution: %s\n
+        entropy stable: %s\n
+        overintegration: %s\n
+        visualize: %s
+        """,
+        order, final_time, resolution, esdg,
+        overintegration, visualize
+    )
+
     # eos-related parameters
     gamma = 1.4
 
@@ -135,7 +151,15 @@ def run_acoustic_pulse(actx,
         (default_simplex_group_factory,
          QuadratureSimplexGroupFactory)
 
-    exp_name = f"fld-acoustic-pulse-N{order}-K{resolution}"
+    if esdg:
+        case = "esdg-pulse"
+        operator_cls = EntropyStableEulerOperator
+    else:
+        case = "pulse"
+        operator_cls = EulerOperator
+
+    exp_name = f"fld-{case}-N{order}-K{resolution}"
+
     if overintegration:
         exp_name += "-overintegrated"
         quad_tag = DISCR_TAG_QUAD
@@ -155,7 +179,7 @@ def run_acoustic_pulse(actx,
 
     # {{{ Euler operator
 
-    euler_operator = EulerOperator(
+    euler_operator = operator_cls(
         dcoll,
         bdry_conditions={BTAG_ALL: InviscidWallBC()},
         flux_type="lf",
@@ -212,7 +236,7 @@ def rhs(t, q):
 
 
 def main(ctx_factory, order=3, final_time=1, resolution=16,
-         overintegration=False, visualize=False, lazy=False):
+         esdg=False, overintegration=False, visualize=False, lazy=False):
     cl_ctx = ctx_factory()
     queue = cl.CommandQueue(cl_ctx)
 
@@ -228,10 +252,17 @@ def main(ctx_factory, order=3, final_time=1, resolution=16,
             force_device_scalars=True,
         )
 
+    if not actx.supports_nonscalar_broadcasting and esdg is True:
+        raise RuntimeError(
+            "Cannot use ESDG with an array context that cannot perform "
+            "nonscalar broadcasting. Run with --lazy instead."
+        )
+
     run_acoustic_pulse(
         actx,
         order=order,
         resolution=resolution,
+        esdg=esdg,
         overintegration=overintegration,
         final_time=final_time,
         visualize=visualize
@@ -245,6 +276,8 @@ def main(ctx_factory, order=3, final_time=1, resolution=16,
     parser.add_argument("--order", default=3, type=int)
     parser.add_argument("--tfinal", default=0.1, type=float)
     parser.add_argument("--resolution", default=16, type=int)
+    parser.add_argument("--esdg", action="store_true",
+                        help="use entropy stable dg")
     parser.add_argument("--oi", action="store_true",
                         help="use overintegration")
     parser.add_argument("--visualize", action="store_true",
@@ -258,6 +291,7 @@ def main(ctx_factory, order=3, final_time=1, resolution=16,
          order=args.order,
          final_time=args.tfinal,
          resolution=args.resolution,
+         esdg=args.esdg,
          overintegration=args.oi,
          visualize=args.visualize,
          lazy=args.lazy)
diff --git a/examples/euler/sod.py b/examples/euler/sod.py
new file mode 100644
index 000000000..abf138f41
--- /dev/null
+++ b/examples/euler/sod.py
@@ -0,0 +1,227 @@
+__copyright__ = """
+Copyright (C) 2021 University of Illinois Board of Trustees
+"""
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+
+from grudge.dof_desc import BoundaryDomainTag
+import pyopencl as cl
+import pyopencl.tools as cl_tools
+
+from arraycontext import thaw, freeze
+from grudge.array_context import PytatoPyOpenCLArrayContext
+from grudge.models.euler import (
+    EntropyStableEulerOperator,
+    ConservedEulerField,
+    PrescribedBC,
+    conservative_to_primitive_vars,
+)
+from grudge.shortcuts import rk4_step
+
+from pytools.obj_array import make_obj_array
+
+import grudge.op as op
+
+import logging
+logger = logging.getLogger(__name__)
+
+
+def sod_shock_initial_condition(nodes, t=0):
+    gamma = 1.4
+    dim = len(nodes)
+    gmn1 = 1.0 / (gamma - 1.0)
+    x = nodes[0]
+    actx = x.array_context
+    zeros = 0*x
+
+    _x0 = 0.5
+    _rhoin = 1.0
+    _rhoout = 0.125
+    _pin = 1.0
+    _pout = 0.1
+    rhoin = zeros + _rhoin
+    rhoout = zeros + _rhoout
+    energyin = zeros + gmn1 * _pin
+    energyout = zeros + gmn1 * _pout
+
+    x0 = zeros + _x0
+    sigma = 1e-13
+    weight = 0.5 * (1.0 - actx.np.tanh(1.0/sigma * (x - x0)))
+
+    mass = rhoout + (rhoin - rhoout)*weight
+    energy = energyout + (energyin - energyout)*weight
+    momentum = make_obj_array([zeros for _ in range(dim)])
+
+    return ConservedEulerField(mass=mass, energy=energy, momentum=momentum)
+
+
+def run_sod_shock_tube(
+        actx, order=4, resolution=32, final_time=0.2, visualize=False):
+
+    logger.info(
+        """
+        Sod 1-D parameters:\n
+        order: %s\n
+        final_time: %s\n
+        resolution: %s\n
+        visualize: %s
+        """,
+        order, final_time, resolution, visualize
+    )
+
+    # eos-related parameters
+    gamma = 1.4
+
+    # {{{ discretization
+
+    from meshmode.mesh.generation import generate_regular_rect_mesh
+
+    dim = 1
+    box_ll = 0.0
+    box_ur = 1.0
+    mesh = generate_regular_rect_mesh(
+        a=(box_ll,)*dim,
+        b=(box_ur,)*dim,
+        nelements_per_axis=(resolution,)*dim,
+        boundary_tag_to_face={
+            "prescribed": ["+x", "-x"],
+        }
+    )
+
+    from grudge import DiscretizationCollection
+    from grudge.dof_desc import \
+        DISCR_TAG_BASE, DISCR_TAG_QUAD
+    from meshmode.discretization.poly_element import \
+        (default_simplex_group_factory,
+         QuadratureSimplexGroupFactory)
+
+    exp_name = f"fld-sod-1d-N{order}-K{resolution}"
+    quad_tag = DISCR_TAG_QUAD
+
+    dcoll = DiscretizationCollection(
+        actx, mesh,
+        discr_tag_to_group_factory={
+            DISCR_TAG_BASE: default_simplex_group_factory(dim, order),
+            DISCR_TAG_QUAD: QuadratureSimplexGroupFactory(order + 2)
+        }
+    )
+
+    # }}}
+
+    # {{{ Euler operator
+
+    dd_prescribed = BoundaryDomainTag("prescribed")
+    bcs = {
+        dd_prescribed: PrescribedBC(prescribed_state=sod_shock_initial_condition)
+    }
+
+    euler_operator = EntropyStableEulerOperator(
+        dcoll,
+        bdry_conditions=bcs,
+        flux_type="lf",
+        gamma=gamma,
+        quadrature_tag=quad_tag
+    )
+
+    def rhs(t, q):
+        return euler_operator.operator(t, q)
+
+    compiled_rhs = actx.compile(rhs)
+
+    fields = sod_shock_initial_condition(thaw(dcoll.nodes(), actx))
+
+    from grudge.dt_utils import h_min_from_volume
+
+    cfl = 0.01
+    cn = 0.5*(order + 1)**2
+    dt = cfl * actx.to_numpy(h_min_from_volume(dcoll)) / cn
+
+    logger.info("Timestep size: %g", dt)
+
+    # }}}
+
+    from grudge.shortcuts import make_visualizer
+
+    vis = make_visualizer(dcoll)
+
+    # {{{ time stepping
+
+    step = 0
+    t = 0.0
+    while t < final_time:
+        if step % 10 == 0:
+            norm_q = actx.to_numpy(op.norm(dcoll, fields, 2))
+            logger.info("[%04d] t = %.5f |q| = %.5e", step, t, norm_q)
+            if visualize:
+                rho, velocity, pressure = \
+                    conservative_to_primitive_vars(fields, gamma=gamma)
+                vis.write_vtk_file(
+                    f"{exp_name}-{step:04d}.vtu",
+                    [
+                        ("rho", rho),
+                        ("energy", fields.energy),
+                        ("momentum", fields.momentum),
+                        ("velocity", velocity),
+                        ("pressure", pressure)
+                    ]
+                )
+            assert norm_q < 10000
+
+        fields = thaw(freeze(fields, actx), actx)
+        fields = rk4_step(fields, t, dt, compiled_rhs)
+        t += dt
+        step += 1
+
+    # }}}
+
+
+def main(ctx_factory, order=4, final_time=0.2, resolution=32, visualize=False):
+    cl_ctx = ctx_factory()
+    queue = cl.CommandQueue(cl_ctx)
+    actx = PytatoPyOpenCLArrayContext(
+        queue,
+        allocator=cl_tools.MemoryPool(cl_tools.ImmediateAllocator(queue)),
+    )
+
+    run_sod_shock_tube(
+        actx, order=order,
+        resolution=resolution,
+        final_time=final_time,
+        visualize=visualize)
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--order", default=4, type=int)
+    parser.add_argument("--tfinal", default=0.2, type=float)
+    parser.add_argument("--resolution", default=32, type=int)
+    parser.add_argument("--visualize", action="store_true")
+    args = parser.parse_args()
+
+    logging.basicConfig(level=logging.INFO)
+    main(cl.create_some_context,
+         order=args.order,
+         final_time=args.tfinal,
+         resolution=args.resolution,
+         visualize=args.visualize)
diff --git a/examples/euler/vortex.py b/examples/euler/vortex.py
index 9f00743e5..ab94529db 100644
--- a/examples/euler/vortex.py
+++ b/examples/euler/vortex.py
@@ -29,7 +29,8 @@
 from grudge.array_context import PytatoPyOpenCLArrayContext, PyOpenCLArrayContext
 from grudge.models.euler import (
     vortex_initial_condition,
-    EulerOperator
+    EulerOperator,
+    EntropyStableEulerOperator
 )
 from grudge.shortcuts import rk4_step
 
@@ -40,6 +41,7 @@
 
 
 def run_vortex(actx, order=3, resolution=8, final_time=5,
+               esdg=False,
                overintegration=False,
                flux_type="central",
                visualize=False):
@@ -50,11 +52,12 @@ def run_vortex(actx, order=3, resolution=8, final_time=5,
         order: %s\n
         final_time: %s\n
         resolution: %s\n
+        entropy stable: %s\n
         overintegration: %s\n
         flux_type: %s\n
         visualize: %s
         """,
-        order, final_time, resolution,
+        order, final_time, resolution, esdg,
         overintegration, flux_type, visualize
     )
 
@@ -76,7 +79,14 @@ def run_vortex(actx, order=3, resolution=8, final_time=5,
     from meshmode.discretization.poly_element import \
         default_simplex_group_factory, QuadratureSimplexGroupFactory
 
-    exp_name = f"fld-vortex-N{order}-K{resolution}-{flux_type}"
+    if esdg:
+        case = "esdg-vortex"
+        operator_cls = EntropyStableEulerOperator
+    else:
+        case = "vortex"
+        operator_cls = EulerOperator
+
+    exp_name = f"fld-{case}-N{order}-K{resolution}-{flux_type}"
 
     if overintegration:
         exp_name += "-overintegrated"
@@ -97,7 +107,7 @@ def run_vortex(actx, order=3, resolution=8, final_time=5,
 
     # {{{ Euler operator
 
-    euler_operator = EulerOperator(
+    euler_operator = operator_cls(
         dcoll,
         flux_type=flux_type,
         gamma=gamma,
@@ -154,6 +164,7 @@ def rhs(t, q):
 
 
 def main(ctx_factory, order=3, final_time=5, resolution=8,
+         esdg=False,
          overintegration=False,
          lf_stabilization=False,
          visualize=False,
@@ -173,6 +184,12 @@ def main(ctx_factory, order=3, final_time=5, resolution=8,
             force_device_scalars=True,
         )
 
+    if not actx.supports_nonscalar_broadcasting and esdg is True:
+        raise RuntimeError(
+            "Cannot use ESDG with an array context that cannot perform "
+            "nonscalar broadcasting. Run with --lazy instead."
+        )
+
     if lf_stabilization:
         flux_type = "lf"
     else:
@@ -183,6 +200,7 @@ def main(ctx_factory, order=3, final_time=5, resolution=8,
         order=order,
         resolution=resolution,
         final_time=final_time,
+        esdg=esdg,
         overintegration=overintegration,
         flux_type=flux_type,
         visualize=visualize
@@ -196,6 +214,8 @@ def main(ctx_factory, order=3, final_time=5, resolution=8,
     parser.add_argument("--order", default=3, type=int)
     parser.add_argument("--tfinal", default=0.015, type=float)
     parser.add_argument("--resolution", default=8, type=int)
+    parser.add_argument("--esdg", action="store_true",
+                        help="use entropy stable dg")
     parser.add_argument("--oi", action="store_true",
                         help="use overintegration")
     parser.add_argument("--lf", action="store_true",
@@ -211,6 +231,7 @@ def main(ctx_factory, order=3, final_time=5, resolution=8,
          order=args.order,
          final_time=args.tfinal,
          resolution=args.resolution,
+         esdg=args.esdg,
          overintegration=args.oi,
          lf_stabilization=args.lf,
          visualize=args.visualize,
diff --git a/grudge/array_context.py b/grudge/array_context.py
index 00e6fb0d6..d672a68ad 100644
--- a/grudge/array_context.py
+++ b/grudge/array_context.py
@@ -232,7 +232,21 @@ def _dag_to_compiled_func(self, dict_of_named_arrays,
         self.actx._compile_trace_callback(self.f, "pre_find_distributed_partition",
                 dict_of_named_arrays)
 
-        distributed_partition = pt.find_distributed_partition(dict_of_named_arrays)
+        # https://github.com/inducer/pytato/pull/393 changes the function signature
+        try:
+            # pylint: disable=too-many-function-args
+            distributed_partition = pt.find_distributed_partition(
+                # pylint-ignore-reason:
+                # '_BasePytatoArrayContext' has no
+                # 'mpi_communicator' member
+                # pylint: disable=no-member
+                self.actx.mpi_communicator, dict_of_named_arrays)
+        except TypeError as e:
+            if "find_distributed_partition() takes 1 positional" in str(e):
+                distributed_partition = pt.find_distributed_partition(
+                    dict_of_named_arrays)
+            else:
+                raise
 
         if __debug__:
             # pylint-ignore-reason:
diff --git a/grudge/discretization.py b/grudge/discretization.py
index 8e57ca503..25d5fa797 100644
--- a/grudge/discretization.py
+++ b/grudge/discretization.py
@@ -7,6 +7,8 @@
 .. autofunction:: make_discretization_collection
 
 .. currentmodule:: grudge.discretization
+.. autoclass:: PartID
+.. autofunction:: filter_part_boundaries
 """
 
 __copyright__ = """
@@ -34,10 +36,13 @@
 THE SOFTWARE.
 """
 
-from typing import Mapping, Optional, Union, TYPE_CHECKING, Any
+from typing import (
+    Sequence, Mapping, Optional, Union, List, Tuple, TYPE_CHECKING, Any)
 
 from pytools import memoize_method, single_valued
 
+from dataclasses import dataclass, replace
+
 from grudge.dof_desc import (
     VTAG_ALL,
     DD_VOLUME_ALL,
@@ -71,6 +76,75 @@
     import mpi4py.MPI
 
 
+@dataclass(frozen=True)
+class PartID:
+    """Unique identifier for a piece of a partitioned mesh.
+
+    .. attribute:: volume_tag
+
+        The volume of the part.
+
+    .. attribute:: rank
+
+        The (optional) MPI rank of the part.
+
+    """
+    volume_tag: VolumeTag
+    rank: Optional[int] = None
+
+
+# {{{ part ID normalization
+
+def _normalize_mesh_part_ids(
+        mesh: Mesh,
+        self_volume_tag: VolumeTag,
+        all_volume_tags: Sequence[VolumeTag],
+        mpi_communicator: Optional["mpi4py.MPI.Intracomm"] = None):
+    """Convert a mesh's configuration-dependent "part ID" into a fixed type."""
+    from numbers import Integral
+    if mpi_communicator is not None:
+        # Accept PartID or rank (assume intra-volume for the latter)
+        def as_part_id(mesh_part_id):
+            if isinstance(mesh_part_id, PartID):
+                return mesh_part_id
+            elif isinstance(mesh_part_id, Integral):
+                return PartID(self_volume_tag, int(mesh_part_id))
+            else:
+                raise TypeError(f"Unable to convert {mesh_part_id} to PartID.")
+    else:
+        # Accept PartID or volume tag
+        def as_part_id(mesh_part_id):
+            if isinstance(mesh_part_id, PartID):
+                return mesh_part_id
+            elif mesh_part_id in all_volume_tags:
+                return PartID(mesh_part_id)
+            else:
+                raise TypeError(f"Unable to convert {mesh_part_id} to PartID.")
+
+    facial_adjacency_groups = mesh.facial_adjacency_groups
+
+    new_facial_adjacency_groups = []
+
+    from meshmode.mesh import InterPartAdjacencyGroup
+    for grp_list in facial_adjacency_groups:
+        new_grp_list = []
+        for fagrp in grp_list:
+            if isinstance(fagrp, InterPartAdjacencyGroup):
+                part_id = as_part_id(fagrp.part_id)
+                new_fagrp = replace(
+                    fagrp,
+                    boundary_tag=BTAG_PARTITION(part_id),
+                    part_id=part_id)
+            else:
+                new_fagrp = fagrp
+            new_grp_list.append(new_fagrp)
+        new_facial_adjacency_groups.append(new_grp_list)
+
+    return mesh.copy(facial_adjacency_groups=new_facial_adjacency_groups)
+
+# }}}
+
+
 # {{{ discr_tag_to_group_factory normalization
 
 def _normalize_discr_tag_to_group_factory(
@@ -156,6 +230,9 @@ def __init__(self, array_context: ArrayContext,
             discr_tag_to_group_factory: Optional[
                 Mapping[DiscretizationTag, ElementGroupFactory]] = None,
             mpi_communicator: Optional["mpi4py.MPI.Intracomm"] = None,
+            inter_part_connections: Optional[
+                Mapping[Tuple[PartID, PartID],
+                    DiscretizationConnection]] = None,
             ) -> None:
         """
         :arg discr_tag_to_group_factory: A mapping from discretization tags
@@ -206,6 +283,9 @@ def __init__(self, array_context: ArrayContext,
 
             mesh = volume_discrs
 
+            mesh = _normalize_mesh_part_ids(
+                mesh, VTAG_ALL, [VTAG_ALL], mpi_communicator=mpi_communicator)
+
             discr_tag_to_group_factory = _normalize_discr_tag_to_group_factory(
                     dim=mesh.dim,
                     discr_tag_to_group_factory=discr_tag_to_group_factory,
@@ -219,17 +299,32 @@ def __init__(self, array_context: ArrayContext,
 
             del mesh
 
+            if inter_part_connections is not None:
+                raise TypeError("may not pass inter_part_connections when "
+                        "DiscretizationCollection constructor is called in "
+                        "legacy mode")
+
+            self._inter_part_connections = \
+                    _set_up_inter_part_connections(
+                            array_context=self._setup_actx,
+                            mpi_communicator=mpi_communicator,
+                            volume_discrs=volume_discrs,
+                            base_group_factory=(
+                                discr_tag_to_group_factory[DISCR_TAG_BASE]))
+
             # }}}
         else:
             assert discr_tag_to_group_factory is not None
             self._discr_tag_to_group_factory = discr_tag_to_group_factory
 
-        self._volume_discrs = volume_discrs
+            if inter_part_connections is None:
+                raise TypeError("inter_part_connections must be passed when "
+                        "DiscretizationCollection constructor is called in "
+                        "'modern' mode")
+
+            self._inter_part_connections = inter_part_connections
 
-        self._dist_boundary_connections = {
-            vtag: self._set_up_distributed_communication(
-                        vtag, mpi_communicator, array_context)
-            for vtag in self._volume_discrs.keys()}
+        self._volume_discrs = volume_discrs
 
     # }}}
 
@@ -252,71 +347,6 @@ def is_management_rank(self):
             return self.mpi_communicator.Get_rank() \
                     == self.get_management_rank_index()
 
-    # {{{ distributed
-
-    def _set_up_distributed_communication(
-            self, vtag, mpi_communicator, array_context):
-        from_dd = DOFDesc(VolumeDomainTag(vtag), DISCR_TAG_BASE)
-
-        boundary_connections = {}
-
-        from meshmode.distributed import get_connected_partitions
-        connected_parts = get_connected_partitions(self._volume_discrs[vtag].mesh)
-
-        if connected_parts:
-            if mpi_communicator is None:
-                raise RuntimeError("must supply an MPI communicator when using a "
-                    "distributed mesh")
-
-            grp_factory = \
-                self.group_factory_for_discretization_tag(DISCR_TAG_BASE)
-
-            local_boundary_connections = {}
-            for i_remote_part in connected_parts:
-                local_boundary_connections[i_remote_part] = self.connection_from_dds(
-                        from_dd, from_dd.trace(BTAG_PARTITION(i_remote_part)))
-
-            from meshmode.distributed import MPIBoundaryCommSetupHelper
-            with MPIBoundaryCommSetupHelper(mpi_communicator, array_context,
-                    local_boundary_connections, grp_factory) as bdry_setup_helper:
-                while True:
-                    conns = bdry_setup_helper.complete_some()
-                    if not conns:
-                        break
-                    for i_remote_part, conn in conns.items():
-                        boundary_connections[i_remote_part] = conn
-
-        return boundary_connections
-
-    def distributed_boundary_swap_connection(self, dd):
-        """Provides a mapping from the base volume discretization
-        to the exterior boundary restriction on a parallel boundary
-        partition described by *dd*. This connection is used to
-        communicate across element boundaries in different parallel
-        partitions during distributed runs.
-
-        :arg dd: a :class:`~grudge.dof_desc.DOFDesc`, or a value
-            convertible to one. The domain tag must be a subclass
-            of :class:`grudge.dof_desc.BoundaryDomainTag` with an
-            associated :class:`meshmode.mesh.BTAG_PARTITION`
-            corresponding to a particular communication rank.
-        """
-        if dd.discretization_tag is not DISCR_TAG_BASE:
-            # FIXME
-            raise NotImplementedError(
-                "Distributed communication with discretization tag "
-                f"{dd.discretization_tag} is not implemented."
-            )
-
-        assert isinstance(dd.domain_tag, BoundaryDomainTag)
-        assert isinstance(dd.domain_tag.tag, BTAG_PARTITION)
-
-        vtag = dd.domain_tag.volume_tag
-
-        return self._dist_boundary_connections[vtag][dd.domain_tag.tag.part_nr]
-
-    # }}}
-
     # {{{ discr_from_dd
 
     @memoize_method
@@ -772,6 +802,105 @@ def normal(self, dd):
     # }}}
 
 
+# {{{ distributed/multi-volume setup
+
+def _set_up_inter_part_connections(
+        array_context: ArrayContext,
+        mpi_communicator: Optional["mpi4py.MPI.Intracomm"],
+        volume_discrs: Mapping[VolumeTag, Discretization],
+        base_group_factory: ElementGroupFactory,
+        ) -> Mapping[
+                Tuple[PartID, PartID],
+                DiscretizationConnection]:
+
+    from meshmode.distributed import (get_connected_parts,
+            make_remote_group_infos, InterRankBoundaryInfo,
+            MPIBoundaryCommSetupHelper)
+
+    rank = mpi_communicator.Get_rank() if mpi_communicator is not None else None
+
+    # Save boundary restrictions as they're created to avoid potentially creating
+    # them twice in the loop below
+    cached_part_bdry_restrictions: Mapping[
+        Tuple[PartID, PartID],
+        DiscretizationConnection] = {}
+
+    def get_part_bdry_restriction(self_part_id, other_part_id):
+        cached_result = cached_part_bdry_restrictions.get(
+            (self_part_id, other_part_id), None)
+        if cached_result is not None:
+            return cached_result
+        return cached_part_bdry_restrictions.setdefault(
+            (self_part_id, other_part_id),
+            make_face_restriction(
+                array_context, volume_discrs[self_part_id.volume_tag],
+                base_group_factory,
+                boundary_tag=BTAG_PARTITION(other_part_id)))
+
+    inter_part_conns: Mapping[
+            Tuple[PartID, PartID],
+            DiscretizationConnection] = {}
+
+    irbis = []
+
+    for vtag, volume_discr in volume_discrs.items():
+        part_id = PartID(vtag, rank)
+        connected_part_ids = get_connected_parts(volume_discr.mesh)
+        for connected_part_id in connected_part_ids:
+            bdry_restr = get_part_bdry_restriction(
+                self_part_id=part_id, other_part_id=connected_part_id)
+
+            if connected_part_id.rank == rank:
+                # {{{ rank-local interface between multiple volumes
+
+                connected_bdry_restr = get_part_bdry_restriction(
+                    self_part_id=connected_part_id, other_part_id=part_id)
+
+                from meshmode.discretization.connection import \
+                        make_partition_connection
+                inter_part_conns[connected_part_id, part_id] = \
+                    make_partition_connection(
+                        array_context,
+                        local_bdry_conn=bdry_restr,
+                        remote_bdry_discr=connected_bdry_restr.to_discr,
+                        remote_group_infos=make_remote_group_infos(
+                            array_context, part_id, connected_bdry_restr))
+
+                # }}}
+            else:
+                # {{{ cross-rank interface
+
+                if mpi_communicator is None:
+                    raise RuntimeError("must supply an MPI communicator "
+                        "when using a distributed mesh")
+
+                irbis.append(
+                        InterRankBoundaryInfo(
+                            local_part_id=part_id,
+                            remote_part_id=connected_part_id,
+                            remote_rank=connected_part_id.rank,
+                            local_boundary_connection=bdry_restr))
+
+                # }}}
+
+    if irbis:
+        assert mpi_communicator is not None
+
+        with MPIBoundaryCommSetupHelper(mpi_communicator, array_context,
+                irbis, base_group_factory) as bdry_setup_helper:
+            while True:
+                conns = bdry_setup_helper.complete_some()
+                if not conns:
+                    # We're done.
+                    break
+
+                inter_part_conns.update(conns)
+
+    return inter_part_conns
+
+# }}}
+
+
 # {{{ modal group factory
 
 def _generate_modal_group_factory(nodal_group_factory):
@@ -860,6 +989,8 @@ def make_discretization_collection(
 
     del order
 
+    mpi_communicator = getattr(array_context, "mpi_communicator", None)
+
     if any(
             isinstance(mesh_or_discr, Discretization)
             for mesh_or_discr in volumes.values()):
@@ -868,14 +999,60 @@ def make_discretization_collection(
     volume_discrs = {
         vtag: Discretization(
             array_context,
-            mesh,
+            _normalize_mesh_part_ids(
+                mesh, vtag, volumes.keys(), mpi_communicator=mpi_communicator),
             discr_tag_to_group_factory[DISCR_TAG_BASE])
         for vtag, mesh in volumes.items()}
 
     return DiscretizationCollection(
             array_context=array_context,
             volume_discrs=volume_discrs,
-            discr_tag_to_group_factory=discr_tag_to_group_factory)
+            discr_tag_to_group_factory=discr_tag_to_group_factory,
+            inter_part_connections=_set_up_inter_part_connections(
+                array_context=array_context,
+                mpi_communicator=mpi_communicator,
+                volume_discrs=volume_discrs,
+                base_group_factory=discr_tag_to_group_factory[DISCR_TAG_BASE]))
+
+# }}}
+
+
+# {{{ filter_part_boundaries
+
+def filter_part_boundaries(
+        dcoll: DiscretizationCollection,
+        *,
+        volume_dd: DOFDesc = DD_VOLUME_ALL,
+        neighbor_volume_dd: Optional[DOFDesc] = None,
+        neighbor_rank: Optional[int] = None) -> List[DOFDesc]:
+    """
+    Retrieve tags of part boundaries that match *neighbor_volume_dd* and/or
+    *neighbor_rank*.
+    """
+    vol_mesh = dcoll.discr_from_dd(volume_dd).mesh
+
+    from meshmode.mesh import InterPartAdjacencyGroup
+    filtered_part_bdry_dds = [
+        volume_dd.trace(fagrp.boundary_tag)
+        for fagrp_list in vol_mesh.facial_adjacency_groups
+        for fagrp in fagrp_list
+        if isinstance(fagrp, InterPartAdjacencyGroup)]
+
+    if neighbor_volume_dd is not None:
+        filtered_part_bdry_dds = [
+            bdry_dd
+            for bdry_dd in filtered_part_bdry_dds
+            if (
+                bdry_dd.domain_tag.tag.part_id.volume_tag
+                == neighbor_volume_dd.domain_tag.tag)]
+
+    if neighbor_rank is not None:
+        filtered_part_bdry_dds = [
+            bdry_dd
+            for bdry_dd in filtered_part_bdry_dds
+            if bdry_dd.domain_tag.tag.part_id.rank == neighbor_rank]
+
+    return filtered_part_bdry_dds
 
 # }}}
 
diff --git a/grudge/eager.py b/grudge/eager.py
index 626e15592..08cf08f2a 100644
--- a/grudge/eager.py
+++ b/grudge/eager.py
@@ -87,7 +87,8 @@ def nodal_max(self, dd, vec):
         return op.nodal_max(self, dd, vec)
 
 
-connected_ranks = op.connected_ranks
+# FIXME: Deprecate connected_ranks instead of removing
+connected_parts = op.connected_parts
 interior_trace_pair = op.interior_trace_pair
 cross_rank_trace_pairs = op.cross_rank_trace_pairs
 
diff --git a/grudge/flux_differencing.py b/grudge/flux_differencing.py
new file mode 100644
index 000000000..816138df9
--- /dev/null
+++ b/grudge/flux_differencing.py
@@ -0,0 +1,271 @@
+"""Grudge module for flux-differencing in entropy-stable DG methods
+
+Flux-differencing
+-----------------
+
+.. autofunction:: volume_flux_differencing
+"""
+
+__copyright__ = """
+Copyright (C) 2021 University of Illinois Board of Trustees
+"""
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+
+from arraycontext import (
+    ArrayContext,
+    map_array_container,
+    freeze
+)
+from arraycontext import ArrayOrContainer
+
+from functools import partial
+
+from meshmode.transform_metadata import FirstAxisIsElementsTag
+from meshmode.dof_array import DOFArray
+
+from grudge.discretization import DiscretizationCollection
+from grudge.dof_desc import DOFDesc
+
+from pytools import memoize_in, keyed_memoize_in
+
+import numpy as np
+
+
+def _reference_skew_symmetric_hybridized_sbp_operators(
+        actx: ArrayContext,
+        base_element_group,
+        vol_quad_element_group,
+        face_quad_element_group, dtype):
+    @keyed_memoize_in(
+        actx, _reference_skew_symmetric_hybridized_sbp_operators,
+        lambda base_grp, quad_vol_grp, face_quad_grp: (
+            base_grp.discretization_key(),
+            quad_vol_grp.discretization_key(),
+            face_quad_grp.discretization_key()))
+    def get_reference_skew_symetric_hybridized_diff_mats(
+            base_grp, quad_vol_grp, face_quad_grp):
+        from meshmode.discretization.poly_element import diff_matrices
+        from modepy import faces_for_shape, face_normal
+        from grudge.interpolation import (
+            volume_quadrature_interpolation_matrix,
+            surface_quadrature_interpolation_matrix
+        )
+        from grudge.op import reference_inverse_mass_matrix
+
+        # {{{ Volume operators
+
+        weights = quad_vol_grp.quadrature_rule().weights
+        vdm_q = actx.to_numpy(
+            volume_quadrature_interpolation_matrix(actx, base_grp, quad_vol_grp))
+        inv_mass_mat = actx.to_numpy(
+            reference_inverse_mass_matrix(actx, base_grp))
+        p_mat = inv_mass_mat @ (vdm_q.T * weights)
+
+        # }}}
+
+        # {{{ Surface operators
+
+        faces = faces_for_shape(base_grp.shape)
+        nfaces = len(faces)
+        # NOTE: assumes same quadrature rule on all faces
+        face_weights = np.tile(face_quad_grp.quadrature_rule().weights, nfaces)
+        face_normals = [face_normal(face) for face in faces]
+        nnods_per_face = face_quad_grp.nunit_dofs
+        e = np.ones(shape=(nnods_per_face,))
+        nrstj = [
+            # nsrtJ = nhat * Jhatf, where nhat is the reference normal
+            # and Jhatf is the Jacobian det. of the transformation from
+            # the face of the reference element to the reference face.
+            np.concatenate([np.sign(nhat[idx])*e for nhat in face_normals])
+            for idx in range(base_grp.dim)
+        ]
+        b_mats = [np.diag(face_weights*nrstj[d]) for d in range(base_grp.dim)]
+        vf_mat = actx.to_numpy(
+            surface_quadrature_interpolation_matrix(
+                actx,
+                base_element_group=base_grp,
+                face_quad_element_group=face_quad_grp))
+        zero_mat = np.zeros((nfaces*nnods_per_face, nfaces*nnods_per_face),
+                            dtype=dtype)
+
+        # }}}
+
+        # {{{ Hybridized (volume + surface) operators
+
+        q_mats = [p_mat.T @ (weights * vdm_q.T @ vdm_q) @ diff_mat @ p_mat
+                  for diff_mat in diff_matrices(base_grp)]
+        e_mat = vf_mat @ p_mat
+        q_skew_hybridized = np.asarray(
+            [
+                np.block(
+                    [[q_mats[d] - q_mats[d].T, e_mat.T @ b_mats[d]],
+                    [-b_mats[d] @ e_mat, zero_mat]]
+                ) for d in range(base_grp.dim)
+            ],
+            order="C"
+        )
+
+        # }}}
+
+        return actx.freeze(actx.from_numpy(q_skew_hybridized))
+
+    return get_reference_skew_symetric_hybridized_diff_mats(
+        base_element_group,
+        vol_quad_element_group,
+        face_quad_element_group
+    )
+
+
+def _single_axis_hybridized_derivative_kernel(
+        dcoll, dd_quad, dd_face_quad, xyz_axis, flux_matrix):
+    if not dcoll._has_affine_groups(dd_quad.domain_tag):
+        raise NotImplementedError("Not implemented for non-affine elements yet.")
+
+    if not isinstance(flux_matrix, DOFArray):
+        return map_array_container(
+            partial(_single_axis_hybridized_derivative_kernel,
+                    dcoll, dd_quad, dd_face_quad, xyz_axis),
+            flux_matrix
+        )
+
+    from grudge.dof_desc import DISCR_TAG_BASE
+    dd_vol = dd_quad.with_discr_tag(DISCR_TAG_BASE)
+
+    from grudge.geometry import \
+        area_element, inverse_surface_metric_derivative
+    from grudge.interpolation import (
+        volume_and_surface_interpolation_matrix,
+        volume_and_surface_quadrature_interpolation
+    )
+
+    actx = flux_matrix.array_context
+
+    # FIXME: This is kinda meh
+    def inverse_jac_matrix():
+        @memoize_in(
+            dcoll,
+            (_single_axis_hybridized_derivative_kernel, dd_quad, dd_face_quad))
+        def _inv_surf_metric_deriv():
+            return freeze(
+                actx.np.stack(
+                    [
+                        actx.np.stack(
+                            [
+                                volume_and_surface_quadrature_interpolation(
+                                    dcoll, dd_quad, dd_face_quad,
+                                    area_element(actx, dcoll, dd=dd_vol)
+                                    * inverse_surface_metric_derivative(
+                                        actx, dcoll,
+                                        rst_ax, xyz_axis, dd=dd_vol
+                                    )
+                                ) for rst_ax in range(dcoll.dim)
+                            ]
+                        ) for xyz_axis in range(dcoll.ambient_dim)
+                    ]
+                ),
+                actx
+            )
+        return _inv_surf_metric_deriv()
+
+    return DOFArray(
+        actx,
+        data=tuple(
+            # r for rst axis
+            actx.einsum("ik,rej,rij,eij->ek",
+                        volume_and_surface_interpolation_matrix(
+                            actx,
+                            base_element_group=bgrp,
+                            vol_quad_element_group=qvgrp,
+                            face_quad_element_group=qafgrp
+                        ),
+                        ijm_i[xyz_axis],
+                        _reference_skew_symmetric_hybridized_sbp_operators(
+                            actx,
+                            bgrp,
+                            qvgrp,
+                            qafgrp,
+                            fmat_i.dtype
+                        ),
+                        fmat_i,
+                        arg_names=("Vh_mat_t", "inv_jac_t", "Q_mat", "F_mat"),
+                        tagged=(FirstAxisIsElementsTag(),))
+
+            for bgrp, qvgrp, qafgrp, fmat_i, ijm_i in zip(
+                    dcoll.discr_from_dd(dd_vol).groups,
+                    dcoll.discr_from_dd(dd_quad).groups,
+                    dcoll.discr_from_dd(dd_face_quad).groups,
+                    flux_matrix,
+                    inverse_jac_matrix()
+            )
+        )
+    )
+
+
+def volume_flux_differencing(
+        dcoll: DiscretizationCollection,
+        dd_quad: DOFDesc,
+        dd_face_quad: DOFDesc,
+        flux_matrices: ArrayOrContainer) -> ArrayOrContainer:
+    r"""Computes the volume contribution of the DG divergence operator using
+    flux-differencing:
+
+    .. math::
+
+       \mathrm{VOL} = \sum_{i=1}^{d}
+        \begin{bmatrix}
+            \mathbf{V}_q \\ \mathbf{V}_f
+        \end{bmatrix}^T
+        \left(
+            \left( \mathbf{Q}_{i} - \mathbf{Q}^T_{i} \right)
+            \circ \mathbf{F}_{i}
+        \right)\mathbf{1}
+
+    where :math:`\circ` denotes the
+    `Hadamard product <https://en.wikipedia.org/wiki/Hadamard_product_(matrices)>`__,
+    :math:`\mathbf{F}_{i}` are matrices whose entries are computed
+    as the evaluation of an entropy-conserving two-point flux function
+    (e.g. :func:`grudge.models.euler.divergence_flux_chandrashekar`)
+    and :math:`\mathbf{Q}_{i} - \mathbf{Q}^T_{i}` are the skew-symmetric
+    hybridized differentiation operators defined in (15) of
+    `this paper <https://arxiv.org/pdf/1902.01828.pdf>`__.
+
+    :arg flux_matrices: a :class:`~meshmode.dof_array.DOFArray` or an
+        :class:`~arraycontext.container.ArrayContainer` of them containing
+        evaluations of two-point flux.
+    :returns: a :class:`~meshmode.dof_array.DOFArray` or an
+        :class:`~arraycontext.container.ArrayContainer` of them.
+    """
+
+    def _hybridized_div(fmats):
+        return sum(_single_axis_hybridized_derivative_kernel(
+            dcoll, dd_quad, dd_face_quad, i, fmat_i)
+            for i, fmat_i in enumerate(fmats))
+
+    from grudge.tools import rec_map_subarrays
+    return rec_map_subarrays(
+        _hybridized_div,
+        (dcoll.ambient_dim,), (),
+        flux_matrices, scalar_cls=DOFArray)
+
+
+# vim: foldmethod=marker
diff --git a/grudge/interpolation.py b/grudge/interpolation.py
index 61bdf1a13..a2b9d1f77 100644
--- a/grudge/interpolation.py
+++ b/grudge/interpolation.py
@@ -32,7 +32,24 @@
 """
 
 
+import numpy as np
+
+from arraycontext import (
+    ArrayContext,
+    map_array_container
+)
+from arraycontext import ArrayOrContainerT
+
+from functools import partial
+
+from meshmode.transform_metadata import FirstAxisIsElementsTag
+
 from grudge.discretization import DiscretizationCollection
+from grudge.dof_desc import DOFDesc
+
+from meshmode.dof_array import DOFArray
+
+from pytools import keyed_memoize_in
 
 
 # FIXME: Should revamp interp and make clear distinctions
@@ -46,3 +63,129 @@ def interp(dcoll: DiscretizationCollection, src, tgt, vec):
     from grudge.projection import project
 
     return project(dcoll, src, tgt, vec)
+
+
+# {{{ Interpolation matrices
+
+def volume_quadrature_interpolation_matrix(
+        actx: ArrayContext, base_element_group, vol_quad_element_group):
+    @keyed_memoize_in(
+        actx, volume_quadrature_interpolation_matrix,
+        lambda base_grp, vol_quad_grp: (base_grp.discretization_key(),
+                                        vol_quad_grp.discretization_key()))
+    def get_volume_vand(base_grp, vol_quad_grp):
+        from modepy import vandermonde
+
+        basis = base_grp.basis_obj()
+        vdm_inv = np.linalg.inv(vandermonde(basis.functions,
+                                            base_grp.unit_nodes))
+        vdm_q = vandermonde(basis.functions, vol_quad_grp.unit_nodes) @ vdm_inv
+        return actx.freeze(actx.from_numpy(vdm_q))
+
+    return get_volume_vand(base_element_group, vol_quad_element_group)
+
+
+def surface_quadrature_interpolation_matrix(
+        actx: ArrayContext, base_element_group, face_quad_element_group):
+    @keyed_memoize_in(
+        actx, surface_quadrature_interpolation_matrix,
+        lambda base_grp, face_quad_grp: (base_grp.discretization_key(),
+                                         face_quad_grp.discretization_key()))
+    def get_surface_vand(base_grp, face_quad_grp):
+        nfaces = base_grp.mesh_el_group.nfaces
+        assert face_quad_grp.nelements == nfaces * base_grp.nelements
+
+        from modepy import vandermonde, faces_for_shape
+
+        basis = base_grp.basis_obj()
+        vdm_inv = np.linalg.inv(vandermonde(basis.functions,
+                                            base_grp.unit_nodes))
+        faces = faces_for_shape(base_grp.shape)
+        # NOTE: Assumes same quadrature rule on each face
+        face_quadrature = face_quad_grp.quadrature_rule()
+
+        surface_nodes = faces[0].map_to_volume(face_quadrature.nodes)
+        for fidx in range(1, nfaces):
+            surface_nodes = np.append(
+                surface_nodes,
+                faces[fidx].map_to_volume(face_quadrature.nodes),
+                axis=1
+            )
+        vdm_f = vandermonde(basis.functions, surface_nodes) @ vdm_inv
+        return actx.freeze(actx.from_numpy(vdm_f))
+
+    return get_surface_vand(base_element_group, face_quad_element_group)
+
+
+def volume_and_surface_interpolation_matrix(
+        actx: ArrayContext,
+        base_element_group, vol_quad_element_group, face_quad_element_group):
+    @keyed_memoize_in(
+        actx, volume_and_surface_interpolation_matrix,
+        lambda base_grp, vol_quad_grp, face_quad_grp: (
+            base_grp.discretization_key(),
+            vol_quad_grp.discretization_key(),
+            face_quad_grp.discretization_key()))
+    def get_vol_surf_interpolation_matrix(base_grp, vol_quad_grp, face_quad_grp):
+        vq_mat = actx.to_numpy(
+            volume_quadrature_interpolation_matrix(
+                actx,
+                base_element_group=base_grp,
+                vol_quad_element_group=vol_quad_grp))
+        vf_mat = actx.to_numpy(
+            surface_quadrature_interpolation_matrix(
+                actx,
+                base_element_group=base_grp,
+                face_quad_element_group=face_quad_grp))
+        return actx.freeze(actx.from_numpy(np.block([[vq_mat], [vf_mat]])))
+
+    return get_vol_surf_interpolation_matrix(
+        base_element_group, vol_quad_element_group, face_quad_element_group
+    )
+
+# }}}
+
+
+def volume_and_surface_quadrature_interpolation(
+        dcoll: DiscretizationCollection,
+        dd_quad: DOFDesc,
+        dd_face_quad: DOFDesc,
+        vec: ArrayOrContainerT) -> ArrayOrContainerT:
+    """todo.
+    """
+    if not isinstance(vec, DOFArray):
+        return map_array_container(
+            partial(volume_and_surface_quadrature_interpolation,
+                    dcoll, dd_quad, dd_face_quad), vec
+        )
+
+    from grudge.dof_desc import DISCR_TAG_BASE
+    dd_vol = dd_quad.with_discr_tag(DISCR_TAG_BASE)
+    actx = vec.array_context
+    discr = dcoll.discr_from_dd(dd_vol)
+    quad_volm_discr = dcoll.discr_from_dd(dd_quad)
+    quad_face_discr = dcoll.discr_from_dd(dd_face_quad)
+
+    return DOFArray(
+        actx,
+        data=tuple(
+            actx.einsum("ij,ej->ei",
+                        volume_and_surface_interpolation_matrix(
+                            actx,
+                            base_element_group=bgrp,
+                            vol_quad_element_group=qvgrp,
+                            face_quad_element_group=qfgrp
+                        ),
+                        vec_i,
+                        arg_names=("Vh_mat", "vec"),
+                        tagged=(FirstAxisIsElementsTag(),))
+
+            for bgrp, qvgrp, qfgrp, vec_i in zip(
+                discr.groups,
+                quad_volm_discr.groups,
+                quad_face_discr.groups, vec)
+        )
+    )
+
+
+# vim: foldmethod=marker
diff --git a/grudge/models/euler.py b/grudge/models/euler.py
index f4d6f8f4c..70c765e31 100644
--- a/grudge/models/euler.py
+++ b/grudge/models/euler.py
@@ -4,6 +4,7 @@
 -----------------
 
 .. autoclass:: EulerOperator
+.. autoclass:: EntropyStableEulerOperator
 
 Predefined initial conditions
 -----------------------------
@@ -20,6 +21,9 @@
 
 .. autofunction:: euler_volume_flux
 .. autofunction:: euler_numerical_flux
+
+.. autofunction:: divergence_flux_chandrashekar
+.. autofunction:: entropy_stable_numerical_flux_chandrashekar
 """
 
 __copyright__ = """
@@ -49,9 +53,12 @@
 from abc import ABCMeta, abstractmethod
 
 from dataclasses import dataclass
+
 from arraycontext import (
     dataclass_array_container,
-    with_container_arithmetic
+    with_container_arithmetic,
+    map_array_container, thaw,
+    outer
 )
 
 from meshmode.dof_array import DOFArray
@@ -124,14 +131,13 @@ def vortex_initial_condition(
 
 # {{{ Variable transformation and helper routines
 
-def conservative_to_primitive_vars(cv_state: ConservedEulerField, gamma=1.4):
+def conservative_to_primitive_vars(cv_state: ConservedEulerField, gamma: float):
     """Converts from conserved variables (density, momentum, total energy)
     into primitive variables (density, velocity, pressure).
 
     :arg cv_state: A :class:`ConservedEulerField` containing the conserved
         variables.
-    :arg gamma: The isentropic expansion factor for a single-species gas
-        (default set to 1.4).
+    :arg gamma: The isentropic expansion factor.
     :returns: A :class:`Tuple` containing the primitive variables:
         (density, velocity, pressure).
     """
@@ -141,20 +147,76 @@ def conservative_to_primitive_vars(cv_state: ConservedEulerField, gamma=1.4):
     u = rho_u / rho
     p = (gamma - 1) * (rho_e - 0.5 * sum(rho_u * u))
 
-    return rho, u, p
+    return (rho, u, p)
 
 
-def compute_wavespeed(cv_state: ConservedEulerField, gamma=1.4):
-    """Computes the total translational wavespeed.
+def conservative_to_entropy_vars(cv_state: ConservedEulerField, gamma: float):
+    """Converts from conserved variables (density, momentum, total energy)
+    into entropy variables.
 
     :arg cv_state: A :class:`ConservedEulerField` containing the conserved
         variables.
     :arg gamma: The isentropic expansion factor for a single-species gas
         (default set to 1.4).
+    :returns: A :class:`ConservedEulerField` containing the entropy variables.
+    """
+    actx = cv_state.array_context
+    rho, u, p = conservative_to_primitive_vars(cv_state, gamma)
+
+    u_square = sum(v ** 2 for v in u)
+    s = actx.np.log(p) - gamma*actx.np.log(rho)
+    rho_p = rho / p
+
+    return ConservedEulerField(
+        mass=((gamma - s)/(gamma - 1)) - 0.5 * rho_p * u_square,
+        energy=-rho_p,
+        momentum=rho_p * u
+    )
+
+
+def entropy_to_conservative_vars(ev_state: ConservedEulerField, gamma: float):
+    """Converts from entropy variables into conserved variables
+    (density, momentum, total energy).
+
+    :arg ev_state: A :class:`ConservedEulerField` containing the entropy
+        variables.
+    :arg gamma: The isentropic expansion factor.
+    :returns: A :class:`ConservedEulerField` containing the conserved variables.
+    """
+    actx = ev_state.array_context
+    # See Hughes, Franca, Mallet (1986) A new finite element
+    # formulation for CFD: (DOI: 10.1016/0045-7825(86)90127-1)
+    inv_gamma_minus_one = 1/(gamma - 1)
+
+    # Convert to entropy `-rho * s` used by Hughes, France, Mallet (1986)
+    ev_state = ev_state * (gamma - 1)
+    v1 = ev_state.mass
+    v2t4 = ev_state.momentum
+    v5 = ev_state.energy
+
+    v_square = sum(v**2 for v in v2t4)
+    s = gamma - v1 + v_square/(2*v5)
+    rho_iota = (
+        ((gamma - 1) / (-v5)**gamma)**(inv_gamma_minus_one)
+    ) * actx.np.exp(-s * inv_gamma_minus_one)
+
+    return ConservedEulerField(
+        mass=-rho_iota * v5,
+        energy=rho_iota * (1 - v_square/(2*v5)),
+        momentum=rho_iota * v2t4
+    )
+
+
+def compute_wavespeed(cv_state: ConservedEulerField, gamma: float):
+    """Computes the total translational wavespeed.
+
+    :arg cv_state: A :class:`ConservedEulerField` containing the conserved
+        variables.
+    :arg gamma: The isentropic expansion factor.
     :returns: A :class:`~meshmode.dof_array.DOFArray` containing local wavespeeds.
     """
     actx = cv_state.array_context
-    rho, u, p = conservative_to_primitive_vars(cv_state, gamma=gamma)
+    rho, u, p = conservative_to_primitive_vars(cv_state, gamma)
 
     return actx.np.sqrt(np.dot(u, u)) + actx.np.sqrt(gamma * (p / rho))
 
@@ -173,7 +235,7 @@ def boundary_tpair(
             self,
             dcoll: DiscretizationCollection,
             dd_bc: DOFDesc,
-            state: ConservedEulerField, t=0):
+            restricted_state: ConservedEulerField, t=0):
         pass
 
 
@@ -183,14 +245,13 @@ def boundary_tpair(
             self,
             dcoll: DiscretizationCollection,
             dd_bc: DOFDesc,
-            state: ConservedEulerField, t=0):
-        actx = state.array_context
-        dd_base = as_dofdesc("vol").with_discr_tag(DISCR_TAG_BASE)
+            restricted_state: ConservedEulerField, t=0):
+        actx = restricted_state.array_context
 
         return TracePair(
             dd_bc,
-            interior=op.project(dcoll, dd_base, dd_bc, state),
-            exterior=self.prescribed_state(actx.thaw(dcoll.nodes(dd_bc)), t=t)
+            interior=restricted_state,
+            exterior=self.prescribed_state(thaw(dcoll.nodes(dd_bc), actx), t=t)
         )
 
 
@@ -200,11 +261,10 @@ def boundary_tpair(
             self,
             dcoll: DiscretizationCollection,
             dd_bc: DOFDesc,
-            state: ConservedEulerField, t=0):
-        actx = state.array_context
-        dd_base = as_dofdesc("vol").with_discr_tag(DISCR_TAG_BASE)
-        nhat = actx.thaw(dcoll.normal(dd_bc))
-        interior = op.project(dcoll, dd_base, dd_bc, state)
+            restricted_state: ConservedEulerField, t=0):
+        actx = restricted_state.array_context
+        nhat = thaw(dcoll.normal(dd_bc), actx)
+        interior = restricted_state
 
         return TracePair(
             dd_bc,
@@ -224,19 +284,17 @@ def boundary_tpair(
 # {{{ Euler operator
 
 def euler_volume_flux(
-        dcoll: DiscretizationCollection, cv_state: ConservedEulerField, gamma=1.4):
+        dcoll: DiscretizationCollection,
+        cv_state: ConservedEulerField, gamma: float):
     """Computes the (non-linear) volume flux for the
     Euler operator.
 
     :arg cv_state: A :class:`ConservedEulerField` containing the conserved
         variables.
-    :arg gamma: The isentropic expansion factor for a single-species gas
-        (default set to 1.4).
+    :arg gamma: The isentropic expansion factor.
     :returns: A :class:`ConservedEulerField` containing the volume fluxes.
     """
-    from arraycontext import outer
-
-    rho, u, p = conservative_to_primitive_vars(cv_state, gamma=gamma)
+    rho, u, p = conservative_to_primitive_vars(cv_state, gamma)
 
     return ConservedEulerField(
         mass=cv_state.momentum,
@@ -247,40 +305,35 @@ def euler_volume_flux(
 
 def euler_numerical_flux(
         dcoll: DiscretizationCollection, tpair: TracePair,
-        gamma=1.4, lf_stabilization=False):
+        gamma: float, dissipation=False):
     """Computes the interface numerical flux for the Euler operator.
 
     :arg tpair: A :class:`grudge.trace_pair.TracePair` containing the conserved
         variables on the interior and exterior sides of element facets.
-    :arg gamma: The isentropic expansion factor for a single-species gas
-        (default set to 1.4).
-    :arg lf_stabilization: A boolean denoting whether to apply Lax-Friedrichs
+    :arg gamma: The isentropic expansion factor.
+    :arg dissipation: A boolean denoting whether to apply Lax-Friedrichs
         dissipation.
     :returns: A :class:`ConservedEulerField` containing the interface fluxes.
     """
-    dd_intfaces = tpair.dd
-    dd_allfaces = dd_intfaces.with_dtag("all_faces")
     q_ll = tpair.int
     q_rr = tpair.ext
     actx = q_ll.array_context
 
     flux_tpair = TracePair(
         tpair.dd,
-        interior=euler_volume_flux(dcoll, q_ll, gamma=gamma),
-        exterior=euler_volume_flux(dcoll, q_rr, gamma=gamma)
+        interior=euler_volume_flux(dcoll, q_ll, gamma),
+        exterior=euler_volume_flux(dcoll, q_rr, gamma)
     )
     num_flux = flux_tpair.avg
-    normal = actx.thaw(dcoll.normal(dd_intfaces))
-
-    if lf_stabilization:
-        from arraycontext import outer
+    normal = thaw(dcoll.normal(tpair.dd), actx)
 
+    if dissipation:
         # Compute jump penalization parameter
-        lam = actx.np.maximum(compute_wavespeed(q_ll, gamma=gamma),
-                              compute_wavespeed(q_rr, gamma=gamma))
+        lam = actx.np.maximum(compute_wavespeed(q_ll, gamma),
+                              compute_wavespeed(q_rr, gamma))
         num_flux -= lam*outer(tpair.diff, normal)/2
 
-    return op.project(dcoll, dd_intfaces, dd_allfaces, num_flux @ normal)
+    return num_flux @ normal
 
 
 class EulerOperator(HyperbolicOperator):
@@ -309,59 +362,298 @@ def __init__(self, dcoll: DiscretizationCollection,
 
     def max_characteristic_velocity(self, actx, **kwargs):
         state = kwargs["state"]
-        return compute_wavespeed(state, gamma=self.gamma)
+        return compute_wavespeed(state, self.gamma)
 
     def operator(self, t, q):
         dcoll = self.dcoll
         gamma = self.gamma
         qtag = self.qtag
-        dq = DOFDesc("vol", qtag)
-        df = DOFDesc("all_faces", qtag)
 
-        def interp_to_quad(u):
-            return op.project(dcoll, "vol", dq, u)
+        dissipation = self.lf_stabilization
+
+        dd_base = as_dofdesc("vol", DISCR_TAG_BASE)
+        dd_vol_quad = as_dofdesc("vol", qtag)
+        dd_face_quad = as_dofdesc("all_faces", qtag)
+
+        def interp_to_quad_surf(tpair):
+            dd = tpair.dd
+            dd_quad = dd.with_discr_tag(qtag)
+            return TracePair(
+                dd_quad,
+                interior=op.project(dcoll, dd, dd_quad, tpair.int),
+                exterior=op.project(dcoll, dd, dd_quad, tpair.ext)
+            )
+
+        interior_trace_pairs = [
+            interp_to_quad_surf(tpair)
+            for tpair in op.interior_trace_pairs(dcoll, q)
+        ]
 
-        # Compute volume fluxes
-        volume_fluxes = op.weak_local_div(
-            dcoll, dq,
-            interp_to_quad(euler_volume_flux(dcoll, q, gamma=gamma))
+        # Compute volume derivatives
+        volume_derivs = op.weak_local_div(
+            dcoll, dd_vol_quad,
+            euler_volume_flux(
+                dcoll, op.project(dcoll, dd_base, dd_vol_quad, q), gamma)
         )
 
         # Compute interior interface fluxes
         interface_fluxes = (
             sum(
-                euler_numerical_flux(
-                    dcoll,
-                    op.tracepair_with_discr_tag(dcoll, qtag, tpair),
-                    gamma=gamma,
-                    lf_stabilization=self.lf_stabilization
-                ) for tpair in op.interior_trace_pairs(dcoll, q)
+                op.project(dcoll, qtpair.dd, dd_face_quad,
+                           euler_numerical_flux(dcoll, qtpair, gamma,
+                                                dissipation=dissipation))
+                for qtpair in interior_trace_pairs
             )
         )
 
         # Compute boundary fluxes
         if self.bdry_conditions is not None:
-            bc_fluxes = sum(
-                euler_numerical_flux(
+            for btag in self.bdry_conditions:
+                boundary_condition = self.bdry_conditions[btag]
+                dd_bc = as_dofdesc(btag).with_discr_tag(qtag)
+                bc_flux = op.project(
                     dcoll,
-                    self.bdry_conditions[btag].boundary_tpair(
+                    dd_bc,
+                    dd_face_quad,
+                    euler_numerical_flux(
                         dcoll,
-                        as_dofdesc(btag).with_discr_tag(qtag),
-                        q,
-                        t=t
-                    ),
-                    gamma=gamma,
-                    lf_stabilization=self.lf_stabilization
-                ) for btag in self.bdry_conditions
-            )
-            interface_fluxes = interface_fluxes + bc_fluxes
+                        boundary_condition.boundary_tpair(
+                            dcoll=dcoll,
+                            dd_bc=dd_bc,
+                            restricted_state=op.project(dcoll, dd_base, dd_bc, q),
+                            t=t
+                        ),
+                        gamma,
+                        dissipation=dissipation
+                    )
+                )
+                interface_fluxes = interface_fluxes + bc_flux
 
         return op.inverse_mass(
             dcoll,
-            volume_fluxes - op.face_mass(dcoll, df, interface_fluxes)
+            volume_derivs - op.face_mass(dcoll, dd_face_quad, interface_fluxes)
         )
 
 # }}}
 
 
+# {{{ Entropy stable Euler operator
+
+def divergence_flux_chandrashekar(
+        dcoll: DiscretizationCollection,
+        q_left: ConservedEulerField,
+        q_right: ConservedEulerField, gamma: float):
+    """Two-point volume flux based on the entropy conserving
+    and kinetic energy preserving two-point flux in:
+
+    Chandrashekar (2013) Kinetic Energy Preserving and Entropy Stable Finite
+    Volume Schemes for Compressible Euler and Navier-Stokes Equations:
+    `DOI <https://doi.org/10.4208/cicp.170712.010313a>`__.
+
+    :args q_left: A :class:`ConservedEulerField` containing the "left" state.
+    :args q_right: A :class:`ConservedEulerField` containing the "right" state.
+    :arg gamma: The isentropic expansion factor.
+    """
+    dim = dcoll.dim
+    actx = q_left.array_context
+
+    def ln_mean(x: DOFArray, y: DOFArray, epsilon=1e-4):
+        f2 = (x * (x - 2 * y) + y * y) / (x * (x + 2 * y) + y * y)
+        return actx.np.where(
+            actx.np.less(f2, epsilon),
+            (x + y) / (2 + f2*2/3 + f2*f2*2/5 + f2*f2*f2*2/7),
+            (y - x) / actx.np.log(y / x)
+        )
+
+    rho_left, u_left, p_left = conservative_to_primitive_vars(q_left, gamma)
+    rho_right, u_right, p_right = conservative_to_primitive_vars(q_right, gamma)
+
+    beta_left = 0.5 * rho_left / p_left
+    beta_right = 0.5 * rho_right / p_right
+    specific_kin_left = 0.5 * sum(v**2 for v in u_left)
+    specific_kin_right = 0.5 * sum(v**2 for v in u_right)
+
+    rho_avg = 0.5 * (rho_left + rho_right)
+    rho_mean = ln_mean(rho_left,  rho_right)
+    beta_mean = ln_mean(beta_left, beta_right)
+    beta_avg = 0.5 * (beta_left + beta_right)
+    u_avg = 0.5 * (u_left + u_right)
+    p_mean = 0.5 * rho_avg / beta_avg
+
+    velocity_square_avg = specific_kin_left + specific_kin_right
+
+    mass_flux = rho_mean * u_avg
+    momentum_flux = outer(mass_flux, u_avg) + np.eye(dim) * p_mean
+    energy_flux = (
+        mass_flux * 0.5 * (1/(gamma - 1)/beta_mean - velocity_square_avg)
+        + np.dot(momentum_flux, u_avg)
+    )
+
+    return ConservedEulerField(mass=mass_flux,
+                               energy=energy_flux,
+                               momentum=momentum_flux)
+
+
+def entropy_stable_numerical_flux_chandrashekar(
+        dcoll: DiscretizationCollection, tpair: TracePair,
+        gamma: float, dissipation=False):
+    """Entropy stable numerical flux based on the entropy conserving
+    and kinetic energy preserving two-point flux in:
+
+    Chandrashekar (2013) Kinetic Energy Preserving and Entropy Stable Finite
+    Volume Schemes for Compressible Euler and Navier-Stokes Equations
+    `DOI <https://doi.org/10.4208/cicp.170712.010313a>`__.
+
+    :arg tpair: A :class:`grudge.trace_pair.TracePair` containing the conserved
+        variables on the interior and exterior sides of element facets.
+    :arg gamma: The isentropic expansion factor.
+    :arg dissipation: A boolean denoting whether to apply Lax-Friedrichs
+        dissipation.
+    :returns: A :class:`ConservedEulerField` containing the interface fluxes.
+    """
+    q_int = tpair.int
+    q_ext = tpair.ext
+    actx = q_int.array_context
+
+    num_flux = divergence_flux_chandrashekar(
+        dcoll, q_left=q_int, q_right=q_ext, gamma=gamma)
+    normal = thaw(dcoll.normal(tpair.dd), actx)
+
+    if dissipation:
+        # Compute jump penalization parameter
+        lam = actx.np.maximum(compute_wavespeed(q_int, gamma),
+                              compute_wavespeed(q_ext, gamma))
+        num_flux -= lam*outer(tpair.diff, normal)/2
+
+    return num_flux @ normal
+
+
+class EntropyStableEulerOperator(EulerOperator):
+    """Discretizes the Euler equations using an entropy-stable
+    discontinuous Galerkin discretization as outlined in (15)
+    of `this paper <https://arxiv.org/pdf/1902.01828.pdf>`__.
+    """
+
+    def operator(self, t, q):
+        from grudge.projection import volume_quadrature_project
+        from grudge.interpolation import \
+            volume_and_surface_quadrature_interpolation
+
+        dcoll = self.dcoll
+        gamma = self.gamma
+        qtag = self.qtag
+        dissipation = self.lf_stabilization
+
+        dd_base = DOFDesc("vol", DISCR_TAG_BASE)
+        dd_vol_quad = DOFDesc("vol", qtag)
+        dd_face_quad = DOFDesc("all_faces", qtag)
+
+        # Convert to projected entropy variables: v_q = P_q v(u_q)
+        proj_entropy_vars = \
+            volume_quadrature_project(
+                dcoll, dd_vol_quad,
+                conservative_to_entropy_vars(
+                    # Interpolate state to vol quad grid: u_q = V_q u
+                    op.project(dcoll, dd_base, dd_vol_quad, q), gamma))
+
+        def modified_conserved_vars_tpair(tpair):
+            dd = tpair.dd
+            dd_quad = dd.with_discr_tag(qtag)
+            # Interpolate entropy variables to the surface quadrature grid
+            ev_tpair = op.project(dcoll, dd, dd_quad, tpair)
+            return TracePair(
+                dd_quad,
+                # Convert interior and exterior states to conserved variables
+                interior=entropy_to_conservative_vars(ev_tpair.int, gamma),
+                exterior=entropy_to_conservative_vars(ev_tpair.ext, gamma)
+            )
+
+        # Compute interior trace pairs containing the modified conserved
+        # variables (in terms of projected entropy variables)
+        interior_trace_pairs = [
+            modified_conserved_vars_tpair(tpair)
+            for tpair in op.interior_trace_pairs(dcoll, proj_entropy_vars)
+        ]
+
+        from functools import partial
+        from grudge.flux_differencing import volume_flux_differencing
+
+        def _reshape(shape, ary):
+            if not isinstance(ary, DOFArray):
+                return map_array_container(partial(_reshape, shape), ary)
+
+            return DOFArray(ary.array_context, data=tuple(
+                subary.reshape(grp.nelements, *shape)
+                # Just need group for determining the number of elements
+                for grp, subary in zip(dcoll.discr_from_dd(dd_base).groups, ary)))
+
+        # Compute the (modified) conserved state in terms of the projected
+        # entropy variables on both the volume and surface nodes
+        qtilde_vol_and_surf = \
+            entropy_to_conservative_vars(
+                # Interpolate projected entropy variables to
+                # volume + surface quadrature grids
+                volume_and_surface_quadrature_interpolation(
+                    dcoll, dd_vol_quad, dd_face_quad, proj_entropy_vars), gamma)
+
+        # FIXME: These matrices are actually symmetric. Could make use
+        # of that to avoid redundant computation.
+        flux_matrices = divergence_flux_chandrashekar(
+            dcoll,
+            _reshape((1, -1), qtilde_vol_and_surf),
+            _reshape((-1, 1), qtilde_vol_and_surf),
+            gamma
+        )
+
+        # Compute volume derivatives using flux differencing
+        volume_derivs = -volume_flux_differencing(
+            dcoll, dd_vol_quad, dd_face_quad, flux_matrices)
+
+        # Computing interface numerical fluxes
+        interface_fluxes = (
+            sum(
+                op.project(dcoll, qtpair.dd, dd_face_quad,
+                           entropy_stable_numerical_flux_chandrashekar(
+                               dcoll, qtpair, gamma, dissipation=dissipation))
+                for qtpair in interior_trace_pairs
+            )
+        )
+
+        # Compute boundary fluxes
+        if self.bdry_conditions is not None:
+            for btag in self.bdry_conditions:
+                boundary_condition = self.bdry_conditions[btag]
+                dd_bc = as_dofdesc(btag).with_discr_tag(qtag)
+                bc_flux = op.project(
+                    dcoll,
+                    dd_bc,
+                    dd_face_quad,
+                    entropy_stable_numerical_flux_chandrashekar(
+                        dcoll,
+                        boundary_condition.boundary_tpair(
+                            dcoll=dcoll,
+                            dd_bc=dd_bc,
+                            # Pass modified conserved state to be used as
+                            # the "interior" state for computing the boundary
+                            # trace pair
+                            restricted_state=entropy_to_conservative_vars(
+                                op.project(
+                                    dcoll, dd_base, dd_bc, proj_entropy_vars),
+                                gamma
+                            ),
+                            t=t
+                        ),
+                        gamma,
+                        dissipation=dissipation
+                    )
+                )
+                interface_fluxes = interface_fluxes + bc_flux
+
+        return op.inverse_mass(
+            dcoll,
+            volume_derivs - op.face_mass(dcoll, dd_face_quad, interface_fluxes)
+        )
+
+# }}}
+
 # vim: foldmethod=marker
diff --git a/grudge/op.py b/grudge/op.py
index f5781f4be..c46f9450c 100644
--- a/grudge/op.py
+++ b/grudge/op.py
@@ -95,7 +95,7 @@
 )
 
 from grudge.interpolation import interp
-from grudge.projection import project
+from grudge.projection import project, volume_quadrature_project
 
 from grudge.reductions import (
     norm,
@@ -118,8 +118,11 @@
     interior_trace_pair,
     interior_trace_pairs,
     local_interior_trace_pair,
-    connected_ranks,
+    connected_parts,
+    inter_volume_trace_pairs,
+    local_inter_volume_trace_pairs,
     cross_rank_trace_pairs,
+    cross_rank_inter_volume_trace_pairs,
     bdry_trace_pair,
     bv_trace_pair
 )
@@ -127,6 +130,7 @@
 
 __all__ = (
     "project",
+    "volume_quadrature_project",
     "interp",
 
     "norm",
@@ -147,8 +151,11 @@
     "interior_trace_pair",
     "interior_trace_pairs",
     "local_interior_trace_pair",
-    "connected_ranks",
+    "connected_parts",
+    "inter_volume_trace_pairs",
+    "local_inter_volume_trace_pairs",
     "cross_rank_trace_pairs",
+    "cross_rank_inter_volume_trace_pairs",
     "bdry_trace_pair",
     "bv_trace_pair",
 
diff --git a/grudge/projection.py b/grudge/projection.py
index e21e02295..093919673 100644
--- a/grudge/projection.py
+++ b/grudge/projection.py
@@ -5,6 +5,7 @@
 -----------
 
 .. autofunction:: project
+.. autofunction:: volume_quadrature_project
 """
 
 from __future__ import annotations
@@ -33,8 +34,12 @@
 THE SOFTWARE.
 """
 
+from functools import partial
+from numbers import Number
+
+import numpy as np
 
-from arraycontext import ArrayOrContainer
+from arraycontext import ArrayOrContainer, map_array_container
 
 from grudge.discretization import DiscretizationCollection
 from grudge.dof_desc import (
@@ -43,7 +48,10 @@
     BoundaryDomainTag,
     ConvertibleToDOFDesc)
 
-from numbers import Number
+from meshmode.dof_array import DOFArray
+from meshmode.transform_metadata import FirstAxisIsElementsTag
+
+from pytools import keyed_memoize_in
 
 
 def project(
@@ -82,3 +90,65 @@ def project(
         return vec
 
     return dcoll.connection_from_dds(src_dofdesc, tgt_dofdesc)(vec)
+
+
+def volume_quadrature_project(
+        dcoll: DiscretizationCollection, dd_q, vec) -> ArrayOrContainer:
+    """Projects a field on the quadrature discreization, described by *dd_q*,
+    into the polynomial space described by the volume discretization.
+
+    :arg dd_q: a :class:`~grudge.dof_desc.DOFDesc`, or a value convertible to one.
+    :arg vec: a :class:`~meshmode.dof_array.DOFArray` or an
+        :class:`~arraycontext.container.ArrayContainer` of them.
+    :returns: a :class:`~meshmode.dof_array.DOFArray` or an
+        :class:`~arraycontext.container.ArrayContainer` like *vec*.
+    """
+    if not isinstance(vec, DOFArray):
+        return map_array_container(
+            partial(volume_quadrature_project, dcoll, dd_q), vec
+        )
+
+    from grudge.geometry import area_element
+    from grudge.interpolation import volume_quadrature_interpolation_matrix
+    from grudge.op import inverse_mass
+
+    from grudge.dof_desc import DISCR_TAG_BASE
+    dd_vol = dd_q.with_discr_tag(DISCR_TAG_BASE)
+
+    actx = vec.array_context
+    discr = dcoll.discr_from_dd(dd_vol)
+    quad_discr = dcoll.discr_from_dd(dd_q)
+    jacobians = area_element(
+        actx, dcoll, dd=dd_q,
+        _use_geoderiv_connection=actx.supports_nonscalar_broadcasting)
+
+    @keyed_memoize_in(
+        actx, volume_quadrature_project,
+        lambda base_grp, vol_quad_grp: (base_grp.discretization_key(),
+                                        vol_quad_grp.discretization_key()))
+    def get_mat(base_grp, vol_quad_grp):
+        vdm_q = actx.to_numpy(
+            volume_quadrature_interpolation_matrix(
+                actx, base_grp, vol_quad_grp
+            )
+        )
+        weights = np.diag(vol_quad_grp.quadrature_rule().weights)
+        return actx.freeze(actx.from_numpy(vdm_q.T @ weights))
+
+    return inverse_mass(
+        dcoll,
+        dd_vol,
+        DOFArray(
+            actx,
+            data=tuple(
+                actx.einsum("ij,ej,ej->ei",
+                            get_mat(bgrp, qgrp),
+                            jac_i,
+                            vec_i,
+                            arg_names=("vqw_t", "jac", "vec"),
+                            tagged=(FirstAxisIsElementsTag(),))
+                for bgrp, qgrp, vec_i, jac_i in zip(
+                    discr.groups, quad_discr.groups, vec, jacobians)
+            )
+        )
+    )
diff --git a/grudge/trace_pair.py b/grudge/trace_pair.py
index 1f49ae0d6..acc086505 100644
--- a/grudge/trace_pair.py
+++ b/grudge/trace_pair.py
@@ -18,12 +18,15 @@
 .. autofunction:: bdry_trace_pair
 .. autofunction:: bv_trace_pair
 
-Interior and cross-rank trace functions
----------------------------------------
+Interior, cross-rank, and inter-volume traces
+---------------------------------------------
 
 .. autofunction:: interior_trace_pairs
 .. autofunction:: local_interior_trace_pair
+.. autofunction:: inter_volume_trace_pairs
+.. autofunction:: local_inter_volume_trace_pairs
 .. autofunction:: cross_rank_trace_pairs
+.. autofunction:: cross_rank_inter_volume_trace_pairs
 """
 
 __copyright__ = """
@@ -52,17 +55,18 @@
 
 
 from warnings import warn
-from typing import List, Hashable, Optional, Type, Any
+from typing import List, Hashable, Optional, Tuple, Type, Any, Sequence, Mapping
 
 from pytools.persistent_dict import KeyBuilder
 
 from arraycontext import (
     ArrayContainer,
+    ArrayContext,
     with_container_arithmetic,
     dataclass_array_container,
-    get_container_context_recursively,
-    flatten, to_numpy,
-    unflatten, from_numpy,
+    get_container_context_recursively_opt,
+    to_numpy,
+    from_numpy,
     ArrayOrContainer
 )
 
@@ -72,7 +76,7 @@
 
 from pytools import memoize_on_first_arg
 
-from grudge.discretization import DiscretizationCollection
+from grudge.discretization import DiscretizationCollection, PartID
 from grudge.projection import project
 
 from meshmode.mesh import BTAG_PARTITION
@@ -82,7 +86,7 @@
 import grudge.dof_desc as dof_desc
 from grudge.dof_desc import (
         DOFDesc, DD_VOLUME_ALL, FACE_RESTR_INTERIOR, DISCR_TAG_BASE,
-        VolumeDomainTag,
+        VolumeTag, VolumeDomainTag, BoundaryDomainTag,
         ConvertibleToDOFDesc,
         )
 
@@ -360,6 +364,124 @@ def interior_trace_pairs(dcoll: DiscretizationCollection, vec, *,
 # }}}
 
 
+# {{{ inter-volume trace pairs
+
+def local_inter_volume_trace_pairs(
+        dcoll: DiscretizationCollection,
+        pairwise_volume_data: Mapping[
+            Tuple[DOFDesc, DOFDesc],
+            Tuple[ArrayOrContainer, ArrayOrContainer]]
+        ) -> Mapping[Tuple[DOFDesc, DOFDesc], TracePair]:
+    for vol_dd_pair in pairwise_volume_data.keys():
+        for vol_dd in vol_dd_pair:
+            if not isinstance(vol_dd.domain_tag, VolumeDomainTag):
+                raise ValueError(
+                    "pairwise_volume_data keys must describe volumes, "
+                    f"got '{vol_dd}'")
+            if vol_dd.discretization_tag != DISCR_TAG_BASE:
+                raise ValueError(
+                    "expected base-discretized DOFDesc in pairwise_volume_data, "
+                    f"got '{vol_dd}'")
+
+    rank = (
+        dcoll.mpi_communicator.Get_rank()
+        if dcoll.mpi_communicator is not None
+        else None)
+
+    result: Mapping[Tuple[DOFDesc, DOFDesc], TracePair] = {}
+
+    for vol_dd_pair, vol_data_pair in pairwise_volume_data.items():
+        from meshmode.mesh import mesh_has_boundary
+        if not mesh_has_boundary(
+                dcoll.discr_from_dd(vol_dd_pair[0]).mesh,
+                BTAG_PARTITION(PartID(vol_dd_pair[1].domain_tag.tag, rank))):
+            continue
+
+        directional_vol_dd_pairs = [
+            (vol_dd_pair[1], vol_dd_pair[0]),
+            (vol_dd_pair[0], vol_dd_pair[1])]
+
+        trace_dd_pair = tuple(
+            self_vol_dd.trace(
+                BTAG_PARTITION(
+                    PartID(other_vol_dd.domain_tag.tag, rank)))
+            for other_vol_dd, self_vol_dd in directional_vol_dd_pairs)
+
+        # Pre-compute the projections out here to avoid doing it twice inside
+        # the loop below
+        trace_data = {
+            trace_dd: project(dcoll, vol_dd, trace_dd, vol_data)
+            for vol_dd, trace_dd, vol_data in zip(
+                vol_dd_pair, trace_dd_pair, vol_data_pair)}
+
+        for other_vol_dd, self_vol_dd in directional_vol_dd_pairs:
+            self_part_id = PartID(self_vol_dd.domain_tag.tag, rank)
+            other_part_id = PartID(other_vol_dd.domain_tag.tag, rank)
+
+            self_trace_dd = self_vol_dd.trace(BTAG_PARTITION(other_part_id))
+            other_trace_dd = other_vol_dd.trace(BTAG_PARTITION(self_part_id))
+
+            self_trace_data = trace_data[self_trace_dd]
+            unswapped_other_trace_data = trace_data[other_trace_dd]
+
+            other_to_self = dcoll._inter_part_connections[
+                other_part_id, self_part_id]
+
+            def get_opposite_trace(ary):
+                if isinstance(ary, Number):
+                    return ary
+                else:
+                    return other_to_self(ary)  # noqa: B023
+
+            from arraycontext import rec_map_array_container
+            from meshmode.dof_array import DOFArray
+            other_trace_data = rec_map_array_container(
+                get_opposite_trace,
+                unswapped_other_trace_data,
+                leaf_class=DOFArray)
+
+            result[other_vol_dd, self_vol_dd] = TracePair(
+                self_trace_dd,
+                interior=self_trace_data,
+                exterior=other_trace_data)
+
+    return result
+
+
+def inter_volume_trace_pairs(dcoll: DiscretizationCollection,
+        pairwise_volume_data: Mapping[
+            Tuple[DOFDesc, DOFDesc],
+            Tuple[ArrayOrContainer, ArrayOrContainer]],
+        comm_tag: Hashable = None) -> Mapping[
+            Tuple[DOFDesc, DOFDesc],
+            List[TracePair]]:
+    """
+    Note that :func:`local_inter_volume_trace_pairs` provides the rank-local
+    contributions if those are needed in isolation. Similarly,
+    :func:`cross_rank_inter_volume_trace_pairs` provides only the trace pairs
+    defined on cross-rank boundaries.
+    """
+    # TODO documentation
+
+    result: Mapping[
+        Tuple[DOFDesc, DOFDesc],
+        List[TracePair]] = {}
+
+    local_tpairs = local_inter_volume_trace_pairs(dcoll, pairwise_volume_data)
+    cross_rank_tpairs = cross_rank_inter_volume_trace_pairs(
+        dcoll, pairwise_volume_data, comm_tag=comm_tag)
+
+    for directional_vol_dd_pair, tpair in local_tpairs.items():
+        result[directional_vol_dd_pair] = [tpair]
+
+    for directional_vol_dd_pair, tpairs in cross_rank_tpairs.items():
+        result.setdefault(directional_vol_dd_pair, []).extend(tpairs)
+
+    return result
+
+# }}}
+
+
 # {{{ distributed: helper functions
 
 class _TagKeyBuilder(KeyBuilder):
@@ -367,16 +489,21 @@ def update_for_type(self, key_hash, key: Type[Any]):
         self.rec(key_hash, (key.__module__, key.__name__, key.__name__,))
 
 
+# FIXME: Deprecate connected_ranks instead of removing
 @memoize_on_first_arg
-def connected_ranks(
+def connected_parts(
         dcoll: DiscretizationCollection,
-        volume_dd: Optional[DOFDesc] = None):
-    if volume_dd is None:
-        volume_dd = DD_VOLUME_ALL
+        self_volume_tag: VolumeTag,
+        other_volume_tag: VolumeTag
+        ) -> Sequence[PartID]:
+    result: List[PartID] = [
+        connected_part_id
+        for connected_part_id, part_id in dcoll._inter_part_connections.keys()
+        if (
+            part_id.volume_tag == self_volume_tag
+            and connected_part_id.volume_tag == other_volume_tag)]
 
-    from meshmode.distributed import get_connected_partitions
-    return get_connected_partitions(
-        dcoll._volume_discrs[volume_dd.domain_tag.tag].mesh)
+    return result
 
 
 def _sym_tag_to_num_tag(comm_tag: Optional[Hashable]) -> Optional[int]:
@@ -398,6 +525,7 @@ def _sym_tag_to_num_tag(comm_tag: Optional[Hashable]) -> Optional[int]:
 
     num_tag = sum(ord(ch) << i for i, ch in enumerate(digest)) % tag_ub
 
+    # FIXME: This prints the wrong numerical tag because of base_comm_tag below
     warn("Encountered unknown symbolic tag "
             f"'{comm_tag}', assigning a value of '{num_tag}'. "
             "This is a temporary workaround, please ensure that "
@@ -414,80 +542,122 @@ class _RankBoundaryCommunicationEager:
     base_comm_tag = 1273
 
     def __init__(self,
-                 dcoll: DiscretizationCollection,
-                 array_container: ArrayOrContainer,
-                 remote_rank, comm_tag: Optional[int] = None,
-                 volume_dd=DD_VOLUME_ALL):
-        actx = get_container_context_recursively(array_container)
-        bdry_dd = volume_dd.trace(BTAG_PARTITION(remote_rank))
-
-        local_bdry_data = project(dcoll, volume_dd, bdry_dd, array_container)
+            actx: ArrayContext,
+            dcoll: DiscretizationCollection,
+            *,
+            local_part_id: PartID,
+            remote_part_id: PartID,
+            local_bdry_data: ArrayOrContainer,
+            remote_bdry_data_template: ArrayOrContainer,
+            comm_tag: Optional[Hashable] = None):
+
         comm = dcoll.mpi_communicator
         assert comm is not None
 
+        remote_rank = remote_part_id.rank
+        assert remote_rank is not None
+
         self.dcoll = dcoll
         self.array_context = actx
-        self.remote_bdry_dd = bdry_dd
-        self.bdry_discr = dcoll.discr_from_dd(bdry_dd)
+        self.local_part_id = local_part_id
+        self.remote_part_id = remote_part_id
+        self.local_bdry_dd = DOFDesc(
+            BoundaryDomainTag(
+                BTAG_PARTITION(remote_part_id),
+                volume_tag=local_part_id.volume_tag),
+            DISCR_TAG_BASE)
+        self.bdry_discr = dcoll.discr_from_dd(self.local_bdry_dd)
         self.local_bdry_data = local_bdry_data
-        self.local_bdry_data_np = \
-            to_numpy(flatten(self.local_bdry_data, actx), actx)
-
-        self.comm_tag = self.base_comm_tag
-        comm_tag = _sym_tag_to_num_tag(comm_tag)
-        if comm_tag is not None:
-            self.comm_tag += comm_tag
+        self.remote_bdry_data_template = remote_bdry_data_template
+
+        def _generate_num_comm_tag(sym_comm_tag):
+            result = self.base_comm_tag
+            num_comm_tag = _sym_tag_to_num_tag(sym_comm_tag)
+            if num_comm_tag is not None:
+                result += num_comm_tag
+            return result
+
+        send_sym_comm_tag = (remote_part_id.volume_tag, comm_tag)
+        recv_sym_comm_tag = (local_part_id.volume_tag, comm_tag)
+        self.send_comm_tag = _generate_num_comm_tag(send_sym_comm_tag)
+        self.recv_comm_tag = _generate_num_comm_tag(recv_sym_comm_tag)
         del comm_tag
 
-        # Here, we initialize both send and recieve operations through
-        # mpi4py `Request` (MPI_Request) instances for comm.Isend (MPI_Isend)
-        # and comm.Irecv (MPI_Irecv) respectively. These initiate non-blocking
-        # point-to-point communication requests and require explicit management
-        # via the use of wait (MPI_Wait, MPI_Waitall, MPI_Waitany, MPI_Waitsome),
-        # test (MPI_Test, MPI_Testall, MPI_Testany, MPI_Testsome), and cancel
-        # (MPI_Cancel). The rank-local data `self.local_bdry_data_np` will have its
-        # associated memory buffer sent across connected ranks and must not be
-        # modified at the Python level during this process. Completion of the
-        # requests is handled in :meth:`finish`.
-        #
-        # For more details on the mpi4py semantics, see:
-        # https://mpi4py.readthedocs.io/en/stable/overview.html#nonblocking-communications
-        #
         # NOTE: mpi4py currently (2021-11-03) holds a reference to the send
         # memory buffer for (i.e. `self.local_bdry_data_np`) until the send
         # requests is complete, however it is not clear that this is documented
         # behavior. We hold on to the buffer (via the instance attribute)
         # as well, just in case.
-        self.send_req = comm.Isend(self.local_bdry_data_np,
-                                   remote_rank,
-                                   tag=self.comm_tag)
-        self.remote_data_host_numpy = np.empty_like(self.local_bdry_data_np)
-        self.recv_req = comm.Irecv(self.remote_data_host_numpy,
-                                   remote_rank,
-                                   tag=self.comm_tag)
+        self.send_reqs = []
+        self.send_data = []
+
+        def send_single_array(key, local_subary):
+            if not isinstance(local_subary, Number):
+                local_subary_np = to_numpy(local_subary, actx)
+                self.send_reqs.append(
+                    comm.Isend(local_subary_np, remote_rank, tag=self.send_comm_tag))
+                self.send_data.append(local_subary_np)
+            return local_subary
+
+        self.recv_reqs = []
+        self.recv_data = {}
+
+        def recv_single_array(key, remote_subary_template):
+            if not isinstance(remote_subary_template, Number):
+                remote_subary_np = np.empty(
+                    remote_subary_template.shape,
+                    remote_subary_template.dtype)
+                self.recv_reqs.append(
+                    comm.Irecv(remote_subary_np, remote_rank,
+                        tag=self.recv_comm_tag))
+                self.recv_data[key] = remote_subary_np
+            return remote_subary_template
+
+        from arraycontext.container.traversal import rec_keyed_map_array_container
+        rec_keyed_map_array_container(send_single_array, local_bdry_data)
+        rec_keyed_map_array_container(recv_single_array, remote_bdry_data_template)
 
     def finish(self):
-        # Wait for the nonblocking receive request to complete before
+        from mpi4py import MPI
+
+        # Wait for the nonblocking receive requests to complete before
         # accessing the data
-        self.recv_req.Wait()
-
-        # Nonblocking receive is complete, we can now access the data and apply
-        # the boundary-swap connection
-        actx = self.array_context
-        remote_bdry_data_flat = from_numpy(self.remote_data_host_numpy, actx)
-        remote_bdry_data = unflatten(self.local_bdry_data,
-                                     remote_bdry_data_flat, actx)
-        bdry_conn = self.dcoll.distributed_boundary_swap_connection(
-            self.remote_bdry_dd)
-        swapped_remote_bdry_data = bdry_conn(remote_bdry_data)
-
-        # Complete the nonblocking send request associated with communicating
-        # `self.local_bdry_data_np`
-        self.send_req.Wait()
-
-        return TracePair(self.remote_bdry_dd,
-                         interior=self.local_bdry_data,
-                         exterior=swapped_remote_bdry_data)
+        MPI.Request.waitall(self.recv_reqs)
+
+        def finish_single_array(key, remote_subary_template):
+            if isinstance(remote_subary_template, Number):
+                # NOTE: Assumes that the same number is passed on every rank
+                return remote_subary_template
+            else:
+                return from_numpy(self.recv_data[key], self.array_context)
+
+        from arraycontext.container.traversal import rec_keyed_map_array_container
+        unswapped_remote_bdry_data = rec_keyed_map_array_container(
+            finish_single_array, self.remote_bdry_data_template)
+
+        remote_to_local = self.dcoll._inter_part_connections[
+            self.remote_part_id, self.local_part_id]
+
+        def get_opposite_trace(ary):
+            if isinstance(ary, Number):
+                return ary
+            else:
+                return remote_to_local(ary)
+
+        from arraycontext import rec_map_array_container
+        from meshmode.dof_array import DOFArray
+        remote_bdry_data = rec_map_array_container(
+            get_opposite_trace,
+            unswapped_remote_bdry_data,
+            leaf_class=DOFArray)
+
+        # Complete the nonblocking send requests
+        MPI.Request.waitall(self.send_reqs)
+
+        return TracePair(
+                self.local_bdry_dd,
+                interior=self.local_bdry_data,
+                exterior=remote_bdry_data)
 
 # }}}
 
@@ -496,51 +666,112 @@ def finish(self):
 
 class _RankBoundaryCommunicationLazy:
     def __init__(self,
-                 dcoll: DiscretizationCollection,
-                 array_container: ArrayOrContainer,
-                 remote_rank: int, comm_tag: Hashable,
-                 volume_dd=DD_VOLUME_ALL):
+            actx: ArrayContext,
+            dcoll: DiscretizationCollection,
+            *,
+            local_part_id: PartID,
+            remote_part_id: PartID,
+            local_bdry_data: ArrayOrContainer,
+            remote_bdry_data_template: ArrayOrContainer,
+            comm_tag: Optional[Hashable] = None) -> None:
+
         if comm_tag is None:
-            raise ValueError("lazy communication requires 'tag' to be supplied")
+            raise ValueError("lazy communication requires 'comm_tag' to be supplied")
 
-        bdry_dd = volume_dd.trace(BTAG_PARTITION(remote_rank))
+        remote_rank = remote_part_id.rank
+        assert remote_rank is not None
 
         self.dcoll = dcoll
-        self.array_context = get_container_context_recursively(array_container)
-        self.remote_bdry_dd = bdry_dd
-        self.bdry_discr = dcoll.discr_from_dd(self.remote_bdry_dd)
-
-        self.local_bdry_data = project(
-            dcoll, volume_dd, bdry_dd, array_container)
-
-        from pytato import make_distributed_recv, staple_distributed_send
-
-        def communicate_single_array(key, local_bdry_ary):
-            ary_tag = (comm_tag, key)
-            return staple_distributed_send(
-                    local_bdry_ary, dest_rank=remote_rank, comm_tag=ary_tag,
-                    stapled_to=make_distributed_recv(
+        self.array_context = actx
+        self.local_bdry_dd = DOFDesc(
+            BoundaryDomainTag(
+                BTAG_PARTITION(remote_part_id),
+                volume_tag=local_part_id.volume_tag),
+            DISCR_TAG_BASE)
+        self.bdry_discr = dcoll.discr_from_dd(self.local_bdry_dd)
+        self.local_part_id = local_part_id
+        self.remote_part_id = remote_part_id
+
+        from pytato import (
+            make_distributed_recv,
+            make_distributed_send,
+            DistributedSendRefHolder)
+
+        # TODO: This currently assumes that local_bdry_data and
+        # remote_bdry_data_template have the same structure. This is not true
+        # in general. Find a way to staple the sends appropriately when the number
+        # of recvs is not equal to the number of sends
+        # FIXME: Overly restrictive (just needs to be the same structure)
+        assert type(local_bdry_data) == type(remote_bdry_data_template)
+
+        sends = {}
+
+        def send_single_array(key, local_subary):
+            if isinstance(local_subary, Number):
+                return
+            else:
+                ary_tag = (remote_part_id.volume_tag, comm_tag, key)
+                sends[key] = make_distributed_send(
+                    local_subary, dest_rank=remote_rank, comm_tag=ary_tag)
+
+        def recv_single_array(key, remote_subary_template):
+            if isinstance(remote_subary_template, Number):
+                # NOTE: Assumes that the same number is passed on every rank
+                return remote_subary_template
+            else:
+                ary_tag = (local_part_id.volume_tag, comm_tag, key)
+                return DistributedSendRefHolder(
+                    sends[key],
+                    make_distributed_recv(
                         src_rank=remote_rank, comm_tag=ary_tag,
-                        shape=local_bdry_ary.shape, dtype=local_bdry_ary.dtype,
-                        axes=local_bdry_ary.axes))
+                        shape=remote_subary_template.shape,
+                        dtype=remote_subary_template.dtype,
+                        axes=remote_subary_template.axes))
 
         from arraycontext.container.traversal import rec_keyed_map_array_container
-        self.remote_data = rec_keyed_map_array_container(
-                communicate_single_array, self.local_bdry_data)
 
-    def finish(self):
-        bdry_conn = self.dcoll.distributed_boundary_swap_connection(
-            self.remote_bdry_dd)
+        rec_keyed_map_array_container(send_single_array, local_bdry_data)
+        self.local_bdry_data = local_bdry_data
+
+        self.unswapped_remote_bdry_data = rec_keyed_map_array_container(
+            recv_single_array, remote_bdry_data_template)
 
-        return TracePair(self.remote_bdry_dd,
-                         interior=self.local_bdry_data,
-                         exterior=bdry_conn(self.remote_data))
+    def finish(self):
+        remote_to_local = self.dcoll._inter_part_connections[
+            self.remote_part_id, self.local_part_id]
+
+        def get_opposite_trace(ary):
+            if isinstance(ary, Number):
+                return ary
+            else:
+                return remote_to_local(ary)
+
+        from arraycontext import rec_map_array_container
+        from meshmode.dof_array import DOFArray
+        remote_bdry_data = rec_map_array_container(
+            get_opposite_trace,
+            self.unswapped_remote_bdry_data,
+            leaf_class=DOFArray)
+
+        return TracePair(
+                self.local_bdry_dd,
+                interior=self.local_bdry_data,
+                exterior=remote_bdry_data)
 
 # }}}
 
 
 # {{{ cross_rank_trace_pairs
 
+def _replace_dof_arrays(array_container, dof_array):
+    from arraycontext import rec_map_array_container
+    from meshmode.dof_array import DOFArray
+    return rec_map_array_container(
+        lambda x: dof_array if isinstance(x, DOFArray) else x,
+        array_container,
+        leaf_class=DOFArray)
+
+
 def cross_rank_trace_pairs(
         dcoll: DiscretizationCollection, ary: ArrayOrContainer,
         tag: Hashable = None,
@@ -549,9 +780,9 @@ def cross_rank_trace_pairs(
     r"""Get a :class:`list` of *ary* trace pairs for each partition boundary.
 
     For each partition boundary, the field data values in *ary* are
-    communicated to/from the neighboring partition. Presumably, this
-    communication is MPI (but strictly speaking, may not be, and this
-    routine is agnostic to the underlying communication).
+    communicated to/from the neighboring part. Presumably, this communication
+    is MPI (but strictly speaking, may not be, and this routine is agnostic to
+    the underlying communication).
 
     For each face on each partition boundary, a
     :class:`TracePair` is created with the locally, and
@@ -596,14 +827,36 @@ def cross_rank_trace_pairs(
 
     # }}}
 
-    if isinstance(ary, Number):
-        # NOTE: Assumed that the same number is passed on every rank
-        return [TracePair(
-                volume_dd.trace(BTAG_PARTITION(remote_rank)),
-                interior=ary, exterior=ary)
-            for remote_rank in connected_ranks(dcoll, volume_dd=volume_dd)]
+    if dcoll.mpi_communicator is None:
+        return []
+
+    rank = dcoll.mpi_communicator.Get_rank()
+
+    local_part_id = PartID(volume_dd.domain_tag.tag, rank)
+
+    connected_part_ids = connected_parts(
+            dcoll, self_volume_tag=volume_dd.domain_tag.tag,
+            other_volume_tag=volume_dd.domain_tag.tag)
+
+    remote_part_ids = [
+        part_id
+        for part_id in connected_part_ids
+        if part_id.rank != rank]
 
-    actx = get_container_context_recursively(ary)
+    # This asserts that there is only one data exchange per rank, so that
+    # there is no risk of mismatched data reaching the wrong recipient.
+    # (Since we have only a single tag.)
+    assert len(remote_part_ids) == len({part_id.rank for part_id in remote_part_ids})
+
+    actx = get_container_context_recursively_opt(ary)
+
+    if actx is None:
+        # NOTE: Assumes that the same number is passed on every rank
+        return [
+            TracePair(
+                volume_dd.trace(BTAG_PARTITION(remote_part_id)),
+                interior=ary, exterior=ary)
+            for remote_part_id in remote_part_ids]
 
     from grudge.array_context import MPIPytatoArrayContextBase
 
@@ -612,14 +865,170 @@ def cross_rank_trace_pairs(
     else:
         rbc_class = _RankBoundaryCommunicationEager
 
-    # Initialize and post all sends/receives
-    rank_bdry_communcators = [
-        rbc_class(dcoll, ary, remote_rank, comm_tag=comm_tag, volume_dd=volume_dd)
-        for remote_rank in connected_ranks(dcoll, volume_dd=volume_dd)
-    ]
+    rank_bdry_communicators = []
+
+    for remote_part_id in remote_part_ids:
+        bdry_dd = volume_dd.trace(BTAG_PARTITION(remote_part_id))
+
+        local_bdry_data = project(dcoll, volume_dd, bdry_dd, ary)
+
+        from arraycontext import tag_axes
+        from meshmode.transform_metadata import (
+            DiscretizationElementAxisTag,
+            DiscretizationDOFAxisTag)
+        remote_bdry_zeros = tag_axes(
+            actx, {
+                0: DiscretizationElementAxisTag(),
+                1: DiscretizationDOFAxisTag()},
+            dcoll._inter_part_connections[
+                remote_part_id, local_part_id].from_discr.zeros(actx))
+
+        remote_bdry_data_template = _replace_dof_arrays(
+            local_bdry_data, remote_bdry_zeros)
+
+        rank_bdry_communicators.append(
+            rbc_class(actx, dcoll,
+                local_part_id=local_part_id,
+                remote_part_id=remote_part_id,
+                local_bdry_data=local_bdry_data,
+                remote_bdry_data_template=remote_bdry_data_template,
+                comm_tag=comm_tag))
+
+    return [rbc.finish() for rbc in rank_bdry_communicators]
+
+# }}}
+
+
+# {{{ cross_rank_inter_volume_trace_pairs
+
+def cross_rank_inter_volume_trace_pairs(
+        dcoll: DiscretizationCollection,
+        pairwise_volume_data: Mapping[
+            Tuple[DOFDesc, DOFDesc],
+            Tuple[ArrayOrContainer, ArrayOrContainer]],
+        *, comm_tag: Hashable = None,
+        ) -> Mapping[
+            Tuple[DOFDesc, DOFDesc],
+            List[TracePair]]:
+    # FIXME: Should this interface take in boundary data instead?
+    # TODO: Docs
+    r"""Get a :class:`list` of *ary* trace pairs for each partition boundary.
+
+    :arg comm_tag: a hashable object used to match sent and received data
+        across ranks. Communication will only match if both endpoints specify
+        objects that compare equal. A generalization of MPI communication
+        tags to arbitary, potentially composite objects.
+
+    :returns: a :class:`list` of :class:`TracePair` objects.
+    """
+    # {{{ process arguments
+
+    for vol_dd_pair, vol_data_pair in pairwise_volume_data.items():
+        for vol_dd in vol_dd_pair:
+            if not isinstance(vol_dd.domain_tag, VolumeDomainTag):
+                raise ValueError(
+                    "pairwise_volume_data keys must describe volumes, "
+                    f"got '{vol_dd}'")
+            if vol_dd.discretization_tag != DISCR_TAG_BASE:
+                raise ValueError(
+                    "expected base-discretized DOFDesc in pairwise_volume_data, "
+                    f"got '{vol_dd}'")
+        # FIXME: This check could probably be made more robust
+        if type(vol_data_pair[0]) != type(vol_data_pair[1]):  # noqa: E721
+            raise ValueError("heterogeneous inter-volume data not supported.")
+
+    # }}}
+
+    if dcoll.mpi_communicator is None:
+        return {}
+
+    rank = dcoll.mpi_communicator.Get_rank()
+
+    for vol_data_pair in pairwise_volume_data.values():
+        for vol_data in vol_data_pair:
+            actx = get_container_context_recursively_opt(vol_data)
+            if actx is not None:
+                break
+        if actx is not None:
+            break
+
+    def get_remote_connected_parts(local_vol_dd, remote_vol_dd):
+        connected_part_ids = connected_parts(
+            dcoll, self_volume_tag=local_vol_dd.domain_tag.tag,
+            other_volume_tag=remote_vol_dd.domain_tag.tag)
+        return [
+            part_id
+            for part_id in connected_part_ids
+            if part_id.rank != rank]
+
+    if actx is None:
+        # NOTE: Assumes that the same number is passed on every rank for a
+        # given volume
+        return {
+            (remote_vol_dd, local_vol_dd): [
+                TracePair(
+                    local_vol_dd.trace(BTAG_PARTITION(remote_part_id)),
+                    interior=local_vol_ary, exterior=remote_vol_ary)
+                for remote_part_id in get_remote_connected_parts(
+                    local_vol_dd, remote_vol_dd)]
+            for (remote_vol_dd, local_vol_dd), (remote_vol_ary, local_vol_ary)
+            in pairwise_volume_data.items()}
+
+    from grudge.array_context import MPIPytatoArrayContextBase
+
+    if isinstance(actx, MPIPytatoArrayContextBase):
+        rbc_class = _RankBoundaryCommunicationLazy
+    else:
+        rbc_class = _RankBoundaryCommunicationEager
 
-    # Complete send/receives and return communicated data
-    return [rc.finish() for rc in rank_bdry_communcators]
+    rank_bdry_communicators = {}
+
+    for vol_dd_pair, vol_data_pair in pairwise_volume_data.items():
+        directional_volume_data = {
+            (vol_dd_pair[0], vol_dd_pair[1]): (vol_data_pair[0], vol_data_pair[1]),
+            (vol_dd_pair[1], vol_dd_pair[0]): (vol_data_pair[1], vol_data_pair[0])}
+
+        for dd_pair, data_pair in directional_volume_data.items():
+            other_vol_dd, self_vol_dd = dd_pair
+            other_vol_data, self_vol_data = data_pair
+
+            self_part_id = PartID(self_vol_dd.domain_tag.tag, rank)
+            other_part_ids = get_remote_connected_parts(self_vol_dd, other_vol_dd)
+
+            rbcs = []
+
+            for other_part_id in other_part_ids:
+                self_bdry_dd = self_vol_dd.trace(BTAG_PARTITION(other_part_id))
+                self_bdry_data = project(
+                    dcoll, self_vol_dd, self_bdry_dd, self_vol_data)
+
+                from arraycontext import tag_axes
+                from meshmode.transform_metadata import (
+                    DiscretizationElementAxisTag,
+                    DiscretizationDOFAxisTag)
+                other_bdry_zeros = tag_axes(
+                    actx, {
+                        0: DiscretizationElementAxisTag(),
+                        1: DiscretizationDOFAxisTag()},
+                    dcoll._inter_part_connections[
+                        other_part_id, self_part_id].from_discr.zeros(actx))
+
+                other_bdry_data_template = _replace_dof_arrays(
+                    other_vol_data, other_bdry_zeros)
+
+                rbcs.append(
+                    rbc_class(actx, dcoll,
+                        local_part_id=self_part_id,
+                        remote_part_id=other_part_id,
+                        local_bdry_data=self_bdry_data,
+                        remote_bdry_data_template=other_bdry_data_template,
+                        comm_tag=comm_tag))
+
+            rank_bdry_communicators[other_vol_dd, self_vol_dd] = rbcs
+
+    return {
+        directional_vol_dd_pair: [rbc.finish() for rbc in rbcs]
+        for directional_vol_dd_pair, rbcs in rank_bdry_communicators.items()}
 
 # }}}
 
diff --git a/test/test_euler_model.py b/test/test_euler_model.py
index 13a479d48..5fec819ba 100644
--- a/test/test_euler_model.py
+++ b/test/test_euler_model.py
@@ -24,22 +24,26 @@
 
 import pytest
 
-from grudge.array_context import PytestPyOpenCLArrayContextFactory
+from grudge.array_context import \
+    PytestPyOpenCLArrayContextFactory, PytestPytatoPyOpenCLArrayContextFactory
 from arraycontext import (
-    pytest_generate_tests_for_array_contexts,
+    pytest_generate_tests_for_array_contexts, thaw,
 )
 pytest_generate_tests = pytest_generate_tests_for_array_contexts(
-        [PytestPyOpenCLArrayContextFactory])
+        [PytestPyOpenCLArrayContextFactory,
+         PytestPytatoPyOpenCLArrayContextFactory])
 
 import grudge.op as op
 
+import numpy as np
+
 import logging
 logger = logging.getLogger(__name__)
 
 
 @pytest.mark.parametrize("order", [1, 2, 3])
-def test_euler_vortex_convergence(actx_factory, order):
-
+@pytest.mark.parametrize("esdg", [False, True])
+def test_euler_vortex_convergence(actx_factory, order, esdg):
     from meshmode.mesh.generation import generate_regular_rect_mesh
 
     from grudge import DiscretizationCollection
@@ -47,7 +51,8 @@ def test_euler_vortex_convergence(actx_factory, order):
     from grudge.dt_utils import h_max_from_volume
     from grudge.models.euler import (
         vortex_initial_condition,
-        EulerOperator
+        EulerOperator,
+        EntropyStableEulerOperator
     )
     from grudge.shortcuts import rk4_step
 
@@ -60,6 +65,16 @@ def test_euler_vortex_convergence(actx_factory, order):
     actx = actx_factory()
     eoc_rec = EOCRecorder()
     quad_tag = DISCR_TAG_QUAD
+    if esdg:
+        operator_cls = EntropyStableEulerOperator
+    else:
+        operator_cls = EulerOperator
+
+    if esdg and not actx.supports_nonscalar_broadcasting:
+        pytest.xfail(
+            "Flux-differencing computations requires an array context "
+            "that supports non-scalar broadcasting"
+        )
 
     for resolution in [8, 16, 32]:
 
@@ -85,7 +100,7 @@ def test_euler_vortex_convergence(actx_factory, order):
 
         # }}}
 
-        euler_operator = EulerOperator(
+        euler_operator = operator_cls(
             dcoll,
             flux_type="lf",
             gamma=1.4,
@@ -135,8 +150,55 @@ def rhs(t, q):
 
     logger.info("\n%s", eoc_rec.pretty_print(abscissa_label="h",
                                              error_label="L2 Error"))
+    assert eoc_rec.order_estimate() >= order + 0.5
+
+
+def test_entropy_variable_roundtrip(actx_factory):
+    from grudge.models.euler import (
+        entropy_to_conservative_vars,
+        conservative_to_entropy_vars,
+        vortex_initial_condition
+    )
+
+    actx = actx_factory()
+    gamma = 1.4  # Adiabatic expansion factor for single-gas Euler model
+
+    from meshmode.mesh.generation import generate_regular_rect_mesh
+
+    dim = 2
+    res = 5
+    mesh = generate_regular_rect_mesh(
+        a=(0, -5),
+        b=(20, 5),
+        nelements_per_axis=(2*res, res),
+        periodic=(True, True))
+
+    from meshmode.discretization.poly_element import \
+        default_simplex_group_factory
+    from grudge import DiscretizationCollection
+    from grudge.dof_desc import DISCR_TAG_BASE
+
+    order = 3
+    dcoll = DiscretizationCollection(
+        actx, mesh,
+        discr_tag_to_group_factory={
+            DISCR_TAG_BASE: default_simplex_group_factory(dim, order)
+        }
+    )
+
+    # Fields in conserved variables
+    fields = vortex_initial_condition(thaw(dcoll.nodes(), actx))
+
+    # Map back and forth between entropy and conserved vars
+    fields_ev = conservative_to_entropy_vars(fields, gamma)
+    ev_fields_to_cons = entropy_to_conservative_vars(fields_ev, gamma)
+    residual = ev_fields_to_cons - fields
+
+    assert actx.to_numpy(op.norm(dcoll, residual.mass, np.inf)) < 1e-13
+    assert actx.to_numpy(op.norm(dcoll, residual.energy, np.inf)) < 1e-13
     assert (
-        eoc_rec.order_estimate() >= order + 0.5
+        actx.to_numpy(op.norm(dcoll, residual.momentum[i], np.inf)) < 1e-13
+        for i in range(dim)
     )
 
 
diff --git a/test/test_sbp_ops.py b/test/test_sbp_ops.py
new file mode 100644
index 000000000..41bf0d6d1
--- /dev/null
+++ b/test/test_sbp_ops.py
@@ -0,0 +1,171 @@
+__copyright__ = "Copyright (C) 2021 University of Illinois Board of Trustees"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+
+import numpy as np
+
+from grudge import DiscretizationCollection
+from grudge.dof_desc import DOFDesc, DISCR_TAG_BASE, DISCR_TAG_QUAD
+
+import pytest
+
+from grudge.array_context import PytestPyOpenCLArrayContextFactory
+from arraycontext import pytest_generate_tests_for_array_contexts
+pytest_generate_tests = pytest_generate_tests_for_array_contexts(
+        [PytestPyOpenCLArrayContextFactory])
+
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+@pytest.mark.parametrize("dim", [1, 2, 3])
+@pytest.mark.parametrize("order", [1, 2, 3, 4])
+def test_reference_element_sbp_operators(actx_factory, dim, order):
+    actx = actx_factory()
+
+    from meshmode.mesh.generation import generate_regular_rect_mesh
+
+    nel_1d = 5
+    box_ll = -5.0
+    box_ur = 5.0
+    mesh = generate_regular_rect_mesh(
+        a=(box_ll,)*dim,
+        b=(box_ur,)*dim,
+        nelements_per_axis=(nel_1d,)*dim)
+
+    from meshmode.discretization.poly_element import \
+        default_simplex_group_factory, QuadratureSimplexGroupFactory
+
+    dcoll = DiscretizationCollection(
+        actx, mesh,
+        discr_tag_to_group_factory={
+            DISCR_TAG_BASE: default_simplex_group_factory(dim, order),
+            DISCR_TAG_QUAD: QuadratureSimplexGroupFactory(2*order)
+        }
+    )
+
+    dd_q = DOFDesc("vol", DISCR_TAG_QUAD)
+    dd_f = DOFDesc("all_faces", DISCR_TAG_QUAD)
+
+    volm_discr = dcoll.discr_from_dd("vol")
+    quad_discr = dcoll.discr_from_dd(dd_q)
+    quad_face_discr = dcoll.discr_from_dd(dd_f)
+
+    from meshmode.discretization.poly_element import diff_matrices
+    from modepy import faces_for_shape, face_normal
+    from grudge.interpolation import (
+        volume_quadrature_interpolation_matrix,
+        surface_quadrature_interpolation_matrix
+    )
+    from grudge.op import reference_inverse_mass_matrix
+
+    for vgrp, qgrp, qfgrp in zip(volm_discr.groups,
+                                 quad_discr.groups,
+                                 quad_face_discr.groups):
+        nq_vol = qgrp.nunit_dofs
+        nq_per_face = qfgrp.nunit_dofs
+        nfaces = vgrp.shape.nfaces
+        nq_faces = nfaces * nq_per_face
+        nq_total = nq_vol + nq_faces
+
+        # {{{ Volume operators
+
+        weights = qgrp.quadrature_rule().weights
+        vdm_q = actx.to_numpy(
+            volume_quadrature_interpolation_matrix(actx, vgrp, qgrp))
+        inv_mass_mat = actx.to_numpy(
+            reference_inverse_mass_matrix(actx, vgrp))
+        p_mat = inv_mass_mat @ (vdm_q.T * weights)
+
+        # }}}
+
+        # Checks Pq @ Vq = Minv @ Vq.T @ W @ Vq = I
+        assert np.allclose(p_mat @ vdm_q,
+                           np.identity(len(inv_mass_mat)), rtol=1e-15)
+
+        # {{{ Surface operators
+
+        faces = faces_for_shape(vgrp.shape)
+        # NOTE: assumes same quadrature rule on all faces
+        face_weights = np.tile(qfgrp.quadrature_rule().weights, nfaces)
+        face_normals = [face_normal(face) for face in faces]
+        e = np.ones(shape=(nq_per_face,))
+        nrstj = [np.concatenate([np.sign(nhat[idx])*e
+                                 for nhat in face_normals])
+                 for idx in range(vgrp.dim)]
+        b_mats = [np.diag(face_weights*nrstj[d]) for d in range(vgrp.dim)]
+        vf_mat = actx.to_numpy(
+            surface_quadrature_interpolation_matrix(
+                actx,
+                base_element_group=vgrp,
+                face_quad_element_group=qfgrp
+            )
+        )
+
+        # }}}
+
+        # {{{ Hybridized (volume + surface) operators
+
+        q_mats = [p_mat.T @ (weights * vdm_q.T @ vdm_q) @ diff_mat @ p_mat
+                  for diff_mat in diff_matrices(vgrp)]
+        e_mat = vf_mat @ p_mat
+        qtilde_mats = 0.5 * np.asarray(
+            [np.block([[q_mats[d] - q_mats[d].T, e_mat.T @ b_mats[d]],
+                       [-b_mats[d] @ e_mat, b_mats[d]]])
+             for d in range(dcoll.dim)]
+        )
+
+        # }}}
+
+        ones = np.ones(shape=(nq_total,))
+        zeros = np.zeros(shape=(nq_total,))
+        for idx in range(dim):
+            # Checks the generalized SBP property:
+            # Qi + Qi.T = E.T @ Bi @ E
+            # c.f. Lemma 1. https://arxiv.org/pdf/1708.01243.pdf
+            assert np.allclose(q_mats[idx] + q_mats[idx].T,
+                               e_mat.T @ b_mats[idx] @ e_mat, rtol=1e-15)
+
+            # Checks the SBP-like property for the skew hybridized operator
+            # Qitilde + Qitilde.T = [0 0; 0 Bi]
+            # c.f. Theorem 1 and Lemma 1. https://arxiv.org/pdf/1902.01828.pdf
+            residual = qtilde_mats[idx] + qtilde_mats[idx].T
+            residual[nq_vol:nq_vol+nq_faces, nq_vol:nq_vol+nq_faces] -= b_mats[idx]
+            assert np.allclose(residual, np.zeros(residual.shape), rtol=1e-15)
+
+            # Checks quadrature condition for: Qiskew @ ones = zeros
+            # Qiskew + Qiskew.T = [0 0; 0 Bi]
+            # c.f. Lemma 2. https://arxiv.org/pdf/1902.01828.pdf
+            assert np.allclose(np.dot(qtilde_mats[idx], ones),
+                               zeros, rtol=1e-15)
+
+
+# You can test individual routines by typing
+# $ python test_grudge.py 'test_routine()'
+
+if __name__ == "__main__":
+    import sys
+    if len(sys.argv) > 1:
+        exec(sys.argv[1])
+    else:
+        pytest.main([__file__])