diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 8458ee663..45df5ed57 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -23,6 +23,7 @@ jobs:
       - name: Install system dependencies
         shell: bash
         run: |
+          sudo apt update
           sudo apt install build-essential mpich libmpich-dev \
               libblas-dev liblapack-dev gfortran
 
diff --git a/.gitignore b/.gitignore
index 089f35d42..2d3b4296b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,11 +6,9 @@ PyOP2.egg-info
 *.py[cdo]
 
 # Extension modules
-computeind.c
-computeind.so
-sparsity.cpp
 sparsity.so
-
+sparsity.c
+sparsity.cpython*.so
 # Docs
 pyop2.coffee.rst
 pyop2.rst
diff --git a/pyop2/base.py b/pyop2/base.py
index b16d18687..4d5c7bb78 100644
--- a/pyop2/base.py
+++ b/pyop2/base.py
@@ -169,6 +169,21 @@ def __init__(self, data=None, map=None, access=None, lgmaps=None, unroll_map=Fal
                 raise MapValueError(
                     "To set of %s doesn't match the set of %s." % (map, data))
 
+    def recreate(self, data=None, map=None, access=None, lgmaps=None, unroll_map=None):
+        """Creates a new Dat based on the existing Dat with the changes specified.
+
+        :param data: A data-carrying object, either :class:`Dat` or class:`Mat`
+        :param map:  A :class:`Map` to access this :class:`Arg` or the default
+                     if the identity map is to be used.
+        :param access: An access descriptor of type :class:`Access`
+        :param lgmaps: For :class:`Mat` objects, a tuple of 2-tuples of local to
+            global maps used during assembly."""
+        return type(self)(data=data or self.data,
+                          map=map or self.map,
+                          access=access or self.access,
+                          lgmaps=lgmaps or self.lgmaps,
+                          unroll_map=False if unroll_map is None else unroll_map)
+
     @cached_property
     def _kernel_args_(self):
         return self.data._kernel_args_
@@ -523,6 +538,35 @@ def layers(self):
         """Return None (not an :class:`ExtrudedSet`)."""
         return None
 
+    def _check_operands(self, other):
+        if type(other) is Set:
+            if other is not self:
+                raise ValueError("Uable to perform set operations between two unrelated sets: %s and %s." % (self, other))
+        elif type(other) is Subset:
+            if self is not other._superset:
+                raise TypeError("Superset mismatch: self (%s) != other._superset (%s)" % (self, other._superset))
+        else:
+            raise TypeError("Unable to perform set operations between `Set` and %s." % (type(other), ))
+
+    def intersection(self, other):
+        self._check_operands(other)
+        return other
+
+    def union(self, other):
+        self._check_operands(other)
+        return self
+
+    def difference(self, other):
+        self._check_operands(other)
+        if other is self:
+            return Subset(self, [])
+        else:
+            return type(other)(self, np.setdiff1d(np.asarray(range(self.total_size), dtype=IntType), other._indices))
+
+    def symmetric_difference(self, other):
+        self._check_operands(other)
+        return self.difference(other)
+
 
 class GlobalSet(Set):
 
@@ -771,6 +815,13 @@ def indices(self):
         """Returns the indices pointing in the superset."""
         return self._indices
 
+    @cached_property
+    def owned_indices(self):
+        """Return the indices that correspond to the owned entities of the
+        superset.
+        """
+        return self.indices[self.indices < self.superset.size]
+
     @cached_property
     def layers_array(self):
         if self._superset.constant_layers:
@@ -778,6 +829,44 @@ def layers_array(self):
         else:
             return self._superset.layers_array[self.indices, ...]
 
+    def _check_operands(self, other):
+        if type(other) is Set:
+            if other is not self._superset:
+                raise TypeError("Superset mismatch: self._superset (%s) != other (%s)" % (self._superset, other))
+        elif type(other) is Subset:
+            if self._superset is not other._superset:
+                raise TypeError("Unable to perform set operation between subsets of mismatching supersets (%s != %s)" % (self._superset, other._superset))
+        else:
+            raise TypeError("Unable to perform set operations between `Subset` and %s." % (type(other), ))
+
+    def intersection(self, other):
+        self._check_operands(other)
+        if other is self._superset:
+            return self
+        else:
+            return type(self)(self._superset, np.intersect1d(self._indices, other._indices))
+
+    def union(self, other):
+        self._check_operands(other)
+        if other is self._superset:
+            return other
+        else:
+            return type(self)(self._superset, np.union1d(self._indices, other._indices))
+
+    def difference(self, other):
+        self._check_operands(other)
+        if other is self._superset:
+            return Subset(other, [])
+        else:
+            return type(self)(self._superset, np.setdiff1d(self._indices, other._indices))
+
+    def symmetric_difference(self, other):
+        self._check_operands(other)
+        if other is self._superset:
+            return other.symmetric_difference(self)
+        else:
+            return type(self)(self._superset, np.setxor1d(self._indices, other._indices))
+
 
 class SetPartition(object):
     def __init__(self, set, offset, size):
@@ -1548,41 +1637,18 @@ def zero(self, subset=None):
         """Zero the data associated with this :class:`Dat`
 
         :arg subset: A :class:`Subset` of entries to zero (optional)."""
+
         # Update dat_version
         self.update_dat_version()
 
-        if hasattr(self, "_zero_parloops"):
-            loops = self._zero_parloops
+        # If there is no subset we can safely zero the halo values.
+        if subset is None:
+            self._data[:] = 0
+            self.halo_valid = True
+        elif subset.superset != self.dataset.set:
+            raise MapValueError("The subset and dataset are incompatible")
         else:
-            loops = {}
-            self._zero_parloops = loops
-
-        iterset = subset or self.dataset.set
-
-        loop = loops.get(iterset, None)
-
-        if loop is None:
-            try:
-                knl = self._zero_kernels[(self.dtype, self.cdim)]
-            except KeyError:
-                import islpy as isl
-                import pymbolic.primitives as p
-
-                inames = isl.make_zero_and_vars(["i"])
-                domain = (inames[0].le_set(inames["i"])) & (inames["i"].lt_set(inames[0] + self.cdim))
-                x = p.Variable("dat")
-                i = p.Variable("i")
-                insn = loopy.Assignment(x.index(i), 0, within_inames=frozenset(["i"]))
-                data = loopy.GlobalArg("dat", dtype=self.dtype, shape=(self.cdim,))
-                knl = loopy.make_function([domain], [insn], [data], name="zero", target=loopy.CTarget(), lang_version=(2018, 2))
-
-                knl = _make_object('Kernel', knl, 'zero')
-                self._zero_kernels[(self.dtype, self.cdim)] = knl
-            loop = _make_object('ParLoop', knl,
-                                iterset,
-                                self(WRITE))
-            loops[iterset] = loop
-        loop.compute()
+            self.data[subset.owned_indices] = 0
 
     @collective
     def copy(self, other, subset=None):
@@ -1592,28 +1658,17 @@ def copy(self, other, subset=None):
         :arg subset: A :class:`Subset` of elements to copy (optional)"""
         if other is self:
             return
-        self._copy_parloop(other, subset=subset).compute()
-
-    @collective
-    def _copy_parloop(self, other, subset=None):
-        """Create the :class:`ParLoop` implementing copy."""
-        if not hasattr(self, '_copy_kernel'):
-            import islpy as isl
-            import pymbolic.primitives as p
-            inames = isl.make_zero_and_vars(["i"])
-            domain = (inames[0].le_set(inames["i"])) & (inames["i"].lt_set(inames[0] + self.cdim))
-            _other = p.Variable("other")
-            _self = p.Variable("self")
-            i = p.Variable("i")
-            insn = loopy.Assignment(_other.index(i), _self.index(i), within_inames=frozenset(["i"]))
-            data = [loopy.GlobalArg("self", dtype=self.dtype, shape=(self.cdim,)),
-                    loopy.GlobalArg("other", dtype=other.dtype, shape=(other.cdim,))]
-            knl = loopy.make_function([domain], [insn], data, name="copy", target=loopy.CTarget(), lang_version=(2018, 2))
-
-            self._copy_kernel = _make_object('Kernel', knl, 'copy')
-        return _make_object('ParLoop', self._copy_kernel,
-                            subset or self.dataset.set,
-                            self(READ), other(WRITE))
+        if subset is None:
+            # If the current halo is valid we can also copy these values across.
+            if self.halo_valid:
+                other._data[:] = self._data
+                other.halo_valid = True
+            else:
+                other.data[:] = self.data_ro
+        elif subset.superset != self.dataset.set:
+            raise MapValueError("The subset and dataset are incompatible")
+        else:
+            other.data[subset.owned_indices] = self.data_ro[subset.owned_indices]
 
     def __iter__(self):
         """Yield self when iterated over."""
@@ -1852,8 +1907,6 @@ def __itruediv__(self, other):
         """Pointwise division or scaling of fields."""
         return self._iop(other, operator.itruediv)
 
-    __idiv__ = __itruediv__  # Python 2 compatibility
-
     @collective
     def global_to_local_begin(self, access_mode):
         """Begin a halo exchange from global to ghosted representation.
@@ -2649,6 +2702,41 @@ def __le__(self, o):
         return self == o
 
 
+class PermutedMap(Map):
+    """Composition of a standard :class:`Map` with a constant permutation.
+
+    :arg map_: The map to permute.
+    :arg permutation: The permutation of the map indices.
+
+    Where normally staging to element data is performed as
+
+    .. code-block::
+
+       local[i] = global[map[i]]
+
+    With a :class:`PermutedMap` we instead get
+
+    .. code-block::
+
+       local[i] = global[map[permutation[i]]]
+
+    This might be useful if your local kernel wants data in a
+    different order to the one that the map provides, and you don't
+    want two global-sized data structures.
+    """
+    def __init__(self, map_, permutation):
+        self.map_ = map_
+        self.permutation = np.asarray(permutation, dtype=Map.dtype)
+        assert (np.unique(permutation) == np.arange(map_.arity, dtype=Map.dtype)).all()
+
+    @cached_property
+    def _wrapper_cache_key_(self):
+        return super()._wrapper_cache_key_ + (tuple(self.permutation),)
+
+    def __getattr__(self, name):
+        return getattr(self.map_, name)
+
+
 class MixedMap(Map, ObjectCached):
     r"""A container for a bag of :class:`Map`\s."""
 
@@ -3350,7 +3438,8 @@ class Kernel(Cached):
     @classmethod
     @validate_type(('name', str, NameTypeError))
     def _cache_key(cls, code, name, opts={}, include_dirs=[], headers=[],
-                   user_code="", ldargs=None, cpp=False, requires_zeroed_output_arguments=False):
+                   user_code="", ldargs=None, cpp=False, requires_zeroed_output_arguments=False,
+                   flop_count=None):
         # Both code and name are relevant since there might be multiple kernels
         # extracting different functions from the same code
         # Also include the PyOP2 version, since the Kernel class might change
@@ -3372,7 +3461,8 @@ def _wrapper_cache_key_(self):
         return (self._key, )
 
     def __init__(self, code, name, opts={}, include_dirs=[], headers=[],
-                 user_code="", ldargs=None, cpp=False, requires_zeroed_output_arguments=False):
+                 user_code="", ldargs=None, cpp=False, requires_zeroed_output_arguments=False,
+                 flop_count=None):
         # Protect against re-initialization when retrieved from cache
         if self._initialized:
             return
@@ -3388,6 +3478,7 @@ def __init__(self, code, name, opts={}, include_dirs=[], headers=[],
         self._code = code
         self._initialized = True
         self.requires_zeroed_output_arguments = requires_zeroed_output_arguments
+        self.flop_count = flop_count
 
     @property
     def name(self):
@@ -3400,6 +3491,8 @@ def code(self):
 
     @cached_property
     def num_flops(self):
+        if self.flop_count is not None:
+            return self.flop_count
         if not configuration["compute_kernel_flops"]:
             return 0
         if isinstance(self.code, Node):
diff --git a/pyop2/codegen/builder.py b/pyop2/codegen/builder.py
index 5167e20a4..50d57ca25 100644
--- a/pyop2/codegen/builder.py
+++ b/pyop2/codegen/builder.py
@@ -16,7 +16,7 @@
                                           When, Zero)
 from pyop2.datatypes import IntType
 from pyop2.op2 import (ALL, INC, MAX, MIN, ON_BOTTOM, ON_INTERIOR_FACETS,
-                       ON_TOP, READ, RW, WRITE, Subset)
+                       ON_TOP, READ, RW, WRITE, Subset, PermutedMap)
 from pyop2.utils import cached_property
 
 
@@ -30,7 +30,7 @@ class Map(object):
 
     __slots__ = ("values", "offset", "interior_horizontal",
                  "variable", "unroll", "layer_bounds",
-                 "prefetch")
+                 "prefetch", "permutation")
 
     def __init__(self, map_, interior_horizontal, layer_bounds,
                  values=None, offset=None, unroll=False):
@@ -50,11 +50,17 @@ def __init__(self, map_, interior_horizontal, layer_bounds,
         offset = map_.offset
         shape = (None, ) + map_.shape[1:]
         values = Argument(shape, dtype=map_.dtype, pfx="map")
+        if isinstance(map_, PermutedMap):
+            self.permutation = NamedLiteral(map_.permutation, parent=values, suffix="permutation")
+            if offset is not None:
+                offset = offset[map_.permutation]
+        else:
+            self.permutation = None
         if offset is not None:
             if len(set(map_.offset)) == 1:
                 offset = Literal(offset[0], casting=True)
             else:
-                offset = NamedLiteral(offset, name=values.name + "_offset")
+                offset = NamedLiteral(offset, parent=values, suffix="offset")
 
         self.values = values
         self.offset = offset
@@ -76,7 +82,10 @@ def indexed(self, multiindex, layer=None):
             base_key = None
             if base_key not in self.prefetch:
                 j = Index()
-                base = Indexed(self.values, (n, j))
+                if self.permutation is None:
+                    base = Indexed(self.values, (n, j))
+                else:
+                    base = Indexed(self.values, (n, Indexed(self.permutation, (j,))))
                 self.prefetch[base_key] = Materialise(PackInst(), base, MultiIndex(j))
 
             base = self.prefetch[base_key]
@@ -103,7 +112,10 @@ def indexed(self, multiindex, layer=None):
             return Indexed(self.prefetch[key], (f, i)), (f, i)
         else:
             assert f.extent == 1 or f.extent is None
-            base = Indexed(self.values, (n, i))
+            if self.permutation is None:
+                base = Indexed(self.values, (n, i))
+            else:
+                base = Indexed(self.values, (n, Indexed(self.permutation, (i,))))
             return base, (f, i)
 
     def indexed_vector(self, n, shape, layer=None):
diff --git a/pyop2/codegen/loopycompat.py b/pyop2/codegen/loopycompat.py
index d8a3aa9ec..3eeec83cc 100644
--- a/pyop2/codegen/loopycompat.py
+++ b/pyop2/codegen/loopycompat.py
@@ -85,7 +85,7 @@ def _shape_1_if_empty(shape_caller, shape_callee):
                 get_kw_pos_association)
         _, pos_to_kw = get_kw_pos_association(callee_knl)
         arg_id_to_shape = {}
-        for arg_id, arg in insn.arg_id_to_val().items():
+        for arg_id, arg in insn.arg_id_to_arg().items():
             arg_id = pos_to_kw[arg_id]
 
             arg_descr = get_arg_descriptor_for_expression(caller_knl, arg)
diff --git a/pyop2/codegen/optimise.py b/pyop2/codegen/optimise.py
index f7a3550e5..f0a7b58b9 100644
--- a/pyop2/codegen/optimise.py
+++ b/pyop2/codegen/optimise.py
@@ -1,8 +1,7 @@
 from pyop2.codegen.node import traversal, reuse_if_untouched, Memoizer
 from functools import singledispatch
 from pyop2.codegen.representation import (Index, RuntimeIndex, Node,
-                                          FunctionCall, Variable, Argument,
-                                          NamedLiteral)
+                                          FunctionCall, Variable, Argument)
 
 
 def collect_indices(expressions):
@@ -90,7 +89,7 @@ def index_merger(instructions, cache=None):
 
 @singledispatch
 def _rename_node(node, self):
-    """Replace division with multiplication
+    """Rename nodes
 
     :param node: root of expression
     :param self: function for recursive calls
@@ -103,7 +102,7 @@ def _rename_node(node, self):
 
 @_rename_node.register(Index)
 def _rename_node_index(node, self):
-    name = self.replace.get(node, node.name)
+    name = self.renamer(node)
     return Index(extent=node.extent, name=name)
 
 
@@ -114,38 +113,25 @@ def _rename_node_func(node, self):
     return FunctionCall(node.name, node.label, node.access, free_indices, *children)
 
 
-@_rename_node.register(RuntimeIndex)
-def _rename_node_rtindex(node, self):
-    children = tuple(map(self, node.children))
-    name = self.replace.get(node, node.name)
-    return RuntimeIndex(*children, name=name)
-
-
-@_rename_node.register(NamedLiteral)
-def _rename_node_namedliteral(node, self):
-    name = self.replace.get(node, node.name)
-    return NamedLiteral(node.value, name)
-
-
 @_rename_node.register(Variable)
 def _rename_node_variable(node, self):
-    name = self.replace.get(node, node.name)
+    name = self.renamer(node)
     return Variable(name, node.shape, node.dtype)
 
 
 @_rename_node.register(Argument)
 def _rename_node_argument(node, self):
-    name = self.replace.get(node, node.name)
+    name = self.renamer(node)
     return Argument(node.shape, node.dtype, name=name)
 
 
-def rename_nodes(instructions, replace):
+def rename_nodes(instructions, renamer):
     """Rename the nodes in the instructions.
 
     :param instructions: Iterable of nodes.
-    :param replace: Dictionary matching old names to new names.
+    :param renamer: Function that maps nodes to new names
     :return: List of instructions with nodes renamed.
     """
     mapper = Memoizer(_rename_node)
-    mapper.replace = replace
+    mapper.renamer = renamer
     return list(map(mapper, instructions))
diff --git a/pyop2/codegen/rep2loopy.py b/pyop2/codegen/rep2loopy.py
index af703f693..2dd21310e 100644
--- a/pyop2/codegen/rep2loopy.py
+++ b/pyop2/codegen/rep2loopy.py
@@ -15,7 +15,6 @@
 from collections import OrderedDict, defaultdict
 from functools import singledispatch, reduce
 import itertools
-import re
 import operator
 
 from pyop2.codegen.node import traversal, Node, Memoizer, reuse_if_untouched
@@ -430,24 +429,35 @@ def generate(builder, wrapper_name=None):
     instructions = instructions + initialiser
     mapper.initialisers = [tuple(merger(i) for i in inits) for inits in mapper.initialisers]
 
+    def name_generator(prefix):
+        yield from (f"{prefix}{i}" for i in itertools.count())
+
     # rename indices and nodes (so that the counters start from zero)
-    pattern = re.compile(r"^([a-zA-Z_]+)([0-9]+)(_offset)?$")
-    replacements = {}
-    counter = defaultdict(itertools.count)
-    for node in traversal(instructions):
-        if isinstance(node, (Index, RuntimeIndex, Variable, Argument, NamedLiteral)):
-            match = pattern.match(node.name)
-            if match is None:
-                continue
-            prefix, _, postfix = match.groups()
-            if postfix is None:
-                postfix = ""
-            replacements[node] = "%s%d%s" % (prefix, next(counter[(prefix, postfix)]), postfix)
-
-    instructions = rename_nodes(instructions, replacements)
-    mapper.initialisers = [rename_nodes(inits, replacements) for inits in mapper.initialisers]
-    parameters.wrapper_arguments = rename_nodes(parameters.wrapper_arguments, replacements)
-    s, e = rename_nodes([mapper(e) for e in builder.layer_extents], replacements)
+    node_names = {}
+    node_namers = dict((cls, name_generator(prefix))
+                       for cls, prefix in [(Index, "i"), (Variable, "t")])
+
+    def renamer(expr):
+        if isinstance(expr, Argument):
+            if expr._name is not None:
+                # Some arguments have given names
+                return expr._name
+            else:
+                # Otherwise generate one with their given prefix.
+                namer = node_namers.setdefault((type(expr), expr.prefix),
+                                               name_generator(expr.prefix))
+        else:
+            namer = node_namers[type(expr)]
+        try:
+            return node_names[expr]
+        except KeyError:
+            return node_names.setdefault(expr, next(namer))
+
+    instructions = rename_nodes(instructions, renamer)
+    mapper.initialisers = [rename_nodes(inits, renamer)
+                           for inits in mapper.initialisers]
+    parameters.wrapper_arguments = rename_nodes(parameters.wrapper_arguments, renamer)
+    s, e = rename_nodes([mapper(e) for e in builder.layer_extents], renamer)
     parameters.layer_start = s.name
     parameters.layer_end = e.name
 
diff --git a/pyop2/codegen/representation.py b/pyop2/codegen/representation.py
index 9a4025505..58f5b18f9 100644
--- a/pyop2/codegen/representation.py
+++ b/pyop2/codegen/representation.py
@@ -83,8 +83,8 @@ class IndexBase(metaclass=ABCMeta):
 
 class Index(Terminal, Scalar):
     _count = itertools.count()
-    __slots__ = ("name", "extent", "merge")
-    __front__ = ("name", "extent", "merge")
+    __slots__ = ("extent", "merge", "name")
+    __front__ = ("extent", "merge", "name")
 
     def __init__(self, extent=None, merge=True, name=None):
         self.name = name or "i%d" % next(Index._count)
@@ -118,11 +118,12 @@ def __init__(self, value):
 
 class RuntimeIndex(Scalar):
     _count = itertools.count()
-    __slots__ = ("name", "children")
+    __slots__ = ("children", "name")
     __back__ = ("name", )
 
-    def __init__(self, lo, hi, constraint, name=None):
-        self.name = name or "r%d" % next(RuntimeIndex._count)
+    def __init__(self, lo, hi, constraint, name):
+        assert name is not None, "runtime indices need a name"
+        self.name = name
         self.children = lo, hi, constraint
 
     @cached_property
@@ -173,17 +174,23 @@ def __init__(self, name):
 class Argument(Terminal):
     _count = defaultdict(partial(itertools.count))
 
-    __slots__ = ("shape", "dtype", "name")
-    __front__ = ("shape", "dtype", "name")
+    __slots__ = ("shape", "dtype", "_name", "prefix", "_gen_name")
+    __front__ = ("shape", "dtype", "_name", "prefix")
 
     def __init__(self, shape, dtype, name=None, pfx=None):
         self.dtype = dtype
         self.shape = shape
-        if name is None:
-            if pfx is None:
-                pfx = "v"
-            name = "%s%d" % (pfx, next(Argument._count[pfx]))
-        self.name = name
+        self._name = name
+        pfx = pfx or "v"
+        self.prefix = pfx
+        self._gen_name = name or "%s%d" % (pfx, next(Argument._count[pfx]))
+
+    def get_hash(self):
+        return hash((type(self),) + self._cons_args(self.children) + (self.name,))
+
+    @property
+    def name(self):
+        return self._name or self._gen_name
 
 
 class Literal(Terminal, Scalar):
@@ -218,19 +225,22 @@ def dtype(self):
 
 
 class NamedLiteral(Terminal):
-    __slots__ = ("value", "name")
-    __front__ = ("value", "name")
+    __slots__ = ("value", "parent", "suffix")
+    __front__ = ("value", "parent", "suffix")
 
-    def __init__(self, value, name):
+    def __init__(self, value, parent, suffix):
         self.value = value
-        self.name = name
+        self.parent = parent
+        self.suffix = suffix
 
     def is_equal(self, other):
         if type(self) != type(other):
             return False
         if self.shape != other.shape:
             return False
-        if self.name != other.name:
+        if self.parent != other.parent:
+            return False
+        if self.suffix != other.suffix:
             return False
         return tuple(self.value.flat) == tuple(other.value.flat)
 
@@ -245,6 +255,10 @@ def shape(self):
     def dtype(self):
         return self.value.dtype
 
+    @property
+    def name(self):
+        return f"{self.parent.name}_{self.suffix}"
+
 
 class Min(Scalar):
     __slots__ = ("children", )
diff --git a/pyop2/compilation.py b/pyop2/compilation.py
index e5a9fefdd..2d72dbb11 100644
--- a/pyop2/compilation.py
+++ b/pyop2/compilation.py
@@ -33,6 +33,7 @@
 
 
 import os
+import shutil
 import subprocess
 import sys
 import ctypes
@@ -500,31 +501,27 @@ def clear_cache(prompt=False):
     :arg prompt: if ``True`` prompt before removing any files
     """
     cachedir = configuration['cache_dir']
+
     if not os.path.exists(cachedir):
+        print("Cache directory could not be found")
         return
-
-    files = [os.path.join(cachedir, f) for f in os.listdir(cachedir)
-             if os.path.isfile(os.path.join(cachedir, f))]
-    nfiles = len(files)
-
-    if nfiles == 0:
+    if len(os.listdir(cachedir)) == 0:
         print("No cached libraries to remove")
         return
 
     remove = True
     if prompt:
-
-        user = input("Remove %d cached libraries from %s? [Y/n]: " % (nfiles, cachedir))
+        user = input(f"Remove cached libraries from {cachedir}? [Y/n]: ")
 
         while user.lower() not in ['', 'y', 'n']:
             print("Please answer y or n.")
-            user = input("Remove %d cached libraries from %s? [Y/n]: " % (nfiles, cachedir))
+            user = input(f"Remove cached libraries from {cachedir}? [Y/n]: ")
 
         if user.lower() == 'n':
             remove = False
 
     if remove:
-        print("Removing %d cached libraries from %s" % (nfiles, cachedir))
-        [os.remove(f) for f in files]
+        print(f"Removing cached libraries from {cachedir}")
+        shutil.rmtree(cachedir)
     else:
         print("Not removing cached libraries")
diff --git a/pyop2/op2.py b/pyop2/op2.py
index 3082b5556..84ac26056 100644
--- a/pyop2/op2.py
+++ b/pyop2/op2.py
@@ -43,7 +43,7 @@
 from pyop2.sequential import READ, WRITE, RW, INC, MIN, MAX  # noqa: F401
 from pyop2.base import ON_BOTTOM, ON_TOP, ON_INTERIOR_FACETS, ALL  # noqa: F401
 from pyop2.sequential import Set, ExtrudedSet, MixedSet, Subset, DataSet, MixedDataSet  # noqa: F401
-from pyop2.sequential import Map, MixedMap, Sparsity, Halo  # noqa: F401
+from pyop2.sequential import Map, MixedMap, PermutedMap, Sparsity, Halo  # noqa: F401
 from pyop2.sequential import Global, GlobalDataSet        # noqa: F401
 from pyop2.sequential import Dat, MixedDat, DatView, Mat  # noqa: F401
 from pyop2.sequential import ParLoop as SeqParLoop
@@ -59,7 +59,7 @@
            'MixedSet', 'Subset', 'DataSet', 'GlobalDataSet', 'MixedDataSet',
            'Halo', 'Dat', 'MixedDat', 'Mat', 'Global', 'Map', 'MixedMap',
            'Sparsity', 'par_loop', 'ParLoop',
-           'DatView']
+           'DatView', 'PermutedMap']
 
 
 def ParLoop(kernel, *args, **kwargs):
diff --git a/pyop2/petsc_base.py b/pyop2/petsc_base.py
index 592d03ab4..1cdfe9502 100644
--- a/pyop2/petsc_base.py
+++ b/pyop2/petsc_base.py
@@ -986,7 +986,7 @@ def mult(self, mat, x, y):
                 # Column matrix
                 if x.sizes[1] == 1:
                     v.copy(y)
-                    a = np.zeros(1)
+                    a = np.zeros(1, dtype=ScalarType)
                     if x.comm.rank == 0:
                         a[0] = x.array_r
                     else:
@@ -1002,7 +1002,7 @@ def multTranspose(self, mat, x, y):
                 # Row matrix
                 if x.sizes[1] == 1:
                     v.copy(y)
-                    a = np.zeros(1)
+                    a = np.zeros(1, dtype=ScalarType)
                     if x.comm.rank == 0:
                         a[0] = x.array_r
                     else:
@@ -1026,7 +1026,7 @@ def multTransposeAdd(self, mat, x, y, z):
                 # Row matrix
                 if x.sizes[1] == 1:
                     v.copy(z)
-                    a = np.zeros(1)
+                    a = np.zeros(1, dtype=ScalarType)
                     if x.comm.rank == 0:
                         a[0] = x.array_r
                     else:
diff --git a/pyop2/sequential.py b/pyop2/sequential.py
index 1dbab1c18..ff8189be0 100644
--- a/pyop2/sequential.py
+++ b/pyop2/sequential.py
@@ -45,7 +45,7 @@
 from pyop2.base import par_loop                          # noqa: F401
 from pyop2.base import READ, WRITE, RW, INC, MIN, MAX    # noqa: F401
 from pyop2.base import ALL
-from pyop2.base import Map, MixedMap, Sparsity, Halo      # noqa: F401
+from pyop2.base import Map, MixedMap, PermutedMap, Sparsity, Halo  # noqa: F401
 from pyop2.base import Set, ExtrudedSet, MixedSet, Subset  # noqa: F401
 from pyop2.base import DatView                           # noqa: F401
 from pyop2.base import Kernel                            # noqa: F401
@@ -58,6 +58,7 @@
 from pyop2.profiling import timed_region
 from pyop2.utils import cached_property, get_petsc_dir
 
+from petsc4py import PETSc
 import loopy
 
 
@@ -131,6 +132,7 @@ def code_to_compile(self):
             return preamble + "\nextern \"C\" {\n" + device_code + "\n}\n"
         return code.device_code()
 
+    @PETSc.Log.EventDecorator()
     @collective
     def compile(self):
         # If we weren't in the cache we /must/ have arguments
diff --git a/pyop2/utils.py b/pyop2/utils.py
index 85190e7d9..0fc59901d 100644
--- a/pyop2/utils.py
+++ b/pyop2/utils.py
@@ -327,8 +327,11 @@ def get_petsc_dir():
         return (dir, dir + arch)
     except KeyError:
         try:
-            import petsc
-            return (petsc.get_petsc_dir(), )
+            import petsc4py
+            config = petsc4py.get_config()
+            petsc_dir = config["PETSC_DIR"]
+            petsc_arch = config["PETSC_ARCH"]
+            return petsc_dir, petsc_dir + petsc_arch
         except ImportError:
             sys.exit("""Error: Could not find PETSc library.
 
diff --git a/requirements-git.txt b/requirements-git.txt
index 26bdc5abc..438205043 100644
--- a/requirements-git.txt
+++ b/requirements-git.txt
@@ -1,2 +1,2 @@
 git+https://github.com/coneoproject/COFFEE.git#egg=coffee
-git+https://github.com/firedrakeproject/loopy.git@firedrake#egg=loopy
+git+https://github.com/firedrakeproject/loopy.git@main#egg=loopy
diff --git a/test/unit/test_api.py b/test/unit/test_api.py
index 5c3f85aa1..eee28bb35 100644
--- a/test/unit/test_api.py
+++ b/test/unit/test_api.py
@@ -809,7 +809,7 @@ def test_dat_float(self, dset):
     def test_dat_int(self, dset):
         "Data type for int data should be numpy.int."
         d = op2.Dat(dset, [1] * dset.size * dset.cdim)
-        assert d.dtype == np.int
+        assert d.dtype == np.asarray(1).dtype
 
     def test_dat_convert_int_float(self, dset):
         "Explicit float type should override NumPy's default choice of int."
@@ -909,7 +909,7 @@ def test_mixed_dat_illegal_arg(self):
     def test_mixed_dat_illegal_dtype(self, set):
         """Constructing a MixedDat from Dats of different dtype should fail."""
         with pytest.raises(exceptions.DataValueError):
-            op2.MixedDat((op2.Dat(set, dtype=np.int), op2.Dat(set)))
+            op2.MixedDat((op2.Dat(set, dtype=np.int32), op2.Dat(set, dtype=np.float64)))
 
     def test_mixed_dat_dats(self, dats):
         """Constructing a MixedDat from an iterable of Dats should leave them
@@ -1303,22 +1303,22 @@ def test_global_dim_list(self):
     def test_global_float(self):
         "Data type for float data should be numpy.float64."
         g = op2.Global(1, 1.0)
-        assert g.dtype == np.double
+        assert g.dtype == np.asarray(1.0).dtype
 
     def test_global_int(self):
         "Data type for int data should be numpy.int."
         g = op2.Global(1, 1)
-        assert g.dtype == np.int
+        assert g.dtype == np.asarray(1).dtype
 
     def test_global_convert_int_float(self):
         "Explicit float type should override NumPy's default choice of int."
-        g = op2.Global(1, 1, 'double')
+        g = op2.Global(1, 1, dtype=np.float64)
         assert g.dtype == np.float64
 
     def test_global_convert_float_int(self):
         "Explicit int type should override NumPy's default choice of float."
-        g = op2.Global(1, 1.5, 'int')
-        assert g.dtype == np.int
+        g = op2.Global(1, 1.5, dtype=np.int64)
+        assert g.dtype == np.int64
 
     def test_global_illegal_dtype(self):
         "Illegal data type should raise DataValueError."
diff --git a/test/unit/test_caching.py b/test/unit/test_caching.py
index 11a6c343c..f3c68e0ef 100644
--- a/test/unit/test_caching.py
+++ b/test/unit/test_caching.py
@@ -34,7 +34,6 @@
 
 import pytest
 import numpy
-import random
 from pyop2 import op2, base
 
 from coffee.base import *
@@ -99,15 +98,13 @@ def y(dindset):
 
 @pytest.fixture
 def iter2ind1(iterset, indset):
-    u_map = numpy.array(list(range(nelems)), dtype=numpy.uint32)
-    random.shuffle(u_map, _seed)
+    u_map = numpy.array(list(range(nelems)), dtype=numpy.uint32)[::-1]
     return op2.Map(iterset, indset, 1, u_map, "iter2ind1")
 
 
 @pytest.fixture
 def iter2ind2(iterset, indset):
-    u_map = numpy.array(list(range(nelems)) * 2, dtype=numpy.uint32)
-    random.shuffle(u_map, _seed)
+    u_map = numpy.array(list(range(nelems)) * 2, dtype=numpy.uint32)[::-1]
     return op2.Map(iterset, indset, 2, u_map, "iter2ind2")
 
 
diff --git a/test/unit/test_callables.py b/test/unit/test_callables.py
index c42e19c97..98be8ff0f 100644
--- a/test/unit/test_callables.py
+++ b/test/unit/test_callables.py
@@ -75,7 +75,7 @@ def test_inverse_callable(self, zero_mat, inv_mat):
         loopy.set_caching_enabled(False)
 
         k = loopy.make_kernel(
-            ["{[i,j] : 0 <= i,j < 2}"],
+            ["{ : }"],
             """
             B[:,:] = inverse(A[:,:])
             """,
@@ -99,7 +99,7 @@ def test_solve_callable(self, zero_vec, solve_mat, solve_vec):
         loopy.set_caching_enabled(False)
 
         k = loopy.make_kernel(
-            ["{[i,j] : 0 <= i,j < 2}"],
+            ["{ : }"],
             """
             x[:] = solve(A[:,:], b[:])
             """,
diff --git a/test/unit/test_extrusion.py b/test/unit/test_extrusion.py
index 8f84621db..dfae39b60 100644
--- a/test/unit/test_extrusion.py
+++ b/test/unit/test_extrusion.py
@@ -50,7 +50,7 @@ def compute_ind_extr(nums,
                      map,
                      lsize):
     count = 0
-    ind = numpy.zeros(lsize, dtype=numpy.int)
+    ind = numpy.zeros(lsize, dtype=numpy.int32)
     len1 = len(mesh2d)
     for mm in range(lins):
         offset = 0
diff --git a/test/unit/test_indirect_loop.py b/test/unit/test_indirect_loop.py
index 09347cfed..b1f4e3cbe 100644
--- a/test/unit/test_indirect_loop.py
+++ b/test/unit/test_indirect_loop.py
@@ -34,7 +34,6 @@
 
 import pytest
 import numpy as np
-import random
 
 from pyop2 import op2
 from pyop2.exceptions import MapValueError
@@ -81,8 +80,7 @@ def x2(indset):
 @pytest.fixture
 def mapd():
     mapd = list(range(nelems))
-    random.shuffle(mapd, lambda: 0.02041724)
-    return mapd
+    return mapd[::-1]
 
 
 @pytest.fixture
diff --git a/test/unit/test_subset.py b/test/unit/test_subset.py
index 156ae9b7b..a2c2f9f9b 100644
--- a/test/unit/test_subset.py
+++ b/test/unit/test_subset.py
@@ -57,7 +57,7 @@ class TestSubSet:
 
     def test_direct_loop(self, iterset):
         """Test a direct ParLoop on a subset"""
-        indices = np.array([i for i in range(nelems) if not i % 2], dtype=np.int)
+        indices = np.array([i for i in range(nelems) if not i % 2], dtype=np.int32)
         ss = op2.Subset(iterset, indices)
 
         d = op2.Dat(iterset ** 1, data=None, dtype=np.uint32)
@@ -77,8 +77,8 @@ def test_direct_loop_empty(self, iterset):
 
     def test_direct_complementary_subsets(self, iterset):
         """Test direct par_loop over two complementary subsets"""
-        even = np.array([i for i in range(nelems) if not i % 2], dtype=np.int)
-        odd = np.array([i for i in range(nelems) if i % 2], dtype=np.int)
+        even = np.array([i for i in range(nelems) if not i % 2], dtype=np.int32)
+        odd = np.array([i for i in range(nelems) if i % 2], dtype=np.int32)
 
         sseven = op2.Subset(iterset, even)
         ssodd = op2.Subset(iterset, odd)
@@ -91,8 +91,8 @@ def test_direct_complementary_subsets(self, iterset):
 
     def test_direct_complementary_subsets_with_indexing(self, iterset):
         """Test direct par_loop over two complementary subsets"""
-        even = np.arange(0, nelems, 2, dtype=np.int)
-        odd = np.arange(1, nelems, 2, dtype=np.int)
+        even = np.arange(0, nelems, 2, dtype=np.int32)
+        odd = np.arange(1, nelems, 2, dtype=np.int32)
 
         sseven = iterset(even)
         ssodd = iterset(odd)
@@ -104,16 +104,16 @@ def test_direct_complementary_subsets_with_indexing(self, iterset):
         assert (d.data == 1).all()
 
     def test_direct_loop_sub_subset(self, iterset):
-        indices = np.arange(0, nelems, 2, dtype=np.int)
+        indices = np.arange(0, nelems, 2, dtype=np.int32)
         ss = op2.Subset(iterset, indices)
-        indices = np.arange(0, nelems//2, 2, dtype=np.int)
+        indices = np.arange(0, nelems//2, 2, dtype=np.int32)
         sss = op2.Subset(ss, indices)
 
         d = op2.Dat(iterset ** 1, data=None, dtype=np.uint32)
         k = op2.Kernel("static void inc(unsigned int* v) { *v += 1; }", "inc")
         op2.par_loop(k, sss, d(op2.RW))
 
-        indices = np.arange(0, nelems, 4, dtype=np.int)
+        indices = np.arange(0, nelems, 4, dtype=np.int32)
         ss2 = op2.Subset(iterset, indices)
         d2 = op2.Dat(iterset ** 1, data=None, dtype=np.uint32)
         op2.par_loop(k, ss2, d2(op2.RW))
@@ -121,16 +121,16 @@ def test_direct_loop_sub_subset(self, iterset):
         assert (d.data == d2.data).all()
 
     def test_direct_loop_sub_subset_with_indexing(self, iterset):
-        indices = np.arange(0, nelems, 2, dtype=np.int)
+        indices = np.arange(0, nelems, 2, dtype=np.int32)
         ss = iterset(indices)
-        indices = np.arange(0, nelems//2, 2, dtype=np.int)
+        indices = np.arange(0, nelems//2, 2, dtype=np.int32)
         sss = ss(indices)
 
         d = op2.Dat(iterset ** 1, data=None, dtype=np.uint32)
         k = op2.Kernel("static void inc(unsigned int* v) { *v += 1; }", "inc")
         op2.par_loop(k, sss, d(op2.RW))
 
-        indices = np.arange(0, nelems, 4, dtype=np.int)
+        indices = np.arange(0, nelems, 4, dtype=np.int32)
         ss2 = iterset(indices)
         d2 = op2.Dat(iterset ** 1, data=None, dtype=np.uint32)
         op2.par_loop(k, ss2, d2(op2.RW))
@@ -139,7 +139,7 @@ def test_direct_loop_sub_subset_with_indexing(self, iterset):
 
     def test_indirect_loop(self, iterset):
         """Test a indirect ParLoop on a subset"""
-        indices = np.array([i for i in range(nelems) if not i % 2], dtype=np.int)
+        indices = np.array([i for i in range(nelems) if not i % 2], dtype=np.int32)
         ss = op2.Subset(iterset, indices)
 
         indset = op2.Set(2, "indset")
@@ -167,7 +167,7 @@ def test_indirect_loop_empty(self, iterset):
 
     def test_indirect_loop_with_direct_dat(self, iterset):
         """Test a indirect ParLoop on a subset"""
-        indices = np.array([i for i in range(nelems) if not i % 2], dtype=np.int)
+        indices = np.array([i for i in range(nelems) if not i % 2], dtype=np.int32)
         ss = op2.Subset(iterset, indices)
 
         indset = op2.Set(2, "indset")
@@ -185,8 +185,8 @@ def test_indirect_loop_with_direct_dat(self, iterset):
 
     def test_complementary_subsets(self, iterset):
         """Test par_loop on two complementary subsets"""
-        even = np.array([i for i in range(nelems) if not i % 2], dtype=np.int)
-        odd = np.array([i for i in range(nelems) if i % 2], dtype=np.int)
+        even = np.array([i for i in range(nelems) if not i % 2], dtype=np.int32)
+        odd = np.array([i for i in range(nelems) if i % 2], dtype=np.int32)
 
         sseven = op2.Subset(iterset, even)
         ssodd = op2.Subset(iterset, odd)
@@ -216,7 +216,7 @@ def test_matrix(self):
         ss10 = op2.Subset(iterset, [1, 0])
         indset = op2.Set(4)
 
-        dat = op2.Dat(idset ** 1, data=[0, 1], dtype=np.float)
+        dat = op2.Dat(idset ** 1, data=[0, 1], dtype=np.float64)
         map = op2.Map(iterset, indset, 4, [0, 1, 2, 3, 0, 1, 2, 3])
         idmap = op2.Map(iterset, idset, 1, [0, 1])
         sparsity = op2.Sparsity((indset, indset), (map, map))
@@ -253,3 +253,62 @@ def test_matrix(self):
 
         assert (mat01.values == mat.values).all()
         assert (mat10.values == mat.values).all()
+
+
+class TestSetOperations:
+
+    """
+    Set operation tests
+    """
+
+    def test_set_set_operations(self):
+        """Test standard set operations between a set and itself"""
+        a = op2.Set(10)
+        u = a.union(a)
+        i = a.intersection(a)
+        d = a.difference(a)
+        s = a.symmetric_difference(a)
+        assert u is a
+        assert i is a
+        assert d._indices.size == 0
+        assert s._indices.size == 0
+
+    def test_set_subset_operations(self):
+        """Test standard set operations between a set and a subset"""
+        a = op2.Set(10)
+        b = op2.Subset(a, np.array([2, 3, 5, 7], dtype=np.int32))
+        u = a.union(b)
+        i = a.intersection(b)
+        d = a.difference(b)
+        s = a.symmetric_difference(b)
+        assert u is a
+        assert i is b
+        assert (d._indices == [0, 1, 4, 6, 8, 9]).all()
+        assert (s._indices == d._indices).all()
+
+    def test_subset_set_operations(self):
+        """Test standard set operations between a subset and a set"""
+        a = op2.Set(10)
+        b = op2.Subset(a, np.array([2, 3, 5, 7], dtype=np.int32))
+        u = b.union(a)
+        i = b.intersection(a)
+        d = b.difference(a)
+        s = b.symmetric_difference(a)
+        assert u is a
+        assert i is b
+        assert d._indices.size == 0
+        assert (s._indices == [0, 1, 4, 6, 8, 9]).all()
+
+    def test_subset_subset_operations(self):
+        """Test standard set operations between two subsets"""
+        a = op2.Set(10)
+        b = op2.Subset(a, np.array([2, 3, 5, 7], dtype=np.int32))
+        c = op2.Subset(a, np.array([2, 4, 6, 8], dtype=np.int32))
+        u = b.union(c)
+        i = b.intersection(c)
+        d = b.difference(c)
+        s = b.symmetric_difference(c)
+        assert (u._indices == [2, 3, 4, 5, 6, 7, 8]).all()
+        assert (i._indices == [2, ]).all()
+        assert (d._indices == [3, 5, 7]).all()
+        assert (s._indices == [3, 4, 5, 6, 7, 8]).all()