From 5dc3ac7ec700d85886eda3d53a03abcf5c7efc9c Mon Sep 17 00:00:00 2001
From: Charles Cooper <cooper.charles.m@gmail.com>
Date: Sat, 15 Jul 2023 11:20:53 -0400
Subject: [PATCH] feat: improve batch copy performance (#3483)

per cancun, eip-5656, this commit adds the use of mcopy for memory
copies. it also
- adds heuristics to use loops vs unrolled loops for batch copies.
- adds helper functions `vyper.codegen.core._opt_[gas,codesize,none]()`
  to detect optimization mode during codegen
- adds `--optimize none` to CLI options, with the intent of phasing out
  `--no-optimize` if the ergonomics are better.
---
 .github/workflows/era-tester.yml         |   4 +-
 setup.cfg                                |   1 -
 tests/compiler/test_opcodes.py           |   7 +-
 tests/parser/functions/test_slice.py     |  89 ++++++++--------
 tests/parser/types/test_dynamic_array.py |  12 +--
 vyper/cli/vyper_compile.py               |   2 +-
 vyper/codegen/core.py                    | 128 ++++++++++++++++++++---
 vyper/codegen/ir_node.py                 |  16 +--
 vyper/compiler/phases.py                 |   8 +-
 vyper/evm/opcodes.py                     |   5 +-
 vyper/ir/compile_ir.py                   |   1 +
 vyper/ir/optimizer.py                    |  44 +++++---
 vyper/utils.py                           |   3 +-
 13 files changed, 221 insertions(+), 99 deletions(-)

diff --git a/.github/workflows/era-tester.yml b/.github/workflows/era-tester.yml
index 8a2a3e50ce..187b5c03a2 100644
--- a/.github/workflows/era-tester.yml
+++ b/.github/workflows/era-tester.yml
@@ -101,11 +101,11 @@ jobs:
       if: ${{ github.ref != 'refs/heads/master' }}
       run: |
         cd era-compiler-tester
-        cargo run --release --bin compiler-tester -- -v --path=tests/vyper/ --mode="M0B0 ${{ env.VYPER_VERSION }}"
+        cargo run --release --bin compiler-tester -- --path=tests/vyper/ --mode="M0B0 ${{ env.VYPER_VERSION }}"
 
     - name: Run tester (slow)
       # Run era tester across the LLVM optimization matrix
       if: ${{ github.ref == 'refs/heads/master' }}
       run: |
         cd era-compiler-tester
-        cargo run --release --bin compiler-tester -- -v --path=tests/vyper/ --mode="M*B* ${{ env.VYPER_VERSION }}"
+        cargo run --release --bin compiler-tester -- --path=tests/vyper/ --mode="M*B* ${{ env.VYPER_VERSION }}"
diff --git a/setup.cfg b/setup.cfg
index d18ffe2ac7..dd4a32a3ac 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -31,7 +31,6 @@ addopts = -n auto
 	--cov-report html
 	--cov-report xml
 	--cov=vyper
-	--hypothesis-show-statistics
 python_files = test_*.py
 testpaths = tests
 markers =
diff --git a/tests/compiler/test_opcodes.py b/tests/compiler/test_opcodes.py
index b9841b92f0..20f45ced6b 100644
--- a/tests/compiler/test_opcodes.py
+++ b/tests/compiler/test_opcodes.py
@@ -59,5 +59,8 @@ def test_get_opcodes(evm_version):
         assert "PUSH0" in ops
 
     if evm_version in ("cancun",):
-        assert "TLOAD" in ops
-        assert "TSTORE" in ops
+        for op in ("TLOAD", "TSTORE", "MCOPY"):
+            assert op in ops
+    else:
+        for op in ("TLOAD", "TSTORE", "MCOPY"):
+            assert op not in ops
diff --git a/tests/parser/functions/test_slice.py b/tests/parser/functions/test_slice.py
index 11d834bf42..f1b642b28d 100644
--- a/tests/parser/functions/test_slice.py
+++ b/tests/parser/functions/test_slice.py
@@ -1,4 +1,6 @@
+import hypothesis.strategies as st
 import pytest
+from hypothesis import given, settings
 
 from vyper.exceptions import ArgumentException
 
@@ -9,14 +11,6 @@ def _generate_bytes(length):
     return bytes(list(range(length)))
 
 
-# good numbers to try
-_fun_numbers = [0, 1, 5, 31, 32, 33, 64, 99, 100, 101]
-
-
-# [b"", b"\x01", b"\x02"...]
-_bytes_examples = [_generate_bytes(i) for i in _fun_numbers if i <= 100]
-
-
 def test_basic_slice(get_contract_with_gas_estimation):
     code = """
 @external
@@ -31,12 +25,16 @@ def slice_tower_test(inp1: Bytes[50]) -> Bytes[50]:
     assert x == b"klmnopqrst", x
 
 
-@pytest.mark.parametrize("bytesdata", _bytes_examples)
-@pytest.mark.parametrize("start", _fun_numbers)
+# note: optimization boundaries at 32, 64 and 320 depending on mode
+_draw_1024 = st.integers(min_value=0, max_value=1024)
+_draw_1024_1 = st.integers(min_value=1, max_value=1024)
+_bytes_1024 = st.binary(min_size=0, max_size=1024)
+
+
 @pytest.mark.parametrize("literal_start", (True, False))
-@pytest.mark.parametrize("length", _fun_numbers)
 @pytest.mark.parametrize("literal_length", (True, False))
-@pytest.mark.fuzzing
+@given(start=_draw_1024, length=_draw_1024, length_bound=_draw_1024_1, bytesdata=_bytes_1024)
+@settings(max_examples=25, deadline=None)
 def test_slice_immutable(
     get_contract,
     assert_compile_failed,
@@ -46,47 +44,48 @@ def test_slice_immutable(
     literal_start,
     length,
     literal_length,
+    length_bound,
 ):
     _start = start if literal_start else "start"
     _length = length if literal_length else "length"
 
     code = f"""
-IMMUTABLE_BYTES: immutable(Bytes[100])
-IMMUTABLE_SLICE: immutable(Bytes[100])
+IMMUTABLE_BYTES: immutable(Bytes[{length_bound}])
+IMMUTABLE_SLICE: immutable(Bytes[{length_bound}])
 
 @external
-def __init__(inp: Bytes[100], start: uint256, length: uint256):
+def __init__(inp: Bytes[{length_bound}], start: uint256, length: uint256):
     IMMUTABLE_BYTES = inp
     IMMUTABLE_SLICE = slice(IMMUTABLE_BYTES, {_start}, {_length})
 
 @external
-def do_splice() -> Bytes[100]:
+def do_splice() -> Bytes[{length_bound}]:
     return IMMUTABLE_SLICE
     """
 
+    def _get_contract():
+        return get_contract(code, bytesdata, start, length)
+
     if (
-        (start + length > 100 and literal_start and literal_length)
-        or (literal_length and length > 100)
-        or (literal_start and start > 100)
+        (start + length > length_bound and literal_start and literal_length)
+        or (literal_length and length > length_bound)
+        or (literal_start and start > length_bound)
         or (literal_length and length < 1)
     ):
-        assert_compile_failed(
-            lambda: get_contract(code, bytesdata, start, length), ArgumentException
-        )
-    elif start + length > len(bytesdata):
-        assert_tx_failed(lambda: get_contract(code, bytesdata, start, length))
+        assert_compile_failed(lambda: _get_contract(), ArgumentException)
+    elif start + length > len(bytesdata) or (len(bytesdata) > length_bound):
+        # deploy fail
+        assert_tx_failed(lambda: _get_contract())
     else:
-        c = get_contract(code, bytesdata, start, length)
+        c = _get_contract()
         assert c.do_splice() == bytesdata[start : start + length]
 
 
 @pytest.mark.parametrize("location", ("storage", "calldata", "memory", "literal", "code"))
-@pytest.mark.parametrize("bytesdata", _bytes_examples)
-@pytest.mark.parametrize("start", _fun_numbers)
 @pytest.mark.parametrize("literal_start", (True, False))
-@pytest.mark.parametrize("length", _fun_numbers)
 @pytest.mark.parametrize("literal_length", (True, False))
-@pytest.mark.fuzzing
+@given(start=_draw_1024, length=_draw_1024, length_bound=_draw_1024_1, bytesdata=_bytes_1024)
+@settings(max_examples=25, deadline=None)
 def test_slice_bytes(
     get_contract,
     assert_compile_failed,
@@ -97,9 +96,10 @@ def test_slice_bytes(
     literal_start,
     length,
     literal_length,
+    length_bound,
 ):
     if location == "memory":
-        spliced_code = "foo: Bytes[100] = inp"
+        spliced_code = f"foo: Bytes[{length_bound}] = inp"
         foo = "foo"
     elif location == "storage":
         spliced_code = "self.foo = inp"
@@ -120,31 +120,38 @@ def test_slice_bytes(
     _length = length if literal_length else "length"
 
     code = f"""
-foo: Bytes[100]
-IMMUTABLE_BYTES: immutable(Bytes[100])
+foo: Bytes[{length_bound}]
+IMMUTABLE_BYTES: immutable(Bytes[{length_bound}])
 @external
-def __init__(foo: Bytes[100]):
+def __init__(foo: Bytes[{length_bound}]):
     IMMUTABLE_BYTES = foo
 
 @external
-def do_slice(inp: Bytes[100], start: uint256, length: uint256) -> Bytes[100]:
+def do_slice(inp: Bytes[{length_bound}], start: uint256, length: uint256) -> Bytes[{length_bound}]:
     {spliced_code}
     return slice({foo}, {_start}, {_length})
     """
 
-    length_bound = len(bytesdata) if location == "literal" else 100
+    def _get_contract():
+        return get_contract(code, bytesdata)
+
+    data_length = len(bytesdata) if location == "literal" else length_bound
     if (
-        (start + length > length_bound and literal_start and literal_length)
-        or (literal_length and length > length_bound)
-        or (literal_start and start > length_bound)
+        (start + length > data_length and literal_start and literal_length)
+        or (literal_length and length > data_length)
+        or (location == "literal" and len(bytesdata) > length_bound)
+        or (literal_start and start > data_length)
         or (literal_length and length < 1)
     ):
-        assert_compile_failed(lambda: get_contract(code, bytesdata), ArgumentException)
+        assert_compile_failed(lambda: _get_contract(), ArgumentException)
+    elif len(bytesdata) > data_length:
+        # deploy fail
+        assert_tx_failed(lambda: _get_contract())
     elif start + length > len(bytesdata):
-        c = get_contract(code, bytesdata)
+        c = _get_contract()
         assert_tx_failed(lambda: c.do_slice(bytesdata, start, length))
     else:
-        c = get_contract(code, bytesdata)
+        c = _get_contract()
         assert c.do_slice(bytesdata, start, length) == bytesdata[start : start + length], code
 
 
diff --git a/tests/parser/types/test_dynamic_array.py b/tests/parser/types/test_dynamic_array.py
index cbae183fe4..9231d1979f 100644
--- a/tests/parser/types/test_dynamic_array.py
+++ b/tests/parser/types/test_dynamic_array.py
@@ -2,7 +2,6 @@
 
 import pytest
 
-from vyper.compiler.settings import OptimizationLevel
 from vyper.exceptions import (
     ArgumentException,
     ArrayIndexException,
@@ -1585,14 +1584,9 @@ def bar2() -> uint256:
         newFoo.b1[1][0][0].a1[0][1][1] + \\
         newFoo.b1[0][1][0].a1[0][0][0]
     """
-
-    if optimize == OptimizationLevel.NONE:
-        # fails at assembly stage with too many stack variables
-        assert_compile_failed(lambda: get_contract(code), Exception)
-    else:
-        c = get_contract(code)
-        assert c.bar() == [[[3, 7], [7, 3]], [[7, 3], [0, 0]]]
-        assert c.bar2() == 0
+    c = get_contract(code)
+    assert c.bar() == [[[3, 7], [7, 3]], [[7, 3], [0, 0]]]
+    assert c.bar2() == 0
 
 
 def test_tuple_of_lists(get_contract):
diff --git a/vyper/cli/vyper_compile.py b/vyper/cli/vyper_compile.py
index 71e78dd666..55e0fc82b2 100755
--- a/vyper/cli/vyper_compile.py
+++ b/vyper/cli/vyper_compile.py
@@ -105,7 +105,7 @@ def _parse_args(argv):
         dest="evm_version",
     )
     parser.add_argument("--no-optimize", help="Do not optimize", action="store_true")
-    parser.add_argument("--optimize", help="Optimization flag", choices=["gas", "codesize"])
+    parser.add_argument("--optimize", help="Optimization flag", choices=["gas", "codesize", "none"])
     parser.add_argument(
         "--no-bytecode-metadata", help="Do not add metadata to bytecode", action="store_true"
     )
diff --git a/vyper/codegen/core.py b/vyper/codegen/core.py
index 58d9db9889..5b16938e99 100644
--- a/vyper/codegen/core.py
+++ b/vyper/codegen/core.py
@@ -1,6 +1,11 @@
+import contextlib
+from typing import Generator
+
 from vyper import ast as vy_ast
 from vyper.codegen.ir_node import Encoding, IRnode
+from vyper.compiler.settings import OptimizationLevel
 from vyper.evm.address_space import CALLDATA, DATA, IMMUTABLES, MEMORY, STORAGE, TRANSIENT
+from vyper.evm.opcodes import version_check
 from vyper.exceptions import CompilerPanic, StructureException, TypeCheckFailure, TypeMismatch
 from vyper.semantics.types import (
     AddressT,
@@ -19,13 +24,7 @@
 from vyper.semantics.types.shortcuts import BYTES32_T, INT256_T, UINT256_T
 from vyper.semantics.types.subscriptable import SArrayT
 from vyper.semantics.types.user import EnumT
-from vyper.utils import (
-    GAS_CALLDATACOPY_WORD,
-    GAS_CODECOPY_WORD,
-    GAS_IDENTITY,
-    GAS_IDENTITYWORD,
-    ceil32,
-)
+from vyper.utils import GAS_COPY_WORD, GAS_IDENTITY, GAS_IDENTITYWORD, ceil32
 
 DYNAMIC_ARRAY_OVERHEAD = 1
 
@@ -90,12 +89,16 @@ def _identity_gas_bound(num_bytes):
     return GAS_IDENTITY + GAS_IDENTITYWORD * (ceil32(num_bytes) // 32)
 
 
+def _mcopy_gas_bound(num_bytes):
+    return GAS_COPY_WORD * ceil32(num_bytes) // 32
+
+
 def _calldatacopy_gas_bound(num_bytes):
-    return GAS_CALLDATACOPY_WORD * ceil32(num_bytes) // 32
+    return GAS_COPY_WORD * ceil32(num_bytes) // 32
 
 
 def _codecopy_gas_bound(num_bytes):
-    return GAS_CODECOPY_WORD * ceil32(num_bytes) // 32
+    return GAS_COPY_WORD * ceil32(num_bytes) // 32
 
 
 # Copy byte array word-for-word (including layout)
@@ -258,7 +261,6 @@ def copy_bytes(dst, src, length, length_bound):
         assert src.is_pointer and dst.is_pointer
 
         # fast code for common case where num bytes is small
-        # TODO expand this for more cases where num words is less than ~8
         if length_bound <= 32:
             copy_op = STORE(dst, LOAD(src))
             ret = IRnode.from_list(copy_op, annotation=annotation)
@@ -268,8 +270,12 @@ def copy_bytes(dst, src, length, length_bound):
             # special cases: batch copy to memory
             # TODO: iloadbytes
             if src.location == MEMORY:
-                copy_op = ["staticcall", "gas", 4, src, length, dst, length]
-                gas_bound = _identity_gas_bound(length_bound)
+                if version_check(begin="cancun"):
+                    copy_op = ["mcopy", dst, src, length]
+                    gas_bound = _mcopy_gas_bound(length_bound)
+                else:
+                    copy_op = ["staticcall", "gas", 4, src, length, dst, length]
+                    gas_bound = _identity_gas_bound(length_bound)
             elif src.location == CALLDATA:
                 copy_op = ["calldatacopy", dst, src, length]
                 gas_bound = _calldatacopy_gas_bound(length_bound)
@@ -876,6 +882,38 @@ def make_setter(left, right):
     return _complex_make_setter(left, right)
 
 
+_opt_level = OptimizationLevel.GAS
+
+
+@contextlib.contextmanager
+def anchor_opt_level(new_level: OptimizationLevel) -> Generator:
+    """
+    Set the global optimization level variable for the duration of this
+    context manager.
+    """
+    assert isinstance(new_level, OptimizationLevel)
+
+    global _opt_level
+    try:
+        tmp = _opt_level
+        _opt_level = new_level
+        yield
+    finally:
+        _opt_level = tmp
+
+
+def _opt_codesize():
+    return _opt_level == OptimizationLevel.CODESIZE
+
+
+def _opt_gas():
+    return _opt_level == OptimizationLevel.GAS
+
+
+def _opt_none():
+    return _opt_level == OptimizationLevel.NONE
+
+
 def _complex_make_setter(left, right):
     if right.value == "~empty" and left.location == MEMORY:
         # optimized memzero
@@ -891,11 +929,69 @@ def _complex_make_setter(left, right):
         assert is_tuple_like(left.typ)
         keys = left.typ.tuple_keys()
 
-    # if len(keyz) == 0:
-    #    return IRnode.from_list(["pass"])
+    if left.is_pointer and right.is_pointer and right.encoding == Encoding.VYPER:
+        # both left and right are pointers, see if we want to batch copy
+        # instead of unrolling the loop.
+        assert left.encoding == Encoding.VYPER
+        len_ = left.typ.memory_bytes_required
+
+        has_storage = STORAGE in (left.location, right.location)
+        if has_storage:
+            if _opt_codesize():
+                # assuming PUSH2, a single sstore(dst (sload src)) is 8 bytes,
+                # sstore(add (dst ofst), (sload (add (src ofst)))) is 16 bytes,
+                # whereas loop overhead is 16-17 bytes.
+                base_cost = 3
+                if left._optimized.is_literal:
+                    # code size is smaller since add is performed at compile-time
+                    base_cost += 1
+                if right._optimized.is_literal:
+                    base_cost += 1
+                # the formula is a heuristic, but it works.
+                # (CMC 2023-07-14 could get more detailed for PUSH1 vs
+                # PUSH2 etc but not worried about that too much now,
+                # it's probably better to add a proper unroll rule in the
+                # optimizer.)
+                should_batch_copy = len_ >= 32 * base_cost
+            elif _opt_gas():
+                # kind of arbitrary, but cut off when code used > ~160 bytes
+                should_batch_copy = len_ >= 32 * 10
+            else:
+                assert _opt_none()
+                # don't care, just generate the most readable version
+                should_batch_copy = True
+        else:
+            # find a cutoff for memory copy where identity is cheaper
+            # than unrolled mloads/mstores
+            # if MCOPY is available, mcopy is *always* better (except in
+            # the 1 word case, but that is already handled by copy_bytes).
+            if right.location == MEMORY and _opt_gas() and not version_check(begin="cancun"):
+                # cost for 0th word - (mstore dst (mload src))
+                base_unroll_cost = 12
+                nth_word_cost = base_unroll_cost
+                if not left._optimized.is_literal:
+                    # (mstore (add N dst) (mload src))
+                    nth_word_cost += 6
+                if not right._optimized.is_literal:
+                    # (mstore dst (mload (add N src)))
+                    nth_word_cost += 6
+
+                identity_base_cost = 115  # staticcall 4 gas dst len src len
+
+                n_words = ceil32(len_) // 32
+                should_batch_copy = (
+                    base_unroll_cost + (nth_word_cost * (n_words - 1)) >= identity_base_cost
+                )
+
+            # calldata to memory, code to memory, cancun, or codesize -
+            # batch copy is always better.
+            else:
+                should_batch_copy = True
+
+        if should_batch_copy:
+            return copy_bytes(left, right, len_, len_)
 
-    # general case
-    # TODO use copy_bytes when the generated code is above a certain size
+    # general case, unroll
     with left.cache_when_complex("_L") as (b1, left), right.cache_when_complex("_R") as (b2, right):
         for k in keys:
             l_i = get_element_ptr(left, k, array_bounds_check=False)
diff --git a/vyper/codegen/ir_node.py b/vyper/codegen/ir_node.py
index f7698fbabb..0895e5f02d 100644
--- a/vyper/codegen/ir_node.py
+++ b/vyper/codegen/ir_node.py
@@ -49,10 +49,7 @@ class Encoding(Enum):
 # this creates a magical block which maps to IR `with`
 class _WithBuilder:
     def __init__(self, ir_node, name, should_inline=False):
-        # TODO figure out how to fix this circular import
-        from vyper.ir.optimizer import optimize
-
-        if should_inline and optimize(ir_node).is_complex_ir:
+        if should_inline and ir_node._optimized.is_complex_ir:
             # this can only mean trouble
             raise CompilerPanic("trying to inline a complex IR node")
 
@@ -366,6 +363,13 @@ def is_pointer(self):
         # eventually
         return self.location is not None
 
+    @property  # probably could be cached_property but be paranoid
+    def _optimized(self):
+        # TODO figure out how to fix this circular import
+        from vyper.ir.optimizer import optimize
+
+        return optimize(self)
+
     # This function is slightly confusing but abstracts a common pattern:
     # when an IR value needs to be computed once and then cached as an
     # IR value (if it is expensive, or more importantly if its computation
@@ -382,13 +386,11 @@ def is_pointer(self):
     #   return builder.resolve(ret)
     # ```
     def cache_when_complex(self, name):
-        from vyper.ir.optimizer import optimize
-
         # for caching purposes, see if the ir_node will be optimized
         # because a non-literal expr could turn into a literal,
         # (e.g. `(add 1 2)`)
         # TODO this could really be moved into optimizer.py
-        should_inline = not optimize(self).is_complex_ir
+        should_inline = not self._optimized.is_complex_ir
 
         return _WithBuilder(self, name, should_inline)
 
diff --git a/vyper/compiler/phases.py b/vyper/compiler/phases.py
index 99465809bd..4e1bd9e6c3 100644
--- a/vyper/compiler/phases.py
+++ b/vyper/compiler/phases.py
@@ -5,6 +5,7 @@
 
 from vyper import ast as vy_ast
 from vyper.codegen import module
+from vyper.codegen.core import anchor_opt_level
 from vyper.codegen.global_context import GlobalContext
 from vyper.codegen.ir_node import IRnode
 from vyper.compiler.settings import OptimizationLevel, Settings
@@ -268,7 +269,9 @@ def generate_folded_ast(
     return vyper_module_folded, symbol_tables
 
 
-def generate_ir_nodes(global_ctx: GlobalContext, optimize: bool) -> tuple[IRnode, IRnode]:
+def generate_ir_nodes(
+    global_ctx: GlobalContext, optimize: OptimizationLevel
+) -> tuple[IRnode, IRnode]:
     """
     Generate the intermediate representation (IR) from the contextualized AST.
 
@@ -288,7 +291,8 @@ def generate_ir_nodes(global_ctx: GlobalContext, optimize: bool) -> tuple[IRnode
         IR to generate deployment bytecode
         IR to generate runtime bytecode
     """
-    ir_nodes, ir_runtime = module.generate_ir_for_module(global_ctx)
+    with anchor_opt_level(optimize):
+        ir_nodes, ir_runtime = module.generate_ir_for_module(global_ctx)
     if optimize != OptimizationLevel.NONE:
         ir_nodes = optimizer.optimize(ir_nodes)
         ir_runtime = optimizer.optimize(ir_runtime)
diff --git a/vyper/evm/opcodes.py b/vyper/evm/opcodes.py
index 4fec13e897..767d634c89 100644
--- a/vyper/evm/opcodes.py
+++ b/vyper/evm/opcodes.py
@@ -89,6 +89,7 @@
     "MSIZE": (0x59, 0, 1, 2),
     "GAS": (0x5A, 0, 1, 2),
     "JUMPDEST": (0x5B, 0, 0, 1),
+    "MCOPY": (0x5E, 3, 0, (None, None, None, None, None, 3)),
     "PUSH0": (0x5F, 0, 1, 2),
     "PUSH1": (0x60, 0, 1, 3),
     "PUSH2": (0x61, 0, 1, 3),
@@ -171,8 +172,8 @@
     "INVALID": (0xFE, 0, 0, 0),
     "DEBUG": (0xA5, 1, 0, 0),
     "BREAKPOINT": (0xA6, 0, 0, 0),
-    "TLOAD": (0x5C, 1, 1, 100),
-    "TSTORE": (0x5D, 2, 0, 100),
+    "TLOAD": (0x5C, 1, 1, (None, None, None, None, None, 100)),
+    "TSTORE": (0x5D, 2, 0, (None, None, None, None, None, 100)),
 }
 
 PSEUDO_OPCODES: OpcodeMap = {
diff --git a/vyper/ir/compile_ir.py b/vyper/ir/compile_ir.py
index 15a68a5079..a9064a44fa 100644
--- a/vyper/ir/compile_ir.py
+++ b/vyper/ir/compile_ir.py
@@ -297,6 +297,7 @@ def _height_of(witharg):
         return o
 
     # batch copy from data section of the currently executing code to memory
+    # (probably should have named this dcopy but oh well)
     elif code.value == "dloadbytes":
         dst = code.args[0]
         src = code.args[1]
diff --git a/vyper/ir/optimizer.py b/vyper/ir/optimizer.py
index b13c6f79f8..40e02e79c7 100644
--- a/vyper/ir/optimizer.py
+++ b/vyper/ir/optimizer.py
@@ -2,6 +2,7 @@
 from typing import List, Optional, Tuple, Union
 
 from vyper.codegen.ir_node import IRnode
+from vyper.evm.opcodes import version_check
 from vyper.exceptions import CompilerPanic, StaticAssertionException
 from vyper.utils import (
     ceil32,
@@ -472,6 +473,7 @@ def finalize(val, args):
     if value == "seq":
         changed |= _merge_memzero(argz)
         changed |= _merge_calldataload(argz)
+        changed |= _merge_mload(argz)
         changed |= _remove_empty_seqs(argz)
 
         # (seq x) => (x) for cleanliness and
@@ -636,12 +638,26 @@ def _remove_empty_seqs(argz):
 
 
 def _merge_calldataload(argz):
-    # look for sequential operations copying from calldata to memory
-    # and merge them into a single calldatacopy operation
+    return _merge_load(argz, "calldataload", "calldatacopy")
+
+
+def _merge_dload(argz):
+    return _merge_load(argz, "dload", "dloadbytes")
+
+
+def _merge_mload(argz):
+    if not version_check(begin="cancun"):
+        return False
+    return _merge_load(argz, "mload", "mcopy")
+
+
+def _merge_load(argz, _LOAD, _COPY):
+    # look for sequential operations copying from X to Y
+    # and merge them into a single copy operation
     changed = False
     mstore_nodes: List = []
-    initial_mem_offset = 0
-    initial_calldata_offset = 0
+    initial_dst_offset = 0
+    initial_src_offset = 0
     total_length = 0
     idx = None
     for i, ir_node in enumerate(argz):
@@ -649,19 +665,19 @@ def _merge_calldataload(argz):
         if (
             ir_node.value == "mstore"
             and isinstance(ir_node.args[0].value, int)
-            and ir_node.args[1].value == "calldataload"
+            and ir_node.args[1].value == _LOAD
             and isinstance(ir_node.args[1].args[0].value, int)
         ):
             # mstore of a zero value
-            mem_offset = ir_node.args[0].value
-            calldata_offset = ir_node.args[1].args[0].value
+            dst_offset = ir_node.args[0].value
+            src_offset = ir_node.args[1].args[0].value
             if not mstore_nodes:
-                initial_mem_offset = mem_offset
-                initial_calldata_offset = calldata_offset
+                initial_dst_offset = dst_offset
+                initial_src_offset = src_offset
                 idx = i
             if (
-                initial_mem_offset + total_length == mem_offset
-                and initial_calldata_offset + total_length == calldata_offset
+                initial_dst_offset + total_length == dst_offset
+                and initial_src_offset + total_length == src_offset
             ):
                 mstore_nodes.append(ir_node)
                 total_length += 32
@@ -676,7 +692,7 @@ def _merge_calldataload(argz):
         if len(mstore_nodes) > 1:
             changed = True
             new_ir = IRnode.from_list(
-                ["calldatacopy", initial_mem_offset, initial_calldata_offset, total_length],
+                [_COPY, initial_dst_offset, initial_src_offset, total_length],
                 source_pos=mstore_nodes[0].source_pos,
             )
             # replace first copy operation with optimized node and remove the rest
@@ -684,8 +700,8 @@ def _merge_calldataload(argz):
             # note: del xs[k:l] deletes l - k items
             del argz[idx + 1 : idx + len(mstore_nodes)]
 
-        initial_mem_offset = 0
-        initial_calldata_offset = 0
+        initial_dst_offset = 0
+        initial_src_offset = 0
         total_length = 0
         mstore_nodes.clear()
 
diff --git a/vyper/utils.py b/vyper/utils.py
index 2440117d0c..3d9d9cb416 100644
--- a/vyper/utils.py
+++ b/vyper/utils.py
@@ -196,8 +196,7 @@ def calc_mem_gas(memsize):
 # Specific gas usage
 GAS_IDENTITY = 15
 GAS_IDENTITYWORD = 3
-GAS_CODECOPY_WORD = 3
-GAS_CALLDATACOPY_WORD = 3
+GAS_COPY_WORD = 3  # i.e., W_copy from YP
 
 # A decimal value can store multiples of 1/DECIMAL_DIVISOR
 MAX_DECIMAL_PLACES = 10