diff --git a/example.py b/example.py
index fc133550..9043fa24 100644
--- a/example.py
+++ b/example.py
@@ -581,7 +581,9 @@ def __init__(self, var="", arch=AArch64_Neon, target=Target_CortexA55):
     def core(self,slothy):
         slothy.config.variable_size=True
         slothy.config.constraints.stalls_first_attempt=32
-        slothy.optimize()
+        slothy.config.selfcheck = False
+        slothy.config.allow_useless_instructions = True
+        slothy.fusion_region("start", "end", ssa=False)
 
 class AArch64Example0Equ(Example):
     def __init__(self, var="", arch=AArch64_Neon, target=Target_CortexA55):
@@ -639,6 +641,21 @@ def core(self,slothy):
         slothy.config.sw_pipelining.optimize_postamble = False
         slothy.optimize_loop("start")
 
+class AArch64Split0(Example):
+    def __init__(self, var="", arch=AArch64_Neon, target=Target_CortexA55):
+        name = "aarch64_split0"
+        infile = name
+
+        if var != "":
+            name += f"_{var}"
+            infile += f"_{var}"
+        name += f"_{target_label_dict[target]}"
+
+        super().__init__(infile, name, rename=True, arch=arch, target=target)
+
+    def core(self,slothy):
+        slothy.config.allow_useless_instructions = True
+        slothy.fusion_region("start", "end", ssa=False)
 class Armv7mExample0(Example):
     def __init__(self, var="", arch=Arch_Armv7M, target=Target_CortexM7):
         name = "armv7m_simple0"
@@ -1211,7 +1228,7 @@ def core(self, slothy):
         slothy.config.constraints.stalls_first_attempt = 110
         slothy.optimize_loop("layer123_start")
 
-        
+
 
 
 class ntt_dilithium_123(Example):
@@ -1334,7 +1351,7 @@ def core(self, slothy):
         slothy.optimize_loop("layer5678_start")
 
         slothy.config = conf.copy()
-        
+
         if self.timeout is not None:
             slothy.config.timeout = self.timeout // 12
 
@@ -1351,7 +1368,7 @@ def core(self, slothy):
         slothy.config.split_heuristic_stepsize = 0.1
         slothy.config.constraints.stalls_first_attempt = 14
         slothy.optimize_loop("layer1234_start")
-            
+
 
 class ntt_dilithium_1234(Example):
     def __init__(self, var="", arch=AArch64_Neon, target=Target_CortexA72):
@@ -1495,6 +1512,8 @@ def main():
                  AArch64Example2(),
                  AArch64Example2(target=Target_CortexA72),
 
+                 AArch64Split0(),
+
                 # Armv7m examples
                  Armv7mExample0(),
 
diff --git a/examples/naive/aarch64/aarch64_split0.s b/examples/naive/aarch64/aarch64_split0.s
new file mode 100644
index 00000000..a448f25c
--- /dev/null
+++ b/examples/naive/aarch64/aarch64_split0.s
@@ -0,0 +1,15 @@
+start:
+ldr q0, [x1, #0]
+ldr q1, [x2, #0]
+eor3 v5.16b, v1.16b, v2.16b, v3.16b // @slothy:some_tag // some comment
+eor3 v3.16b, v1.16b, v2.16b, v3.16b // Cannot we split naively
+ldr q8,  [x0]
+ldr q9,  [x0, #1*16]
+ldr q10, [x0, #2*16]
+ldr q11, [x0, #3*16]
+mul v24.8h, v9.8h, v0.h[0]
+sqrdmulh v9.8h, v9.8h, v0.h[1]
+mls v24.8h, v9.8h, v1.h[0]
+sub     v9.8h,    v8.8h, v24.8h
+add     v8.8h,    v8.8h, v24.8h
+end:
diff --git a/examples/opt/aarch64/aarch64_split0_opt_a55.s b/examples/opt/aarch64/aarch64_split0_opt_a55.s
new file mode 100644
index 00000000..d0df6a96
--- /dev/null
+++ b/examples/opt/aarch64/aarch64_split0_opt_a55.s
@@ -0,0 +1,17 @@
+start:
+ldr q0, [x1, #0]
+ldr q1, [x2, #0]
+eor v5.16B, v1.16B, v2.16B// some comment // @slothy:some_tag
+eor v5.16B, v5.16B, v3.16B// some comment // @slothy:some_tag
+eor3 v3.16B, v1.16B, v2.16B, v3.16B// Cannot we split naively
+ldr q8, [x0]
+ldr q9, [x0, #16]
+ldr q10, [x0, #32]
+ldr q11, [x0, #48]
+mul v24.8H, v9.8H, v0.H[0]
+sqrdmulh v9.8H, v9.8H, v0.H[1]
+mls v24.8H, v9.8H, v1.H[0]
+sub v9.8H, v8.8H, v24.8H
+add v8.8H, v8.8H, v24.8H
+end:
+
diff --git a/slothy/core/dataflow.py b/slothy/core/dataflow.py
index e5f27005..2e333dd5 100644
--- a/slothy/core/dataflow.py
+++ b/slothy/core/dataflow.py
@@ -525,7 +525,18 @@ def apply_cbs(self, cb, logger, one_a_time=False):
                 break
 
             z = filter(lambda x: x.delete is False, self.nodes)
-            z = map(lambda x: ([x.inst], x.inst.source_line), z)
+
+            def pair_with_source(i):
+                return ([i], i.source_line)
+            def map_node(t):
+                s = t.inst
+                if not isinstance(t.inst, list):
+                    s = [s]
+                return map(pair_with_source, s)
+            def flatten(llst):
+                return [x for y in llst for x in y]
+
+            z = flatten(map(map_node, z))
 
             self.src = list(z)
 
diff --git a/slothy/core/slothy.py b/slothy/core/slothy.py
index b0b776ee..2fbfdf8a 100644
--- a/slothy/core/slothy.py
+++ b/slothy/core/slothy.py
@@ -324,7 +324,7 @@ def get_input_from_output(self, start, end, outputs=None):
         dfgc = DFGConfig(c)
         return list(DFG(body, logger.getChild("dfg_find_deps"), dfgc).inputs)
 
-    def _fusion_core(self, pre, body, post, logger):
+    def _fusion_core(self, pre, body, post, logger, ssa=True):
         c = self.config.copy()
 
         if c.with_preprocessor:
@@ -342,9 +342,10 @@ def _fusion_core(self, pre, body, post, logger):
         body = AsmAllocation.unfold_all_aliases(c.register_aliases, body)
         dfgc = DFGConfig(c)
 
-        dfg = DFG(body, logger.getChild("ssa"), dfgc, parsing_cb=False)
-        dfg.ssa()
-        body = [ ComputationNode.to_source_line(t) for t in dfg.nodes ]
+        if ssa is True:
+            dfg = DFG(body, logger.getChild("ssa"), dfgc, parsing_cb=False)
+            dfg.ssa()
+            body = [ ComputationNode.to_source_line(t) for t in dfg.nodes ]
 
         dfg = DFG(body, logger.getChild("fusion"), dfgc, parsing_cb=False)
         dfg.apply_fusion_cbs()
@@ -352,18 +353,18 @@ def _fusion_core(self, pre, body, post, logger):
 
         return body
 
-    def fusion_region(self, start, end):
+    def fusion_region(self, start, end, **kwargs):
         """Run fusion callbacks on straightline code"""
         logger = self.logger.getChild(f"ssa_{start}_{end}")
         pre, body, post = AsmHelper.extract(self.source, start, end)
 
         body_ssa = [ SourceLine(f"{start}:") ] +\
-             self._fusion_core(pre, body, logger) + \
+             self._fusion_core(pre, body, post, logger, **kwargs) + \
             [ SourceLine(f"{end}:") ]
         self.source = pre + body_ssa + post
         assert SourceLine.is_source(self.source)
 
-    def fusion_loop(self, loop_lbl):
+    def fusion_loop(self, loop_lbl, **kwargs):
         """Run fusion callbacks on loop body"""
         logger = self.logger.getChild(f"ssa_loop_{loop_lbl}")
 
@@ -373,25 +374,12 @@ def fusion_loop(self, loop_lbl):
         indentation = AsmHelper.find_indentation(body)
 
         body_ssa = SourceLine.read_multiline(loop.start(loop_cnt)) + \
-            SourceLine.apply_indentation(self._fusion_core(pre, body, logger), indentation) + \
+            SourceLine.apply_indentation(self._fusion_core(pre, body, late, logger, **kwargs), indentation) + \
             SourceLine.read_multiline(loop.end(other_data))
 
         self.source = pre + body_ssa + post
         assert SourceLine.is_source(self.source)
 
-        c = self.config.copy()
-        self.config.keep_tags = True
-        self.config.constraints.functional_only = True
-        self.config.constraints.allow_reordering = False
-        self.config.sw_pipelining.enabled = False
-        self.config.split_heuristic = False
-        self.config.inputs_are_outputs = True
-        self.config.sw_pipelining.unknown_iteration_count = False
-        self.optimize_loop(loop_lbl)
-        self.config = c
-
-        assert SourceLine.is_source(self.source)
-
     def optimize_loop(self, loop_lbl, postamble_label=None):
         """Optimize the loop starting at a given label
             The postamble_label marks the end of the loop kernel.
diff --git a/slothy/targets/aarch64/aarch64_neon.py b/slothy/targets/aarch64/aarch64_neon.py
index 68fdf3c0..8422a5fd 100644
--- a/slothy/targets/aarch64/aarch64_neon.py
+++ b/slothy/targets/aarch64/aarch64_neon.py
@@ -44,6 +44,7 @@ class which generates instruction parsers and writers from instruction templates
 from enum import Enum
 from functools import cache
 
+from slothy.helper import SourceLine
 from sympy import simplify
 
 from slothy.targets.common import *
@@ -3175,7 +3176,50 @@ def core(inst,t,log=None):
 
     return core
 
-veor.global_fusion_cb  = eor3_fusion_cb()
+# TODO: Test only...
+# veor.global_fusion_cb  = eor3_fusion_cb()
+
+def eor3_splitting_cb():
+    def core(inst,t,log=None):
+
+        d = inst.args_out[0]
+        a = inst.args_in[0]
+        b = inst.args_in[1]
+        c = inst.args_in[2]
+
+        # Check if we can use the output as a temporary
+        if d in [a,b,c]:
+            return False
+
+        eor0 = AArch64Instruction.build(veor, { "Vd": d, "Va" : a, "Vb" : b,
+                                                "datatype0":"16b",
+                                                "datatype1":"16b",
+                                                "datatype2":"16b" })
+        eor1 = AArch64Instruction.build(veor, { "Vd": d, "Va" : d, "Vb" : c,
+                                                "datatype0":"16b",
+                                                "datatype1":"16b",
+                                                "datatype2":"16b" })
+
+        eor0_src = SourceLine(eor0.write()).\
+            add_tags(inst.source_line.tags).\
+            add_comments(inst.source_line.comments)
+        eor1_src = SourceLine(eor1.write()).\
+            add_tags(inst.source_line.tags).\
+            add_comments(inst.source_line.comments)
+
+        eor0.source_line = eor0_src
+        eor1.source_line = eor1_src
+
+        if log is not None:
+            log(f"EOR3 splitting: {t.inst}; {eor0} + {eor1}")
+
+        t.changed = True
+        t.inst = [eor0, eor1]
+        return True
+
+    return core
+
+veor3.global_fusion_cb  = eor3_splitting_cb()
 
 def iter_aarch64_instructions():
     yield from all_subclass_leaves(Instruction)