Implement global self test

This commit prototypes an implementation of a 'global' selftest, building and emulating the entire source code loaded into SLOTHY and checking that input and output behave equivalently. To use, call `slothy.global_selftest(...)` at any point during stateful SLOTHY operations. Then, the global selftest will compare the current state of the code with the original state. See ntt_kyber_123_4567 for an example.
slothy-optimizer · hanno-becker · Dec 9, 2024 · Nov 25, 2024 · Dec 8, 2024 · Dec 9, 2024
commit 0e04f488534a84bdc5e2a73e742e5fdba075566f
diff --git a/example.py b/example.py
@@ -656,6 +656,24 @@ def core(self,slothy):
         slothy.config.inputs_are_outputs = True
         slothy.optimize(start="start", end="end")
 
+class Armv7mExample0Func(Example):
+    def __init__(self, var="", arch=Arch_Armv7M, target=Target_CortexM7):
+        name = "armv7m_simple0_func"
+        infile = name
+
+        if var != "":
+            name += f"_{var}"
+            infile += f"_{var}"
+        name += f"_{target_label_dict[target]}"
+
+        super().__init__(infile, name, rename=True, arch=arch, target=target)
+
+    def core(self,slothy):
+        slothy.config.variable_size=True
+        slothy.config.inputs_are_outputs = True
+        slothy.optimize(start="start", end="end")
+        slothy.global_selftest("my_func", {"r0": 1024 })
+
 class Armv7mLoopSubs(Example):
     def __init__(self, var="", arch=Arch_Armv7M, target=Target_CortexM7):
         name = "loop_subs"
@@ -688,7 +706,7 @@ def core(self,slothy):
         slothy.config.variable_size=True
         slothy.config.outputs = ["r6"]
         slothy.optimize_loop("start")
-        
+
 class Armv7mLoopVmovCmp(Example):
     def __init__(self, var="", arch=Arch_Armv7M, target=Target_CortexM7):
         name = "loop_vmov_cmp"
@@ -720,12 +738,13 @@ def __init__(self, var="", arch=AArch64_Neon, target=Target_CortexA55):
 
     def core(self,slothy):
         slothy.optimize()
-        
+
 class ntt_kyber_123_4567(Example):
     def __init__(self, var="", arch=AArch64_Neon, target=Target_CortexA55, timeout=None):
         name = "ntt_kyber_123_4567"
         infile = name
 
+        self.var = var
         if var != "":
             name += f"_{var}"
             infile += f"_{var}"
@@ -744,6 +763,10 @@ def core(self, slothy):
         slothy.config.constraints.stalls_first_attempt = 64
         slothy.optimize_loop("layer123_start")
         slothy.optimize_loop("layer4567_start")
+        # Build + emulate entire function to test that behaviour has not changed
+        if self.var == "":
+            slothy.global_selftest("ntt_kyber_123_4567",
+                                   {"x0": 1024, "x1": 1024, "x3": 1024, "x4": 1024, "x5": 1024})
 
 class intt_kyber_123_4567(Example):
     def __init__(self, var="", arch=AArch64_Neon, target=Target_CortexA55, timeout=None):
@@ -1226,7 +1249,7 @@ def core(self, slothy):
         slothy.config.constraints.stalls_first_attempt = 110
         slothy.optimize_loop("layer123_start")
 
-        
+
 
 
 class ntt_dilithium_123(Example):
@@ -1349,7 +1372,7 @@ def core(self, slothy):
         slothy.optimize_loop("layer5678_start")
 
         slothy.config = conf.copy()
-        
+
         if self.timeout is not None:
             slothy.config.timeout = self.timeout // 12
 
@@ -1366,7 +1389,7 @@ def core(self, slothy):
         slothy.config.split_heuristic_stepsize = 0.1
         slothy.config.constraints.stalls_first_attempt = 14
         slothy.optimize_loop("layer1234_start")
-            
+
 
 class ntt_dilithium_1234(Example):
     def __init__(self, var="", arch=AArch64_Neon, target=Target_CortexA72):
@@ -1513,6 +1536,7 @@ def main():
 
                 # Armv7m examples
                  Armv7mExample0(),
+                 Armv7mExample0Func(),
 
                 # Loop examples
                  AArch64LoopSubs(),

diff --git a/examples/naive/aarch64/ntt_kyber_123_4567.s b/examples/naive/aarch64/ntt_kyber_123_4567.s
@@ -23,8 +23,13 @@
 /// SOFTWARE.
 ///
 
+// Commented out for simple standalone emulation not
+// requiring correct constant data
+//
+// Should be commented when used.
+//
 // Needed to provide ASM_LOAD directive
-#include <hal_env.h>
+// #include <hal_envh>
 
 .macro mulmodq dst, src, const, idx0, idx1
         sqrdmulh t2.8h,   \src\().8h, \const\().h[\idx1]
@@ -154,7 +159,12 @@
 .data
 .p2align 4
 roots:
-        #include "ntt_kyber_123_45_67_twiddles.s"
+// Commented out for simple standalone emulation not
+// requiring correct constant data
+//
+// Should be commented when used.
+//
+//        #include "ntt_kyber_123_45_67_twiddles.s"
 
         in      .req x0
         inp     .req x1
@@ -223,9 +233,14 @@ ntt_kyber_123_4567:
 _ntt_kyber_123_4567:
         push_stack
 
-        ASM_LOAD(r_ptr0, roots)
-        ASM_LOAD(r_ptr1, roots_l56)
-        ASM_LOAD(xtmp, const_addr)
+// Commented out for simple standalone emulation not
+// requiring correct constant data.
+//
+// Should be commented when used.
+//
+//        ASM_LOAD(r_ptr0, roots)
+//        ASM_LOAD(r_ptr1, roots_l56)
+//        ASM_LOAD(xtmp, const_addr)
 
         ld1 {consts.8h}, [xtmp]
 

diff --git a/examples/naive/armv7m/armv7m_simple0_func.s b/examples/naive/armv7m/armv7m_simple0_func.s
@@ -0,0 +1,20 @@
+.syntax unified
+//.cpu cortex-m4 // llvm-mc does not like this...
+//.thumb // unicorn seems to get confused by this...
+
+.align 2
+.global my_func
+// .type my_func, %function // llvm-mc does not like this...
+my_func:
+  push {r4-r11, lr}
+
+start:
+ldr r8, [r0, #4]
+add r8, r2, r8
+eor.w r8, r8, r3
+smlabt r3, r2, r2, r8
+asrs r3, r3, #1
+str r3, [r0, #4]
+end:
+
+  pop {r4-r11, pc}
diff --git a/slothy/core/config.py b/slothy/core/config.py
@@ -122,8 +122,8 @@ def selftest(self):
           equivalence-check the loop-form (including the compare+branch instructions
           at the loop boundary) rather than the unrolled code.
 
-        DEPENDENCY: To run this, you need `llvm-mc` the binary in your path or configured
-        as via `llvm_mc_binary`, and `unicorn-engine` Python bindings setup.
+        DEPENDENCY: To run this, you need `llvm-nm`, `llvm-readobj`, `llvm-mc`
+                    in your PATH. Those are part of a standard LLVM setup.
 
         NOTE: This is so far implemented as a repeated randomized test -- nothing clever.
         """
@@ -416,8 +416,8 @@ def with_llvm_mca(self):
         """Indicates whether LLVM MCA should be run prior and after optimization
         to obtain approximate performance data based on LLVM's scheduling models.
 
-        If this is set, both Config.llvm_mca_binary and Config.compiler_binary
-        need to be set.
+        If this is set, Config.compiler_binary need to be set, and llcm-mca in
+        your PATH.
         """
         return self._with_llvm_mca_before and self._with_llvm_mca_after
 
@@ -438,8 +438,8 @@ def with_llvm_mca_before(self):
         """Indicates whether LLVM MCA should be run prior to optimization
         to obtain approximate performance data based on LLVM's scheduling models.
 
-        If this is set, both Config.llvm_mca_binary and Config.compiler_binary
-        need to be set.
+        If this is set, Config.compiler_binary need to be set, and llcm-mca in
+        your PATH.
         """
         return self._with_llvm_mca_before
 
@@ -448,8 +448,8 @@ def with_llvm_mca_after(self):
         """Indicates whether LLVM MCA should be run after optimization
         to obtain approximate performance data based on LLVM's scheduling models.
 
-        If this is set, both Config.llvm_mca_binary and Config.compiler_binary
-        need to be set.
+        If this is set, Config.compiler_binary need to be set, and llcm-mca in
+        your PATH.
         """
         return self._with_llvm_mca_after
 
@@ -469,21 +469,6 @@ def compiler_include_paths(self):
         or `with_llvm_mca_after` are set."""
         return self._compiler_include_paths
 
-    @property
-    def llvm_mca_binary(self):
-        """The llvm-mca binary to be used for estimated performance annotations
-
-        This is only relevant if `with_llvm_mca_before` or `with_llvm_mca_after`
-        is set."""
-        return self._llvm_mca_binary
-
-    @property
-    def llvm_mc_binary(self):
-        """The llvm-mc binary to be used for assembling output data
-
-        This is only relevant if `selftest` is set."""
-        return self._llvm_mc_binary
-
     @property
     def timeout(self):
         """The timeout in seconds after which the underlying constraint solver stops
@@ -1228,8 +1213,6 @@ def __init__(self, Arch, Target):
 
         self._compiler_binary = "gcc"
         self._compiler_include_paths = None
-        self._llvm_mca_binary = "llvm-mca"
-        self._llvm_mc_binary = "llvm-mc"
 
         self.keep_tags = True
         self.inherit_macro_comments = False
@@ -1377,12 +1360,6 @@ def compiler_binary(self, val):
     @compiler_include_paths.setter
     def compiler_include_paths(self, val):
         self._compiler_include_paths = val
-    @llvm_mca_binary.setter
-    def llvm_mca_binary(self, val):
-        self._llvm_mca_binary = val
-    @llvm_mc_binary.setter
-    def llvm_mc_binary(self, val):
-        self._llvm_mc_binary = val
     @timeout.setter
     def timeout(self, val):
         self._timeout = val

diff --git a/slothy/core/core.py b/slothy/core/core.py
@@ -42,7 +42,8 @@
 from slothy.helper import LockAttributes, Permutation, DeferHandler, SourceLine, LLVM_Mc
 
 try:
-    from unicorn import Uc
+    from unicorn import *
+    from unicorn.arm_const import *
 except ImportError:
     Uc = None
 
@@ -877,11 +878,10 @@ def selftest(self, log):
             self._config.arch.RegisterType.list_registers(ty)]
 
         def run_code(code, txt=None):
-            objcode = LLVM_Mc.assemble(code, self._config.llvm_mc_binary,
+            objcode, offset = LLVM_Mc.assemble(code,
                                        self._config.arch.llvm_mc_arch,
                                        self._config.arch.llvm_mc_attr,
                                        log)
-
             # Setup emulator
             mu = Uc(self.config.arch.unicorn_arch, self.config.arch.unicorn_mode)
             # Copy initial register contents into emulator
@@ -897,7 +897,7 @@ def run_code(code, txt=None):
             mu.mem_map(RAM_BASE, RAM_SZ)
             mu.mem_write(RAM_BASE, initial_memory)
             # Run emulator
-            mu.emu_start(CODE_BASE, CODE_BASE + len(objcode))
+            mu.emu_start(CODE_BASE + offset, CODE_BASE + len(objcode))
 
             final_register_contents = {}
             for r in regs:
@@ -937,7 +937,7 @@ def run_code(code, txt=None):
                 if final_regs_old[r] != final_regs_new[r]:
                     raise SlothySelfTestException(f"Selftest failed: Register mismatch for {r}: {hex(final_regs_old[r])} != {hex(final_regs_new[r])}")
 
-        log.info("Selftest: OK")
+        log.info("Local selftest: OK")
 
     def selfcheck_with_fixup(self, log):
         """Do selfcheck, and consider preamble/postamble fixup in case of SW pipelining