diff --git a/.github/workflows/c.yml b/.github/workflows/c.yml
index c3535185b..5345cad14 100644
--- a/.github/workflows/c.yml
+++ b/.github/workflows/c.yml
@@ -118,7 +118,7 @@ jobs:
           
       - name: 🔨 Build
         run: |
-          cmake -B build
+          LIBCRUX_BENCHMARKS=1 cmake -B build
           cmake --build build
 
       - name: 🏃🏻‍♀️ Test
@@ -132,7 +132,7 @@ jobs:
       - name: 🔨 Build Release
         run: |
           rm -rf build
-          cmake -B build -DCMAKE_BUILD_TYPE=Release
+          LIBCRUX_BENCHMARKS=1 cmake -B build -DCMAKE_BUILD_TYPE=Release
           cmake --build build --config Release
         if: ${{ matrix.os != 'windows-latest' }}
 
@@ -159,13 +159,6 @@ jobs:
           cmake -B build
           cmake --build build
       # FIXME: Benchmark build for cg on Windows CI is not working right now.
-        if: ${{ matrix.os != 'windows-latest' }}
-
-      # FIXME: Benchmark build for cg on Windows CI are not working right now.
-      # - name: 🏃🏻‍♀️ Test (cg)
-      #   working-directory: libcrux-ml-kem/cg
-      #   run: ./build/Debug/ml_kem_test
-      #   if: ${{ matrix.os == 'windows-latest' }}
 
       - name: 🏃🏻‍♀️ Test
         run: ./build/ml_kem_test
diff --git a/.github/workflows/s390x.yml b/.github/workflows/s390x.yml
new file mode 100644
index 000000000..e76c37b62
--- /dev/null
+++ b/.github/workflows/s390x.yml
@@ -0,0 +1,44 @@
+name: s390x - Build & Test
+
+on:
+  push:
+  pull_request:
+    branches: ["main", "dev"]
+  workflow_dispatch:
+  merge_group:
+
+env:
+  CARGO_TERM_COLOR: always
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  s390x:
+    runs-on: ubuntu-latest
+    name: Build on ubuntu-22.04 s390x
+    steps:
+      - uses: actions/checkout@v4
+      - uses: uraimo/run-on-arch-action@v2
+        name: Run
+        id: runcmd
+        with:
+          arch: s390x
+          distro: ubuntu22.04
+
+          # Speed up builds by storing container images in
+          # a GitHub package registry.
+          githubToken: ${{ github.token }}
+
+          run: |
+            apt-get -y update
+            apt-get install -y curl gcc g++ make cmake ninja-build git
+            cd libcrux-ml-kem/c
+            cmake -B build -G"Ninja Multi-Config"
+            cmake --build build
+            ./build/Debug/ml_kem_test
+            cd ../cg
+            cmake -B build -G"Ninja Multi-Config"
+            cmake --build build
+            ./build/Debug/ml_kem_test
diff --git a/libcrux-ml-dsa/proofs/fstar/extraction/Libcrux_ml_dsa.Hash_functions.Neon.fsti b/libcrux-ml-dsa/proofs/fstar/extraction/Libcrux_ml_dsa.Hash_functions.Neon.fsti
index 6805e0d00..9ad6829f1 100644
--- a/libcrux-ml-dsa/proofs/fstar/extraction/Libcrux_ml_dsa.Hash_functions.Neon.fsti
+++ b/libcrux-ml-dsa/proofs/fstar/extraction/Libcrux_ml_dsa.Hash_functions.Neon.fsti
@@ -24,19 +24,19 @@ val shake256_x4
       Prims.l_True
       (fun _ -> Prims.l_True)
 
-val squeeze_first_block_x4 (x: t_Shake256x4)
+val squeeze_first_block_x4 (state: t_Shake256x4)
     : Prims.Pure
       (t_Shake256x4 &
         (t_Array u8 (sz 136) & t_Array u8 (sz 136) & t_Array u8 (sz 136) & t_Array u8 (sz 136)))
       Prims.l_True
       (fun _ -> Prims.l_True)
 
-val squeeze_first_five_blocks (x: t_Shake128x4) (out0 out1 out2 out3: t_Array u8 (sz 840))
+val squeeze_first_five_blocks (state: t_Shake128x4) (out0 out1 out2 out3: t_Array u8 (sz 840))
     : Prims.Pure
       (t_Shake128x4 & t_Array u8 (sz 840) & t_Array u8 (sz 840) & t_Array u8 (sz 840) &
         t_Array u8 (sz 840)) Prims.l_True (fun _ -> Prims.l_True)
 
-val squeeze_next_block (x: t_Shake128x4)
+val squeeze_next_block (state: t_Shake128x4)
     : Prims.Pure
       (t_Shake128x4 &
         (t_Array u8 (sz 168) & t_Array u8 (sz 168) & t_Array u8 (sz 168) & t_Array u8 (sz 168)))
@@ -140,7 +140,7 @@ let impl: Libcrux_ml_dsa.Hash_functions.Shake128.t_XofX4 t_Shake128x4 =
         (t_Array u8 (sz 168) & t_Array u8 (sz 168) & t_Array u8 (sz 168) & t_Array u8 (sz 168)))
   }
 
-val squeeze_next_block_x4 (x: t_Shake256x4)
+val squeeze_next_block_x4 (state: t_Shake256x4)
     : Prims.Pure
       (t_Shake256x4 &
         (t_Array u8 (sz 136) & t_Array u8 (sz 136) & t_Array u8 (sz 136) & t_Array u8 (sz 136)))
diff --git a/libcrux-ml-dsa/proofs/fstar/extraction/Libcrux_ml_dsa.Hash_functions.Portable.fsti b/libcrux-ml-dsa/proofs/fstar/extraction/Libcrux_ml_dsa.Hash_functions.Portable.fsti
index 2d75db5dd..19bf6bae1 100644
--- a/libcrux-ml-dsa/proofs/fstar/extraction/Libcrux_ml_dsa.Hash_functions.Portable.fsti
+++ b/libcrux-ml-dsa/proofs/fstar/extraction/Libcrux_ml_dsa.Hash_functions.Portable.fsti
@@ -21,6 +21,9 @@ val t_Shake256Absorb:Type0
 
 val t_Shake256Squeeze:Type0
 
+val init_absorb__init_absorb (input: t_Slice u8)
+    : Prims.Pure Libcrux_sha3.Portable.t_KeccakState Prims.l_True (fun _ -> Prims.l_True)
+
 val init_absorb (input0 input1 input2 input3: t_Slice u8)
     : Prims.Pure t_Shake128X4 Prims.l_True (fun _ -> Prims.l_True)
 
@@ -69,22 +72,22 @@ val shake256_init: Prims.unit -> Prims.Pure t_Shake256Absorb Prims.l_True (fun _
 val shake256_squeeze (st: t_Shake256Squeeze) (out: t_Slice u8)
     : Prims.Pure (t_Shake256Squeeze & t_Slice u8) Prims.l_True (fun _ -> Prims.l_True)
 
-val squeeze_first_block_shake256 (x: t_Shake256)
+val squeeze_first_block_shake256 (state: t_Shake256)
     : Prims.Pure (t_Shake256 & t_Array u8 (sz 136)) Prims.l_True (fun _ -> Prims.l_True)
 
-val squeeze_first_block_x4 (x: t_Shake256X4)
+val squeeze_first_block_x4 (state: t_Shake256X4)
     : Prims.Pure
       (t_Shake256X4 &
         (t_Array u8 (sz 136) & t_Array u8 (sz 136) & t_Array u8 (sz 136) & t_Array u8 (sz 136)))
       Prims.l_True
       (fun _ -> Prims.l_True)
 
-val squeeze_first_five_blocks (x: t_Shake128X4) (out0 out1 out2 out3: t_Array u8 (sz 840))
+val squeeze_first_five_blocks (state: t_Shake128X4) (out0 out1 out2 out3: t_Array u8 (sz 840))
     : Prims.Pure
       (t_Shake128X4 & t_Array u8 (sz 840) & t_Array u8 (sz 840) & t_Array u8 (sz 840) &
         t_Array u8 (sz 840)) Prims.l_True (fun _ -> Prims.l_True)
 
-val squeeze_next_block (x: t_Shake128X4)
+val squeeze_next_block (state: t_Shake128X4)
     : Prims.Pure
       (t_Shake128X4 &
         (t_Array u8 (sz 168) & t_Array u8 (sz 168) & t_Array u8 (sz 168) & t_Array u8 (sz 168)))
@@ -188,7 +191,7 @@ let impl: Libcrux_ml_dsa.Hash_functions.Shake128.t_XofX4 t_Shake128X4 =
         (t_Array u8 (sz 168) & t_Array u8 (sz 168) & t_Array u8 (sz 168) & t_Array u8 (sz 168)))
   }
 
-val squeeze_next_block_shake256 (x: t_Shake256)
+val squeeze_next_block_shake256 (state: t_Shake256)
     : Prims.Pure (t_Shake256 & t_Array u8 (sz 136)) Prims.l_True (fun _ -> Prims.l_True)
 
 [@@ FStar.Tactics.Typeclasses.tcinstance]
@@ -238,7 +241,7 @@ let impl_2: Libcrux_ml_dsa.Hash_functions.Shake256.t_Xof t_Shake256 =
       self, hax_temp_output <: (t_Shake256 & t_Array u8 (sz 136))
   }
 
-val squeeze_next_block_x4 (x: t_Shake256X4)
+val squeeze_next_block_x4 (state: t_Shake256X4)
     : Prims.Pure
       (t_Shake256X4 &
         (t_Array u8 (sz 136) & t_Array u8 (sz 136) & t_Array u8 (sz 136) & t_Array u8 (sz 136)))
diff --git a/libcrux-ml-dsa/proofs/fstar/extraction/Libcrux_ml_dsa.Hash_functions.Simd256.fsti b/libcrux-ml-dsa/proofs/fstar/extraction/Libcrux_ml_dsa.Hash_functions.Simd256.fsti
index 3ff04ac43..a9b24b26a 100644
--- a/libcrux-ml-dsa/proofs/fstar/extraction/Libcrux_ml_dsa.Hash_functions.Simd256.fsti
+++ b/libcrux-ml-dsa/proofs/fstar/extraction/Libcrux_ml_dsa.Hash_functions.Simd256.fsti
@@ -27,19 +27,19 @@ val shake256_x4
       Prims.l_True
       (fun _ -> Prims.l_True)
 
-val squeeze_first_block_x4 (x: t_Shake256x4)
+val squeeze_first_block_x4 (state: t_Shake256x4)
     : Prims.Pure
       (t_Shake256x4 &
         (t_Array u8 (sz 136) & t_Array u8 (sz 136) & t_Array u8 (sz 136) & t_Array u8 (sz 136)))
       Prims.l_True
       (fun _ -> Prims.l_True)
 
-val squeeze_first_five_blocks (x: t_Shake128x4) (out0 out1 out2 out3: t_Array u8 (sz 840))
+val squeeze_first_five_blocks (state: t_Shake128x4) (out0 out1 out2 out3: t_Array u8 (sz 840))
     : Prims.Pure
       (t_Shake128x4 & t_Array u8 (sz 840) & t_Array u8 (sz 840) & t_Array u8 (sz 840) &
         t_Array u8 (sz 840)) Prims.l_True (fun _ -> Prims.l_True)
 
-val squeeze_next_block (x: t_Shake128x4)
+val squeeze_next_block (state: t_Shake128x4)
     : Prims.Pure
       (t_Shake128x4 &
         (t_Array u8 (sz 168) & t_Array u8 (sz 168) & t_Array u8 (sz 168) & t_Array u8 (sz 168)))
@@ -143,7 +143,7 @@ let impl: Libcrux_ml_dsa.Hash_functions.Shake128.t_XofX4 t_Shake128x4 =
         (t_Array u8 (sz 168) & t_Array u8 (sz 168) & t_Array u8 (sz 168) & t_Array u8 (sz 168)))
   }
 
-val squeeze_next_block_x4 (x: t_Shake256x4)
+val squeeze_next_block_x4 (state: t_Shake256x4)
     : Prims.Pure
       (t_Shake256x4 &
         (t_Array u8 (sz 136) & t_Array u8 (sz 136) & t_Array u8 (sz 136) & t_Array u8 (sz 136)))
diff --git a/libcrux-ml-dsa/src/hash_functions.rs b/libcrux-ml-dsa/src/hash_functions.rs
index 028b1906c..11b2461c1 100644
--- a/libcrux-ml-dsa/src/hash_functions.rs
+++ b/libcrux-ml-dsa/src/hash_functions.rs
@@ -76,19 +76,19 @@ pub(crate) mod shake128 {
 
 /// A portable implementation of [`shake128::Xof`] and [`shake256::Xof`].
 pub(crate) mod portable {
-    use libcrux_sha3::portable::incremental;
-
     use super::{shake128, shake256};
+    use libcrux_sha3::portable::incremental;
+    use libcrux_sha3::portable::KeccakState;
 
     /// Portable SHAKE 128 x4 state.
     ///
     /// We're using a portable implementation so this is actually sequential.
     #[cfg_attr(hax, hax_lib::opaque_type)]
     pub(crate) struct Shake128X4 {
-        state0: libcrux_sha3::portable::KeccakState,
-        state1: libcrux_sha3::portable::KeccakState,
-        state2: libcrux_sha3::portable::KeccakState,
-        state3: libcrux_sha3::portable::KeccakState,
+        state0: KeccakState,
+        state1: KeccakState,
+        state2: KeccakState,
+        state3: KeccakState,
     }
 
     fn init_absorb(input0: &[u8], input1: &[u8], input2: &[u8], input3: &[u8]) -> Shake128X4 {
@@ -113,20 +113,20 @@ pub(crate) mod portable {
     }
 
     fn squeeze_first_five_blocks(
-        x: &mut Shake128X4,
+        state: &mut Shake128X4,
         out0: &mut [u8; shake128::FIVE_BLOCKS_SIZE],
         out1: &mut [u8; shake128::FIVE_BLOCKS_SIZE],
         out2: &mut [u8; shake128::FIVE_BLOCKS_SIZE],
         out3: &mut [u8; shake128::FIVE_BLOCKS_SIZE],
     ) {
-        incremental::shake128_squeeze_first_five_blocks(&mut x.state0, out0);
-        incremental::shake128_squeeze_first_five_blocks(&mut x.state1, out1);
-        incremental::shake128_squeeze_first_five_blocks(&mut x.state2, out2);
-        incremental::shake128_squeeze_first_five_blocks(&mut x.state3, out3);
+        incremental::shake128_squeeze_first_five_blocks(&mut state.state0, out0);
+        incremental::shake128_squeeze_first_five_blocks(&mut state.state1, out1);
+        incremental::shake128_squeeze_first_five_blocks(&mut state.state2, out2);
+        incremental::shake128_squeeze_first_five_blocks(&mut state.state3, out3);
     }
 
     fn squeeze_next_block(
-        x: &mut Shake128X4,
+        state: &mut Shake128X4,
     ) -> (
         [u8; shake128::BLOCK_SIZE],
         [u8; shake128::BLOCK_SIZE],
@@ -134,13 +134,13 @@ pub(crate) mod portable {
         [u8; shake128::BLOCK_SIZE],
     ) {
         let mut out0 = [0u8; shake128::BLOCK_SIZE];
-        incremental::shake128_squeeze_next_block(&mut x.state0, &mut out0);
+        incremental::shake128_squeeze_next_block(&mut state.state0, &mut out0);
         let mut out1 = [0u8; shake128::BLOCK_SIZE];
-        incremental::shake128_squeeze_next_block(&mut x.state1, &mut out1);
+        incremental::shake128_squeeze_next_block(&mut state.state1, &mut out1);
         let mut out2 = [0u8; shake128::BLOCK_SIZE];
-        incremental::shake128_squeeze_next_block(&mut x.state2, &mut out2);
+        incremental::shake128_squeeze_next_block(&mut state.state2, &mut out2);
         let mut out3 = [0u8; shake128::BLOCK_SIZE];
-        incremental::shake128_squeeze_next_block(&mut x.state3, &mut out3);
+        incremental::shake128_squeeze_next_block(&mut state.state3, &mut out3);
 
         (out0, out1, out2, out3)
     }
@@ -187,7 +187,7 @@ pub(crate) mod portable {
     /// Portable SHAKE 256 state
     #[cfg_attr(hax, hax_lib::opaque_type)]
     pub(crate) struct Shake256 {
-        state: libcrux_sha3::portable::KeccakState,
+        state: KeccakState,
     }
 
     fn shake256<const OUTPUT_LENGTH: usize>(input: &[u8], out: &mut [u8; OUTPUT_LENGTH]) {
@@ -200,15 +200,15 @@ pub(crate) mod portable {
         Shake256 { state }
     }
 
-    fn squeeze_first_block_shake256(x: &mut Shake256) -> [u8; shake256::BLOCK_SIZE] {
+    fn squeeze_first_block_shake256(state: &mut Shake256) -> [u8; shake256::BLOCK_SIZE] {
         let mut out = [0u8; shake256::BLOCK_SIZE];
-        incremental::shake256_squeeze_first_block(&mut x.state, &mut out);
+        incremental::shake256_squeeze_first_block(&mut state.state, &mut out);
         out
     }
 
-    fn squeeze_next_block_shake256(x: &mut Shake256) -> [u8; shake256::BLOCK_SIZE] {
+    fn squeeze_next_block_shake256(state: &mut Shake256) -> [u8; shake256::BLOCK_SIZE] {
         let mut out = [0u8; shake256::BLOCK_SIZE];
-        incremental::shake256_squeeze_next_block(&mut x.state, &mut out);
+        incremental::shake256_squeeze_next_block(&mut state.state, &mut out);
         out
     }
 
@@ -262,7 +262,7 @@ pub(crate) mod portable {
     }
 
     fn squeeze_first_block_x4(
-        x: &mut Shake256X4,
+        state: &mut Shake256X4,
     ) -> (
         [u8; shake256::BLOCK_SIZE],
         [u8; shake256::BLOCK_SIZE],
@@ -270,19 +270,19 @@ pub(crate) mod portable {
         [u8; shake256::BLOCK_SIZE],
     ) {
         let mut out0 = [0u8; shake256::BLOCK_SIZE];
-        incremental::shake256_squeeze_first_block(&mut x.state0, &mut out0);
+        incremental::shake256_squeeze_first_block(&mut state.state0, &mut out0);
         let mut out1 = [0u8; shake256::BLOCK_SIZE];
-        incremental::shake256_squeeze_first_block(&mut x.state1, &mut out1);
+        incremental::shake256_squeeze_first_block(&mut state.state1, &mut out1);
         let mut out2 = [0u8; shake256::BLOCK_SIZE];
-        incremental::shake256_squeeze_first_block(&mut x.state2, &mut out2);
+        incremental::shake256_squeeze_first_block(&mut state.state2, &mut out2);
         let mut out3 = [0u8; shake256::BLOCK_SIZE];
-        incremental::shake256_squeeze_first_block(&mut x.state3, &mut out3);
+        incremental::shake256_squeeze_first_block(&mut state.state3, &mut out3);
 
         (out0, out1, out2, out3)
     }
 
     fn squeeze_next_block_x4(
-        x: &mut Shake256X4,
+        state: &mut Shake256X4,
     ) -> (
         [u8; shake256::BLOCK_SIZE],
         [u8; shake256::BLOCK_SIZE],
@@ -290,13 +290,13 @@ pub(crate) mod portable {
         [u8; shake256::BLOCK_SIZE],
     ) {
         let mut out0 = [0u8; shake256::BLOCK_SIZE];
-        incremental::shake256_squeeze_next_block(&mut x.state0, &mut out0);
+        incremental::shake256_squeeze_next_block(&mut state.state0, &mut out0);
         let mut out1 = [0u8; shake256::BLOCK_SIZE];
-        incremental::shake256_squeeze_next_block(&mut x.state1, &mut out1);
+        incremental::shake256_squeeze_next_block(&mut state.state1, &mut out1);
         let mut out2 = [0u8; shake256::BLOCK_SIZE];
-        incremental::shake256_squeeze_next_block(&mut x.state2, &mut out2);
+        incremental::shake256_squeeze_next_block(&mut state.state2, &mut out2);
         let mut out3 = [0u8; shake256::BLOCK_SIZE];
-        incremental::shake256_squeeze_next_block(&mut x.state3, &mut out3);
+        incremental::shake256_squeeze_next_block(&mut state.state3, &mut out3);
 
         (out0, out1, out2, out3)
     }
@@ -399,17 +399,23 @@ pub(crate) mod simd256 {
     }
 
     fn squeeze_first_five_blocks(
-        x: &mut Shake128x4,
+        state: &mut Shake128x4,
         out0: &mut [u8; shake128::FIVE_BLOCKS_SIZE],
         out1: &mut [u8; shake128::FIVE_BLOCKS_SIZE],
         out2: &mut [u8; shake128::FIVE_BLOCKS_SIZE],
         out3: &mut [u8; shake128::FIVE_BLOCKS_SIZE],
     ) {
-        x4::incremental::shake128_squeeze_first_five_blocks(&mut x.state, out0, out1, out2, out3);
+        x4::incremental::shake128_squeeze_first_five_blocks(
+            &mut state.state,
+            out0,
+            out1,
+            out2,
+            out3,
+        );
     }
 
     fn squeeze_next_block(
-        x: &mut Shake128x4,
+        state: &mut Shake128x4,
     ) -> (
         [u8; shake128::BLOCK_SIZE],
         [u8; shake128::BLOCK_SIZE],
@@ -421,7 +427,7 @@ pub(crate) mod simd256 {
         let mut out2 = [0u8; shake128::BLOCK_SIZE];
         let mut out3 = [0u8; shake128::BLOCK_SIZE];
         x4::incremental::shake128_squeeze_next_block(
-            &mut x.state,
+            &mut state.state,
             &mut out0,
             &mut out1,
             &mut out2,
@@ -514,7 +520,7 @@ pub(crate) mod simd256 {
     }
 
     fn squeeze_first_block_x4(
-        x: &mut Shake256x4,
+        state: &mut Shake256x4,
     ) -> (
         [u8; shake256::BLOCK_SIZE],
         [u8; shake256::BLOCK_SIZE],
@@ -526,7 +532,7 @@ pub(crate) mod simd256 {
         let mut out2 = [0u8; shake256::BLOCK_SIZE];
         let mut out3 = [0u8; shake256::BLOCK_SIZE];
         x4::incremental::shake256_squeeze_first_block(
-            &mut x.state,
+            &mut state.state,
             &mut out0,
             &mut out1,
             &mut out2,
@@ -537,7 +543,7 @@ pub(crate) mod simd256 {
     }
 
     fn squeeze_next_block_x4(
-        x: &mut Shake256x4,
+        state: &mut Shake256x4,
     ) -> (
         [u8; shake256::BLOCK_SIZE],
         [u8; shake256::BLOCK_SIZE],
@@ -549,7 +555,7 @@ pub(crate) mod simd256 {
         let mut out2 = [0u8; shake256::BLOCK_SIZE];
         let mut out3 = [0u8; shake256::BLOCK_SIZE];
         x4::incremental::shake256_squeeze_next_block(
-            &mut x.state,
+            &mut state.state,
             &mut out0,
             &mut out1,
             &mut out2,
@@ -641,18 +647,18 @@ pub(crate) mod neon {
     }
 
     fn squeeze_first_five_blocks(
-        x: &mut Shake128x4,
+        state: &mut Shake128x4,
         out0: &mut [u8; shake128::FIVE_BLOCKS_SIZE],
         out1: &mut [u8; shake128::FIVE_BLOCKS_SIZE],
         out2: &mut [u8; shake128::FIVE_BLOCKS_SIZE],
         out3: &mut [u8; shake128::FIVE_BLOCKS_SIZE],
     ) {
-        x2::incremental::shake128_squeeze_first_five_blocks(&mut x.state[0], out0, out1);
-        x2::incremental::shake128_squeeze_first_five_blocks(&mut x.state[1], out2, out3);
+        x2::incremental::shake128_squeeze_first_five_blocks(&mut state.state[0], out0, out1);
+        x2::incremental::shake128_squeeze_first_five_blocks(&mut state.state[1], out2, out3);
     }
 
     fn squeeze_next_block(
-        x: &mut Shake128x4,
+        state: &mut Shake128x4,
     ) -> (
         [u8; shake128::BLOCK_SIZE],
         [u8; shake128::BLOCK_SIZE],
@@ -663,8 +669,8 @@ pub(crate) mod neon {
         let mut out1 = [0u8; shake128::BLOCK_SIZE];
         let mut out2 = [0u8; shake128::BLOCK_SIZE];
         let mut out3 = [0u8; shake128::BLOCK_SIZE];
-        x2::incremental::shake128_squeeze_next_block(&mut x.state[0], &mut out0, &mut out1);
-        x2::incremental::shake128_squeeze_next_block(&mut x.state[1], &mut out2, &mut out3);
+        x2::incremental::shake128_squeeze_next_block(&mut state.state[0], &mut out0, &mut out1);
+        x2::incremental::shake128_squeeze_next_block(&mut state.state[1], &mut out2, &mut out3);
 
         (out0, out1, out2, out3)
     }
@@ -711,7 +717,7 @@ pub(crate) mod neon {
     }
 
     fn squeeze_first_block_x4(
-        x: &mut Shake256x4,
+        state: &mut Shake256x4,
     ) -> (
         [u8; shake256::BLOCK_SIZE],
         [u8; shake256::BLOCK_SIZE],
@@ -722,14 +728,14 @@ pub(crate) mod neon {
         let mut out1 = [0u8; shake256::BLOCK_SIZE];
         let mut out2 = [0u8; shake256::BLOCK_SIZE];
         let mut out3 = [0u8; shake256::BLOCK_SIZE];
-        x2::incremental::shake256_squeeze_first_block(&mut x.state[0], &mut out0, &mut out1);
-        x2::incremental::shake256_squeeze_first_block(&mut x.state[1], &mut out2, &mut out3);
+        x2::incremental::shake256_squeeze_first_block(&mut state.state[0], &mut out0, &mut out1);
+        x2::incremental::shake256_squeeze_first_block(&mut state.state[1], &mut out2, &mut out3);
 
         (out0, out1, out2, out3)
     }
 
     fn squeeze_next_block_x4(
-        x: &mut Shake256x4,
+        state: &mut Shake256x4,
     ) -> (
         [u8; shake256::BLOCK_SIZE],
         [u8; shake256::BLOCK_SIZE],
@@ -740,8 +746,8 @@ pub(crate) mod neon {
         let mut out1 = [0u8; shake256::BLOCK_SIZE];
         let mut out2 = [0u8; shake256::BLOCK_SIZE];
         let mut out3 = [0u8; shake256::BLOCK_SIZE];
-        x2::incremental::shake256_squeeze_next_block(&mut x.state[0], &mut out0, &mut out1);
-        x2::incremental::shake256_squeeze_next_block(&mut x.state[1], &mut out2, &mut out3);
+        x2::incremental::shake256_squeeze_next_block(&mut state.state[0], &mut out0, &mut out1);
+        x2::incremental::shake256_squeeze_next_block(&mut state.state[1], &mut out2, &mut out3);
 
         (out0, out1, out2, out3)
     }
diff --git a/libcrux-ml-kem/c/CMakeLists.txt b/libcrux-ml-kem/c/CMakeLists.txt
index 121558310..7eb5cd5ca 100644
--- a/libcrux-ml-kem/c/CMakeLists.txt
+++ b/libcrux-ml-kem/c/CMakeLists.txt
@@ -17,6 +17,7 @@ if(NOT MSVC)
     # TODO: Clean up
     add_compile_options(
         -Wall
+
         # -Wextra
         # -pedantic
         # -Wconversion
@@ -29,6 +30,7 @@ if(NOT MSVC)
 endif(NOT MSVC)
 
 set(CMAKE_COLOR_DIAGNOSTICS "ON")
+
 # For LSP-based editors
 set(CMAKE_EXPORT_COMPILE_COMMANDS 1)
 include_directories(
@@ -101,12 +103,10 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64|arm64v8" AND DEFINED ENV{LIBCRU
 endif()
 
 # --- Tests
-
 if(DEFINED ENV{LIBCRUX_UNPACKED})
     add_compile_definitions(LIBCRUX_UNPACKED)
 endif(DEFINED ENV{LIBCRUX_UNPACKED})
 
-
 # Get gtests
 include(FetchContent)
 FetchContent_Declare(googletest
@@ -144,52 +144,54 @@ target_link_libraries(sha3_test PRIVATE
 )
 
 # --- Benchmarks
-FetchContent_Declare(benchmark
-    GIT_REPOSITORY https://github.com/google/benchmark.git
-    GIT_TAG v1.8.4
-)
-FetchContent_MakeAvailable(benchmark)
+if(DEFINED ENV{LIBCRUX_BENCHMARKS})
+    FetchContent_Declare(benchmark
+        GIT_REPOSITORY https://github.com/google/benchmark.git
+        GIT_TAG v1.8.4
+    )
+    FetchContent_MakeAvailable(benchmark)
 
-add_executable(ml_kem_bench
-    ${PROJECT_SOURCE_DIR}/benches/mlkem768.cc
-)
-target_link_libraries(ml_kem_bench PRIVATE
-    ml_kem_static
-    benchmark::benchmark
-)
+    add_executable(ml_kem_bench
+        ${PROJECT_SOURCE_DIR}/benches/mlkem768.cc
+    )
+    target_link_libraries(ml_kem_bench PRIVATE
+        ml_kem_static
+        benchmark::benchmark
+    )
 
-if(DEFINED ENV{SYMCRYPT_PATH})
-    message("Symcrypt path: $ENV{SYMCRYPT_PATH}")
-    add_compile_definitions(LIBCRUX_SYMCRYPT)
-    target_include_directories(ml_kem_bench PRIVATE $ENV{SYMCRYPT_PATH})
-    target_link_directories(ml_kem_bench PRIVATE $ENV{SYMCRYPT_PATH}/bin/lib)
-    target_link_libraries(ml_kem_bench PRIVATE symcrypt)
-endif(DEFINED ENV{SYMCRYPT_PATH})
+    if(DEFINED ENV{SYMCRYPT_PATH})
+        message("Symcrypt path: $ENV{SYMCRYPT_PATH}")
+        add_compile_definitions(LIBCRUX_SYMCRYPT)
+        target_include_directories(ml_kem_bench PRIVATE $ENV{SYMCRYPT_PATH})
+        target_link_directories(ml_kem_bench PRIVATE $ENV{SYMCRYPT_PATH}/bin/lib)
+        target_link_libraries(ml_kem_bench PRIVATE symcrypt)
+    endif(DEFINED ENV{SYMCRYPT_PATH})
 
-add_executable(ml_kem_keygen
-    ${PROJECT_SOURCE_DIR}/benches/mlkem768_keygen.cc
-)
-target_link_libraries(ml_kem_keygen PRIVATE
-    ml_kem_static
-    benchmark::benchmark
-)
-
-add_executable(ml_kem_encaps
-    ${PROJECT_SOURCE_DIR}/benches/mlkem768_encaps.cc
-)
-target_link_libraries(ml_kem_encaps PRIVATE
-    ml_kem_static
-    benchmark::benchmark
-)
+    add_executable(ml_kem_keygen
+        ${PROJECT_SOURCE_DIR}/benches/mlkem768_keygen.cc
+    )
+    target_link_libraries(ml_kem_keygen PRIVATE
+        ml_kem_static
+        benchmark::benchmark
+    )
 
-if(NOT MSVC)
-    # We benchmark internal functions here that are inlined and thus not available
-    # in MSVC.
-    add_executable(sha3_bench
-        ${PROJECT_SOURCE_DIR}/benches/sha3.cc
+    add_executable(ml_kem_encaps
+        ${PROJECT_SOURCE_DIR}/benches/mlkem768_encaps.cc
     )
-    target_link_libraries(sha3_bench PRIVATE
+    target_link_libraries(ml_kem_encaps PRIVATE
         ml_kem_static
         benchmark::benchmark
     )
-endif(NOT MSVC)
+
+    if(NOT MSVC)
+        # We benchmark internal functions here that are inlined and thus not available
+        # in MSVC.
+        add_executable(sha3_bench
+            ${PROJECT_SOURCE_DIR}/benches/sha3.cc
+        )
+        target_link_libraries(sha3_bench PRIVATE
+            ml_kem_static
+            benchmark::benchmark
+        )
+    endif(NOT MSVC)
+endif(DEFINED ENV{LIBCRUX_BENCHMARKS})
diff --git a/libcrux-ml-kem/cg/CMakeLists.txt b/libcrux-ml-kem/cg/CMakeLists.txt
index ce8ed53c2..e18520d55 100644
--- a/libcrux-ml-kem/cg/CMakeLists.txt
+++ b/libcrux-ml-kem/cg/CMakeLists.txt
@@ -26,10 +26,10 @@ if(NOT MSVC)
 endif(NOT MSVC)
 
 if((CMAKE_C_COMPILER_ID STREQUAL "Clang" AND
-      CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL "13.0.0") OR
-     (CMAKE_C_COMPILER_ID STREQUAL "AppleClang" AND
-      CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL "13.1.6"))
-      add_compile_options(-Werror -Wframe-larger-than=25344)
+    CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL "13.0.0") OR
+    (CMAKE_C_COMPILER_ID STREQUAL "AppleClang" AND
+    CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL "13.1.6"))
+    add_compile_options(-Werror -Wframe-larger-than=25344)
 endif()
 
 set(CMAKE_COLOR_DIAGNOSTICS "ON")
@@ -95,48 +95,50 @@ target_link_libraries(sha3_test PRIVATE
 )
 
 # --- Benchmarks
-FetchContent_Declare(benchmark
-    GIT_REPOSITORY https://github.com/google/benchmark.git
-    GIT_TAG v1.8.4
-)
-FetchContent_MakeAvailable(benchmark)
-
-add_executable(ml_kem_bench
-    ${PROJECT_SOURCE_DIR}/benches/mlkem768.cc
-)
-target_link_libraries(ml_kem_bench PRIVATE
-    benchmark::benchmark
-)
+if(DEFINED ENV{LIBCRUX_BENCHMARKS})
+    FetchContent_Declare(benchmark
+        GIT_REPOSITORY https://github.com/google/benchmark.git
+        GIT_TAG v1.8.4
+    )
+    FetchContent_MakeAvailable(benchmark)
 
-if(DEFINED ENV{SYMCRYPT_PATH})
-    message("Symcrypt path: $ENV{SYMCRYPT_PATH}")
-    add_compile_definitions(LIBCRUX_SYMCRYPT)
-    target_include_directories(ml_kem_bench PRIVATE $ENV{SYMCRYPT_PATH})
-    target_link_directories(ml_kem_bench PRIVATE $ENV{SYMCRYPT_PATH}/bin/lib)
-    target_link_libraries(ml_kem_bench PRIVATE symcrypt)
-endif(DEFINED ENV{SYMCRYPT_PATH})
+    add_executable(ml_kem_bench
+        ${PROJECT_SOURCE_DIR}/benches/mlkem768.cc
+    )
+    target_link_libraries(ml_kem_bench PRIVATE
+        benchmark::benchmark
+    )
 
-add_executable(ml_kem_keygen
-    ${PROJECT_SOURCE_DIR}/benches/mlkem768_keygen.cc
-)
-target_link_libraries(ml_kem_keygen PRIVATE
-    benchmark::benchmark
-)
+    if(DEFINED ENV{SYMCRYPT_PATH})
+        message("Symcrypt path: $ENV{SYMCRYPT_PATH}")
+        add_compile_definitions(LIBCRUX_SYMCRYPT)
+        target_include_directories(ml_kem_bench PRIVATE $ENV{SYMCRYPT_PATH})
+        target_link_directories(ml_kem_bench PRIVATE $ENV{SYMCRYPT_PATH}/bin/lib)
+        target_link_libraries(ml_kem_bench PRIVATE symcrypt)
+    endif(DEFINED ENV{SYMCRYPT_PATH})
 
-add_executable(ml_kem_encaps
-    ${PROJECT_SOURCE_DIR}/benches/mlkem768_encaps.cc
-)
-target_link_libraries(ml_kem_encaps PRIVATE
-    benchmark::benchmark
-)
+    add_executable(ml_kem_keygen
+        ${PROJECT_SOURCE_DIR}/benches/mlkem768_keygen.cc
+    )
+    target_link_libraries(ml_kem_keygen PRIVATE
+        benchmark::benchmark
+    )
 
-if(NOT MSVC)
-    # We benchmark internal functions here that are inlined and thus not available
-    # in MSVC.
-    add_executable(sha3_bench
-        ${PROJECT_SOURCE_DIR}/benches/sha3.cc
+    add_executable(ml_kem_encaps
+        ${PROJECT_SOURCE_DIR}/benches/mlkem768_encaps.cc
     )
-    target_link_libraries(sha3_bench PRIVATE
+    target_link_libraries(ml_kem_encaps PRIVATE
         benchmark::benchmark
     )
-endif(NOT MSVC)
+
+    if(NOT MSVC)
+        # We benchmark internal functions here that are inlined and thus not available
+        # in MSVC.
+        add_executable(sha3_bench
+            ${PROJECT_SOURCE_DIR}/benches/sha3.cc
+        )
+        target_link_libraries(sha3_bench PRIVATE
+            benchmark::benchmark
+        )
+    endif(NOT MSVC)
+endif(DEFINED ENV{LIBCRUX_BENCHMARKS})
diff --git a/libcrux-ml-kem/cg/eurydice_glue.h b/libcrux-ml-kem/cg/eurydice_glue.h
index cdd27af77..3f9b35cc2 100644
--- a/libcrux-ml-kem/cg/eurydice_glue.h
+++ b/libcrux-ml-kem/cg/eurydice_glue.h
@@ -17,6 +17,7 @@ extern "C" {
 #include <stdlib.h>
 #include <string.h>
 
+#include "karamel/endianness.h"
 #include "karamel/target.h"
 
 // SLICES, ARRAYS, ETC.
@@ -88,7 +89,7 @@ typedef struct {
 #define Eurydice_slice_copy(dst, src, t) \
   memcpy(dst.ptr, src.ptr, dst.len * sizeof(t))
 #define core_array___Array_T__N__23__as_slice(len_, ptr_, t, _ret_t) \
-  ((Eurydice_slice){.ptr = ptr_, .len = len_})
+  (CLITERAL(Eurydice_slice){.ptr = ptr_, .len = len_})
 
 #define core_array___core__clone__Clone_for__Array_T__N___20__clone( \
     len, src, dst, elem_type, _ret_t)                                \
@@ -130,18 +131,14 @@ static inline void Eurydice_slice_to_array3(uint8_t *dst_tag, char *dst_ok,
 // CORE STUFF (conversions, endianness, ...)
 
 static inline void core_num__u64_9__to_le_bytes(uint64_t v, uint8_t buf[8]) {
-  memcpy(buf, &v, sizeof(v));
+  store64_le(buf, v);
 }
 static inline uint64_t core_num__u64_9__from_le_bytes(uint8_t buf[8]) {
-  uint64_t v;
-  memcpy(&v, buf, sizeof(v));
-  return v;
+  return load64_le(buf);
 }
 
 static inline uint32_t core_num__u32_8__from_le_bytes(uint8_t buf[4]) {
-  uint32_t v;
-  memcpy(&v, buf, sizeof(v));
-  return v;
+  return load32_le(buf);
 }
 
 static inline uint32_t core_num__u8_6__count_ones(uint8_t x0) {
diff --git a/libcrux-ml-kem/cg/karamel/endianness.h b/libcrux-ml-kem/cg/karamel/endianness.h
new file mode 100644
index 000000000..d59d9854d
--- /dev/null
+++ b/libcrux-ml-kem/cg/karamel/endianness.h
@@ -0,0 +1,228 @@
+/* Copyright (c) INRIA and Microsoft Corporation. All rights reserved.
+   Licensed under the Apache 2.0 and MIT Licenses. */
+
+#ifndef __LOWSTAR_ENDIANNESS_H
+#define __LOWSTAR_ENDIANNESS_H
+
+#include <inttypes.h>
+#include <string.h>
+
+/******************************************************************************/
+/* Implementing C.fst (part 2: endian-ness macros)                            */
+/******************************************************************************/
+
+/* ... for Linux */
+#if defined(__linux__) || defined(__CYGWIN__) || \
+    defined(__USE_SYSTEM_ENDIAN_H__) || defined(__GLIBC__)
+#include <endian.h>
+
+/* ... for OSX */
+#elif defined(__APPLE__)
+#include <libkern/OSByteOrder.h>
+#define htole64(x) OSSwapHostToLittleInt64(x)
+#define le64toh(x) OSSwapLittleToHostInt64(x)
+#define htobe64(x) OSSwapHostToBigInt64(x)
+#define be64toh(x) OSSwapBigToHostInt64(x)
+
+#define htole16(x) OSSwapHostToLittleInt16(x)
+#define le16toh(x) OSSwapLittleToHostInt16(x)
+#define htobe16(x) OSSwapHostToBigInt16(x)
+#define be16toh(x) OSSwapBigToHostInt16(x)
+
+#define htole32(x) OSSwapHostToLittleInt32(x)
+#define le32toh(x) OSSwapLittleToHostInt32(x)
+#define htobe32(x) OSSwapHostToBigInt32(x)
+#define be32toh(x) OSSwapBigToHostInt32(x)
+
+/* ... for Solaris */
+#elif defined(__sun__)
+#include <sys/byteorder.h>
+#define htole64(x) LE_64(x)
+#define le64toh(x) LE_64(x)
+#define htobe64(x) BE_64(x)
+#define be64toh(x) BE_64(x)
+
+#define htole16(x) LE_16(x)
+#define le16toh(x) LE_16(x)
+#define htobe16(x) BE_16(x)
+#define be16toh(x) BE_16(x)
+
+#define htole32(x) LE_32(x)
+#define le32toh(x) LE_32(x)
+#define htobe32(x) BE_32(x)
+#define be32toh(x) BE_32(x)
+
+/* ... for the BSDs */
+#elif defined(__FreeBSD__) || defined(__NetBSD__) || defined(__DragonFly__)
+#include <sys/endian.h>
+#elif defined(__OpenBSD__)
+#include <endian.h>
+
+/* ... for Windows (MSVC)... not targeting XBOX 360! */
+#elif defined(_MSC_VER)
+
+#include <stdlib.h>
+#define htobe16(x) _byteswap_ushort(x)
+#define htole16(x) (x)
+#define be16toh(x) _byteswap_ushort(x)
+#define le16toh(x) (x)
+
+#define htobe32(x) _byteswap_ulong(x)
+#define htole32(x) (x)
+#define be32toh(x) _byteswap_ulong(x)
+#define le32toh(x) (x)
+
+#define htobe64(x) _byteswap_uint64(x)
+#define htole64(x) (x)
+#define be64toh(x) _byteswap_uint64(x)
+#define le64toh(x) (x)
+
+/* ... for Windows (GCC-like, e.g. mingw or clang) */
+#elif (defined(_WIN32) || defined(_WIN64) || defined(__EMSCRIPTEN__)) && \
+    (defined(__GNUC__) || defined(__clang__))
+
+#define htobe16(x) __builtin_bswap16(x)
+#define htole16(x) (x)
+#define be16toh(x) __builtin_bswap16(x)
+#define le16toh(x) (x)
+
+#define htobe32(x) __builtin_bswap32(x)
+#define htole32(x) (x)
+#define be32toh(x) __builtin_bswap32(x)
+#define le32toh(x) (x)
+
+#define htobe64(x) __builtin_bswap64(x)
+#define htole64(x) (x)
+#define be64toh(x) __builtin_bswap64(x)
+#define le64toh(x) (x)
+
+/* ... generic big-endian fallback code */
+/* ... AIX doesn't have __BYTE_ORDER__ (with XLC compiler) & is always
+ * big-endian */
+#elif (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) || \
+    defined(_AIX)
+
+/* byte swapping code inspired by:
+ * https://github.com/rweather/arduinolibs/blob/master/libraries/Crypto/utility/EndianUtil.h
+ * */
+
+#define htobe32(x) (x)
+#define be32toh(x) (x)
+#define htole32(x)                                                  \
+  (__extension__({                                                  \
+    uint32_t _temp = (x);                                           \
+    ((_temp >> 24) & 0x000000FF) | ((_temp >> 8) & 0x0000FF00) |    \
+        ((_temp << 8) & 0x00FF0000) | ((_temp << 24) & 0xFF000000); \
+  }))
+#define le32toh(x) (htole32((x)))
+
+#define htobe64(x) (x)
+#define be64toh(x) (x)
+#define htole64(x)                                       \
+  (__extension__({                                       \
+    uint64_t __temp = (x);                               \
+    uint32_t __low = htobe32((uint32_t)__temp);          \
+    uint32_t __high = htobe32((uint32_t)(__temp >> 32)); \
+    (((uint64_t)__low) << 32) | __high;                  \
+  }))
+#define le64toh(x) (htole64((x)))
+
+/* ... generic little-endian fallback code */
+#elif defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+
+#define htole32(x) (x)
+#define le32toh(x) (x)
+#define htobe32(x)                                                  \
+  (__extension__({                                                  \
+    uint32_t _temp = (x);                                           \
+    ((_temp >> 24) & 0x000000FF) | ((_temp >> 8) & 0x0000FF00) |    \
+        ((_temp << 8) & 0x00FF0000) | ((_temp << 24) & 0xFF000000); \
+  }))
+#define be32toh(x) (htobe32((x)))
+
+#define htole64(x) (x)
+#define le64toh(x) (x)
+#define htobe64(x)                                       \
+  (__extension__({                                       \
+    uint64_t __temp = (x);                               \
+    uint32_t __low = htobe32((uint32_t)__temp);          \
+    uint32_t __high = htobe32((uint32_t)(__temp >> 32)); \
+    (((uint64_t)__low) << 32) | __high;                  \
+  }))
+#define be64toh(x) (htobe64((x)))
+
+/* ... couldn't determine endian-ness of the target platform */
+#else
+#error "Please define __BYTE_ORDER__!"
+
+#endif /* defined(__linux__) || ... */
+
+/* Loads and stores. These avoid undefined behavior due to unaligned memory
+ * accesses, via memcpy. */
+
+inline static uint16_t load16(uint8_t *b) {
+  uint16_t x;
+  memcpy(&x, b, 2);
+  return x;
+}
+
+inline static uint32_t load32(uint8_t *b) {
+  uint32_t x;
+  memcpy(&x, b, 4);
+  return x;
+}
+
+inline static uint64_t load64(uint8_t *b) {
+  uint64_t x;
+  memcpy(&x, b, 8);
+  return x;
+}
+
+inline static void store16(uint8_t *b, uint16_t i) { memcpy(b, &i, 2); }
+
+inline static void store32(uint8_t *b, uint32_t i) { memcpy(b, &i, 4); }
+
+inline static void store64(uint8_t *b, uint64_t i) { memcpy(b, &i, 8); }
+
+/* Legacy accessors so that this header can serve as an implementation of
+ * C.Endianness */
+#define load16_le(b) (le16toh(load16(b)))
+#define store16_le(b, i) (store16(b, htole16(i)))
+#define load16_be(b) (be16toh(load16(b)))
+#define store16_be(b, i) (store16(b, htobe16(i)))
+
+#define load32_le(b) (le32toh(load32(b)))
+#define store32_le(b, i) (store32(b, htole32(i)))
+#define load32_be(b) (be32toh(load32(b)))
+#define store32_be(b, i) (store32(b, htobe32(i)))
+
+#define load64_le(b) (le64toh(load64(b)))
+#define store64_le(b, i) (store64(b, htole64(i)))
+#define load64_be(b) (be64toh(load64(b)))
+#define store64_be(b, i) (store64(b, htobe64(i)))
+
+/* Co-existence of LowStar.Endianness and FStar.Endianness generates name
+ * conflicts, because of course both insist on having no prefixes. Until a
+ * prefix is added, or until we truly retire FStar.Endianness, solve this issue
+ * in an elegant way. */
+#define load16_le0 load16_le
+#define store16_le0 store16_le
+#define load16_be0 load16_be
+#define store16_be0 store16_be
+
+#define load32_le0 load32_le
+#define store32_le0 store32_le
+#define load32_be0 load32_be
+#define store32_be0 store32_be
+
+#define load64_le0 load64_le
+#define store64_le0 store64_le
+#define load64_be0 load64_be
+#define store64_be0 store64_be
+
+#define load128_le0 load128_le
+#define store128_le0 store128_le
+#define load128_be0 load128_be
+#define store128_be0 store128_be
+
+#endif