From 4bcb11f0610302110ae8109af01d5b652191b2f6 Mon Sep 17 00:00:00 2001
From: Marcin M <128217157+mm-zk@users.noreply.github.com>
Date: Wed, 29 May 2024 14:22:45 +0200
Subject: [PATCH] feat: Update boojum nightly - feature gate packed simd
 (Attempt 2) (#50)

---
 .github/workflows/ci.yaml                     |  19 +
 Cargo.toml                                    |   8 +-
 rust-toolchain.toml                           |   2 +-
 src/cs/traits/cs.rs                           |   2 +-
 src/dag/guide.rs                              |   4 +-
 src/dag/resolver_box.rs                       |  12 +-
 src/dag/resolvers/mt/mod.rs                   |  12 +-
 src/dag/resolvers/mt/registrar.rs             |   2 +-
 src/dag/resolvers/mt/resolution_window.rs     |  34 +-
 src/dag/resolvers/mt/sorters/sorter_live.rs   |  14 +-
 src/field/goldilocks/arm_asm_impl.rs          |  95 +-
 src/field/goldilocks/arm_asm_packed_impl.rs   | 858 ++++++++++++++++++
 src/field/goldilocks/mod.rs                   |  17 +
 src/implementations/poseidon2/mod.rs          |  26 +-
 .../poseidon2/state_generic_impl.rs           |   2 +
 src/lib.rs                                    |   5 +-
 16 files changed, 1020 insertions(+), 92 deletions(-)
 create mode 100644 src/field/goldilocks/arm_asm_packed_impl.rs

diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index a984dd9..f5200a1 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -17,6 +17,25 @@ jobs:
       - uses: actions-rust-lang/setup-rust-toolchain@v1
       - run: cargo build --verbose
       - run: cargo test --verbose --all
+       
+  build_old:
+    name: cargo build and test (packed_simd)
+    strategy:
+      matrix:
+        # Needs big runners to run tests
+        # Only macos-13-xlarge is Apple Silicon, as per:
+        # https://docs.github.com/en/actions/using-github-hosted-runners/about-larger-runners/about-larger-runners#about-macos-larger-runners
+        os: [ubuntu-22.04-github-hosted-16core, macos-13-xlarge]
+    runs-on: ${{ matrix.os }}
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions-rust-lang/setup-rust-toolchain@v1
+        with:
+          toolchain: nightly-2023-05-31 
+
+      # Still compile the old rust nightly with packed simd - until we have a good replacement in poseidon.
+      - run: RUSTFLAGS=-Awarnings cargo +nightly-2023-05-31 build --features include_packed_simd
+      - run: RUSTFLAGS=-Awarnings cargo +nightly-2023-05-31 test --features include_packed_simd
 
   formatting:
     name: cargo fmt
diff --git a/Cargo.toml b/Cargo.toml
index 17c9067..e44d1ba 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -23,7 +23,7 @@ itertools = "0.10"
 blake2 = "0.10"
 sha2 = "0.10"
 num-modular = "0.5.1"
-packed_simd = { version = "0.3.9" }
+packed_simd = { version = "0.3.9" , optional = true}
 pairing = { package = "pairing_ce", git = "https://github.com/matter-labs/pairing.git" }
 crypto-bigint = "0.5"
 convert_case = "*"
@@ -54,3 +54,9 @@ opt-level = 3
 [features]
 # If enabled, logs will be using trace, if disabled, they will be printed to stdout.
 log_tracing = ["tracing"]
+# Currently packed_simd is no longer working with the newest nightly.
+# But we still keep it as a feature, as we didn't migrate all the code, and 
+# some people might want to use older rust nightly, to be able to gain some performance.
+include_packed_simd = ["packed_simd"]
+cr_paranoia_mode = []
+debug_track = []
\ No newline at end of file
diff --git a/rust-toolchain.toml b/rust-toolchain.toml
index 6b48c00..a671fa6 100644
--- a/rust-toolchain.toml
+++ b/rust-toolchain.toml
@@ -1,2 +1,2 @@
 [toolchain]
-channel = "nightly-2023-06-25"
+channel = "nightly-2024-05-07"
diff --git a/src/cs/traits/cs.rs b/src/cs/traits/cs.rs
index da3b51d..5d4d786 100644
--- a/src/cs/traits/cs.rs
+++ b/src/cs/traits/cs.rs
@@ -20,7 +20,7 @@ impl<'set, 'tgt: 'set, T: SmallField> DstBuffer<'set, 'tgt, T> {
                 *offset += 1;
             }
             DstBuffer::MutSliceIndirect(dst, debug_track, offset) => {
-                if cfg!(debug_track) && *debug_track {
+                if cfg!(feature = "debug_track") && *debug_track {
                     log!("   set out {} <- {}", *offset, value.as_raw_u64())
                 }
 
diff --git a/src/dag/guide.rs b/src/dag/guide.rs
index 0eba95b..47a6315 100644
--- a/src/dag/guide.rs
+++ b/src/dag/guide.rs
@@ -384,7 +384,7 @@ impl<'a, T: Copy + Debug, F: SmallField, Cfg: CSResolverConfig> GuideOrder<'a, T
             pos += span.buffer.len();
         }
 
-        if cfg!(cr_paranoia_mode) && self.guide.tracing {
+        if cfg!(feature = "cr_paranoia_mode") && self.guide.tracing {
             log!(
                 "Released span {}: {:?}",
                 self.guide.spans[0].id.0,
@@ -684,7 +684,7 @@ impl<T: Debug, F: SmallField, Cfg: CSResolverConfig> BufferGuide<T, F, Cfg> {
     }
 
     pub(crate) fn flush(&mut self) -> BufferGuideFinalization<'_, T, F, Cfg> {
-        if cfg!(cr_paranoia_mode) && self.tracing {
+        if cfg!(feature = "cr_paranoia_mode") && self.tracing {
             log!("CRG: flush.");
         }
 
diff --git a/src/dag/resolver_box.rs b/src/dag/resolver_box.rs
index 397d551..d0f7dd8 100644
--- a/src/dag/resolver_box.rs
+++ b/src/dag/resolver_box.rs
@@ -424,7 +424,7 @@ pub(crate) fn invocation_binder<Fn, F: SmallField>(
         // Safety: This is the actual type of the provided function.
         let bound = resolver.resolve_fn::<Fn>();
 
-        if (cfg!(cr_paranoia_mode) || crate::dag::resolvers::mt::PARANOIA) && false {
+        if (cfg!(feature = "cr_paranoia_mode") || crate::dag::resolvers::mt::PARANOIA) && false {
             log!(
                 "Ivk: Ins [{}], Out [{}], Out-addr [{}], Thread [{}]",
                 resolver
@@ -448,7 +448,10 @@ pub(crate) fn invocation_binder<Fn, F: SmallField>(
             )
         }
 
-        if (cfg!(cr_paranoia_mode) || crate::dag::resolvers::mt::PARANOIA) && debug_track && false {
+        if (cfg!(feature = "cr_paranoia_mode") || crate::dag::resolvers::mt::PARANOIA)
+            && debug_track
+            && false
+        {
             log!(
                 "Ivk: provided inputs:\n   - {:?}",
                 ins.iter().map(|x| x.as_raw_u64()).collect_vec()
@@ -457,7 +460,10 @@ pub(crate) fn invocation_binder<Fn, F: SmallField>(
 
         bound(ins, &mut DstBuffer::MutSliceIndirect(out, debug_track, 0));
 
-        if (cfg!(cr_paranoia_mode) || crate::dag::resolvers::mt::PARANOIA) && debug_track && true {
+        if (cfg!(feature = "cr_paranoia_mode") || crate::dag::resolvers::mt::PARANOIA)
+            && debug_track
+            && true
+        {
             log!(
                 "Ivk: calculated outputs:\n   - {:?}",
                 out.iter().map(|x| x.as_raw_u64()).collect_vec()
diff --git a/src/dag/resolvers/mt/mod.rs b/src/dag/resolvers/mt/mod.rs
index cad5822..8de30f5 100644
--- a/src/dag/resolvers/mt/mod.rs
+++ b/src/dag/resolvers/mt/mod.rs
@@ -169,7 +169,7 @@ impl<V: SmallField, RS: ResolverSortingMode<V>, CFG: CSResolverConfig>
 
         let debug_track = vec![];
 
-        if cfg!(cr_paranoia_mode) || PARANOIA {
+        if cfg!(feature = "cr_paranoia_mode") || PARANOIA {
             log!("Contains tracked keys {:?} ", debug_track);
         }
 
@@ -269,7 +269,7 @@ impl<V: SmallField, RS: ResolverSortingMode<V>, CFG: CSResolverConfig>
 
         self.sorter.write_sequence();
 
-        if cfg!(cr_paranoia_mode) || PARANOIA {
+        if cfg!(feature = "cr_paranoia_mode") || PARANOIA {
             log!("CR {:?}", unsafe {
                 self.common.awaiters_broker.stats.u_deref()
             });
@@ -1487,7 +1487,7 @@ mod test {
 
         storage.wait_till_resolved();
 
-        if cfg!(cr_paranoia_mode) {
+        if cfg!(feature = "cr_paranoia_mode") {
             log!("Test: total value result: \n   - {}", unsafe {
                 (*storage.common.values.get())
                     .variables
@@ -1509,7 +1509,7 @@ mod test {
                 let act = Place::from_variable(Variable::from_variable_index(ix as u64))
                     .to(|x| storage.get_value_unchecked(x));
 
-                if cfg!(cr_paranoia_mode) {
+                if cfg!(feature = "cr_paranoia_mode") {
                     log!("Test: per item value: ix {}, value {}", ix, act);
                 }
 
@@ -1542,7 +1542,7 @@ mod test {
 
         storage.wait_till_resolved();
 
-        if cfg!(cr_paranoia_mode) {
+        if cfg!(feature = "cr_paranoia_mode") {
             log!("Test: total value result: \n   - {}", unsafe {
                 (*storage.common.values.get())
                     .variables
@@ -1564,7 +1564,7 @@ mod test {
                 let act = Place::from_variable(Variable::from_variable_index(ix as u64))
                     .to(|x| storage.get_value_unchecked(x));
 
-                if cfg!(cr_paranoia_mode) {
+                if cfg!(feature = "cr_paranoia_mode") {
                     log!("Test: per item value: ix {}, value {}", ix, act);
                 }
 
diff --git a/src/dag/resolvers/mt/registrar.rs b/src/dag/resolvers/mt/registrar.rs
index 3c43c6b..257703f 100644
--- a/src/dag/resolvers/mt/registrar.rs
+++ b/src/dag/resolvers/mt/registrar.rs
@@ -116,7 +116,7 @@ impl Registrar {
     }
 
     pub(crate) fn is_empty(&self) -> bool {
-        if cfg!(cr_paranoia_mode) {
+        if cfg!(feature = "cr_paranoia_mode") {
             log!(
                 "CRR: total remaining resolvers: {}",
                 self.vars.values().map(|x| x.len()).sum::<usize>()
diff --git a/src/dag/resolvers/mt/resolution_window.rs b/src/dag/resolvers/mt/resolution_window.rs
index 9efe2cb..60d1d23 100644
--- a/src/dag/resolvers/mt/resolution_window.rs
+++ b/src/dag/resolvers/mt/resolution_window.rs
@@ -163,8 +163,12 @@ impl<V: SmallField + 'static, T: TrackId + 'static, Cfg: RWConfig<T> + 'static>
             comms,
 
             track_list: Vec::new(),
-            execution_list: if cfg!(cr_paranoia_mode) { 1 << 26 } else { 0 }
-                .to(|x| Vec::with_capacity(x).op(|v| v.resize(x, 0))),
+            execution_list: if cfg!(feature = "cr_paranoia_mode") {
+                1 << 26
+            } else {
+                0
+            }
+            .to(|x| Vec::with_capacity(x).op(|v| v.resize(x, 0))),
             phantom: PhantomData,
         };
 
@@ -207,7 +211,7 @@ impl<V: SmallField + 'static, T: TrackId + 'static, Cfg: RWConfig<T> + 'static>
 
                 data[data_ix].push(order_ix.into(), task.order_info.value);
 
-                if cfg!(cr_paranoia_mode) {
+                if cfg!(feature = "cr_paranoia_mode") {
                     self.execution_list[order_ix] += 1;
 
                     if self.execution_list[order_ix] > 1 {
@@ -238,7 +242,7 @@ impl<V: SmallField + 'static, T: TrackId + 'static, Cfg: RWConfig<T> + 'static>
                 }
             }
 
-            if (cfg!(cr_paranoia_mode) || crate::dag::resolvers::mt::PARANOIA) && true {
+            if (cfg!(feature = "cr_paranoia_mode") || crate::dag::resolvers::mt::PARANOIA) && true {
                 log!("RW: Batch! {} tasks.", count);
             }
 
@@ -264,7 +268,7 @@ impl<V: SmallField + 'static, T: TrackId + 'static, Cfg: RWConfig<T> + 'static>
                 .for_each(|x| {
                     x.state = ResolverState::Done;
 
-                    if cfg!(cr_paranoia_mode) || crate::dag::resolvers::mt::PARANOIA {
+                    if cfg!(feature = "cr_paranoia_mode") || crate::dag::resolvers::mt::PARANOIA {
                         unsafe {
                             let r = self.common.resolvers.u_deref().get(x.order_info.value);
 
@@ -291,7 +295,7 @@ impl<V: SmallField + 'static, T: TrackId + 'static, Cfg: RWConfig<T> + 'static>
                     }
                 });
 
-            if cfg!(cr_paranoia_mode) || crate::dag::resolvers::mt::PARANOIA {
+            if cfg!(feature = "cr_paranoia_mode") || crate::dag::resolvers::mt::PARANOIA {
                 if self
                     .exec_order_buffer
                     .iter()
@@ -343,7 +347,7 @@ impl<V: SmallField + 'static, T: TrackId + 'static, Cfg: RWConfig<T> + 'static>
 
                     drop(awaiters);
 
-                    if cfg!(cr_paranoia_mode) && count > 0 {
+                    if cfg!(feature = "cr_paranoia_mode") && count > 0 {
                         log!(
                             "RW: Shifted by {}, new range is: {}..{}, buffer len: {}",
                             count,
@@ -412,7 +416,7 @@ impl<V: SmallField + 'static, T: TrackId + 'static, Cfg: RWConfig<T> + 'static>
 
                 self.stats.total_consumption = extend_to as u64;
 
-                if crate::dag::resolvers::mt::PARANOIA || cfg!(cr_paranoia_mode) {
+                if crate::dag::resolvers::mt::PARANOIA || cfg!(feature = "cr_paranoia_mode") {
                     log!(
                         "RW: Extended range by {}, new range {}..{}",
                         extend_to,
@@ -474,7 +478,7 @@ impl<V: SmallField + 'static, T: TrackId + 'static, Cfg: RWConfig<T> + 'static>
             }
         }
 
-        if crate::dag::resolvers::mt::PARANOIA || cfg!(cr_paranoia_mode) {
+        if crate::dag::resolvers::mt::PARANOIA || cfg!(feature = "cr_paranoia_mode") {
             log!("[{:?}] RW: Exit conditions met.", std::time::Instant::now())
         }
 
@@ -484,7 +488,7 @@ impl<V: SmallField + 'static, T: TrackId + 'static, Cfg: RWConfig<T> + 'static>
 
         self.stats.total_time = start_instant.elapsed();
 
-        if cfg!(cr_paranoia_mode) || crate::dag::resolvers::mt::PARANOIA {
+        if cfg!(feature = "cr_paranoia_mode") || crate::dag::resolvers::mt::PARANOIA {
             log!("CR {:#?}", self.stats);
             log!("CR {:#?}", unsafe { &*self.channel.stats.get() });
 
@@ -554,7 +558,7 @@ impl<V: SmallField, T: TrackId + 'static, Cfg: RWConfig<T>, const SIZE: usize>
                             // here, as this is an unsynchronizd access.
                             let resolver = this.common.resolvers.u_deref().get(*resolver_ix);
 
-                            if cfg!(cr_paranoia_mode) || crate::dag::resolvers::mt::PARANOIA {
+                            if cfg!(feature="cr_paranoia_mode") || crate::dag::resolvers::mt::PARANOIA {
                                 std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
                                     this.invoke(resolver, *order_ix);
 
@@ -590,7 +594,7 @@ impl<V: SmallField, T: TrackId + 'static, Cfg: RWConfig<T>, const SIZE: usize>
             });
         }
 
-        if cfg!(cr_paranoia_mode) || crate::dag::resolvers::mt::PARANOIA {
+        if cfg!(feature = "cr_paranoia_mode") || crate::dag::resolvers::mt::PARANOIA {
             log!(
                 "{}\n{:#?}\n{:#?}",
                 std::thread::current().name().unwrap_or_default(),
@@ -629,7 +633,7 @@ impl<V: SmallField, T: TrackId + 'static, Cfg: RWConfig<T>, const SIZE: usize>
             .map(|x| {
                 let (vs, md) = self.common.values.u_deref().get_item_ref(*x);
 
-                if cfg!(cr_paranoia_mode) || true {
+                if cfg!(feature = "cr_paranoia_mode") || true {
                     if Cfg::ASSERT_TRACKED_VALUES {
                         assert!(md.is_tracked());
                     }
@@ -678,7 +682,7 @@ impl<V: SmallField, T: TrackId + 'static, Cfg: RWConfig<T>, const SIZE: usize>
 
         let mut track = false;
 
-        if cfg!(cr_paranoia_mode) || crate::dag::resolvers::mt::PARANOIA {
+        if cfg!(feature = "cr_paranoia_mode") || crate::dag::resolvers::mt::PARANOIA {
             if let Some(x) = self
                 .debug_track
                 .iter()
@@ -831,7 +835,7 @@ impl LockStepChannel {
     fn execute(&self) {
         use std::sync::atomic::Ordering::*;
 
-        if (cfg!(cr_paranoia_mode) || crate::dag::resolvers::mt::PARANOIA) && false {
+        if (cfg!(feature = "cr_paranoia_mode") || crate::dag::resolvers::mt::PARANOIA) && false {
             log!("RW: batch sent {:#?}", unsafe { self.data.u_deref() });
         }
 
diff --git a/src/dag/resolvers/mt/sorters/sorter_live.rs b/src/dag/resolvers/mt/sorters/sorter_live.rs
index c0c1298..6b1e423 100644
--- a/src/dag/resolvers/mt/sorters/sorter_live.rs
+++ b/src/dag/resolvers/mt/sorters/sorter_live.rs
@@ -191,7 +191,7 @@ impl<F: SmallField, Cfg: CSResolverConfig, RW: ResolutionRecordWriter>
                 }
             }
 
-            if cfg!(cr_paranoia_mode) {
+            if cfg!(feature = "cr_paranoia_mode") {
                 // This ugly block checks that the calculated parallelism is
                 // correct. It's a bit slower than O(n^2). Also note, that it
                 // checks only the last 1050 items, so it's not a full check,
@@ -297,7 +297,7 @@ impl<F: SmallField, Cfg: CSResolverConfig, RW: ResolutionRecordWriter> ResolverS
     }
 
     fn set_value(&mut self, key: crate::cs::Place, value: F) {
-        if (cfg!(cr_paranoia_mode) || crate::dag::resolvers::mt::PARANOIA)
+        if (cfg!(feature = "cr_paranoia_mode") || crate::dag::resolvers::mt::PARANOIA)
             && self.debug_track.contains(&key)
             && false
         {
@@ -378,7 +378,7 @@ impl<F: SmallField, Cfg: CSResolverConfig, RW: ResolutionRecordWriter> ResolverS
 
         let mut hit = false;
 
-        if (cfg!(cr_paranoia_mode) || crate::dag::resolvers::mt::PARANOIA) && true {
+        if (cfg!(feature = "cr_paranoia_mode") || crate::dag::resolvers::mt::PARANOIA) && true {
             if let Some(x) = self.debug_track.iter().find(|x| inputs.contains(x)) {
                 log!("CR: added resolution with tracked input {:?}", x);
 
@@ -498,7 +498,7 @@ impl<F: SmallField, Cfg: CSResolverConfig, RW: ResolutionRecordWriter> ResolverS
         outputs: &[Place],
         added_at: RegistrationNum,
     ) -> Vec<ResolverIx> {
-        if cfg!(cr_paranoia_mode) {
+        if cfg!(feature = "cr_paranoia_mode") {
             if let Some(x) = self.debug_track.iter().find(|x| inputs.contains(x)) {
                 log!("CR: internalized resolution with tracked input {:?}", x);
             }
@@ -519,7 +519,7 @@ impl<F: SmallField, Cfg: CSResolverConfig, RW: ResolutionRecordWriter> ResolverS
 
         let deps = inputs.iter().map(|x| &values.get_item_ref(*x).1);
 
-        if cfg!(cr_paranoia_mode) {
+        if cfg!(feature = "cr_paranoia_mode") {
             debug_assert!(
                 deps.clone().all(|x| { x.is_tracked() }),
                 "Attempting to internalize a resolution with an untracked input. All inputs must be tracked."
@@ -610,14 +610,14 @@ impl<F: SmallField, Cfg: CSResolverConfig, RW: ResolutionRecordWriter> ResolverS
         self.record.values_count = unsafe { self.common.values.u_deref().max_tracked + 1 } as usize;
         self.record.registrations_count = self.stats.registrations_added as usize;
 
-        if cfg!(cr_paranoia_mode) || crate::dag::resolvers::mt::PARANOIA {
+        if cfg!(feature = "cr_paranoia_mode") || crate::dag::resolvers::mt::PARANOIA {
             log!(
                 "CR: Final order written. Order len {}",
                 self.common.exec_order.lock().unwrap().items.len()
             );
         }
 
-        if cfg!(cr_paranoia_mode) || crate::dag::resolvers::mt::PARANOIA {
+        if cfg!(feature = "cr_paranoia_mode") || crate::dag::resolvers::mt::PARANOIA {
             self.guide.stats.finalize();
 
             log!("CR {:?}", self.guide.stats);
diff --git a/src/field/goldilocks/arm_asm_impl.rs b/src/field/goldilocks/arm_asm_impl.rs
index 03399c4..369b881 100644
--- a/src/field/goldilocks/arm_asm_impl.rs
+++ b/src/field/goldilocks/arm_asm_impl.rs
@@ -2,8 +2,10 @@ use crate::cs::implementations::utils::precompute_twiddles_for_fft;
 use crate::cs::traits::GoodAllocator;
 use crate::field::{Field, PrimeField};
 use crate::worker::Worker;
-use packed_simd::shuffle;
+use std::intrinsics::simd::simd_shuffle;
 use std::ops::{Add, BitOr, Sub};
+use std::simd::cmp::{SimdPartialEq, SimdPartialOrd};
+use std::simd::{u64x4, u64x8};
 use std::usize;
 
 use super::GoldilocksField;
@@ -17,7 +19,7 @@ pub struct MixedGL(pub [GoldilocksField; 16]);
 // we also need holder for SIMD targets, because u64x4 has smaller alignment than u64x8
 #[derive(Clone, Copy)]
 #[repr(C, align(64))]
-struct U64x4Holder([packed_simd::u64x4; 4]);
+struct U64x4Holder([u64x4; 4]);
 
 impl std::fmt::Debug for MixedGL {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -38,8 +40,8 @@ impl MixedGL {
     pub const T: u64 = (Self::ORDER - 1) >> Self::TWO_ADICITY;
     pub const BARRETT: u128 = 18446744078004518912; // 0x10000000100000000
     pub const EPSILON: u64 = (1 << 32) - 1;
-    pub const EPSILON_VECTOR: packed_simd::u64x4 = packed_simd::u64x4::splat(Self::EPSILON);
-    pub const EPSILON_VECTOR_D: packed_simd::u64x8 = packed_simd::u64x8::splat(Self::EPSILON);
+    pub const EPSILON_VECTOR: u64x4 = u64x4::from_array([Self::EPSILON; 4]);
+    pub const EPSILON_VECTOR_D: u64x8 = u64x8::from_array([Self::EPSILON; 8]);
 
     #[inline(always)]
     pub fn new() -> Self {
@@ -64,7 +66,7 @@ impl MixedGL {
         for i in 0..4 {
             let a = a_u64.0[i];
             let a_reduced = a.add(Self::EPSILON_VECTOR);
-            let cmp = a_reduced.lt(Self::EPSILON_VECTOR);
+            let cmp = a_reduced.simd_lt(Self::EPSILON_VECTOR);
             let res = cmp.select(a_reduced, a);
 
             a_u64.0[i] = res;
@@ -108,13 +110,13 @@ impl MixedGL {
             let b = b_u64.0[i];
             //additional reduction over b
             let b_reduced = b.add(Self::EPSILON_VECTOR);
-            let cmp = b_reduced.lt(Self::EPSILON_VECTOR);
+            let cmp = b_reduced.simd_lt(Self::EPSILON_VECTOR);
             let b = cmp.select(b_reduced, b);
             //a+b
             let sum = a.add(b);
             let sum_reduced = sum.add(Self::EPSILON_VECTOR);
-            let cmp0 = sum_reduced.lt(sum);
-            let cmp1 = sum.lt(a);
+            let cmp0 = sum_reduced.simd_lt(sum);
+            let cmp1 = sum.simd_lt(a);
             let reduce_flag = cmp0.bitor(cmp1);
             let res = reduce_flag.select(sum_reduced, sum);
 
@@ -139,12 +141,12 @@ impl MixedGL {
             let b = b_u64.0[i];
             //additional reduction over b
             let b_reduced = b.add(Self::EPSILON_VECTOR);
-            let cmp = b_reduced.lt(Self::EPSILON_VECTOR);
+            let cmp = b_reduced.simd_lt(Self::EPSILON_VECTOR);
             let b = cmp.select(b_reduced, b);
             //a-b
             let diff = a.sub(b);
             let diff_reduced = diff.sub(Self::EPSILON_VECTOR);
-            let cmp = a.lt(b);
+            let cmp = a.simd_lt(b);
             let res = cmp.select(diff_reduced, diff);
 
             a_u64.0[i] = res;
@@ -159,27 +161,28 @@ impl MixedGL {
 
     pub unsafe fn butterfly_1x1_impl(&mut self) -> &mut Self {
         let [part1, part2] = MixedGL::as_u64x8_arrays(&*self);
-        let u: packed_simd::u64x8 = shuffle!(part1, part2, [0, 2, 4, 6, 8, 10, 12, 14]);
-        let v: packed_simd::u64x8 = shuffle!(part1, part2, [1, 3, 5, 7, 9, 11, 13, 15]);
+
+        let u: u64x8 = simd_shuffle(part1, part2, const { [0u32, 2, 4, 6, 8, 10, 12, 14] });
+        let v: u64x8 = simd_shuffle(part1, part2, const { [1u32, 3, 5, 7, 9, 11, 13, 15] });
         //additional reduction over v
         let v_reduced = v.add(Self::EPSILON_VECTOR_D);
-        let cmp = v_reduced.lt(Self::EPSILON_VECTOR_D);
+        let cmp = v_reduced.simd_lt(Self::EPSILON_VECTOR_D);
         let v = cmp.select(v_reduced, v);
         // u + v
         let sum = u.add(v);
         let sum_reduced = sum.add(Self::EPSILON_VECTOR_D);
-        let cmp0 = sum_reduced.lt(sum);
-        let cmp1 = sum.lt(u);
+        let cmp0 = sum_reduced.simd_lt(sum);
+        let cmp1 = sum.simd_lt(u);
         let reduce_flag = cmp0.bitor(cmp1);
         let res1 = reduce_flag.select(sum_reduced, sum);
         // u - v
         let diff = u.sub(v);
         let diff_reduced = diff.sub(Self::EPSILON_VECTOR_D);
-        let cmp = u.lt(v);
+        let cmp = u.simd_lt(v);
         let res2 = cmp.select(diff_reduced, diff);
 
-        let part1: packed_simd::u64x8 = shuffle!(res1, res2, [0, 8, 1, 9, 2, 10, 3, 11]);
-        let part2: packed_simd::u64x8 = shuffle!(res1, res2, [4, 12, 5, 13, 6, 14, 7, 15]);
+        let part1: u64x8 = simd_shuffle(res1, res2, const { [0u32, 8, 1, 9, 2, 10, 3, 11] });
+        let part2: u64x8 = simd_shuffle(res1, res2, const { [4u32, 12, 5, 13, 6, 14, 7, 15] });
 
         *self = MixedGL::from_u64x8_arrays([part1, part2]);
 
@@ -188,27 +191,27 @@ impl MixedGL {
 
     pub unsafe fn butterfly_2x2_impl(&mut self) -> &mut Self {
         let [part1, part2] = MixedGL::as_u64x8_arrays(&*self);
-        let u: packed_simd::u64x8 = shuffle!(part1, part2, [0, 1, 4, 5, 8, 9, 12, 13]);
-        let v: packed_simd::u64x8 = shuffle!(part1, part2, [2, 3, 6, 7, 10, 11, 14, 15]);
+        let u: u64x8 = simd_shuffle(part1, part2, const { [0u32, 1, 4, 5, 8, 9, 12, 13] });
+        let v: u64x8 = simd_shuffle(part1, part2, const { [2u32, 3, 6, 7, 10, 11, 14, 15] });
         //additional reduction over v
         let v_reduced = v.add(Self::EPSILON_VECTOR_D);
-        let cmp = v_reduced.lt(Self::EPSILON_VECTOR_D);
+        let cmp = v_reduced.simd_lt(Self::EPSILON_VECTOR_D);
         let v = cmp.select(v_reduced, v);
         // u + v
         let sum = u.add(v);
         let sum_reduced = sum.add(Self::EPSILON_VECTOR_D);
-        let cmp0 = sum_reduced.lt(sum);
-        let cmp1 = sum.lt(u);
+        let cmp0 = sum_reduced.simd_lt(sum);
+        let cmp1 = sum.simd_lt(u);
         let reduce_flag = cmp0.bitor(cmp1);
         let res1 = reduce_flag.select(sum_reduced, sum);
         // u - v
         let diff = u.sub(v);
         let diff_reduced = diff.sub(Self::EPSILON_VECTOR_D);
-        let cmp = u.lt(v);
+        let cmp = u.simd_lt(v);
         let res2 = cmp.select(diff_reduced, diff);
 
-        let part1: packed_simd::u64x8 = shuffle!(res1, res2, [0, 1, 8, 9, 2, 3, 10, 11]);
-        let part2: packed_simd::u64x8 = shuffle!(res1, res2, [4, 5, 12, 13, 6, 7, 14, 15]);
+        let part1: u64x8 = simd_shuffle(res1, res2, const { [0u32, 1, 8, 9, 2, 3, 10, 11] });
+        let part2: u64x8 = simd_shuffle(res1, res2, const { [4u32, 5, 12, 13, 6, 7, 14, 15] });
 
         *self = MixedGL::from_u64x8_arrays([part1, part2]);
 
@@ -217,27 +220,27 @@ impl MixedGL {
 
     pub unsafe fn butterfly_4x4_impl(&mut self) -> &mut Self {
         let [part1, part2] = MixedGL::as_u64x8_arrays(&*self);
-        let u: packed_simd::u64x8 = shuffle!(part1, part2, [0, 1, 2, 3, 8, 9, 10, 11]);
-        let v: packed_simd::u64x8 = shuffle!(part1, part2, [4, 5, 6, 7, 12, 13, 14, 15]);
+        let u: u64x8 = simd_shuffle(part1, part2, const { [0u32, 1, 2, 3, 8, 9, 10, 11] });
+        let v: u64x8 = simd_shuffle(part1, part2, const { [4u32, 5, 6, 7, 12, 13, 14, 15] });
         //additional reduction over v
         let v_reduced = v.add(Self::EPSILON_VECTOR_D);
-        let cmp = v_reduced.lt(Self::EPSILON_VECTOR_D);
+        let cmp = v_reduced.simd_lt(Self::EPSILON_VECTOR_D);
         let v = cmp.select(v_reduced, v);
         // u + v
         let sum = u.add(v);
         let sum_reduced = sum.add(Self::EPSILON_VECTOR_D);
-        let cmp0 = sum_reduced.lt(sum);
-        let cmp1 = sum.lt(u);
+        let cmp0 = sum_reduced.simd_lt(sum);
+        let cmp1 = sum.simd_lt(u);
         let reduce_flag = cmp0.bitor(cmp1);
         let res1 = reduce_flag.select(sum_reduced, sum);
         // u - v
         let diff = u.sub(v);
         let diff_reduced = diff.sub(Self::EPSILON_VECTOR_D);
-        let cmp = u.lt(v);
+        let cmp = u.simd_lt(v);
         let res2 = cmp.select(diff_reduced, diff);
 
-        let part1: packed_simd::u64x8 = shuffle!(res1, res2, [0, 1, 2, 3, 8, 9, 10, 11]);
-        let part2: packed_simd::u64x8 = shuffle!(res1, res2, [4, 5, 6, 7, 12, 13, 14, 15]);
+        let part1: u64x8 = simd_shuffle(res1, res2, const { [0u32, 1, 2, 3, 8, 9, 10, 11] });
+        let part2: u64x8 = simd_shuffle(res1, res2, const { [4u32, 5, 6, 7, 12, 13, 14, 15] });
 
         *self = MixedGL::from_u64x8_arrays([part1, part2]);
 
@@ -256,27 +259,27 @@ impl MixedGL {
 
         let u = std::slice::from_raw_parts_mut(this as *mut u64, 8);
         let v = std::slice::from_raw_parts_mut(other as *mut u64, 8);
-        let a = packed_simd::u64x8::from_slice_aligned(u);
-        let b = packed_simd::u64x8::from_slice_aligned(v);
+        let a = u64x8::from_slice(u);
+        let b = u64x8::from_slice(v);
         //additional reduction over b
         let b_reduced = b.add(Self::EPSILON_VECTOR_D);
-        let cmp = b_reduced.lt(Self::EPSILON_VECTOR_D);
+        let cmp = b_reduced.simd_lt(Self::EPSILON_VECTOR_D);
         let b = cmp.select(b_reduced, b);
         // u + v
         let sum = a.add(b);
         let sum_reduced = sum.add(Self::EPSILON_VECTOR_D);
-        let cmp0 = sum_reduced.lt(sum);
-        let cmp1 = sum.lt(a);
+        let cmp0 = sum_reduced.simd_lt(sum);
+        let cmp1 = sum.simd_lt(a);
         let reduce_flag = cmp0.bitor(cmp1);
         let res1 = reduce_flag.select(sum_reduced, sum);
         // u - v
         let diff = a.sub(b);
         let diff_reduced = diff.sub(Self::EPSILON_VECTOR_D);
-        let cmp = a.lt(b);
+        let cmp = a.simd_lt(b);
         let res2 = cmp.select(diff_reduced, diff);
 
-        res1.write_to_slice_aligned(u);
-        res2.write_to_slice_aligned(v);
+        res1.copy_to_slice(u);
+        res2.copy_to_slice(v);
     }
 
     /// # Safety
@@ -323,7 +326,7 @@ impl MixedGL {
     }
 
     #[inline(always)]
-    pub(crate) fn as_u64x8_arrays(input: &Self) -> [packed_simd::u64x8; 2] {
+    pub(crate) fn as_u64x8_arrays(input: &Self) -> [u64x8; 2] {
         // this preserves an alignment
         unsafe { std::mem::transmute(*input) }
     }
@@ -335,7 +338,7 @@ impl MixedGL {
     }
 
     #[inline(always)]
-    pub(crate) unsafe fn from_u64x8_arrays(input: [packed_simd::u64x8; 2]) -> Self {
+    pub(crate) unsafe fn from_u64x8_arrays(input: [u64x8; 2]) -> Self {
         // this preserves an alignment
         std::mem::transmute(input)
     }
@@ -412,8 +415,8 @@ impl crate::field::traits::field_like::PrimeFieldLike for MixedGL {
         for i in 0..4 {
             let a = a_u64.0[i];
 
-            let is_zero = a.eq(packed_simd::u64x4::splat(0));
-            let neg = packed_simd::u64x4::splat(Self::ORDER).sub(a);
+            let is_zero = a.simd_eq(u64x4::splat(0));
+            let neg = u64x4::splat(Self::ORDER).sub(a);
             let res = is_zero.select(a, neg);
 
             a_u64.0[i] = res;
diff --git a/src/field/goldilocks/arm_asm_packed_impl.rs b/src/field/goldilocks/arm_asm_packed_impl.rs
new file mode 100644
index 0000000..03399c4
--- /dev/null
+++ b/src/field/goldilocks/arm_asm_packed_impl.rs
@@ -0,0 +1,858 @@
+use crate::cs::implementations::utils::precompute_twiddles_for_fft;
+use crate::cs::traits::GoodAllocator;
+use crate::field::{Field, PrimeField};
+use crate::worker::Worker;
+use packed_simd::shuffle;
+use std::ops::{Add, BitOr, Sub};
+use std::usize;
+
+use super::GoldilocksField;
+
+// we need max of an alignment of u64x4 and u64x8 in this implementation, so 64
+
+#[derive(PartialEq, Eq, Hash, Clone, Copy)]
+#[repr(C, align(64))]
+pub struct MixedGL(pub [GoldilocksField; 16]);
+
+// we also need holder for SIMD targets, because u64x4 has smaller alignment than u64x8
+#[derive(Clone, Copy)]
+#[repr(C, align(64))]
+struct U64x4Holder([packed_simd::u64x4; 4]);
+
+impl std::fmt::Debug for MixedGL {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{:?}", self.0)
+    }
+}
+
+impl std::fmt::Display for MixedGL {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{:?}", self.0)
+    }
+}
+
+impl MixedGL {
+    pub const ORDER_BITS: usize = GoldilocksField::ORDER_BITS;
+    pub const ORDER: u64 = GoldilocksField::ORDER;
+    pub const TWO_ADICITY: usize = GoldilocksField::TWO_ADICITY;
+    pub const T: u64 = (Self::ORDER - 1) >> Self::TWO_ADICITY;
+    pub const BARRETT: u128 = 18446744078004518912; // 0x10000000100000000
+    pub const EPSILON: u64 = (1 << 32) - 1;
+    pub const EPSILON_VECTOR: packed_simd::u64x4 = packed_simd::u64x4::splat(Self::EPSILON);
+    pub const EPSILON_VECTOR_D: packed_simd::u64x8 = packed_simd::u64x8::splat(Self::EPSILON);
+
+    #[inline(always)]
+    pub fn new() -> Self {
+        Self([GoldilocksField::ZERO; 16])
+    }
+
+    #[inline(always)]
+    pub fn from_constant(value: GoldilocksField) -> Self {
+        Self([value; 16])
+    }
+
+    #[inline(always)]
+    pub fn from_array(value: [GoldilocksField; 16]) -> Self {
+        Self(value)
+    }
+
+    #[inline(always)]
+    #[unroll::unroll_for_loops]
+    pub fn to_reduced(&mut self) -> &mut Self {
+        let mut a_u64 = Self::as_u64x4_arrays(self);
+
+        for i in 0..4 {
+            let a = a_u64.0[i];
+            let a_reduced = a.add(Self::EPSILON_VECTOR);
+            let cmp = a_reduced.lt(Self::EPSILON_VECTOR);
+            let res = cmp.select(a_reduced, a);
+
+            a_u64.0[i] = res;
+        }
+
+        unsafe {
+            *self = Self::from_u64x4_arrays(a_u64);
+        }
+
+        self
+    }
+
+    #[inline(always)]
+    #[unroll::unroll_for_loops]
+    pub fn mul_constant_assign(&'_ mut self, other: &GoldilocksField) -> &mut Self {
+        for i in 0..16 {
+            self.0[i].mul_assign(other);
+        }
+
+        self
+    }
+
+    #[inline(always)]
+    #[unroll::unroll_for_loops]
+    fn mul_assign_impl(&mut self, other: &Self) -> &mut Self {
+        for i in 0..16 {
+            self.0[i].mul_assign(&other.0[i]);
+        }
+
+        self
+    }
+
+    #[inline(always)]
+    #[unroll::unroll_for_loops]
+    fn add_assign_impl(&mut self, other: &Self) -> &mut Self {
+        let mut a_u64 = Self::as_u64x4_arrays(self);
+        let b_u64 = Self::as_u64x4_arrays(other);
+
+        for i in 0..4 {
+            let a = a_u64.0[i];
+            let b = b_u64.0[i];
+            //additional reduction over b
+            let b_reduced = b.add(Self::EPSILON_VECTOR);
+            let cmp = b_reduced.lt(Self::EPSILON_VECTOR);
+            let b = cmp.select(b_reduced, b);
+            //a+b
+            let sum = a.add(b);
+            let sum_reduced = sum.add(Self::EPSILON_VECTOR);
+            let cmp0 = sum_reduced.lt(sum);
+            let cmp1 = sum.lt(a);
+            let reduce_flag = cmp0.bitor(cmp1);
+            let res = reduce_flag.select(sum_reduced, sum);
+
+            a_u64.0[i] = res;
+        }
+
+        unsafe {
+            *self = Self::from_u64x4_arrays(a_u64);
+        }
+
+        self
+    }
+
+    #[inline(always)]
+    #[unroll::unroll_for_loops]
+    fn sub_assign_impl(&'_ mut self, other: &Self) -> &mut Self {
+        let mut a_u64 = Self::as_u64x4_arrays(self);
+        let b_u64 = Self::as_u64x4_arrays(other);
+
+        for i in 0..4 {
+            let a = a_u64.0[i];
+            let b = b_u64.0[i];
+            //additional reduction over b
+            let b_reduced = b.add(Self::EPSILON_VECTOR);
+            let cmp = b_reduced.lt(Self::EPSILON_VECTOR);
+            let b = cmp.select(b_reduced, b);
+            //a-b
+            let diff = a.sub(b);
+            let diff_reduced = diff.sub(Self::EPSILON_VECTOR);
+            let cmp = a.lt(b);
+            let res = cmp.select(diff_reduced, diff);
+
+            a_u64.0[i] = res;
+        }
+
+        unsafe {
+            *self = Self::from_u64x4_arrays(a_u64);
+        }
+
+        self
+    }
+
+    pub unsafe fn butterfly_1x1_impl(&mut self) -> &mut Self {
+        let [part1, part2] = MixedGL::as_u64x8_arrays(&*self);
+        let u: packed_simd::u64x8 = shuffle!(part1, part2, [0, 2, 4, 6, 8, 10, 12, 14]);
+        let v: packed_simd::u64x8 = shuffle!(part1, part2, [1, 3, 5, 7, 9, 11, 13, 15]);
+        //additional reduction over v
+        let v_reduced = v.add(Self::EPSILON_VECTOR_D);
+        let cmp = v_reduced.lt(Self::EPSILON_VECTOR_D);
+        let v = cmp.select(v_reduced, v);
+        // u + v
+        let sum = u.add(v);
+        let sum_reduced = sum.add(Self::EPSILON_VECTOR_D);
+        let cmp0 = sum_reduced.lt(sum);
+        let cmp1 = sum.lt(u);
+        let reduce_flag = cmp0.bitor(cmp1);
+        let res1 = reduce_flag.select(sum_reduced, sum);
+        // u - v
+        let diff = u.sub(v);
+        let diff_reduced = diff.sub(Self::EPSILON_VECTOR_D);
+        let cmp = u.lt(v);
+        let res2 = cmp.select(diff_reduced, diff);
+
+        let part1: packed_simd::u64x8 = shuffle!(res1, res2, [0, 8, 1, 9, 2, 10, 3, 11]);
+        let part2: packed_simd::u64x8 = shuffle!(res1, res2, [4, 12, 5, 13, 6, 14, 7, 15]);
+
+        *self = MixedGL::from_u64x8_arrays([part1, part2]);
+
+        self
+    }
+
+    pub unsafe fn butterfly_2x2_impl(&mut self) -> &mut Self {
+        let [part1, part2] = MixedGL::as_u64x8_arrays(&*self);
+        let u: packed_simd::u64x8 = shuffle!(part1, part2, [0, 1, 4, 5, 8, 9, 12, 13]);
+        let v: packed_simd::u64x8 = shuffle!(part1, part2, [2, 3, 6, 7, 10, 11, 14, 15]);
+        //additional reduction over v
+        let v_reduced = v.add(Self::EPSILON_VECTOR_D);
+        let cmp = v_reduced.lt(Self::EPSILON_VECTOR_D);
+        let v = cmp.select(v_reduced, v);
+        // u + v
+        let sum = u.add(v);
+        let sum_reduced = sum.add(Self::EPSILON_VECTOR_D);
+        let cmp0 = sum_reduced.lt(sum);
+        let cmp1 = sum.lt(u);
+        let reduce_flag = cmp0.bitor(cmp1);
+        let res1 = reduce_flag.select(sum_reduced, sum);
+        // u - v
+        let diff = u.sub(v);
+        let diff_reduced = diff.sub(Self::EPSILON_VECTOR_D);
+        let cmp = u.lt(v);
+        let res2 = cmp.select(diff_reduced, diff);
+
+        let part1: packed_simd::u64x8 = shuffle!(res1, res2, [0, 1, 8, 9, 2, 3, 10, 11]);
+        let part2: packed_simd::u64x8 = shuffle!(res1, res2, [4, 5, 12, 13, 6, 7, 14, 15]);
+
+        *self = MixedGL::from_u64x8_arrays([part1, part2]);
+
+        self
+    }
+
+    pub unsafe fn butterfly_4x4_impl(&mut self) -> &mut Self {
+        let [part1, part2] = MixedGL::as_u64x8_arrays(&*self);
+        let u: packed_simd::u64x8 = shuffle!(part1, part2, [0, 1, 2, 3, 8, 9, 10, 11]);
+        let v: packed_simd::u64x8 = shuffle!(part1, part2, [4, 5, 6, 7, 12, 13, 14, 15]);
+        //additional reduction over v
+        let v_reduced = v.add(Self::EPSILON_VECTOR_D);
+        let cmp = v_reduced.lt(Self::EPSILON_VECTOR_D);
+        let v = cmp.select(v_reduced, v);
+        // u + v
+        let sum = u.add(v);
+        let sum_reduced = sum.add(Self::EPSILON_VECTOR_D);
+        let cmp0 = sum_reduced.lt(sum);
+        let cmp1 = sum.lt(u);
+        let reduce_flag = cmp0.bitor(cmp1);
+        let res1 = reduce_flag.select(sum_reduced, sum);
+        // u - v
+        let diff = u.sub(v);
+        let diff_reduced = diff.sub(Self::EPSILON_VECTOR_D);
+        let cmp = u.lt(v);
+        let res2 = cmp.select(diff_reduced, diff);
+
+        let part1: packed_simd::u64x8 = shuffle!(res1, res2, [0, 1, 2, 3, 8, 9, 10, 11]);
+        let part2: packed_simd::u64x8 = shuffle!(res1, res2, [4, 5, 6, 7, 12, 13, 14, 15]);
+
+        *self = MixedGL::from_u64x8_arrays([part1, part2]);
+
+        self
+    }
+
+    /// # Safety
+    ///
+    /// Pointers must be properly aligned for `MixedGL` type, should point to arrays of length 8, and should point
+    /// to memory that can be mutated.
+    /// No references to the same memory should exist when this function is called.
+    /// Pointers should be different.
+    pub unsafe fn butterfly_8x8_impl(this: *const u64, other: *const u64) {
+        debug_assert!(this.addr() % std::mem::align_of::<MixedGL>() == 0);
+        debug_assert!(other.addr() % std::mem::align_of::<MixedGL>() == 0);
+
+        let u = std::slice::from_raw_parts_mut(this as *mut u64, 8);
+        let v = std::slice::from_raw_parts_mut(other as *mut u64, 8);
+        let a = packed_simd::u64x8::from_slice_aligned(u);
+        let b = packed_simd::u64x8::from_slice_aligned(v);
+        //additional reduction over b
+        let b_reduced = b.add(Self::EPSILON_VECTOR_D);
+        let cmp = b_reduced.lt(Self::EPSILON_VECTOR_D);
+        let b = cmp.select(b_reduced, b);
+        // u + v
+        let sum = a.add(b);
+        let sum_reduced = sum.add(Self::EPSILON_VECTOR_D);
+        let cmp0 = sum_reduced.lt(sum);
+        let cmp1 = sum.lt(a);
+        let reduce_flag = cmp0.bitor(cmp1);
+        let res1 = reduce_flag.select(sum_reduced, sum);
+        // u - v
+        let diff = a.sub(b);
+        let diff_reduced = diff.sub(Self::EPSILON_VECTOR_D);
+        let cmp = a.lt(b);
+        let res2 = cmp.select(diff_reduced, diff);
+
+        res1.write_to_slice_aligned(u);
+        res2.write_to_slice_aligned(v);
+    }
+
+    /// # Safety
+    ///
+    /// Pointers must be properly aligned for `MixedGL` type, should point to arrays of length 16, and should point
+    /// to memory that can be mutated.
+    /// No references to the same memory should exist when this function is called.
+    /// Pointers should be different.
+    pub unsafe fn butterfly_16x16_impl(mut this: *mut u64, mut other: *mut u64) {
+        debug_assert!(this.addr() % std::mem::align_of::<MixedGL>() == 0);
+        debug_assert!(other.addr() % std::mem::align_of::<MixedGL>() == 0);
+
+        Self::butterfly_8x8_impl(this, other);
+        this = this.offset(8);
+        other = other.offset(8);
+        Self::butterfly_8x8_impl(this, other);
+    }
+
+    // pub unsafe fn butterfly_16x16_impl(
+    //     this: &mut Self,
+    //     other: &mut Self,
+    // ) {
+    //     let mut this_ptr = this.0.as_ptr() as *mut u64;
+    //     let mut other_ptr = other.0.as_ptr() as *mut u64;
+
+    //     debug_assert!(this_ptr.addr() % std::mem::align_of::<MixedGL>() == 0);
+    //     debug_assert!(other_ptr.addr() % std::mem::align_of::<MixedGL>() == 0);
+
+    //     Self::butterfly_8x8_impl(this_ptr, other_ptr);
+    //     this_ptr = this_ptr.offset(8);
+    //     other_ptr = other_ptr.offset(8);
+    //     Self::butterfly_8x8_impl(this_ptr, other_ptr);
+    // }
+
+    #[inline(always)]
+    pub fn from_field_array(input: [GoldilocksField; 16]) -> Self {
+        Self(input)
+    }
+
+    #[inline(always)]
+    fn as_u64x4_arrays(input: &Self) -> U64x4Holder {
+        // this preserves an alignment
+        unsafe { std::mem::transmute(*input) }
+    }
+
+    #[inline(always)]
+    pub(crate) fn as_u64x8_arrays(input: &Self) -> [packed_simd::u64x8; 2] {
+        // this preserves an alignment
+        unsafe { std::mem::transmute(*input) }
+    }
+
+    #[inline(always)]
+    unsafe fn from_u64x4_arrays(input: U64x4Holder) -> Self {
+        // this preserves an alignment
+        std::mem::transmute(input)
+    }
+
+    #[inline(always)]
+    pub(crate) unsafe fn from_u64x8_arrays(input: [packed_simd::u64x8; 2]) -> Self {
+        // this preserves an alignment
+        std::mem::transmute(input)
+    }
+
+    #[inline(always)]
+    pub fn vec_add_assign(a: &mut [Self], b: &[Self]) {
+        use crate::field::traits::field_like::PrimeFieldLike;
+        for (a, b) in a.iter_mut().zip(b.iter()) {
+            a.add_assign(b, &mut ());
+        }
+    }
+
+    #[inline(always)]
+    pub fn vec_mul_assign(a: &mut [Self], b: &[Self]) {
+        use crate::field::traits::field_like::PrimeFieldLike;
+        for (a, b) in a.iter_mut().zip(b.iter()) {
+            a.mul_assign(b, &mut ());
+        }
+    }
+}
+
+impl Default for MixedGL {
+    fn default() -> Self {
+        Self([GoldilocksField::ZERO; 16])
+    }
+}
+
+impl crate::field::traits::field_like::PrimeFieldLike for MixedGL {
+    type Base = GoldilocksField;
+    type Context = ();
+
+    #[inline(always)]
+    fn zero(_ctx: &mut Self::Context) -> Self {
+        Self([GoldilocksField::ZERO; 16])
+    }
+    #[inline(always)]
+    fn one(_ctx: &mut Self::Context) -> Self {
+        Self([GoldilocksField::ONE; 16])
+    }
+    #[inline(always)]
+    fn minus_one(_ctx: &mut Self::Context) -> Self {
+        Self([GoldilocksField::MINUS_ONE; 16])
+    }
+
+    #[inline(always)]
+    fn add_assign(&mut self, other: &Self, _ctx: &mut Self::Context) -> &mut Self {
+        Self::add_assign_impl(self, other)
+    }
+
+    #[inline(always)]
+    fn sub_assign(&'_ mut self, other: &Self, _ctx: &mut Self::Context) -> &mut Self {
+        Self::sub_assign_impl(self, other)
+    }
+
+    #[inline(always)]
+    #[unroll::unroll_for_loops]
+    fn mul_assign(&'_ mut self, other: &Self, _ctx: &mut Self::Context) -> &mut Self {
+        Self::mul_assign_impl(self, other)
+    }
+
+    #[inline(always)]
+    fn square(&'_ mut self, _ctx: &mut Self::Context) -> &'_ mut Self {
+        let t = *self;
+        self.mul_assign(&t, _ctx);
+
+        self
+    }
+
+    #[inline(always)]
+    #[unroll::unroll_for_loops]
+    fn negate(&'_ mut self, _ctx: &mut Self::Context) -> &'_ mut Self {
+        let mut a_u64 = Self::as_u64x4_arrays(self);
+
+        for i in 0..4 {
+            let a = a_u64.0[i];
+
+            let is_zero = a.eq(packed_simd::u64x4::splat(0));
+            let neg = packed_simd::u64x4::splat(Self::ORDER).sub(a);
+            let res = is_zero.select(a, neg);
+
+            a_u64.0[i] = res;
+        }
+
+        unsafe {
+            *self = Self::from_u64x4_arrays(a_u64);
+        }
+
+        self
+    }
+
+    #[inline(always)]
+    fn double(&'_ mut self, _ctx: &mut Self::Context) -> &'_ mut Self {
+        let t = *self;
+        self.add_assign(&t, _ctx);
+
+        self
+    }
+
+    #[inline(always)]
+    #[unroll::unroll_for_loops]
+    fn inverse(&self, _ctx: &mut Self::Context) -> Self {
+        let mut result = *self;
+        for i in 0..16 {
+            result.0[i] = PrimeField::inverse(&result.0[i]).expect("inverse must exist");
+        }
+
+        result
+    }
+
+    #[inline(always)]
+    fn constant(value: Self::Base, _ctx: &mut Self::Context) -> Self {
+        Self([value; 16])
+    }
+}
+
+impl crate::field::traits::field_like::PrimeFieldLikeVectorized for MixedGL {
+    type Twiddles<A: GoodAllocator> = Vec<GoldilocksField, A>;
+    type InverseTwiddles<A: GoodAllocator> = Vec<GoldilocksField, A>;
+    #[inline(always)]
+    fn is_zero(&self) -> bool {
+        self.0 == [GoldilocksField::ZERO; 16]
+    }
+
+    #[inline(always)]
+    fn equals(&self, other: &Self) -> bool {
+        self.eq(other)
+    }
+
+    #[inline(always)]
+    fn mul_all_by_base(&'_ mut self, other: &Self::Base, _ctx: &mut Self::Context) -> &'_ mut Self {
+        Self::mul_constant_assign(self, other)
+    }
+
+    #[inline(always)]
+    fn slice_from_base_slice(input: &[Self::Base]) -> &[Self] {
+        if input.len() < Self::SIZE_FACTOR {
+            panic!("too small input size to cast");
+        }
+        debug_assert!(input.len() % Self::SIZE_FACTOR == 0);
+        debug_assert!(input.as_ptr().addr() % std::mem::align_of::<Self>() == 0);
+        let result_len = input.len() / 16;
+        unsafe { std::slice::from_raw_parts(input.as_ptr() as *mut Self, result_len) }
+    }
+
+    #[inline(always)]
+    fn slice_into_base_slice(input: &[Self]) -> &[Self::Base] {
+        let result_len = input.len() * 16;
+        unsafe { std::slice::from_raw_parts(input.as_ptr() as *mut GoldilocksField, result_len) }
+    }
+
+    #[inline(always)]
+    fn slice_into_base_slice_mut(input: &mut [Self]) -> &mut [Self::Base] {
+        let result_len = input.len() * 16;
+        unsafe {
+            std::slice::from_raw_parts_mut(input.as_ptr() as *mut GoldilocksField, result_len)
+        }
+    }
+
+    #[inline(always)]
+    fn vec_from_base_vec<A: GoodAllocator>(input: Vec<Self::Base, A>) -> Vec<Self, A> {
+        if input.len() < Self::SIZE_FACTOR {
+            panic!("too small input size to cast");
+        }
+        let (ptr, len, capacity, allocator) = input.into_raw_parts_with_alloc();
+        debug_assert!(ptr.addr() % std::mem::align_of::<Self>() == 0);
+        debug_assert!(len % Self::SIZE_FACTOR == 0);
+        debug_assert!(capacity % Self::SIZE_FACTOR == 0);
+
+        unsafe {
+            Vec::from_raw_parts_in(
+                ptr as _,
+                len / Self::SIZE_FACTOR,
+                capacity / Self::SIZE_FACTOR,
+                allocator,
+            )
+        }
+    }
+
+    #[inline(always)]
+    fn vec_into_base_vec<A: GoodAllocator>(input: Vec<Self, A>) -> Vec<Self::Base, A> {
+        let (ptr, len, capacity, allocator) = input.into_raw_parts_with_alloc();
+
+        unsafe {
+            Vec::from_raw_parts_in(
+                ptr as _,
+                len * Self::SIZE_FACTOR,
+                capacity * Self::SIZE_FACTOR,
+                allocator,
+            )
+        }
+    }
+
+    #[inline(always)]
+    fn fft_natural_to_bitreversed<A: GoodAllocator>(
+        input: &mut [Self],
+        coset: Self::Base,
+        twiddles: &Self::Twiddles<A>,
+        _ctx: &mut Self::Context,
+    ) {
+        // let input = crate::utils::cast_check_alignment_ref_mut_unpack::<Self, GoldilocksField>(input);
+        // crate::fft::fft_natural_to_bitreversed_cache_friendly(input, coset, twiddles);
+
+        crate::fft::fft_natural_to_bitreversed_mixedgl(input, coset, twiddles);
+    }
+
+    #[inline(always)]
+    fn ifft_natural_to_natural<A: GoodAllocator>(
+        input: &mut [Self],
+        coset: Self::Base,
+        twiddles: &Self::InverseTwiddles<A>,
+        _ctx: &mut Self::Context,
+    ) {
+        // let input = crate::utils::cast_check_alignment_ref_mut_unpack::<Self, GoldilocksField>(input);
+        // crate::fft::ifft_natural_to_natural_cache_friendly(input, coset, twiddles);
+
+        crate::fft::ifft_natural_to_natural_mixedgl(input, coset, twiddles);
+    }
+
+    #[inline(always)]
+    fn precompute_forward_twiddles_for_fft<A: GoodAllocator>(
+        fft_size: usize,
+        worker: &Worker,
+        ctx: &mut Self::Context,
+    ) -> Self::Twiddles<A> {
+        precompute_twiddles_for_fft::<GoldilocksField, GoldilocksField, A, false>(
+            fft_size, worker, ctx,
+        )
+    }
+
+    #[inline(always)]
+    fn precompute_inverse_twiddles_for_fft<A: GoodAllocator>(
+        fft_size: usize,
+        worker: &Worker,
+        ctx: &mut Self::Context,
+    ) -> Self::Twiddles<A> {
+        precompute_twiddles_for_fft::<GoldilocksField, GoldilocksField, A, true>(
+            fft_size, worker, ctx,
+        )
+    }
+}
+
+#[cfg(test)]
+mod test {
+
+    use crate::field::goldilocks::MixedGL;
+    use crate::field::rand_from_rng;
+    use crate::field::traits::field_like::PrimeFieldLike;
+    use crate::field::traits::field_like::PrimeFieldLikeVectorized;
+    use crate::field::{goldilocks::GoldilocksField, Field};
+    use crate::utils::clone_respecting_allignment;
+
+    #[test]
+    fn test_mixedgl_negate() {
+        let mut ctx = ();
+        const POLY_SIZE: usize = 1 << 20;
+        let mut rng = rand::thread_rng();
+
+        // Generate random Vec<GoldilocksField>
+        let a: Vec<GoldilocksField> = (0..POLY_SIZE).map(|_| rand_from_rng(&mut rng)).collect();
+
+        let mut ag = a.clone();
+
+        for aa in ag.iter_mut() {
+            Field::negate(aa);
+        }
+
+        let mut av: Vec<MixedGL> =
+            MixedGL::vec_from_base_vec(clone_respecting_allignment::<GoldilocksField, MixedGL, _>(
+                &a,
+            ));
+
+        // Test over GLPS
+        for aa in av.iter_mut() {
+            aa.negate(&mut ctx);
+        }
+
+        assert_eq!(MixedGL::vec_into_base_vec(av), ag);
+    }
+
+    use rand::Rng;
+
+    #[test]
+    fn test_mixedgl_add_assign() {
+        let mut ctx = ();
+        const POLY_SIZE: usize = 1 << 24;
+        let mut rng = rand::thread_rng();
+        let _s = GoldilocksField(0x0000000001000000);
+
+        // Generate random Vec<GoldilocksField>
+        // let a: Vec<GoldilocksField> = (0..POLY_SIZE).map(|_| rand_from_rng(&mut rng)).collect();
+        // let b: Vec<GoldilocksField> = (0..POLY_SIZE).map(|_| rand_from_rng(&mut rng)).collect();
+        // let a: Vec<GoldilocksField> = (0..POLY_SIZE).map(|_| GoldilocksField(0x0000000000000001)).collect();
+        // let b: Vec<GoldilocksField> = (0..POLY_SIZE).map(|_| GoldilocksField(0x0000000001000000)).collect();
+        let b: Vec<GoldilocksField> = (0..POLY_SIZE)
+            .map(|_| GoldilocksField(rng.gen_range(GoldilocksField::ORDER..u64::MAX)))
+            .collect();
+        let a: Vec<GoldilocksField> = (0..POLY_SIZE)
+            .map(|_| GoldilocksField(rng.gen_range(GoldilocksField::ORDER..u64::MAX)))
+            .collect();
+        // let a: Vec<GoldilocksField> = (0..POLY_SIZE).map(|_| GoldilocksField(0xfffffffff67f1442)).collect();
+        // let b: Vec<GoldilocksField> = (0..POLY_SIZE).map(|_| GoldilocksField(0xffffffff9c1d065d)).collect();
+
+        // dbg!(&a);
+        // dbg!(&b);
+
+        let mut ag = a.clone();
+        let bg = b.clone();
+
+        for (aa, bb) in ag.iter_mut().zip(bg.iter()) {
+            Field::add_assign(aa, bb);
+        }
+
+        let mut av: Vec<MixedGL> =
+            MixedGL::vec_from_base_vec(clone_respecting_allignment::<GoldilocksField, MixedGL, _>(
+                &a,
+            ));
+        let bv: Vec<MixedGL> =
+            MixedGL::vec_from_base_vec(clone_respecting_allignment::<GoldilocksField, MixedGL, _>(
+                &b,
+            ));
+
+        // Test over GLPS
+        for (aa, bb) in av.iter_mut().zip(bv.iter()) {
+            aa.add_assign(bb, &mut ctx);
+        }
+
+        let avv = MixedGL::vec_into_base_vec(av);
+        // for i in 0..avv.len() {
+        //     assert_eq!(avv[i], ag[i], "error {}", i);
+        // }
+
+        // dbg!(&ag[0]);
+        // dbg!(&avv[0]);
+
+        assert_eq!(avv, ag);
+    }
+
+    #[test]
+    fn test_mixedgl_sub_assign() {
+        let mut ctx = ();
+        const POLY_SIZE: usize = 1 << 20;
+        let _rng = rand::thread_rng();
+
+        // Generate random Vec<GoldilocksField>
+        // let a: Vec<GoldilocksField> = (0..POLY_SIZE).map(|_| rand_from_rng(&mut rng)).collect();
+        // let b: Vec<GoldilocksField> = (0..POLY_SIZE).map(|_| rand_from_rng(&mut rng)).collect();
+        let a: Vec<GoldilocksField> = (0..POLY_SIZE)
+            .map(|_| GoldilocksField(0x0000000000000001))
+            .collect();
+        let b: Vec<GoldilocksField> = (0..POLY_SIZE)
+            .map(|_| GoldilocksField(0x0000000001000000))
+            .collect();
+
+        // Test over Goldilocks
+        let mut ag = a.clone();
+        let bg = b.clone();
+
+        for (aa, bb) in ag.iter_mut().zip(bg.iter()) {
+            Field::sub_assign(aa, bb);
+        }
+
+        let mut av: Vec<MixedGL> =
+            MixedGL::vec_from_base_vec(clone_respecting_allignment::<GoldilocksField, MixedGL, _>(
+                &a,
+            ));
+        let bv: Vec<MixedGL> =
+            MixedGL::vec_from_base_vec(clone_respecting_allignment::<GoldilocksField, MixedGL, _>(
+                &b,
+            ));
+
+        // Test over GLPS
+        for (aa, bb) in av.iter_mut().zip(bv.iter()) {
+            aa.sub_assign(bb, &mut ctx);
+        }
+
+        // dbg!(&ag);
+        // dbg!(&av);
+
+        assert_eq!(ag, MixedGL::vec_into_base_vec(av));
+    }
+
+    #[test]
+    fn test_mixedgl_mul_assign() {
+        let mut ctx = ();
+        const POLY_SIZE: usize = 1 << 20;
+        let mut rng = rand::thread_rng();
+
+        // Generate random Vec<GoldilocksField>
+        let a: Vec<GoldilocksField> = (0..POLY_SIZE).map(|_| rand_from_rng(&mut rng)).collect();
+        let b: Vec<GoldilocksField> = (0..POLY_SIZE).map(|_| rand_from_rng(&mut rng)).collect();
+
+        // Test over Goldilocks
+        let mut ag = a.clone();
+        let bg = b.clone();
+
+        for (aa, bb) in ag.iter_mut().zip(bg.iter()) {
+            Field::mul_assign(aa, bb);
+        }
+
+        let mut av: Vec<MixedGL> =
+            MixedGL::vec_from_base_vec(clone_respecting_allignment::<GoldilocksField, MixedGL, _>(
+                &a,
+            ));
+        let bv: Vec<MixedGL> =
+            MixedGL::vec_from_base_vec(clone_respecting_allignment::<GoldilocksField, MixedGL, _>(
+                &b,
+            ));
+
+        // Test over GLPS
+        for (aa, bb) in av.iter_mut().zip(bv.iter()) {
+            aa.mul_assign(bb, &mut ctx);
+        }
+
+        // dbg!(&ag);
+        // dbg!(&av);
+
+        assert_eq!(ag, MixedGL::vec_into_base_vec(av));
+    }
+
+    #[test]
+    fn test_mixedgl_butterfly16x16() {
+        // let mut ctx = ();
+
+        // let am: [u64;32] = [0x0001000000000000, 0x0000000000000001, 0x0001000000000000, 0x0000000000000001, 0x0000000000000000, 0xffffffff00000000, 0x0000000000000001, 0x0000ffffffffffff, 0x0000000000000000, 0x0001000000000000, 0xffffffff00000000, 0xffffffff00000000, 0xffffffff00000000, 0xfffeffff00000001, 0xfffeffff00000002, 0xfffeffff00000002,
+        //     0x0000000000000000, 0x0000000000000001, 0x0000000000000000, 0x0001000000000001, 0xfffeffff00000001, 0xffffffff00000000, 0x0001000000000000, 0xfffeffff00000002, 0x0000000000000000, 0xfffeffff00000001, 0xffffffff00000000, 0x0000000000000001, 0x0000ffffffffffff, 0x0000000000000000, 0x0000000000000001, 0x0001000000000000];
+
+        let am: [u64; 32] = [
+            0x0001000000000000,
+            0x0000000000000001,
+            0x0001000000000000,
+            0x0000000000000001,
+            0x0000000000000000,
+            0xffffffff00000000,
+            0x0000000000000001,
+            0x0000ffffffffffff,
+            0x0000000000000000,
+            0x0001000000000000,
+            0xffffffff00000000,
+            0xffffffff00000000,
+            0xffffffff00000000,
+            0xfffeffff00000001,
+            0xfffeffff00000002,
+            0xfffeffff00000002,
+            0x0000000000000000,
+            0xffffffff01000001,
+            0x0000000000000000,
+            0x0000010000ffff00,
+            0xfffffeff00000101,
+            0xfffffffeff000001,
+            0x000000ffffffff00,
+            0xfffffeff01000101,
+            0x0000000000000000,
+            0xfffffeff00000101,
+            0xfffffffeff000001,
+            0xffffffff01000001,
+            0x000000fffeffff00,
+            0x0000000000000000,
+            0xffffffff01000001,
+            0x000000ffffffff00,
+        ];
+
+        let a: Vec<GoldilocksField> = am.into_iter().map(GoldilocksField).collect();
+        // let b: Vec<GoldilocksField> = bm.into_iter().map(GoldilocksField).collect();
+        let _s = GoldilocksField(0x0000000001000000);
+
+        // Test over Goldilocks
+        let mut ag = a.clone();
+        // let mut bg = b.clone();
+        let distance_in_cache = 16;
+
+        let mut j = 0;
+        while j < 16 {
+            let mut u = ag[j];
+            let v = ag[j + distance_in_cache];
+            // Field::mul_assign(&mut v, &s);
+            Field::sub_assign(&mut u, &v);
+            ag[j + distance_in_cache] = u;
+            Field::add_assign(&mut ag[j], &v);
+
+            j += 1;
+        }
+
+        let av: Vec<MixedGL> =
+            MixedGL::vec_from_base_vec(clone_respecting_allignment::<GoldilocksField, MixedGL, _>(
+                &a,
+            ));
+        // let mut bv: Vec<MixedGL> = MixedGL::vec_from_base_vec(clone_respecting_allignment::<GoldilocksField, MixedGL, _>(&b));
+        // let mut av = av[0];
+        // let mut bv = bv[0];
+
+        // Test over MixedGL
+        // av[1].mul_constant_assign(&s);
+        unsafe {
+            MixedGL::butterfly_16x16_impl(
+                av[0].0.as_ptr() as *mut u64,
+                av[1].0.as_ptr() as *mut u64,
+            );
+        }
+        // let mut u = av[0];
+        // let mut v = av[1];
+        // unsafe { MixedGL::butterfly_16x16_impl(&mut u, &mut v); }
+        // av[0] = u;
+        // av[1] = v;
+
+        let ag =
+            MixedGL::vec_from_base_vec(clone_respecting_allignment::<GoldilocksField, MixedGL, _>(
+                &ag,
+            ));
+        // let bg = MixedGL::vec_from_base_vec(clone_respecting_allignment::<GoldilocksField, MixedGL, _>(&bg));
+
+        dbg!(&ag);
+        dbg!(&av);
+
+        // dbg!(&bg);
+        // dbg!(&bv);
+
+        assert_eq!(ag, av);
+        // assert_eq!(bg, bv);
+    }
+}
diff --git a/src/field/goldilocks/mod.rs b/src/field/goldilocks/mod.rs
index 10daec1..82fa6be 100644
--- a/src/field/goldilocks/mod.rs
+++ b/src/field/goldilocks/mod.rs
@@ -12,10 +12,18 @@ mod extension;
 mod inversion;
 
 #[cfg(all(
+    not(feature = "include_packed_simd"),
     any(target_feature = "neon", target_feature = "avx2"),
     not(all(target_feature = "avx512f", target_feature = "avx512vl"))
 ))]
 pub mod arm_asm_impl;
+
+#[cfg(all(
+    feature = "include_packed_simd",
+    any(target_feature = "neon", target_feature = "avx2"),
+    not(all(target_feature = "avx512f", target_feature = "avx512vl"))
+))]
+pub mod arm_asm_packed_impl;
 #[cfg(not(any(
     all(target_feature = "avx512f", target_feature = "avx512vl"),
     target_feature = "neon",
@@ -43,10 +51,19 @@ pub mod x86_64_asm_impl;
 pub mod avx512_impl;
 
 #[cfg(all(
+    not(feature = "include_packed_simd"),
     any(target_feature = "neon", target_feature = "avx2"),
     not(all(target_feature = "avx512f", target_feature = "avx512vl"))
 ))]
 pub use arm_asm_impl::*;
+
+#[cfg(all(
+    feature = "include_packed_simd",
+    any(target_feature = "neon", target_feature = "avx2"),
+    not(all(target_feature = "avx512f", target_feature = "avx512vl"))
+))]
+pub use arm_asm_packed_impl::*;
+
 #[cfg(not(any(
     all(target_feature = "avx512f", target_feature = "avx512vl"),
     target_feature = "neon",
diff --git a/src/implementations/poseidon2/mod.rs b/src/implementations/poseidon2/mod.rs
index ecb1326..6dbb7e0 100644
--- a/src/implementations/poseidon2/mod.rs
+++ b/src/implementations/poseidon2/mod.rs
@@ -4,18 +4,25 @@ use crate::field::goldilocks::GoldilocksField;
 pub mod params;
 
 pub mod state_generic_impl;
-#[cfg(not(any(
-    target_feature = "neon",
-    target_feature = "avx2",
-    target_feature = "avx512bw",
-    target_feature = "avx512cd",
-    target_feature = "avx512dq",
-    target_feature = "avx512f",
-    target_feature = "avx512vl"
+#[cfg(not(all(
+    feature = "include_packed_simd",
+    any(
+        target_feature = "neon",
+        target_feature = "avx2",
+        target_feature = "avx512bw",
+        target_feature = "avx512cd",
+        target_feature = "avx512dq",
+        target_feature = "avx512f",
+        target_feature = "avx512vl",
+    )
 )))]
 pub use state_generic_impl::*;
 
+// Other poseidon implementations depend on packed_simd 128
+// which is no longer available in std::simd (and packed_simd is no longer
+// supported in the newest rust nightly).
 #[cfg(all(
+    feature = "include_packed_simd",
     any(target_feature = "neon", target_feature = "avx2"),
     not(any(
         target_feature = "avx512bw",
@@ -28,6 +35,7 @@ pub use state_generic_impl::*;
 pub mod state_vectorized_double;
 
 #[cfg(all(
+    feature = "include_packed_simd",
     any(target_feature = "neon", target_feature = "avx2"),
     not(any(
         target_feature = "avx512bw",
@@ -40,6 +48,7 @@ pub mod state_vectorized_double;
 pub use state_vectorized_double::*;
 
 #[cfg(all(
+    feature = "include_packed_simd",
     target_feature = "avx512bw",
     target_feature = "avx512cd",
     target_feature = "avx512dq",
@@ -49,6 +58,7 @@ pub use state_vectorized_double::*;
 pub mod state_avx512;
 
 #[cfg(all(
+    feature = "include_packed_simd",
     target_feature = "avx512bw",
     target_feature = "avx512cd",
     target_feature = "avx512dq",
diff --git a/src/implementations/poseidon2/state_generic_impl.rs b/src/implementations/poseidon2/state_generic_impl.rs
index 02cb079..c9b74e8 100644
--- a/src/implementations/poseidon2/state_generic_impl.rs
+++ b/src/implementations/poseidon2/state_generic_impl.rs
@@ -29,7 +29,9 @@ impl State {
     pub const T: u64 = (Self::ORDER - 1) >> Self::TWO_ADICITY;
     pub const BARRETT: u128 = 18446744078004518912; // 0x10000000100000000
     pub const EPSILON: u64 = (1 << 32) - 1;
+    #[cfg(feature = "include_packed_simd")]
     pub const EPSILON_VECTOR: packed_simd::u64x4 = packed_simd::u64x4::splat(Self::EPSILON);
+    #[cfg(feature = "include_packed_simd")]
     pub const EPSILON_VECTOR_D: packed_simd::u64x8 = packed_simd::u64x8::splat(Self::EPSILON);
 
     pub const RATE: usize = poseidon_goldilocks_params::RATE;
diff --git a/src/lib.rs b/src/lib.rs
index 72775d5..4f2e1ee 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -21,6 +21,9 @@
 #![allow(dead_code)]
 #![allow(dropping_references)] // Required to explicitly show that mutable references are dropped.
 #![allow(incomplete_features)]
+#![allow(internal_features)] // Required for core_intrinsics
+#![allow(stable_features)]
+#![allow(unused_unsafe)]
 // Enabled features
 #![feature(allocator_api)]
 #![feature(const_mut_refs)]
@@ -43,7 +46,6 @@
 #![feature(generic_const_exprs)]
 #![feature(iter_array_chunks)]
 // #![recursion_limit = "1024"]
-#![feature(stdsimd)]
 #![feature(avx512_target_feature)]
 #![feature(associated_type_defaults)]
 #![feature(trait_alias)]
@@ -51,6 +53,7 @@
 #![feature(return_position_impl_trait_in_trait)]
 #![feature(type_changing_struct_update)]
 #![feature(slice_flatten)]
+#![cfg_attr(feature = "include_packed_simd", feature(stdsimd))]
 
 pub mod algebraic_props;
 pub mod config;