From 4bcb11f0610302110ae8109af01d5b652191b2f6 Mon Sep 17 00:00:00 2001 From: Marcin M <128217157+mm-zk@users.noreply.github.com> Date: Wed, 29 May 2024 14:22:45 +0200 Subject: [PATCH] feat: Update boojum nightly - feature gate packed simd (Attempt 2) (#50) --- .github/workflows/ci.yaml | 19 + Cargo.toml | 8 +- rust-toolchain.toml | 2 +- src/cs/traits/cs.rs | 2 +- src/dag/guide.rs | 4 +- src/dag/resolver_box.rs | 12 +- src/dag/resolvers/mt/mod.rs | 12 +- src/dag/resolvers/mt/registrar.rs | 2 +- src/dag/resolvers/mt/resolution_window.rs | 34 +- src/dag/resolvers/mt/sorters/sorter_live.rs | 14 +- src/field/goldilocks/arm_asm_impl.rs | 95 +- src/field/goldilocks/arm_asm_packed_impl.rs | 858 ++++++++++++++++++ src/field/goldilocks/mod.rs | 17 + src/implementations/poseidon2/mod.rs | 26 +- .../poseidon2/state_generic_impl.rs | 2 + src/lib.rs | 5 +- 16 files changed, 1020 insertions(+), 92 deletions(-) create mode 100644 src/field/goldilocks/arm_asm_packed_impl.rs diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index a984dd9..f5200a1 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -17,6 +17,25 @@ jobs: - uses: actions-rust-lang/setup-rust-toolchain@v1 - run: cargo build --verbose - run: cargo test --verbose --all + + build_old: + name: cargo build and test (packed_simd) + strategy: + matrix: + # Needs big runners to run tests + # Only macos-13-xlarge is Apple Silicon, as per: + # https://docs.github.com/en/actions/using-github-hosted-runners/about-larger-runners/about-larger-runners#about-macos-larger-runners + os: [ubuntu-22.04-github-hosted-16core, macos-13-xlarge] + runs-on: ${{ matrix.os }} + steps: + - uses: actions/checkout@v4 + - uses: actions-rust-lang/setup-rust-toolchain@v1 + with: + toolchain: nightly-2023-05-31 + + # Still compile the old rust nightly with packed simd - until we have a good replacement in poseidon. + - run: RUSTFLAGS=-Awarnings cargo +nightly-2023-05-31 build --features include_packed_simd + - run: RUSTFLAGS=-Awarnings cargo +nightly-2023-05-31 test --features include_packed_simd formatting: name: cargo fmt diff --git a/Cargo.toml b/Cargo.toml index 17c9067..e44d1ba 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -23,7 +23,7 @@ itertools = "0.10" blake2 = "0.10" sha2 = "0.10" num-modular = "0.5.1" -packed_simd = { version = "0.3.9" } +packed_simd = { version = "0.3.9" , optional = true} pairing = { package = "pairing_ce", git = "https://github.com/matter-labs/pairing.git" } crypto-bigint = "0.5" convert_case = "*" @@ -54,3 +54,9 @@ opt-level = 3 [features] # If enabled, logs will be using trace, if disabled, they will be printed to stdout. log_tracing = ["tracing"] +# Currently packed_simd is no longer working with the newest nightly. +# But we still keep it as a feature, as we didn't migrate all the code, and +# some people might want to use older rust nightly, to be able to gain some performance. +include_packed_simd = ["packed_simd"] +cr_paranoia_mode = [] +debug_track = [] \ No newline at end of file diff --git a/rust-toolchain.toml b/rust-toolchain.toml index 6b48c00..a671fa6 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -1,2 +1,2 @@ [toolchain] -channel = "nightly-2023-06-25" +channel = "nightly-2024-05-07" diff --git a/src/cs/traits/cs.rs b/src/cs/traits/cs.rs index da3b51d..5d4d786 100644 --- a/src/cs/traits/cs.rs +++ b/src/cs/traits/cs.rs @@ -20,7 +20,7 @@ impl<'set, 'tgt: 'set, T: SmallField> DstBuffer<'set, 'tgt, T> { *offset += 1; } DstBuffer::MutSliceIndirect(dst, debug_track, offset) => { - if cfg!(debug_track) && *debug_track { + if cfg!(feature = "debug_track") && *debug_track { log!(" set out {} <- {}", *offset, value.as_raw_u64()) } diff --git a/src/dag/guide.rs b/src/dag/guide.rs index 0eba95b..47a6315 100644 --- a/src/dag/guide.rs +++ b/src/dag/guide.rs @@ -384,7 +384,7 @@ impl<'a, T: Copy + Debug, F: SmallField, Cfg: CSResolverConfig> GuideOrder<'a, T pos += span.buffer.len(); } - if cfg!(cr_paranoia_mode) && self.guide.tracing { + if cfg!(feature = "cr_paranoia_mode") && self.guide.tracing { log!( "Released span {}: {:?}", self.guide.spans[0].id.0, @@ -684,7 +684,7 @@ impl BufferGuide { } pub(crate) fn flush(&mut self) -> BufferGuideFinalization<'_, T, F, Cfg> { - if cfg!(cr_paranoia_mode) && self.tracing { + if cfg!(feature = "cr_paranoia_mode") && self.tracing { log!("CRG: flush."); } diff --git a/src/dag/resolver_box.rs b/src/dag/resolver_box.rs index 397d551..d0f7dd8 100644 --- a/src/dag/resolver_box.rs +++ b/src/dag/resolver_box.rs @@ -424,7 +424,7 @@ pub(crate) fn invocation_binder( // Safety: This is the actual type of the provided function. let bound = resolver.resolve_fn::(); - if (cfg!(cr_paranoia_mode) || crate::dag::resolvers::mt::PARANOIA) && false { + if (cfg!(feature = "cr_paranoia_mode") || crate::dag::resolvers::mt::PARANOIA) && false { log!( "Ivk: Ins [{}], Out [{}], Out-addr [{}], Thread [{}]", resolver @@ -448,7 +448,10 @@ pub(crate) fn invocation_binder( ) } - if (cfg!(cr_paranoia_mode) || crate::dag::resolvers::mt::PARANOIA) && debug_track && false { + if (cfg!(feature = "cr_paranoia_mode") || crate::dag::resolvers::mt::PARANOIA) + && debug_track + && false + { log!( "Ivk: provided inputs:\n - {:?}", ins.iter().map(|x| x.as_raw_u64()).collect_vec() @@ -457,7 +460,10 @@ pub(crate) fn invocation_binder( bound(ins, &mut DstBuffer::MutSliceIndirect(out, debug_track, 0)); - if (cfg!(cr_paranoia_mode) || crate::dag::resolvers::mt::PARANOIA) && debug_track && true { + if (cfg!(feature = "cr_paranoia_mode") || crate::dag::resolvers::mt::PARANOIA) + && debug_track + && true + { log!( "Ivk: calculated outputs:\n - {:?}", out.iter().map(|x| x.as_raw_u64()).collect_vec() diff --git a/src/dag/resolvers/mt/mod.rs b/src/dag/resolvers/mt/mod.rs index cad5822..8de30f5 100644 --- a/src/dag/resolvers/mt/mod.rs +++ b/src/dag/resolvers/mt/mod.rs @@ -169,7 +169,7 @@ impl, CFG: CSResolverConfig> let debug_track = vec![]; - if cfg!(cr_paranoia_mode) || PARANOIA { + if cfg!(feature = "cr_paranoia_mode") || PARANOIA { log!("Contains tracked keys {:?} ", debug_track); } @@ -269,7 +269,7 @@ impl, CFG: CSResolverConfig> self.sorter.write_sequence(); - if cfg!(cr_paranoia_mode) || PARANOIA { + if cfg!(feature = "cr_paranoia_mode") || PARANOIA { log!("CR {:?}", unsafe { self.common.awaiters_broker.stats.u_deref() }); @@ -1487,7 +1487,7 @@ mod test { storage.wait_till_resolved(); - if cfg!(cr_paranoia_mode) { + if cfg!(feature = "cr_paranoia_mode") { log!("Test: total value result: \n - {}", unsafe { (*storage.common.values.get()) .variables @@ -1509,7 +1509,7 @@ mod test { let act = Place::from_variable(Variable::from_variable_index(ix as u64)) .to(|x| storage.get_value_unchecked(x)); - if cfg!(cr_paranoia_mode) { + if cfg!(feature = "cr_paranoia_mode") { log!("Test: per item value: ix {}, value {}", ix, act); } @@ -1542,7 +1542,7 @@ mod test { storage.wait_till_resolved(); - if cfg!(cr_paranoia_mode) { + if cfg!(feature = "cr_paranoia_mode") { log!("Test: total value result: \n - {}", unsafe { (*storage.common.values.get()) .variables @@ -1564,7 +1564,7 @@ mod test { let act = Place::from_variable(Variable::from_variable_index(ix as u64)) .to(|x| storage.get_value_unchecked(x)); - if cfg!(cr_paranoia_mode) { + if cfg!(feature = "cr_paranoia_mode") { log!("Test: per item value: ix {}, value {}", ix, act); } diff --git a/src/dag/resolvers/mt/registrar.rs b/src/dag/resolvers/mt/registrar.rs index 3c43c6b..257703f 100644 --- a/src/dag/resolvers/mt/registrar.rs +++ b/src/dag/resolvers/mt/registrar.rs @@ -116,7 +116,7 @@ impl Registrar { } pub(crate) fn is_empty(&self) -> bool { - if cfg!(cr_paranoia_mode) { + if cfg!(feature = "cr_paranoia_mode") { log!( "CRR: total remaining resolvers: {}", self.vars.values().map(|x| x.len()).sum::() diff --git a/src/dag/resolvers/mt/resolution_window.rs b/src/dag/resolvers/mt/resolution_window.rs index 9efe2cb..60d1d23 100644 --- a/src/dag/resolvers/mt/resolution_window.rs +++ b/src/dag/resolvers/mt/resolution_window.rs @@ -163,8 +163,12 @@ impl + 'static> comms, track_list: Vec::new(), - execution_list: if cfg!(cr_paranoia_mode) { 1 << 26 } else { 0 } - .to(|x| Vec::with_capacity(x).op(|v| v.resize(x, 0))), + execution_list: if cfg!(feature = "cr_paranoia_mode") { + 1 << 26 + } else { + 0 + } + .to(|x| Vec::with_capacity(x).op(|v| v.resize(x, 0))), phantom: PhantomData, }; @@ -207,7 +211,7 @@ impl + 'static> data[data_ix].push(order_ix.into(), task.order_info.value); - if cfg!(cr_paranoia_mode) { + if cfg!(feature = "cr_paranoia_mode") { self.execution_list[order_ix] += 1; if self.execution_list[order_ix] > 1 { @@ -238,7 +242,7 @@ impl + 'static> } } - if (cfg!(cr_paranoia_mode) || crate::dag::resolvers::mt::PARANOIA) && true { + if (cfg!(feature = "cr_paranoia_mode") || crate::dag::resolvers::mt::PARANOIA) && true { log!("RW: Batch! {} tasks.", count); } @@ -264,7 +268,7 @@ impl + 'static> .for_each(|x| { x.state = ResolverState::Done; - if cfg!(cr_paranoia_mode) || crate::dag::resolvers::mt::PARANOIA { + if cfg!(feature = "cr_paranoia_mode") || crate::dag::resolvers::mt::PARANOIA { unsafe { let r = self.common.resolvers.u_deref().get(x.order_info.value); @@ -291,7 +295,7 @@ impl + 'static> } }); - if cfg!(cr_paranoia_mode) || crate::dag::resolvers::mt::PARANOIA { + if cfg!(feature = "cr_paranoia_mode") || crate::dag::resolvers::mt::PARANOIA { if self .exec_order_buffer .iter() @@ -343,7 +347,7 @@ impl + 'static> drop(awaiters); - if cfg!(cr_paranoia_mode) && count > 0 { + if cfg!(feature = "cr_paranoia_mode") && count > 0 { log!( "RW: Shifted by {}, new range is: {}..{}, buffer len: {}", count, @@ -412,7 +416,7 @@ impl + 'static> self.stats.total_consumption = extend_to as u64; - if crate::dag::resolvers::mt::PARANOIA || cfg!(cr_paranoia_mode) { + if crate::dag::resolvers::mt::PARANOIA || cfg!(feature = "cr_paranoia_mode") { log!( "RW: Extended range by {}, new range {}..{}", extend_to, @@ -474,7 +478,7 @@ impl + 'static> } } - if crate::dag::resolvers::mt::PARANOIA || cfg!(cr_paranoia_mode) { + if crate::dag::resolvers::mt::PARANOIA || cfg!(feature = "cr_paranoia_mode") { log!("[{:?}] RW: Exit conditions met.", std::time::Instant::now()) } @@ -484,7 +488,7 @@ impl + 'static> self.stats.total_time = start_instant.elapsed(); - if cfg!(cr_paranoia_mode) || crate::dag::resolvers::mt::PARANOIA { + if cfg!(feature = "cr_paranoia_mode") || crate::dag::resolvers::mt::PARANOIA { log!("CR {:#?}", self.stats); log!("CR {:#?}", unsafe { &*self.channel.stats.get() }); @@ -554,7 +558,7 @@ impl, const SIZE: usize> // here, as this is an unsynchronizd access. let resolver = this.common.resolvers.u_deref().get(*resolver_ix); - if cfg!(cr_paranoia_mode) || crate::dag::resolvers::mt::PARANOIA { + if cfg!(feature="cr_paranoia_mode") || crate::dag::resolvers::mt::PARANOIA { std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| { this.invoke(resolver, *order_ix); @@ -590,7 +594,7 @@ impl, const SIZE: usize> }); } - if cfg!(cr_paranoia_mode) || crate::dag::resolvers::mt::PARANOIA { + if cfg!(feature = "cr_paranoia_mode") || crate::dag::resolvers::mt::PARANOIA { log!( "{}\n{:#?}\n{:#?}", std::thread::current().name().unwrap_or_default(), @@ -629,7 +633,7 @@ impl, const SIZE: usize> .map(|x| { let (vs, md) = self.common.values.u_deref().get_item_ref(*x); - if cfg!(cr_paranoia_mode) || true { + if cfg!(feature = "cr_paranoia_mode") || true { if Cfg::ASSERT_TRACKED_VALUES { assert!(md.is_tracked()); } @@ -678,7 +682,7 @@ impl, const SIZE: usize> let mut track = false; - if cfg!(cr_paranoia_mode) || crate::dag::resolvers::mt::PARANOIA { + if cfg!(feature = "cr_paranoia_mode") || crate::dag::resolvers::mt::PARANOIA { if let Some(x) = self .debug_track .iter() @@ -831,7 +835,7 @@ impl LockStepChannel { fn execute(&self) { use std::sync::atomic::Ordering::*; - if (cfg!(cr_paranoia_mode) || crate::dag::resolvers::mt::PARANOIA) && false { + if (cfg!(feature = "cr_paranoia_mode") || crate::dag::resolvers::mt::PARANOIA) && false { log!("RW: batch sent {:#?}", unsafe { self.data.u_deref() }); } diff --git a/src/dag/resolvers/mt/sorters/sorter_live.rs b/src/dag/resolvers/mt/sorters/sorter_live.rs index c0c1298..6b1e423 100644 --- a/src/dag/resolvers/mt/sorters/sorter_live.rs +++ b/src/dag/resolvers/mt/sorters/sorter_live.rs @@ -191,7 +191,7 @@ impl } } - if cfg!(cr_paranoia_mode) { + if cfg!(feature = "cr_paranoia_mode") { // This ugly block checks that the calculated parallelism is // correct. It's a bit slower than O(n^2). Also note, that it // checks only the last 1050 items, so it's not a full check, @@ -297,7 +297,7 @@ impl ResolverS } fn set_value(&mut self, key: crate::cs::Place, value: F) { - if (cfg!(cr_paranoia_mode) || crate::dag::resolvers::mt::PARANOIA) + if (cfg!(feature = "cr_paranoia_mode") || crate::dag::resolvers::mt::PARANOIA) && self.debug_track.contains(&key) && false { @@ -378,7 +378,7 @@ impl ResolverS let mut hit = false; - if (cfg!(cr_paranoia_mode) || crate::dag::resolvers::mt::PARANOIA) && true { + if (cfg!(feature = "cr_paranoia_mode") || crate::dag::resolvers::mt::PARANOIA) && true { if let Some(x) = self.debug_track.iter().find(|x| inputs.contains(x)) { log!("CR: added resolution with tracked input {:?}", x); @@ -498,7 +498,7 @@ impl ResolverS outputs: &[Place], added_at: RegistrationNum, ) -> Vec { - if cfg!(cr_paranoia_mode) { + if cfg!(feature = "cr_paranoia_mode") { if let Some(x) = self.debug_track.iter().find(|x| inputs.contains(x)) { log!("CR: internalized resolution with tracked input {:?}", x); } @@ -519,7 +519,7 @@ impl ResolverS let deps = inputs.iter().map(|x| &values.get_item_ref(*x).1); - if cfg!(cr_paranoia_mode) { + if cfg!(feature = "cr_paranoia_mode") { debug_assert!( deps.clone().all(|x| { x.is_tracked() }), "Attempting to internalize a resolution with an untracked input. All inputs must be tracked." @@ -610,14 +610,14 @@ impl ResolverS self.record.values_count = unsafe { self.common.values.u_deref().max_tracked + 1 } as usize; self.record.registrations_count = self.stats.registrations_added as usize; - if cfg!(cr_paranoia_mode) || crate::dag::resolvers::mt::PARANOIA { + if cfg!(feature = "cr_paranoia_mode") || crate::dag::resolvers::mt::PARANOIA { log!( "CR: Final order written. Order len {}", self.common.exec_order.lock().unwrap().items.len() ); } - if cfg!(cr_paranoia_mode) || crate::dag::resolvers::mt::PARANOIA { + if cfg!(feature = "cr_paranoia_mode") || crate::dag::resolvers::mt::PARANOIA { self.guide.stats.finalize(); log!("CR {:?}", self.guide.stats); diff --git a/src/field/goldilocks/arm_asm_impl.rs b/src/field/goldilocks/arm_asm_impl.rs index 03399c4..369b881 100644 --- a/src/field/goldilocks/arm_asm_impl.rs +++ b/src/field/goldilocks/arm_asm_impl.rs @@ -2,8 +2,10 @@ use crate::cs::implementations::utils::precompute_twiddles_for_fft; use crate::cs::traits::GoodAllocator; use crate::field::{Field, PrimeField}; use crate::worker::Worker; -use packed_simd::shuffle; +use std::intrinsics::simd::simd_shuffle; use std::ops::{Add, BitOr, Sub}; +use std::simd::cmp::{SimdPartialEq, SimdPartialOrd}; +use std::simd::{u64x4, u64x8}; use std::usize; use super::GoldilocksField; @@ -17,7 +19,7 @@ pub struct MixedGL(pub [GoldilocksField; 16]); // we also need holder for SIMD targets, because u64x4 has smaller alignment than u64x8 #[derive(Clone, Copy)] #[repr(C, align(64))] -struct U64x4Holder([packed_simd::u64x4; 4]); +struct U64x4Holder([u64x4; 4]); impl std::fmt::Debug for MixedGL { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { @@ -38,8 +40,8 @@ impl MixedGL { pub const T: u64 = (Self::ORDER - 1) >> Self::TWO_ADICITY; pub const BARRETT: u128 = 18446744078004518912; // 0x10000000100000000 pub const EPSILON: u64 = (1 << 32) - 1; - pub const EPSILON_VECTOR: packed_simd::u64x4 = packed_simd::u64x4::splat(Self::EPSILON); - pub const EPSILON_VECTOR_D: packed_simd::u64x8 = packed_simd::u64x8::splat(Self::EPSILON); + pub const EPSILON_VECTOR: u64x4 = u64x4::from_array([Self::EPSILON; 4]); + pub const EPSILON_VECTOR_D: u64x8 = u64x8::from_array([Self::EPSILON; 8]); #[inline(always)] pub fn new() -> Self { @@ -64,7 +66,7 @@ impl MixedGL { for i in 0..4 { let a = a_u64.0[i]; let a_reduced = a.add(Self::EPSILON_VECTOR); - let cmp = a_reduced.lt(Self::EPSILON_VECTOR); + let cmp = a_reduced.simd_lt(Self::EPSILON_VECTOR); let res = cmp.select(a_reduced, a); a_u64.0[i] = res; @@ -108,13 +110,13 @@ impl MixedGL { let b = b_u64.0[i]; //additional reduction over b let b_reduced = b.add(Self::EPSILON_VECTOR); - let cmp = b_reduced.lt(Self::EPSILON_VECTOR); + let cmp = b_reduced.simd_lt(Self::EPSILON_VECTOR); let b = cmp.select(b_reduced, b); //a+b let sum = a.add(b); let sum_reduced = sum.add(Self::EPSILON_VECTOR); - let cmp0 = sum_reduced.lt(sum); - let cmp1 = sum.lt(a); + let cmp0 = sum_reduced.simd_lt(sum); + let cmp1 = sum.simd_lt(a); let reduce_flag = cmp0.bitor(cmp1); let res = reduce_flag.select(sum_reduced, sum); @@ -139,12 +141,12 @@ impl MixedGL { let b = b_u64.0[i]; //additional reduction over b let b_reduced = b.add(Self::EPSILON_VECTOR); - let cmp = b_reduced.lt(Self::EPSILON_VECTOR); + let cmp = b_reduced.simd_lt(Self::EPSILON_VECTOR); let b = cmp.select(b_reduced, b); //a-b let diff = a.sub(b); let diff_reduced = diff.sub(Self::EPSILON_VECTOR); - let cmp = a.lt(b); + let cmp = a.simd_lt(b); let res = cmp.select(diff_reduced, diff); a_u64.0[i] = res; @@ -159,27 +161,28 @@ impl MixedGL { pub unsafe fn butterfly_1x1_impl(&mut self) -> &mut Self { let [part1, part2] = MixedGL::as_u64x8_arrays(&*self); - let u: packed_simd::u64x8 = shuffle!(part1, part2, [0, 2, 4, 6, 8, 10, 12, 14]); - let v: packed_simd::u64x8 = shuffle!(part1, part2, [1, 3, 5, 7, 9, 11, 13, 15]); + + let u: u64x8 = simd_shuffle(part1, part2, const { [0u32, 2, 4, 6, 8, 10, 12, 14] }); + let v: u64x8 = simd_shuffle(part1, part2, const { [1u32, 3, 5, 7, 9, 11, 13, 15] }); //additional reduction over v let v_reduced = v.add(Self::EPSILON_VECTOR_D); - let cmp = v_reduced.lt(Self::EPSILON_VECTOR_D); + let cmp = v_reduced.simd_lt(Self::EPSILON_VECTOR_D); let v = cmp.select(v_reduced, v); // u + v let sum = u.add(v); let sum_reduced = sum.add(Self::EPSILON_VECTOR_D); - let cmp0 = sum_reduced.lt(sum); - let cmp1 = sum.lt(u); + let cmp0 = sum_reduced.simd_lt(sum); + let cmp1 = sum.simd_lt(u); let reduce_flag = cmp0.bitor(cmp1); let res1 = reduce_flag.select(sum_reduced, sum); // u - v let diff = u.sub(v); let diff_reduced = diff.sub(Self::EPSILON_VECTOR_D); - let cmp = u.lt(v); + let cmp = u.simd_lt(v); let res2 = cmp.select(diff_reduced, diff); - let part1: packed_simd::u64x8 = shuffle!(res1, res2, [0, 8, 1, 9, 2, 10, 3, 11]); - let part2: packed_simd::u64x8 = shuffle!(res1, res2, [4, 12, 5, 13, 6, 14, 7, 15]); + let part1: u64x8 = simd_shuffle(res1, res2, const { [0u32, 8, 1, 9, 2, 10, 3, 11] }); + let part2: u64x8 = simd_shuffle(res1, res2, const { [4u32, 12, 5, 13, 6, 14, 7, 15] }); *self = MixedGL::from_u64x8_arrays([part1, part2]); @@ -188,27 +191,27 @@ impl MixedGL { pub unsafe fn butterfly_2x2_impl(&mut self) -> &mut Self { let [part1, part2] = MixedGL::as_u64x8_arrays(&*self); - let u: packed_simd::u64x8 = shuffle!(part1, part2, [0, 1, 4, 5, 8, 9, 12, 13]); - let v: packed_simd::u64x8 = shuffle!(part1, part2, [2, 3, 6, 7, 10, 11, 14, 15]); + let u: u64x8 = simd_shuffle(part1, part2, const { [0u32, 1, 4, 5, 8, 9, 12, 13] }); + let v: u64x8 = simd_shuffle(part1, part2, const { [2u32, 3, 6, 7, 10, 11, 14, 15] }); //additional reduction over v let v_reduced = v.add(Self::EPSILON_VECTOR_D); - let cmp = v_reduced.lt(Self::EPSILON_VECTOR_D); + let cmp = v_reduced.simd_lt(Self::EPSILON_VECTOR_D); let v = cmp.select(v_reduced, v); // u + v let sum = u.add(v); let sum_reduced = sum.add(Self::EPSILON_VECTOR_D); - let cmp0 = sum_reduced.lt(sum); - let cmp1 = sum.lt(u); + let cmp0 = sum_reduced.simd_lt(sum); + let cmp1 = sum.simd_lt(u); let reduce_flag = cmp0.bitor(cmp1); let res1 = reduce_flag.select(sum_reduced, sum); // u - v let diff = u.sub(v); let diff_reduced = diff.sub(Self::EPSILON_VECTOR_D); - let cmp = u.lt(v); + let cmp = u.simd_lt(v); let res2 = cmp.select(diff_reduced, diff); - let part1: packed_simd::u64x8 = shuffle!(res1, res2, [0, 1, 8, 9, 2, 3, 10, 11]); - let part2: packed_simd::u64x8 = shuffle!(res1, res2, [4, 5, 12, 13, 6, 7, 14, 15]); + let part1: u64x8 = simd_shuffle(res1, res2, const { [0u32, 1, 8, 9, 2, 3, 10, 11] }); + let part2: u64x8 = simd_shuffle(res1, res2, const { [4u32, 5, 12, 13, 6, 7, 14, 15] }); *self = MixedGL::from_u64x8_arrays([part1, part2]); @@ -217,27 +220,27 @@ impl MixedGL { pub unsafe fn butterfly_4x4_impl(&mut self) -> &mut Self { let [part1, part2] = MixedGL::as_u64x8_arrays(&*self); - let u: packed_simd::u64x8 = shuffle!(part1, part2, [0, 1, 2, 3, 8, 9, 10, 11]); - let v: packed_simd::u64x8 = shuffle!(part1, part2, [4, 5, 6, 7, 12, 13, 14, 15]); + let u: u64x8 = simd_shuffle(part1, part2, const { [0u32, 1, 2, 3, 8, 9, 10, 11] }); + let v: u64x8 = simd_shuffle(part1, part2, const { [4u32, 5, 6, 7, 12, 13, 14, 15] }); //additional reduction over v let v_reduced = v.add(Self::EPSILON_VECTOR_D); - let cmp = v_reduced.lt(Self::EPSILON_VECTOR_D); + let cmp = v_reduced.simd_lt(Self::EPSILON_VECTOR_D); let v = cmp.select(v_reduced, v); // u + v let sum = u.add(v); let sum_reduced = sum.add(Self::EPSILON_VECTOR_D); - let cmp0 = sum_reduced.lt(sum); - let cmp1 = sum.lt(u); + let cmp0 = sum_reduced.simd_lt(sum); + let cmp1 = sum.simd_lt(u); let reduce_flag = cmp0.bitor(cmp1); let res1 = reduce_flag.select(sum_reduced, sum); // u - v let diff = u.sub(v); let diff_reduced = diff.sub(Self::EPSILON_VECTOR_D); - let cmp = u.lt(v); + let cmp = u.simd_lt(v); let res2 = cmp.select(diff_reduced, diff); - let part1: packed_simd::u64x8 = shuffle!(res1, res2, [0, 1, 2, 3, 8, 9, 10, 11]); - let part2: packed_simd::u64x8 = shuffle!(res1, res2, [4, 5, 6, 7, 12, 13, 14, 15]); + let part1: u64x8 = simd_shuffle(res1, res2, const { [0u32, 1, 2, 3, 8, 9, 10, 11] }); + let part2: u64x8 = simd_shuffle(res1, res2, const { [4u32, 5, 6, 7, 12, 13, 14, 15] }); *self = MixedGL::from_u64x8_arrays([part1, part2]); @@ -256,27 +259,27 @@ impl MixedGL { let u = std::slice::from_raw_parts_mut(this as *mut u64, 8); let v = std::slice::from_raw_parts_mut(other as *mut u64, 8); - let a = packed_simd::u64x8::from_slice_aligned(u); - let b = packed_simd::u64x8::from_slice_aligned(v); + let a = u64x8::from_slice(u); + let b = u64x8::from_slice(v); //additional reduction over b let b_reduced = b.add(Self::EPSILON_VECTOR_D); - let cmp = b_reduced.lt(Self::EPSILON_VECTOR_D); + let cmp = b_reduced.simd_lt(Self::EPSILON_VECTOR_D); let b = cmp.select(b_reduced, b); // u + v let sum = a.add(b); let sum_reduced = sum.add(Self::EPSILON_VECTOR_D); - let cmp0 = sum_reduced.lt(sum); - let cmp1 = sum.lt(a); + let cmp0 = sum_reduced.simd_lt(sum); + let cmp1 = sum.simd_lt(a); let reduce_flag = cmp0.bitor(cmp1); let res1 = reduce_flag.select(sum_reduced, sum); // u - v let diff = a.sub(b); let diff_reduced = diff.sub(Self::EPSILON_VECTOR_D); - let cmp = a.lt(b); + let cmp = a.simd_lt(b); let res2 = cmp.select(diff_reduced, diff); - res1.write_to_slice_aligned(u); - res2.write_to_slice_aligned(v); + res1.copy_to_slice(u); + res2.copy_to_slice(v); } /// # Safety @@ -323,7 +326,7 @@ impl MixedGL { } #[inline(always)] - pub(crate) fn as_u64x8_arrays(input: &Self) -> [packed_simd::u64x8; 2] { + pub(crate) fn as_u64x8_arrays(input: &Self) -> [u64x8; 2] { // this preserves an alignment unsafe { std::mem::transmute(*input) } } @@ -335,7 +338,7 @@ impl MixedGL { } #[inline(always)] - pub(crate) unsafe fn from_u64x8_arrays(input: [packed_simd::u64x8; 2]) -> Self { + pub(crate) unsafe fn from_u64x8_arrays(input: [u64x8; 2]) -> Self { // this preserves an alignment std::mem::transmute(input) } @@ -412,8 +415,8 @@ impl crate::field::traits::field_like::PrimeFieldLike for MixedGL { for i in 0..4 { let a = a_u64.0[i]; - let is_zero = a.eq(packed_simd::u64x4::splat(0)); - let neg = packed_simd::u64x4::splat(Self::ORDER).sub(a); + let is_zero = a.simd_eq(u64x4::splat(0)); + let neg = u64x4::splat(Self::ORDER).sub(a); let res = is_zero.select(a, neg); a_u64.0[i] = res; diff --git a/src/field/goldilocks/arm_asm_packed_impl.rs b/src/field/goldilocks/arm_asm_packed_impl.rs new file mode 100644 index 0000000..03399c4 --- /dev/null +++ b/src/field/goldilocks/arm_asm_packed_impl.rs @@ -0,0 +1,858 @@ +use crate::cs::implementations::utils::precompute_twiddles_for_fft; +use crate::cs::traits::GoodAllocator; +use crate::field::{Field, PrimeField}; +use crate::worker::Worker; +use packed_simd::shuffle; +use std::ops::{Add, BitOr, Sub}; +use std::usize; + +use super::GoldilocksField; + +// we need max of an alignment of u64x4 and u64x8 in this implementation, so 64 + +#[derive(PartialEq, Eq, Hash, Clone, Copy)] +#[repr(C, align(64))] +pub struct MixedGL(pub [GoldilocksField; 16]); + +// we also need holder for SIMD targets, because u64x4 has smaller alignment than u64x8 +#[derive(Clone, Copy)] +#[repr(C, align(64))] +struct U64x4Holder([packed_simd::u64x4; 4]); + +impl std::fmt::Debug for MixedGL { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{:?}", self.0) + } +} + +impl std::fmt::Display for MixedGL { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{:?}", self.0) + } +} + +impl MixedGL { + pub const ORDER_BITS: usize = GoldilocksField::ORDER_BITS; + pub const ORDER: u64 = GoldilocksField::ORDER; + pub const TWO_ADICITY: usize = GoldilocksField::TWO_ADICITY; + pub const T: u64 = (Self::ORDER - 1) >> Self::TWO_ADICITY; + pub const BARRETT: u128 = 18446744078004518912; // 0x10000000100000000 + pub const EPSILON: u64 = (1 << 32) - 1; + pub const EPSILON_VECTOR: packed_simd::u64x4 = packed_simd::u64x4::splat(Self::EPSILON); + pub const EPSILON_VECTOR_D: packed_simd::u64x8 = packed_simd::u64x8::splat(Self::EPSILON); + + #[inline(always)] + pub fn new() -> Self { + Self([GoldilocksField::ZERO; 16]) + } + + #[inline(always)] + pub fn from_constant(value: GoldilocksField) -> Self { + Self([value; 16]) + } + + #[inline(always)] + pub fn from_array(value: [GoldilocksField; 16]) -> Self { + Self(value) + } + + #[inline(always)] + #[unroll::unroll_for_loops] + pub fn to_reduced(&mut self) -> &mut Self { + let mut a_u64 = Self::as_u64x4_arrays(self); + + for i in 0..4 { + let a = a_u64.0[i]; + let a_reduced = a.add(Self::EPSILON_VECTOR); + let cmp = a_reduced.lt(Self::EPSILON_VECTOR); + let res = cmp.select(a_reduced, a); + + a_u64.0[i] = res; + } + + unsafe { + *self = Self::from_u64x4_arrays(a_u64); + } + + self + } + + #[inline(always)] + #[unroll::unroll_for_loops] + pub fn mul_constant_assign(&'_ mut self, other: &GoldilocksField) -> &mut Self { + for i in 0..16 { + self.0[i].mul_assign(other); + } + + self + } + + #[inline(always)] + #[unroll::unroll_for_loops] + fn mul_assign_impl(&mut self, other: &Self) -> &mut Self { + for i in 0..16 { + self.0[i].mul_assign(&other.0[i]); + } + + self + } + + #[inline(always)] + #[unroll::unroll_for_loops] + fn add_assign_impl(&mut self, other: &Self) -> &mut Self { + let mut a_u64 = Self::as_u64x4_arrays(self); + let b_u64 = Self::as_u64x4_arrays(other); + + for i in 0..4 { + let a = a_u64.0[i]; + let b = b_u64.0[i]; + //additional reduction over b + let b_reduced = b.add(Self::EPSILON_VECTOR); + let cmp = b_reduced.lt(Self::EPSILON_VECTOR); + let b = cmp.select(b_reduced, b); + //a+b + let sum = a.add(b); + let sum_reduced = sum.add(Self::EPSILON_VECTOR); + let cmp0 = sum_reduced.lt(sum); + let cmp1 = sum.lt(a); + let reduce_flag = cmp0.bitor(cmp1); + let res = reduce_flag.select(sum_reduced, sum); + + a_u64.0[i] = res; + } + + unsafe { + *self = Self::from_u64x4_arrays(a_u64); + } + + self + } + + #[inline(always)] + #[unroll::unroll_for_loops] + fn sub_assign_impl(&'_ mut self, other: &Self) -> &mut Self { + let mut a_u64 = Self::as_u64x4_arrays(self); + let b_u64 = Self::as_u64x4_arrays(other); + + for i in 0..4 { + let a = a_u64.0[i]; + let b = b_u64.0[i]; + //additional reduction over b + let b_reduced = b.add(Self::EPSILON_VECTOR); + let cmp = b_reduced.lt(Self::EPSILON_VECTOR); + let b = cmp.select(b_reduced, b); + //a-b + let diff = a.sub(b); + let diff_reduced = diff.sub(Self::EPSILON_VECTOR); + let cmp = a.lt(b); + let res = cmp.select(diff_reduced, diff); + + a_u64.0[i] = res; + } + + unsafe { + *self = Self::from_u64x4_arrays(a_u64); + } + + self + } + + pub unsafe fn butterfly_1x1_impl(&mut self) -> &mut Self { + let [part1, part2] = MixedGL::as_u64x8_arrays(&*self); + let u: packed_simd::u64x8 = shuffle!(part1, part2, [0, 2, 4, 6, 8, 10, 12, 14]); + let v: packed_simd::u64x8 = shuffle!(part1, part2, [1, 3, 5, 7, 9, 11, 13, 15]); + //additional reduction over v + let v_reduced = v.add(Self::EPSILON_VECTOR_D); + let cmp = v_reduced.lt(Self::EPSILON_VECTOR_D); + let v = cmp.select(v_reduced, v); + // u + v + let sum = u.add(v); + let sum_reduced = sum.add(Self::EPSILON_VECTOR_D); + let cmp0 = sum_reduced.lt(sum); + let cmp1 = sum.lt(u); + let reduce_flag = cmp0.bitor(cmp1); + let res1 = reduce_flag.select(sum_reduced, sum); + // u - v + let diff = u.sub(v); + let diff_reduced = diff.sub(Self::EPSILON_VECTOR_D); + let cmp = u.lt(v); + let res2 = cmp.select(diff_reduced, diff); + + let part1: packed_simd::u64x8 = shuffle!(res1, res2, [0, 8, 1, 9, 2, 10, 3, 11]); + let part2: packed_simd::u64x8 = shuffle!(res1, res2, [4, 12, 5, 13, 6, 14, 7, 15]); + + *self = MixedGL::from_u64x8_arrays([part1, part2]); + + self + } + + pub unsafe fn butterfly_2x2_impl(&mut self) -> &mut Self { + let [part1, part2] = MixedGL::as_u64x8_arrays(&*self); + let u: packed_simd::u64x8 = shuffle!(part1, part2, [0, 1, 4, 5, 8, 9, 12, 13]); + let v: packed_simd::u64x8 = shuffle!(part1, part2, [2, 3, 6, 7, 10, 11, 14, 15]); + //additional reduction over v + let v_reduced = v.add(Self::EPSILON_VECTOR_D); + let cmp = v_reduced.lt(Self::EPSILON_VECTOR_D); + let v = cmp.select(v_reduced, v); + // u + v + let sum = u.add(v); + let sum_reduced = sum.add(Self::EPSILON_VECTOR_D); + let cmp0 = sum_reduced.lt(sum); + let cmp1 = sum.lt(u); + let reduce_flag = cmp0.bitor(cmp1); + let res1 = reduce_flag.select(sum_reduced, sum); + // u - v + let diff = u.sub(v); + let diff_reduced = diff.sub(Self::EPSILON_VECTOR_D); + let cmp = u.lt(v); + let res2 = cmp.select(diff_reduced, diff); + + let part1: packed_simd::u64x8 = shuffle!(res1, res2, [0, 1, 8, 9, 2, 3, 10, 11]); + let part2: packed_simd::u64x8 = shuffle!(res1, res2, [4, 5, 12, 13, 6, 7, 14, 15]); + + *self = MixedGL::from_u64x8_arrays([part1, part2]); + + self + } + + pub unsafe fn butterfly_4x4_impl(&mut self) -> &mut Self { + let [part1, part2] = MixedGL::as_u64x8_arrays(&*self); + let u: packed_simd::u64x8 = shuffle!(part1, part2, [0, 1, 2, 3, 8, 9, 10, 11]); + let v: packed_simd::u64x8 = shuffle!(part1, part2, [4, 5, 6, 7, 12, 13, 14, 15]); + //additional reduction over v + let v_reduced = v.add(Self::EPSILON_VECTOR_D); + let cmp = v_reduced.lt(Self::EPSILON_VECTOR_D); + let v = cmp.select(v_reduced, v); + // u + v + let sum = u.add(v); + let sum_reduced = sum.add(Self::EPSILON_VECTOR_D); + let cmp0 = sum_reduced.lt(sum); + let cmp1 = sum.lt(u); + let reduce_flag = cmp0.bitor(cmp1); + let res1 = reduce_flag.select(sum_reduced, sum); + // u - v + let diff = u.sub(v); + let diff_reduced = diff.sub(Self::EPSILON_VECTOR_D); + let cmp = u.lt(v); + let res2 = cmp.select(diff_reduced, diff); + + let part1: packed_simd::u64x8 = shuffle!(res1, res2, [0, 1, 2, 3, 8, 9, 10, 11]); + let part2: packed_simd::u64x8 = shuffle!(res1, res2, [4, 5, 6, 7, 12, 13, 14, 15]); + + *self = MixedGL::from_u64x8_arrays([part1, part2]); + + self + } + + /// # Safety + /// + /// Pointers must be properly aligned for `MixedGL` type, should point to arrays of length 8, and should point + /// to memory that can be mutated. + /// No references to the same memory should exist when this function is called. + /// Pointers should be different. + pub unsafe fn butterfly_8x8_impl(this: *const u64, other: *const u64) { + debug_assert!(this.addr() % std::mem::align_of::() == 0); + debug_assert!(other.addr() % std::mem::align_of::() == 0); + + let u = std::slice::from_raw_parts_mut(this as *mut u64, 8); + let v = std::slice::from_raw_parts_mut(other as *mut u64, 8); + let a = packed_simd::u64x8::from_slice_aligned(u); + let b = packed_simd::u64x8::from_slice_aligned(v); + //additional reduction over b + let b_reduced = b.add(Self::EPSILON_VECTOR_D); + let cmp = b_reduced.lt(Self::EPSILON_VECTOR_D); + let b = cmp.select(b_reduced, b); + // u + v + let sum = a.add(b); + let sum_reduced = sum.add(Self::EPSILON_VECTOR_D); + let cmp0 = sum_reduced.lt(sum); + let cmp1 = sum.lt(a); + let reduce_flag = cmp0.bitor(cmp1); + let res1 = reduce_flag.select(sum_reduced, sum); + // u - v + let diff = a.sub(b); + let diff_reduced = diff.sub(Self::EPSILON_VECTOR_D); + let cmp = a.lt(b); + let res2 = cmp.select(diff_reduced, diff); + + res1.write_to_slice_aligned(u); + res2.write_to_slice_aligned(v); + } + + /// # Safety + /// + /// Pointers must be properly aligned for `MixedGL` type, should point to arrays of length 16, and should point + /// to memory that can be mutated. + /// No references to the same memory should exist when this function is called. + /// Pointers should be different. + pub unsafe fn butterfly_16x16_impl(mut this: *mut u64, mut other: *mut u64) { + debug_assert!(this.addr() % std::mem::align_of::() == 0); + debug_assert!(other.addr() % std::mem::align_of::() == 0); + + Self::butterfly_8x8_impl(this, other); + this = this.offset(8); + other = other.offset(8); + Self::butterfly_8x8_impl(this, other); + } + + // pub unsafe fn butterfly_16x16_impl( + // this: &mut Self, + // other: &mut Self, + // ) { + // let mut this_ptr = this.0.as_ptr() as *mut u64; + // let mut other_ptr = other.0.as_ptr() as *mut u64; + + // debug_assert!(this_ptr.addr() % std::mem::align_of::() == 0); + // debug_assert!(other_ptr.addr() % std::mem::align_of::() == 0); + + // Self::butterfly_8x8_impl(this_ptr, other_ptr); + // this_ptr = this_ptr.offset(8); + // other_ptr = other_ptr.offset(8); + // Self::butterfly_8x8_impl(this_ptr, other_ptr); + // } + + #[inline(always)] + pub fn from_field_array(input: [GoldilocksField; 16]) -> Self { + Self(input) + } + + #[inline(always)] + fn as_u64x4_arrays(input: &Self) -> U64x4Holder { + // this preserves an alignment + unsafe { std::mem::transmute(*input) } + } + + #[inline(always)] + pub(crate) fn as_u64x8_arrays(input: &Self) -> [packed_simd::u64x8; 2] { + // this preserves an alignment + unsafe { std::mem::transmute(*input) } + } + + #[inline(always)] + unsafe fn from_u64x4_arrays(input: U64x4Holder) -> Self { + // this preserves an alignment + std::mem::transmute(input) + } + + #[inline(always)] + pub(crate) unsafe fn from_u64x8_arrays(input: [packed_simd::u64x8; 2]) -> Self { + // this preserves an alignment + std::mem::transmute(input) + } + + #[inline(always)] + pub fn vec_add_assign(a: &mut [Self], b: &[Self]) { + use crate::field::traits::field_like::PrimeFieldLike; + for (a, b) in a.iter_mut().zip(b.iter()) { + a.add_assign(b, &mut ()); + } + } + + #[inline(always)] + pub fn vec_mul_assign(a: &mut [Self], b: &[Self]) { + use crate::field::traits::field_like::PrimeFieldLike; + for (a, b) in a.iter_mut().zip(b.iter()) { + a.mul_assign(b, &mut ()); + } + } +} + +impl Default for MixedGL { + fn default() -> Self { + Self([GoldilocksField::ZERO; 16]) + } +} + +impl crate::field::traits::field_like::PrimeFieldLike for MixedGL { + type Base = GoldilocksField; + type Context = (); + + #[inline(always)] + fn zero(_ctx: &mut Self::Context) -> Self { + Self([GoldilocksField::ZERO; 16]) + } + #[inline(always)] + fn one(_ctx: &mut Self::Context) -> Self { + Self([GoldilocksField::ONE; 16]) + } + #[inline(always)] + fn minus_one(_ctx: &mut Self::Context) -> Self { + Self([GoldilocksField::MINUS_ONE; 16]) + } + + #[inline(always)] + fn add_assign(&mut self, other: &Self, _ctx: &mut Self::Context) -> &mut Self { + Self::add_assign_impl(self, other) + } + + #[inline(always)] + fn sub_assign(&'_ mut self, other: &Self, _ctx: &mut Self::Context) -> &mut Self { + Self::sub_assign_impl(self, other) + } + + #[inline(always)] + #[unroll::unroll_for_loops] + fn mul_assign(&'_ mut self, other: &Self, _ctx: &mut Self::Context) -> &mut Self { + Self::mul_assign_impl(self, other) + } + + #[inline(always)] + fn square(&'_ mut self, _ctx: &mut Self::Context) -> &'_ mut Self { + let t = *self; + self.mul_assign(&t, _ctx); + + self + } + + #[inline(always)] + #[unroll::unroll_for_loops] + fn negate(&'_ mut self, _ctx: &mut Self::Context) -> &'_ mut Self { + let mut a_u64 = Self::as_u64x4_arrays(self); + + for i in 0..4 { + let a = a_u64.0[i]; + + let is_zero = a.eq(packed_simd::u64x4::splat(0)); + let neg = packed_simd::u64x4::splat(Self::ORDER).sub(a); + let res = is_zero.select(a, neg); + + a_u64.0[i] = res; + } + + unsafe { + *self = Self::from_u64x4_arrays(a_u64); + } + + self + } + + #[inline(always)] + fn double(&'_ mut self, _ctx: &mut Self::Context) -> &'_ mut Self { + let t = *self; + self.add_assign(&t, _ctx); + + self + } + + #[inline(always)] + #[unroll::unroll_for_loops] + fn inverse(&self, _ctx: &mut Self::Context) -> Self { + let mut result = *self; + for i in 0..16 { + result.0[i] = PrimeField::inverse(&result.0[i]).expect("inverse must exist"); + } + + result + } + + #[inline(always)] + fn constant(value: Self::Base, _ctx: &mut Self::Context) -> Self { + Self([value; 16]) + } +} + +impl crate::field::traits::field_like::PrimeFieldLikeVectorized for MixedGL { + type Twiddles = Vec; + type InverseTwiddles = Vec; + #[inline(always)] + fn is_zero(&self) -> bool { + self.0 == [GoldilocksField::ZERO; 16] + } + + #[inline(always)] + fn equals(&self, other: &Self) -> bool { + self.eq(other) + } + + #[inline(always)] + fn mul_all_by_base(&'_ mut self, other: &Self::Base, _ctx: &mut Self::Context) -> &'_ mut Self { + Self::mul_constant_assign(self, other) + } + + #[inline(always)] + fn slice_from_base_slice(input: &[Self::Base]) -> &[Self] { + if input.len() < Self::SIZE_FACTOR { + panic!("too small input size to cast"); + } + debug_assert!(input.len() % Self::SIZE_FACTOR == 0); + debug_assert!(input.as_ptr().addr() % std::mem::align_of::() == 0); + let result_len = input.len() / 16; + unsafe { std::slice::from_raw_parts(input.as_ptr() as *mut Self, result_len) } + } + + #[inline(always)] + fn slice_into_base_slice(input: &[Self]) -> &[Self::Base] { + let result_len = input.len() * 16; + unsafe { std::slice::from_raw_parts(input.as_ptr() as *mut GoldilocksField, result_len) } + } + + #[inline(always)] + fn slice_into_base_slice_mut(input: &mut [Self]) -> &mut [Self::Base] { + let result_len = input.len() * 16; + unsafe { + std::slice::from_raw_parts_mut(input.as_ptr() as *mut GoldilocksField, result_len) + } + } + + #[inline(always)] + fn vec_from_base_vec(input: Vec) -> Vec { + if input.len() < Self::SIZE_FACTOR { + panic!("too small input size to cast"); + } + let (ptr, len, capacity, allocator) = input.into_raw_parts_with_alloc(); + debug_assert!(ptr.addr() % std::mem::align_of::() == 0); + debug_assert!(len % Self::SIZE_FACTOR == 0); + debug_assert!(capacity % Self::SIZE_FACTOR == 0); + + unsafe { + Vec::from_raw_parts_in( + ptr as _, + len / Self::SIZE_FACTOR, + capacity / Self::SIZE_FACTOR, + allocator, + ) + } + } + + #[inline(always)] + fn vec_into_base_vec(input: Vec) -> Vec { + let (ptr, len, capacity, allocator) = input.into_raw_parts_with_alloc(); + + unsafe { + Vec::from_raw_parts_in( + ptr as _, + len * Self::SIZE_FACTOR, + capacity * Self::SIZE_FACTOR, + allocator, + ) + } + } + + #[inline(always)] + fn fft_natural_to_bitreversed( + input: &mut [Self], + coset: Self::Base, + twiddles: &Self::Twiddles, + _ctx: &mut Self::Context, + ) { + // let input = crate::utils::cast_check_alignment_ref_mut_unpack::(input); + // crate::fft::fft_natural_to_bitreversed_cache_friendly(input, coset, twiddles); + + crate::fft::fft_natural_to_bitreversed_mixedgl(input, coset, twiddles); + } + + #[inline(always)] + fn ifft_natural_to_natural( + input: &mut [Self], + coset: Self::Base, + twiddles: &Self::InverseTwiddles, + _ctx: &mut Self::Context, + ) { + // let input = crate::utils::cast_check_alignment_ref_mut_unpack::(input); + // crate::fft::ifft_natural_to_natural_cache_friendly(input, coset, twiddles); + + crate::fft::ifft_natural_to_natural_mixedgl(input, coset, twiddles); + } + + #[inline(always)] + fn precompute_forward_twiddles_for_fft( + fft_size: usize, + worker: &Worker, + ctx: &mut Self::Context, + ) -> Self::Twiddles { + precompute_twiddles_for_fft::( + fft_size, worker, ctx, + ) + } + + #[inline(always)] + fn precompute_inverse_twiddles_for_fft( + fft_size: usize, + worker: &Worker, + ctx: &mut Self::Context, + ) -> Self::Twiddles { + precompute_twiddles_for_fft::( + fft_size, worker, ctx, + ) + } +} + +#[cfg(test)] +mod test { + + use crate::field::goldilocks::MixedGL; + use crate::field::rand_from_rng; + use crate::field::traits::field_like::PrimeFieldLike; + use crate::field::traits::field_like::PrimeFieldLikeVectorized; + use crate::field::{goldilocks::GoldilocksField, Field}; + use crate::utils::clone_respecting_allignment; + + #[test] + fn test_mixedgl_negate() { + let mut ctx = (); + const POLY_SIZE: usize = 1 << 20; + let mut rng = rand::thread_rng(); + + // Generate random Vec + let a: Vec = (0..POLY_SIZE).map(|_| rand_from_rng(&mut rng)).collect(); + + let mut ag = a.clone(); + + for aa in ag.iter_mut() { + Field::negate(aa); + } + + let mut av: Vec = + MixedGL::vec_from_base_vec(clone_respecting_allignment::( + &a, + )); + + // Test over GLPS + for aa in av.iter_mut() { + aa.negate(&mut ctx); + } + + assert_eq!(MixedGL::vec_into_base_vec(av), ag); + } + + use rand::Rng; + + #[test] + fn test_mixedgl_add_assign() { + let mut ctx = (); + const POLY_SIZE: usize = 1 << 24; + let mut rng = rand::thread_rng(); + let _s = GoldilocksField(0x0000000001000000); + + // Generate random Vec + // let a: Vec = (0..POLY_SIZE).map(|_| rand_from_rng(&mut rng)).collect(); + // let b: Vec = (0..POLY_SIZE).map(|_| rand_from_rng(&mut rng)).collect(); + // let a: Vec = (0..POLY_SIZE).map(|_| GoldilocksField(0x0000000000000001)).collect(); + // let b: Vec = (0..POLY_SIZE).map(|_| GoldilocksField(0x0000000001000000)).collect(); + let b: Vec = (0..POLY_SIZE) + .map(|_| GoldilocksField(rng.gen_range(GoldilocksField::ORDER..u64::MAX))) + .collect(); + let a: Vec = (0..POLY_SIZE) + .map(|_| GoldilocksField(rng.gen_range(GoldilocksField::ORDER..u64::MAX))) + .collect(); + // let a: Vec = (0..POLY_SIZE).map(|_| GoldilocksField(0xfffffffff67f1442)).collect(); + // let b: Vec = (0..POLY_SIZE).map(|_| GoldilocksField(0xffffffff9c1d065d)).collect(); + + // dbg!(&a); + // dbg!(&b); + + let mut ag = a.clone(); + let bg = b.clone(); + + for (aa, bb) in ag.iter_mut().zip(bg.iter()) { + Field::add_assign(aa, bb); + } + + let mut av: Vec = + MixedGL::vec_from_base_vec(clone_respecting_allignment::( + &a, + )); + let bv: Vec = + MixedGL::vec_from_base_vec(clone_respecting_allignment::( + &b, + )); + + // Test over GLPS + for (aa, bb) in av.iter_mut().zip(bv.iter()) { + aa.add_assign(bb, &mut ctx); + } + + let avv = MixedGL::vec_into_base_vec(av); + // for i in 0..avv.len() { + // assert_eq!(avv[i], ag[i], "error {}", i); + // } + + // dbg!(&ag[0]); + // dbg!(&avv[0]); + + assert_eq!(avv, ag); + } + + #[test] + fn test_mixedgl_sub_assign() { + let mut ctx = (); + const POLY_SIZE: usize = 1 << 20; + let _rng = rand::thread_rng(); + + // Generate random Vec + // let a: Vec = (0..POLY_SIZE).map(|_| rand_from_rng(&mut rng)).collect(); + // let b: Vec = (0..POLY_SIZE).map(|_| rand_from_rng(&mut rng)).collect(); + let a: Vec = (0..POLY_SIZE) + .map(|_| GoldilocksField(0x0000000000000001)) + .collect(); + let b: Vec = (0..POLY_SIZE) + .map(|_| GoldilocksField(0x0000000001000000)) + .collect(); + + // Test over Goldilocks + let mut ag = a.clone(); + let bg = b.clone(); + + for (aa, bb) in ag.iter_mut().zip(bg.iter()) { + Field::sub_assign(aa, bb); + } + + let mut av: Vec = + MixedGL::vec_from_base_vec(clone_respecting_allignment::( + &a, + )); + let bv: Vec = + MixedGL::vec_from_base_vec(clone_respecting_allignment::( + &b, + )); + + // Test over GLPS + for (aa, bb) in av.iter_mut().zip(bv.iter()) { + aa.sub_assign(bb, &mut ctx); + } + + // dbg!(&ag); + // dbg!(&av); + + assert_eq!(ag, MixedGL::vec_into_base_vec(av)); + } + + #[test] + fn test_mixedgl_mul_assign() { + let mut ctx = (); + const POLY_SIZE: usize = 1 << 20; + let mut rng = rand::thread_rng(); + + // Generate random Vec + let a: Vec = (0..POLY_SIZE).map(|_| rand_from_rng(&mut rng)).collect(); + let b: Vec = (0..POLY_SIZE).map(|_| rand_from_rng(&mut rng)).collect(); + + // Test over Goldilocks + let mut ag = a.clone(); + let bg = b.clone(); + + for (aa, bb) in ag.iter_mut().zip(bg.iter()) { + Field::mul_assign(aa, bb); + } + + let mut av: Vec = + MixedGL::vec_from_base_vec(clone_respecting_allignment::( + &a, + )); + let bv: Vec = + MixedGL::vec_from_base_vec(clone_respecting_allignment::( + &b, + )); + + // Test over GLPS + for (aa, bb) in av.iter_mut().zip(bv.iter()) { + aa.mul_assign(bb, &mut ctx); + } + + // dbg!(&ag); + // dbg!(&av); + + assert_eq!(ag, MixedGL::vec_into_base_vec(av)); + } + + #[test] + fn test_mixedgl_butterfly16x16() { + // let mut ctx = (); + + // let am: [u64;32] = [0x0001000000000000, 0x0000000000000001, 0x0001000000000000, 0x0000000000000001, 0x0000000000000000, 0xffffffff00000000, 0x0000000000000001, 0x0000ffffffffffff, 0x0000000000000000, 0x0001000000000000, 0xffffffff00000000, 0xffffffff00000000, 0xffffffff00000000, 0xfffeffff00000001, 0xfffeffff00000002, 0xfffeffff00000002, + // 0x0000000000000000, 0x0000000000000001, 0x0000000000000000, 0x0001000000000001, 0xfffeffff00000001, 0xffffffff00000000, 0x0001000000000000, 0xfffeffff00000002, 0x0000000000000000, 0xfffeffff00000001, 0xffffffff00000000, 0x0000000000000001, 0x0000ffffffffffff, 0x0000000000000000, 0x0000000000000001, 0x0001000000000000]; + + let am: [u64; 32] = [ + 0x0001000000000000, + 0x0000000000000001, + 0x0001000000000000, + 0x0000000000000001, + 0x0000000000000000, + 0xffffffff00000000, + 0x0000000000000001, + 0x0000ffffffffffff, + 0x0000000000000000, + 0x0001000000000000, + 0xffffffff00000000, + 0xffffffff00000000, + 0xffffffff00000000, + 0xfffeffff00000001, + 0xfffeffff00000002, + 0xfffeffff00000002, + 0x0000000000000000, + 0xffffffff01000001, + 0x0000000000000000, + 0x0000010000ffff00, + 0xfffffeff00000101, + 0xfffffffeff000001, + 0x000000ffffffff00, + 0xfffffeff01000101, + 0x0000000000000000, + 0xfffffeff00000101, + 0xfffffffeff000001, + 0xffffffff01000001, + 0x000000fffeffff00, + 0x0000000000000000, + 0xffffffff01000001, + 0x000000ffffffff00, + ]; + + let a: Vec = am.into_iter().map(GoldilocksField).collect(); + // let b: Vec = bm.into_iter().map(GoldilocksField).collect(); + let _s = GoldilocksField(0x0000000001000000); + + // Test over Goldilocks + let mut ag = a.clone(); + // let mut bg = b.clone(); + let distance_in_cache = 16; + + let mut j = 0; + while j < 16 { + let mut u = ag[j]; + let v = ag[j + distance_in_cache]; + // Field::mul_assign(&mut v, &s); + Field::sub_assign(&mut u, &v); + ag[j + distance_in_cache] = u; + Field::add_assign(&mut ag[j], &v); + + j += 1; + } + + let av: Vec = + MixedGL::vec_from_base_vec(clone_respecting_allignment::( + &a, + )); + // let mut bv: Vec = MixedGL::vec_from_base_vec(clone_respecting_allignment::(&b)); + // let mut av = av[0]; + // let mut bv = bv[0]; + + // Test over MixedGL + // av[1].mul_constant_assign(&s); + unsafe { + MixedGL::butterfly_16x16_impl( + av[0].0.as_ptr() as *mut u64, + av[1].0.as_ptr() as *mut u64, + ); + } + // let mut u = av[0]; + // let mut v = av[1]; + // unsafe { MixedGL::butterfly_16x16_impl(&mut u, &mut v); } + // av[0] = u; + // av[1] = v; + + let ag = + MixedGL::vec_from_base_vec(clone_respecting_allignment::( + &ag, + )); + // let bg = MixedGL::vec_from_base_vec(clone_respecting_allignment::(&bg)); + + dbg!(&ag); + dbg!(&av); + + // dbg!(&bg); + // dbg!(&bv); + + assert_eq!(ag, av); + // assert_eq!(bg, bv); + } +} diff --git a/src/field/goldilocks/mod.rs b/src/field/goldilocks/mod.rs index 10daec1..82fa6be 100644 --- a/src/field/goldilocks/mod.rs +++ b/src/field/goldilocks/mod.rs @@ -12,10 +12,18 @@ mod extension; mod inversion; #[cfg(all( + not(feature = "include_packed_simd"), any(target_feature = "neon", target_feature = "avx2"), not(all(target_feature = "avx512f", target_feature = "avx512vl")) ))] pub mod arm_asm_impl; + +#[cfg(all( + feature = "include_packed_simd", + any(target_feature = "neon", target_feature = "avx2"), + not(all(target_feature = "avx512f", target_feature = "avx512vl")) +))] +pub mod arm_asm_packed_impl; #[cfg(not(any( all(target_feature = "avx512f", target_feature = "avx512vl"), target_feature = "neon", @@ -43,10 +51,19 @@ pub mod x86_64_asm_impl; pub mod avx512_impl; #[cfg(all( + not(feature = "include_packed_simd"), any(target_feature = "neon", target_feature = "avx2"), not(all(target_feature = "avx512f", target_feature = "avx512vl")) ))] pub use arm_asm_impl::*; + +#[cfg(all( + feature = "include_packed_simd", + any(target_feature = "neon", target_feature = "avx2"), + not(all(target_feature = "avx512f", target_feature = "avx512vl")) +))] +pub use arm_asm_packed_impl::*; + #[cfg(not(any( all(target_feature = "avx512f", target_feature = "avx512vl"), target_feature = "neon", diff --git a/src/implementations/poseidon2/mod.rs b/src/implementations/poseidon2/mod.rs index ecb1326..6dbb7e0 100644 --- a/src/implementations/poseidon2/mod.rs +++ b/src/implementations/poseidon2/mod.rs @@ -4,18 +4,25 @@ use crate::field::goldilocks::GoldilocksField; pub mod params; pub mod state_generic_impl; -#[cfg(not(any( - target_feature = "neon", - target_feature = "avx2", - target_feature = "avx512bw", - target_feature = "avx512cd", - target_feature = "avx512dq", - target_feature = "avx512f", - target_feature = "avx512vl" +#[cfg(not(all( + feature = "include_packed_simd", + any( + target_feature = "neon", + target_feature = "avx2", + target_feature = "avx512bw", + target_feature = "avx512cd", + target_feature = "avx512dq", + target_feature = "avx512f", + target_feature = "avx512vl", + ) )))] pub use state_generic_impl::*; +// Other poseidon implementations depend on packed_simd 128 +// which is no longer available in std::simd (and packed_simd is no longer +// supported in the newest rust nightly). #[cfg(all( + feature = "include_packed_simd", any(target_feature = "neon", target_feature = "avx2"), not(any( target_feature = "avx512bw", @@ -28,6 +35,7 @@ pub use state_generic_impl::*; pub mod state_vectorized_double; #[cfg(all( + feature = "include_packed_simd", any(target_feature = "neon", target_feature = "avx2"), not(any( target_feature = "avx512bw", @@ -40,6 +48,7 @@ pub mod state_vectorized_double; pub use state_vectorized_double::*; #[cfg(all( + feature = "include_packed_simd", target_feature = "avx512bw", target_feature = "avx512cd", target_feature = "avx512dq", @@ -49,6 +58,7 @@ pub use state_vectorized_double::*; pub mod state_avx512; #[cfg(all( + feature = "include_packed_simd", target_feature = "avx512bw", target_feature = "avx512cd", target_feature = "avx512dq", diff --git a/src/implementations/poseidon2/state_generic_impl.rs b/src/implementations/poseidon2/state_generic_impl.rs index 02cb079..c9b74e8 100644 --- a/src/implementations/poseidon2/state_generic_impl.rs +++ b/src/implementations/poseidon2/state_generic_impl.rs @@ -29,7 +29,9 @@ impl State { pub const T: u64 = (Self::ORDER - 1) >> Self::TWO_ADICITY; pub const BARRETT: u128 = 18446744078004518912; // 0x10000000100000000 pub const EPSILON: u64 = (1 << 32) - 1; + #[cfg(feature = "include_packed_simd")] pub const EPSILON_VECTOR: packed_simd::u64x4 = packed_simd::u64x4::splat(Self::EPSILON); + #[cfg(feature = "include_packed_simd")] pub const EPSILON_VECTOR_D: packed_simd::u64x8 = packed_simd::u64x8::splat(Self::EPSILON); pub const RATE: usize = poseidon_goldilocks_params::RATE; diff --git a/src/lib.rs b/src/lib.rs index 72775d5..4f2e1ee 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -21,6 +21,9 @@ #![allow(dead_code)] #![allow(dropping_references)] // Required to explicitly show that mutable references are dropped. #![allow(incomplete_features)] +#![allow(internal_features)] // Required for core_intrinsics +#![allow(stable_features)] +#![allow(unused_unsafe)] // Enabled features #![feature(allocator_api)] #![feature(const_mut_refs)] @@ -43,7 +46,6 @@ #![feature(generic_const_exprs)] #![feature(iter_array_chunks)] // #![recursion_limit = "1024"] -#![feature(stdsimd)] #![feature(avx512_target_feature)] #![feature(associated_type_defaults)] #![feature(trait_alias)] @@ -51,6 +53,7 @@ #![feature(return_position_impl_trait_in_trait)] #![feature(type_changing_struct_update)] #![feature(slice_flatten)] +#![cfg_attr(feature = "include_packed_simd", feature(stdsimd))] pub mod algebraic_props; pub mod config;