From 48c983db0920d0963d900ddddcdc853bad084208 Mon Sep 17 00:00:00 2001
From: David Cook <dcook@letsencrypt.org>
Date: Thu, 9 May 2024 10:17:56 -0500
Subject: [PATCH] Reimplement Field64 with u64 as internal storage (#1018)

* Remove unnecessary subtraction

This extra subtraction only has the effect of copying one borrow flag to
another, since the subtraction result is discarded.

* Update Sage script to support different radixes

* Reimplement Field64 using a smaller radix
---
 documentation/field_parameters.sage |  46 +++-
 src/field.rs                        |  44 ++--
 src/fp.rs                           | 381 +++++++++++++++++-----------
 src/fp64.rs                         | 307 ++++++++++++++++++++++
 src/lib.rs                          |   1 +
 5 files changed, 594 insertions(+), 185 deletions(-)
 create mode 100644 src/fp64.rs

diff --git a/documentation/field_parameters.sage b/documentation/field_parameters.sage
index b83505baa..328984d57 100755
--- a/documentation/field_parameters.sage
+++ b/documentation/field_parameters.sage
@@ -21,11 +21,26 @@ class Field:
     # subgroup. The generator element will be a 2^num_roots-th root of unity.
     num_roots: Integer
 
-    def __init__(self, name, modulus, generator_element):
+    # The base used for multiprecision arithmetic. This should be a power of
+    # two, and it should ideally be the machine word size of the target
+    # architecture.
+    r: Integer
+
+    # The radix. This must be a multiple of the base "r", and must be coprime
+    # to the prime "p".
+    R: Integer
+
+    def __init__(self, name, modulus, generator_element, r, R):
         assert is_prime(modulus)
+        assert R % r == 0
+        assert R % modulus != 0
+        assert modulus % R != 0
+
         self.name = name
         self.modulus = modulus
         self.generator_element = generator_element
+        self.r = r
+        self.R = R
 
         self.num_roots = None
         for (prime, power) in factor(modulus - 1):
@@ -43,8 +58,7 @@ class Field:
         mu = (-p)^-1 mod r, where r is the modulus implicitly used in wrapping
         machine word operations.
         """
-        r = 2 ^ 64
-        return (-self.modulus).inverse_mod(r)
+        return (-self.modulus).inverse_mod(self.r)
 
     def r2(self):
         """
@@ -52,15 +66,14 @@ class Field:
         Montgomery representation. R is the machine word-friendly modulus
         used in the Montgomery representation.
         """
-        R = 2 ^ 128
-        return R ^ 2 % self.modulus
+        return self.R ^ 2 % self.modulus
 
     def to_montgomery(self, element):
         """
         Transforms an element into its Montgomery representation.
         """
-        R = 2 ^ 128
-        return element * R % self.modulus
+
+        return element * self.R % self.modulus
 
     def bit_mask(self):
         """
@@ -89,19 +102,32 @@ class Field:
 
 FIELDS = [
     Field(
-        "FieldPrio2",
+        "FieldPrio2, u128",
         2 ^ 20 * 4095 + 1,
         3925978153,
+        2 ^ 64,
+        2 ^ 128,
+    ),
+    Field(
+        "Field64, u128",
+        2 ^ 32 * 4294967295 + 1,
+        pow(7, 4294967295, 2 ^ 32 * 4294967295 + 1),
+        2 ^ 64,
+        2 ^ 128,
     ),
     Field(
-        "Field64",
+        "Field64, u64",
         2 ^ 32 * 4294967295 + 1,
         pow(7, 4294967295, 2 ^ 32 * 4294967295 + 1),
+        2 ^ 64,
+        2 ^ 64,
     ),
     Field(
-        "Field128",
+        "Field128, u128",
         2 ^ 66 * 4611686018427387897 + 1,
         pow(7, 4611686018427387897, 2 ^ 66 * 4611686018427387897 + 1),
+        2 ^ 64,
+        2 ^ 128,
     ),
 ]
 for field in FIELDS:
diff --git a/src/field.rs b/src/field.rs
index 5333bff7b..0a9d78ba3 100644
--- a/src/field.rs
+++ b/src/field.rs
@@ -9,7 +9,8 @@
 
 use crate::{
     codec::{CodecError, Decode, Encode},
-    fp::{FP128, FP32, FP64},
+    fp::{FP128, FP32},
+    fp64::FP64,
     prng::{Prng, PrngError},
 };
 use rand::{
@@ -420,7 +421,7 @@ pub trait FftFriendlyFieldElement: FieldElementWithInteger {
 macro_rules! make_field {
     (
         $(#[$meta:meta])*
-        $elem:ident, $int:ident, $fp:ident, $encoding_size:literal,
+        $elem:ident, $int_internal:ident, $int_conversion:ident, $fp:ident, $encoding_size:literal,
     ) => {
         $(#[$meta])*
         ///
@@ -435,7 +436,7 @@ macro_rules! make_field {
         /// As an invariant, this integer representing the field element in the Montgomery domain
         /// must be less than the field modulus, `p`.
         #[derive(Clone, Copy, Default)]
-        pub struct $elem(u128);
+        pub struct $elem($int_internal);
 
         impl $elem {
             /// Attempts to instantiate an `$elem` from the first `Self::ENCODED_SIZE` bytes in the
@@ -452,14 +453,14 @@ macro_rules! make_field {
             /// We cannot use `u128::from_le_bytes` or `u128::from_be_bytes` because those functions
             /// expect inputs to be exactly 16 bytes long. Our encoding of most field elements is
             /// more compact.
-            fn try_from_bytes(bytes: &[u8], mask: u128) -> Result<Self, FieldError> {
+            fn try_from_bytes(bytes: &[u8], mask: $int_internal) -> Result<Self, FieldError> {
                 if Self::ENCODED_SIZE > bytes.len() {
                     return Err(FieldError::ShortRead);
                 }
 
                 let mut int = 0;
                 for i in 0..Self::ENCODED_SIZE {
-                    int |= (bytes[i] as u128) << (i << 3);
+                    int |= (bytes[i] as $int_internal) << (i << 3);
                 }
 
                 int &= mask;
@@ -495,7 +496,7 @@ macro_rules! make_field {
 
         impl ConditionallySelectable for $elem {
             fn conditional_select(a: &Self, b: &Self, choice: subtle::Choice) -> Self {
-                Self(u128::conditional_select(&a.0, &b.0, choice))
+                Self($int_internal::conditional_select(&a.0, &b.0, choice))
             }
         }
 
@@ -617,23 +618,23 @@ macro_rules! make_field {
             }
         }
 
-        impl From<$int> for $elem {
-            fn from(x: $int) -> Self {
+        impl From<$int_conversion> for $elem {
+            fn from(x: $int_conversion) -> Self {
                 // FieldParameters::montgomery() will return a value that has been fully reduced
                 // mod p, satisfying the invariant on Self.
-                Self($fp.montgomery(u128::try_from(x).unwrap()))
+                Self($fp.montgomery($int_internal::try_from(x).unwrap()))
             }
         }
 
-        impl From<$elem> for $int {
+        impl From<$elem> for $int_conversion {
             fn from(x: $elem) -> Self {
-                $int::try_from($fp.residue(x.0)).unwrap()
+                $int_conversion::try_from($fp.residue(x.0)).unwrap()
             }
         }
 
-        impl PartialEq<$int> for $elem {
-            fn eq(&self, rhs: &$int) -> bool {
-                $fp.residue(self.0) == u128::try_from(*rhs).unwrap()
+        impl PartialEq<$int_conversion> for $elem {
+            fn eq(&self, rhs: &$int_conversion) -> bool {
+                $fp.residue(self.0) == $int_internal::try_from(*rhs).unwrap()
             }
         }
 
@@ -641,7 +642,7 @@ macro_rules! make_field {
             type Error = FieldError;
 
             fn try_from(bytes: &[u8]) -> Result<Self, FieldError> {
-                Self::try_from_bytes(bytes, u128::MAX)
+                Self::try_from_bytes(bytes, $int_internal::MAX)
             }
         }
 
@@ -675,7 +676,7 @@ macro_rules! make_field {
         }
 
         // We provide custom [`serde::Serialize`] and [`serde::Deserialize`] implementations because
-        // the derived implementations would represent `FieldElement` values as the backing `u128`,
+        // the derived implementations would represent `FieldElement` values as the backing integer,
         // which is not what we want because (1) we can be more efficient in all cases and (2) in
         // some circumstances, [some serializers don't support `u128`](https://github.com/serde-rs/json/issues/625).
         impl Serialize for $elem {
@@ -707,7 +708,7 @@ macro_rules! make_field {
             fn decode(bytes: &mut Cursor<&[u8]>) -> Result<Self, CodecError> {
                 let mut value = [0u8; $elem::ENCODED_SIZE];
                 bytes.read_exact(&mut value)?;
-                $elem::try_from_bytes(&value, u128::MAX).map_err(|e| {
+                $elem::try_from_bytes(&value, $int_internal::MAX).map_err(|e| {
                     CodecError::Other(Box::new(e) as Box<dyn std::error::Error + 'static + Send + Sync>)
                 })
             }
@@ -735,16 +736,16 @@ macro_rules! make_field {
         }
 
         impl FieldElementWithInteger for $elem {
-            type Integer = $int;
+            type Integer = $int_conversion;
 
             fn pow(&self, exp: Self::Integer) -> Self {
                 // FieldParameters::pow() relies on mul(), and will always return a value less
                 // than p.
-                Self($fp.pow(self.0, u128::try_from(exp).unwrap()))
+                Self($fp.pow(self.0, $int_internal::try_from(exp).unwrap()))
             }
 
             fn modulus() -> Self::Integer {
-                $fp.p as $int
+                $fp.p as $int_conversion
             }
         }
 
@@ -816,6 +817,7 @@ impl Integer for u128 {
 make_field!(
     /// Same as Field32, but encoded in little endian for compatibility with Prio v2.
     FieldPrio2,
+    u128,
     u32,
     FP32,
     4,
@@ -825,6 +827,7 @@ make_field!(
     /// `GF(18446744069414584321)`, a 64-bit field.
     Field64,
     u64,
+    u64,
     FP64,
     8,
 );
@@ -833,6 +836,7 @@ make_field!(
     /// `GF(340282366920938462946865773367900766209)`, a 128-bit field.
     Field128,
     u128,
+    u128,
     FP128,
     16,
 );
diff --git a/src/fp.rs b/src/fp.rs
index d4c0dcdc2..a85f2ac38 100644
--- a/src/fp.rs
+++ b/src/fp.rs
@@ -2,9 +2,6 @@
 
 //! Finite field arithmetic for any field GF(p) for which p < 2^128.
 
-#[cfg(test)]
-use rand::{prelude::*, Rng};
-
 /// For each set of field parameters we pre-compute the 1st, 2nd, 4th, ..., 2^20-th principal roots
 /// of unity. The largest of these is used to run the FFT algorithm on an input of size 2^20. This
 /// is the largest input size we would ever need for the cryptographic applications in this crate.
@@ -55,15 +52,14 @@ impl FieldParameters {
     /// Subtraction. The result will be in [0, p), so long as both x and y are as well.
     #[inline(always)]
     pub fn sub(&self, x: u128, y: u128) -> u128 {
-        //     0, x
-        // -   0, y
+        //        x
+        // -      y
         // ========
-        // b1,z1,z0
+        //    b0,z0
         let (z0, b0) = x.overflowing_sub(y);
-        let (_z1, b1) = 0u128.overflowing_sub(b0 as u128);
-        let m = 0u128.wrapping_sub(b1 as u128);
-        //   z1,z0
-        // +  0, p
+        let m = 0u128.wrapping_sub(b0 as u128);
+        //      z0
+        // +     p
         // ========
         //   s1,s0
         z0.wrapping_add(m & self.p)
@@ -72,14 +68,14 @@ impl FieldParameters {
     }
 
     /// Multiplication of field elements in the Montgomery domain. This uses the REDC algorithm
-    /// described
-    /// [here](https://www.ams.org/journals/mcom/1985-44-170/S0025-5718-1985-0777282-X/S0025-5718-1985-0777282-X.pdf).
-    /// The result will be in [0, p).
+    /// described [here][montgomery]. The result will be in [0, p).
     ///
     /// # Example usage
     /// ```text
     /// assert_eq!(fp.residue(fp.mul(fp.montgomery(23), fp.montgomery(2))), 46);
     /// ```
+    ///
+    /// [montgomery]: https://www.ams.org/journals/mcom/1985-44-170/S0025-5718-1985-0777282-X/S0025-5718-1985-0777282-X.pdf
     #[inline(always)]
     pub fn mul(&self, x: u128, y: u128) -> u128 {
         let x = [lo64(x), hi64(x)];
@@ -200,7 +196,7 @@ impl FieldParameters {
         // Final subtraction
         // If z >= p, then z = z - p
 
-        //     0, z
+        //    cc, z
         // -   0, p
         // ========
         // b1,s1,s0
@@ -252,13 +248,6 @@ impl FieldParameters {
         modp(self.mul(x, self.r2), self.p)
     }
 
-    /// Returns a random field element mapped.
-    #[cfg(test)]
-    pub fn rand_elem<R: Rng + ?Sized>(&self, rng: &mut R) -> u128 {
-        let uniform = rand::distributions::Uniform::from(0..self.p);
-        self.montgomery(uniform.sample(rng))
-    }
-
     /// Maps a field element to its representation as an integer. The result will be in [0, p).
     ///
     /// #Example usage
@@ -271,67 +260,15 @@ impl FieldParameters {
     pub fn residue(&self, x: u128) -> u128 {
         modp(self.mul(x, 1), self.p)
     }
-
-    #[cfg(test)]
-    pub fn check(&self, p: u128, g: u128, order: u128) {
-        use modinverse::modinverse;
-        use num_bigint::{BigInt, ToBigInt};
-        use std::cmp::max;
-
-        assert_eq!(self.p, p, "p mismatch");
-
-        let mu = match modinverse((-(p as i128)).rem_euclid(1 << 64), 1 << 64) {
-            Some(mu) => mu as u64,
-            None => panic!("inverse of -p (mod 2^64) is undefined"),
-        };
-        assert_eq!(self.mu, mu, "mu mismatch");
-
-        let big_p = &p.to_bigint().unwrap();
-        let big_r: &BigInt = &(&(BigInt::from(1) << 128) % big_p);
-        let big_r2: &BigInt = &(&(big_r * big_r) % big_p);
-        let mut it = big_r2.iter_u64_digits();
-        let mut r2 = 0;
-        r2 |= it.next().unwrap() as u128;
-        if let Some(x) = it.next() {
-            r2 |= (x as u128) << 64;
-        }
-        assert_eq!(self.r2, r2, "r2 mismatch");
-
-        assert_eq!(self.g, self.montgomery(g), "g mismatch");
-        assert_eq!(
-            self.residue(self.pow(self.g, order)),
-            1,
-            "g order incorrect"
-        );
-
-        let num_roots = log2(order) as usize;
-        assert_eq!(order, 1 << num_roots, "order not a power of 2");
-        assert_eq!(self.num_roots, num_roots, "num_roots mismatch");
-
-        let mut roots = vec![0; max(num_roots, MAX_ROOTS) + 1];
-        roots[num_roots] = self.montgomery(g);
-        for i in (0..num_roots).rev() {
-            roots[i] = self.mul(roots[i + 1], roots[i + 1]);
-        }
-        assert_eq!(&self.roots, &roots[..MAX_ROOTS + 1], "roots mismatch");
-        assert_eq!(self.residue(self.roots[0]), 1, "first root is not one");
-
-        let bit_mask = (BigInt::from(1) << big_p.bits()) - BigInt::from(1);
-        assert_eq!(
-            self.bit_mask.to_bigint().unwrap(),
-            bit_mask,
-            "bit_mask mismatch"
-        );
-    }
 }
 
 #[inline(always)]
-fn lo64(x: u128) -> u128 {
+pub(crate) fn lo64(x: u128) -> u128 {
     x & ((1 << 64) - 1)
 }
 
 #[inline(always)]
-fn hi64(x: u128) -> u128 {
+pub(crate) fn hi64(x: u128) -> u128 {
     x >> 64
 }
 
@@ -356,38 +293,6 @@ pub(crate) const FP32: FieldParameters = FieldParameters {
     ],
 };
 
-pub(crate) const FP64: FieldParameters = FieldParameters {
-    p: 18446744069414584321, // 64-bit prime
-    mu: 18446744069414584319,
-    r2: 4294967295,
-    g: 959634606461954525,
-    num_roots: 32,
-    bit_mask: 18446744073709551615,
-    roots: [
-        18446744065119617025,
-        4294967296,
-        18446462594437939201,
-        72057594037927936,
-        1152921504338411520,
-        16384,
-        18446743519658770561,
-        18446735273187346433,
-        6519596376689022014,
-        9996039020351967275,
-        15452408553935940313,
-        15855629130643256449,
-        8619522106083987867,
-        13036116919365988132,
-        1033106119984023956,
-        16593078884869787648,
-        16980581328500004402,
-        12245796497946355434,
-        8709441440702798460,
-        8611358103550827629,
-        8120528636261052110,
-    ],
-};
-
 pub(crate) const FP128: FieldParameters = FieldParameters {
     p: 340282366920938462946865773367900766209, // 128-bit prime
     mu: 18446744073709551615,
@@ -427,9 +332,99 @@ pub(crate) fn log2(x: u128) -> u128 {
 }
 
 #[cfg(test)]
-mod tests {
+pub(crate) mod tests {
     use super::*;
-    use num_bigint::ToBigInt;
+    use modinverse::modinverse;
+    use num_bigint::{BigInt, ToBigInt};
+    use rand::{distributions::Distribution, thread_rng, Rng};
+    use std::cmp::max;
+
+    /// This trait abstracts over the details of [`FieldParameters`] and
+    /// [`FieldParameters64`](crate::fp64::FieldParameters64) to allow reuse of test code.
+    pub(crate) trait TestFieldParameters {
+        fn p(&self) -> u128;
+        fn g(&self) -> u128;
+        fn r2(&self) -> u128;
+        fn mu(&self) -> u64;
+        fn bit_mask(&self) -> u128;
+        fn num_roots(&self) -> usize;
+        fn roots(&self) -> Vec<u128>;
+        fn montgomery(&self, x: u128) -> u128;
+        fn residue(&self, x: u128) -> u128;
+        fn add(&self, x: u128, y: u128) -> u128;
+        fn sub(&self, x: u128, y: u128) -> u128;
+        fn neg(&self, x: u128) -> u128;
+        fn mul(&self, x: u128, y: u128) -> u128;
+        fn pow(&self, x: u128, exp: u128) -> u128;
+        fn inv(&self, x: u128) -> u128;
+        fn radix(&self) -> BigInt;
+    }
+
+    impl TestFieldParameters for FieldParameters {
+        fn p(&self) -> u128 {
+            self.p
+        }
+
+        fn g(&self) -> u128 {
+            self.g
+        }
+
+        fn r2(&self) -> u128 {
+            self.r2
+        }
+
+        fn mu(&self) -> u64 {
+            self.mu
+        }
+
+        fn bit_mask(&self) -> u128 {
+            self.bit_mask
+        }
+
+        fn num_roots(&self) -> usize {
+            self.num_roots
+        }
+
+        fn roots(&self) -> Vec<u128> {
+            self.roots.to_vec()
+        }
+
+        fn montgomery(&self, x: u128) -> u128 {
+            FieldParameters::montgomery(self, x)
+        }
+
+        fn residue(&self, x: u128) -> u128 {
+            FieldParameters::residue(self, x)
+        }
+
+        fn add(&self, x: u128, y: u128) -> u128 {
+            FieldParameters::add(self, x, y)
+        }
+
+        fn sub(&self, x: u128, y: u128) -> u128 {
+            FieldParameters::sub(self, x, y)
+        }
+
+        fn neg(&self, x: u128) -> u128 {
+            FieldParameters::neg(self, x)
+        }
+
+        fn mul(&self, x: u128, y: u128) -> u128 {
+            FieldParameters::mul(self, x, y)
+        }
+
+        fn pow(&self, x: u128, exp: u128) -> u128 {
+            FieldParameters::pow(self, x, exp)
+        }
+
+        fn inv(&self, x: u128) -> u128 {
+            FieldParameters::inv(self, x)
+        }
+
+        fn radix(&self) -> BigInt {
+            BigInt::from(1) << 128
+        }
+    }
 
     #[test]
     fn test_log2() {
@@ -445,58 +440,130 @@ mod tests {
         assert_eq!(log2((1 << 127) + 13), 128);
     }
 
-    struct TestFieldParametersData {
-        fp: FieldParameters,  // The paramters being tested
-        expected_p: u128,     // Expected fp.p
-        expected_g: u128,     // Expected fp.residue(fp.g)
-        expected_order: u128, // Expect fp.residue(fp.pow(fp.g, expected_order)) == 1
+    pub(crate) struct TestFieldParametersData {
+        /// The paramters being tested
+        pub fp: Box<dyn TestFieldParameters>,
+        /// Expected fp.p
+        pub expected_p: u128,
+        /// Expected fp.residue(fp.g)
+        pub expected_g: u128,
+        /// Expect fp.residue(fp.pow(fp.g, expected_order)) == 1
+        pub expected_order: u128,
     }
 
     #[test]
-    fn test_fp() {
-        let test_fps = vec![
-            TestFieldParametersData {
-                fp: FP32,
-                expected_p: 4293918721,
-                expected_g: 3925978153,
-                expected_order: 1 << 20,
-            },
-            TestFieldParametersData {
-                fp: FP64,
-                expected_p: 18446744069414584321,
-                expected_g: 1753635133440165772,
-                expected_order: 1 << 32,
-            },
-            TestFieldParametersData {
-                fp: FP128,
-                expected_p: 340282366920938462946865773367900766209,
-                expected_g: 145091266659756586618791329697897684742,
-                expected_order: 1 << 66,
-            },
-        ];
-
-        for t in test_fps.into_iter() {
-            //  Check that the field parameters have been constructed properly.
-            t.fp.check(t.expected_p, t.expected_g, t.expected_order);
-
-            // Check that the generator has the correct order.
-            assert_eq!(t.fp.residue(t.fp.pow(t.fp.g, t.expected_order)), 1);
-            assert_ne!(t.fp.residue(t.fp.pow(t.fp.g, t.expected_order / 2)), 1);
-
-            // Test arithmetic using the field parameters.
-            arithmetic_test(&t.fp);
+    fn test_fp32_u128() {
+        all_field_parameters_tests(TestFieldParametersData {
+            fp: Box::new(FP32),
+            expected_p: 4293918721,
+            expected_g: 3925978153,
+            expected_order: 1 << 20,
+        });
+    }
+
+    #[test]
+    fn test_fp128_u128() {
+        all_field_parameters_tests(TestFieldParametersData {
+            fp: Box::new(FP128),
+            expected_p: 340282366920938462946865773367900766209,
+            expected_g: 145091266659756586618791329697897684742,
+            expected_order: 1 << 66,
+        });
+    }
+
+    pub(crate) fn all_field_parameters_tests(t: TestFieldParametersData) {
+        // Check that the field parameters have been constructed properly.
+        check_consistency(t.fp.as_ref(), t.expected_p, t.expected_g, t.expected_order);
+
+        // Check that the generator has the correct order.
+        assert_eq!(t.fp.residue(t.fp.pow(t.fp.g(), t.expected_order)), 1);
+        assert_ne!(t.fp.residue(t.fp.pow(t.fp.g(), t.expected_order / 2)), 1);
+
+        // Test arithmetic using the field parameters.
+        arithmetic_test(t.fp.as_ref());
+    }
+
+    fn check_consistency(fp: &dyn TestFieldParameters, p: u128, g: u128, order: u128) {
+        assert_eq!(fp.p(), p, "p mismatch");
+
+        let mu = match modinverse((-(p as i128)).rem_euclid(1 << 64), 1 << 64) {
+            Some(mu) => mu as u64,
+            None => panic!("inverse of -p (mod 2^64) is undefined"),
+        };
+        assert_eq!(fp.mu(), mu, "mu mismatch");
+
+        let big_p = &p.to_bigint().unwrap();
+        let big_r: &BigInt = &(fp.radix() % big_p);
+        let big_r2: &BigInt = &(&(big_r * big_r) % big_p);
+        let mut it = big_r2.iter_u64_digits();
+        let mut r2 = 0;
+        r2 |= it.next().unwrap() as u128;
+        if let Some(x) = it.next() {
+            r2 |= (x as u128) << 64;
+        }
+        assert_eq!(fp.r2(), r2, "r2 mismatch");
+
+        assert_eq!(fp.g(), fp.montgomery(g), "g mismatch");
+        assert_eq!(fp.residue(fp.pow(fp.g(), order)), 1, "g order incorrect");
+
+        let num_roots = log2(order) as usize;
+        assert_eq!(order, 1 << num_roots, "order not a power of 2");
+        assert_eq!(fp.num_roots(), num_roots, "num_roots mismatch");
+
+        let mut roots = vec![0; max(num_roots, MAX_ROOTS) + 1];
+        roots[num_roots] = fp.montgomery(g);
+        for i in (0..num_roots).rev() {
+            roots[i] = fp.mul(roots[i + 1], roots[i + 1]);
         }
+        assert_eq!(fp.roots(), &roots[..MAX_ROOTS + 1], "roots mismatch");
+        assert_eq!(fp.residue(fp.roots()[0]), 1, "first root is not one");
+
+        let bit_mask = (BigInt::from(1) << big_p.bits()) - BigInt::from(1);
+        assert_eq!(
+            fp.bit_mask().to_bigint().unwrap(),
+            bit_mask,
+            "bit_mask mismatch"
+        );
     }
 
-    fn arithmetic_test(fp: &FieldParameters) {
-        let mut rng = rand::thread_rng();
-        let big_p = &fp.p.to_bigint().unwrap();
+    fn arithmetic_test(fp: &dyn TestFieldParameters) {
+        let big_p = &fp.p().to_bigint().unwrap();
+        let big_zero = &BigInt::from(0);
+        let uniform = rand::distributions::Uniform::from(0..fp.p());
+        let mut rng = thread_rng();
 
-        for _ in 0..100 {
-            let x = fp.rand_elem(&mut rng);
-            let y = fp.rand_elem(&mut rng);
-            let big_x = &fp.residue(x).to_bigint().unwrap();
-            let big_y = &fp.residue(y).to_bigint().unwrap();
+        let mut weird_ints = Vec::from([
+            0,
+            1,
+            fp.bit_mask() - fp.p(),
+            fp.bit_mask() - fp.p() + 1,
+            fp.p() - 1,
+        ]);
+        if fp.p() > u64::MAX as u128 {
+            weird_ints.extend_from_slice(&[
+                u64::MAX as u128,
+                1 << 64,
+                fp.p() & u64::MAX as u128,
+                fp.p() & !u64::MAX as u128,
+                fp.p() & !u64::MAX as u128 | 1,
+            ]);
+        }
+
+        let mut generate_random = || -> (u128, BigInt) {
+            // Add bias to random element generation, to explore "interesting" inputs.
+            let int = if rng.gen_ratio(1, 4) {
+                weird_ints[rng.gen_range(0..weird_ints.len())]
+            } else {
+                uniform.sample(&mut rng)
+            };
+            let bigint = int.to_bigint().unwrap();
+            let montgomery_domain = fp.montgomery(int);
+            (montgomery_domain, bigint)
+        };
+
+        for _ in 0..1000 {
+            let (x, ref big_x) = generate_random();
+            let (y, ref big_y) = generate_random();
 
             // Test addition.
             let got = fp.add(x, y);
@@ -521,7 +588,11 @@ mod tests {
             let got = fp.inv(x);
             let want = big_x.modpow(&(big_p - 2u128), big_p);
             assert_eq!(fp.residue(got).to_bigint().unwrap(), want);
-            assert_eq!(fp.residue(fp.mul(got, x)), 1);
+            if big_x == big_zero {
+                assert_eq!(fp.residue(fp.mul(got, x)), 0);
+            } else {
+                assert_eq!(fp.residue(fp.mul(got, x)), 1);
+            }
 
             // Test negation.
             let got = fp.neg(x);
diff --git a/src/fp64.rs b/src/fp64.rs
new file mode 100644
index 000000000..e2f2cf2b0
--- /dev/null
+++ b/src/fp64.rs
@@ -0,0 +1,307 @@
+// SPDX-License-Identifier: MPL-2.0
+
+//! Finite field arithmetic for any field GF(p) for which p < 2^64.
+
+use crate::fp::{hi64, lo64, MAX_ROOTS};
+
+/// This structure represents the parameters of a finite field GF(p) for which p < 2^64.
+///
+/// See also [`FieldParameters`](crate::fp::FieldParameters).
+#[derive(Debug, PartialEq, Eq)]
+pub(crate) struct FieldParameters64 {
+    /// The prime modulus `p`.
+    pub p: u64,
+    /// `mu = -p^(-1) mod 2^64`.
+    pub mu: u64,
+    /// `r2 = (2^64)^2 mod p`.
+    pub r2: u64,
+    /// The `2^num_roots`-th -principal root of unity. This element is used to generate the
+    /// elements of `roots`.
+    pub g: u64,
+    /// The number of principal roots of unity in `roots`.
+    pub num_roots: usize,
+    /// Equal to `2^b - 1`, where `b` is the length of `p` in bits.
+    pub bit_mask: u64,
+    /// `roots[l]` is the `2^l`-th principal root of unity, i.e., `roots[l]` has order `2^l` in the
+    /// multiplicative group. `roots[0]` is equal to one by definition.
+    pub roots: [u64; MAX_ROOTS + 1],
+}
+
+impl FieldParameters64 {
+    /// Addition. The result will be in [0, p), so long as both x and y are as well.
+    #[inline(always)]
+    pub fn add(&self, x: u64, y: u64) -> u64 {
+        //   0,x
+        // + 0,y
+        // =====
+        //   c,z
+        let (z, carry) = x.overflowing_add(y);
+        //     c, z
+        // -   0, p
+        // ========
+        // b1,s1,s0
+        let (s0, b0) = z.overflowing_sub(self.p);
+        let (_s1, b1) = (carry as u64).overflowing_sub(b0 as u64);
+        // if b1 == 1: return z
+        // else:       return s0
+        let m = 0u64.wrapping_sub(b1 as u64);
+        (z & m) | (s0 & !m)
+    }
+
+    /// Subtraction. The result will be in [0, p), so long as both x and y are as well.
+    #[inline(always)]
+    pub fn sub(&self, x: u64, y: u64) -> u64 {
+        //        x
+        // -      y
+        // ========
+        //    b0,z0
+        let (z0, b0) = x.overflowing_sub(y);
+        let m = 0u64.wrapping_sub(b0 as u64);
+        //      z0
+        // +     p
+        // ========
+        //   s1,s0
+        z0.wrapping_add(m & self.p)
+        // if b1 == 1: return s0
+        // else:       return z0
+    }
+
+    /// Multiplication of field elements in the Montgomery domain. This uses the REDC algorithm
+    /// described [here][montgomery]. The result will be in [0, p).
+    ///
+    /// # Example usage
+    /// ```text
+    /// assert_eq!(fp.residue(fp.mul(fp.montgomery(23), fp.montgomery(2))), 46);
+    /// ```
+    ///
+    /// [montgomery]: https://www.ams.org/journals/mcom/1985-44-170/S0025-5718-1985-0777282-X/S0025-5718-1985-0777282-X.pdf
+    #[inline(always)]
+    pub fn mul(&self, x: u64, y: u64) -> u64 {
+        let mut zz = [0; 2];
+
+        // Integer multiplication
+        // z = x * y
+
+        //     x
+        // *   y
+        // =====
+        // z1,z0
+        let result = (x as u128) * (y as u128);
+        zz[0] = lo64(result) as u64;
+        zz[1] = hi64(result) as u64;
+
+        // Montgomery Reduction
+        // z = z + p * mu*(z mod 2^64), where mu = (-p)^(-1) mod 2^64.
+
+        // z1,z0
+        // +   p
+        // *   w = mu*z0
+        // =====
+        // z1, 0
+        let w = self.mu.wrapping_mul(zz[0]);
+        let result = (self.p as u128) * (w as u128);
+        let hi = hi64(result);
+        let lo = lo64(result) as u64;
+        let (result, carry) = zz[0].overflowing_add(lo);
+        zz[0] = result;
+        let result = zz[1] as u128 + hi + carry as u128;
+        zz[1] = lo64(result) as u64;
+        let cc = hi64(result) as u64;
+
+        // z = (z1)
+        let prod = zz[1];
+
+        // Final subtraction
+        // If z >= p, then z = z - p
+
+        //    cc, z
+        // -   0, p
+        // ========
+        // b1,s1,s0
+        let (s0, b0) = prod.overflowing_sub(self.p);
+        let (_s1, b1) = cc.overflowing_sub(b0 as u64);
+        // if b1 == 1: return z
+        // else:       return s0
+        let mask = 0u64.wrapping_sub(b1 as u64);
+        (prod & mask) | (s0 & !mask)
+    }
+
+    /// Modular exponentiation, i.e., `x^exp (mod p)` where `p` is the modulus. Note that the
+    /// runtime of this algorithm is linear in the bit length of `exp`.
+    pub fn pow(&self, x: u64, exp: u64) -> u64 {
+        let mut t = self.montgomery(1);
+        for i in (0..64 - exp.leading_zeros()).rev() {
+            t = self.mul(t, t);
+            if (exp >> i) & 1 != 0 {
+                t = self.mul(t, x);
+            }
+        }
+        t
+    }
+
+    /// Modular inversion, i.e., x^-1 (mod p) where `p` is the modulus. Note that the runtime of
+    /// this algorithm is linear in the bit length of `p`.
+    #[inline(always)]
+    pub fn inv(&self, x: u64) -> u64 {
+        self.pow(x, self.p - 2)
+    }
+
+    /// Negation, i.e., `-x (mod p)` where `p` is the modulus.
+    #[inline(always)]
+    pub fn neg(&self, x: u64) -> u64 {
+        self.sub(0, x)
+    }
+
+    /// Maps an integer to its internal representation. Field elements are mapped to the Montgomery
+    /// domain in order to carry out field arithmetic. The result will be in [0, p).
+    ///
+    /// # Example usage
+    /// ```text
+    /// let integer = 1; // Standard integer representation
+    /// let elem = fp.montgomery(integer); // Internal representation in the Montgomery domain
+    /// assert_eq!(elem, 2564090464);
+    /// ```
+    #[inline(always)]
+    pub fn montgomery(&self, x: u64) -> u64 {
+        modp(self.mul(x, self.r2), self.p)
+    }
+
+    /// Maps a field element to its representation as an integer. The result will be in [0, p).
+    ///
+    /// #Example usage
+    /// ```text
+    /// let elem = 2564090464; // Internal representation in the Montgomery domain
+    /// let integer = fp.residue(elem); // Standard integer representation
+    /// assert_eq!(integer, 1);
+    /// ```
+    #[inline(always)]
+    pub fn residue(&self, x: u64) -> u64 {
+        modp(self.mul(x, 1), self.p)
+    }
+}
+
+#[inline(always)]
+fn modp(x: u64, p: u64) -> u64 {
+    let (z, carry) = x.overflowing_sub(p);
+    let m = 0u64.wrapping_sub(carry as u64);
+    z.wrapping_add(m & p)
+}
+
+pub(crate) const FP64: FieldParameters64 = FieldParameters64 {
+    p: 18446744069414584321, // 64-bit prime
+    mu: 18446744069414584319,
+    r2: 18446744065119617025,
+    g: 15733474329512464024,
+    num_roots: 32,
+    bit_mask: 18446744073709551615,
+    roots: [
+        4294967295,
+        18446744065119617026,
+        18446744069414518785,
+        18374686475393433601,
+        268435456,
+        18446673700670406657,
+        18446744069414584193,
+        576460752303421440,
+        16576810576923738718,
+        6647628942875889800,
+        10087739294013848503,
+        2135208489130820273,
+        10781050935026037169,
+        3878014442329970502,
+        1205735313231991947,
+        2523909884358325590,
+        13797134855221748930,
+        12267112747022536458,
+        430584883067102937,
+        10135969988448727187,
+        6815045114074884550,
+    ],
+};
+
+#[cfg(test)]
+mod tests {
+    use num_bigint::BigInt;
+
+    use crate::fp::tests::{
+        all_field_parameters_tests, TestFieldParameters, TestFieldParametersData,
+    };
+
+    use super::*;
+
+    impl TestFieldParameters for FieldParameters64 {
+        fn p(&self) -> u128 {
+            self.p.into()
+        }
+
+        fn g(&self) -> u128 {
+            self.g as u128
+        }
+
+        fn r2(&self) -> u128 {
+            self.r2 as u128
+        }
+
+        fn mu(&self) -> u64 {
+            self.mu
+        }
+
+        fn bit_mask(&self) -> u128 {
+            self.bit_mask as u128
+        }
+
+        fn num_roots(&self) -> usize {
+            self.num_roots
+        }
+
+        fn roots(&self) -> Vec<u128> {
+            self.roots.iter().map(|x| *x as u128).collect()
+        }
+
+        fn montgomery(&self, x: u128) -> u128 {
+            FieldParameters64::montgomery(self, x.try_into().unwrap()).into()
+        }
+
+        fn residue(&self, x: u128) -> u128 {
+            FieldParameters64::residue(self, x.try_into().unwrap()).into()
+        }
+
+        fn add(&self, x: u128, y: u128) -> u128 {
+            FieldParameters64::add(self, x.try_into().unwrap(), y.try_into().unwrap()).into()
+        }
+
+        fn sub(&self, x: u128, y: u128) -> u128 {
+            FieldParameters64::sub(self, x.try_into().unwrap(), y.try_into().unwrap()).into()
+        }
+
+        fn neg(&self, x: u128) -> u128 {
+            FieldParameters64::neg(self, x.try_into().unwrap()).into()
+        }
+
+        fn mul(&self, x: u128, y: u128) -> u128 {
+            FieldParameters64::mul(self, x.try_into().unwrap(), y.try_into().unwrap()).into()
+        }
+
+        fn pow(&self, x: u128, exp: u128) -> u128 {
+            FieldParameters64::pow(self, x.try_into().unwrap(), exp.try_into().unwrap()).into()
+        }
+
+        fn inv(&self, x: u128) -> u128 {
+            FieldParameters64::inv(self, x.try_into().unwrap()).into()
+        }
+
+        fn radix(&self) -> BigInt {
+            BigInt::from(1) << 64
+        }
+    }
+
+    #[test]
+    fn test_fp64_u64() {
+        all_field_parameters_tests(TestFieldParametersData {
+            fp: Box::new(FP64),
+            expected_p: 18446744069414584321,
+            expected_g: 1753635133440165772,
+            expected_order: 1 << 32,
+        })
+    }
+}
diff --git a/src/lib.rs b/src/lib.rs
index d306926ec..e5280d505 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -25,6 +25,7 @@ mod fft;
 pub mod field;
 pub mod flp;
 mod fp;
+mod fp64;
 #[cfg(all(feature = "crypto-dependencies", feature = "experimental"))]
 #[cfg_attr(
     docsrs,