From 3681c90d99739d9f3edfb6fa34aa0a59d89be7fb Mon Sep 17 00:00:00 2001 From: Brian Smith Date: Wed, 18 Oct 2023 20:25:49 -0700 Subject: [PATCH 1/5] EC P-384 ECDSA verification: Implement basic double-and-add multiplier. The efficiency of this will be improved in future commits. --- src/ec/suite_b/ops.rs | 86 ++++++++++++++++---------- src/ec/suite_b/ops/p256.rs | 6 +- src/ec/suite_b/ops/p384.rs | 18 +++--- src/ec/suite_b/ops/vartime.rs | 111 ++++++++++++++++++++++++++++++++++ 4 files changed, 182 insertions(+), 39 deletions(-) create mode 100644 src/ec/suite_b/ops/vartime.rs diff --git a/src/ec/suite_b/ops.rs b/src/ec/suite_b/ops.rs index cf3c73455c..edeb9602e8 100644 --- a/src/ec/suite_b/ops.rs +++ b/src/ec/suite_b/ops.rs @@ -12,7 +12,12 @@ // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. -use crate::{arithmetic::limbs_from_hex, arithmetic::montgomery::*, c, error, limb::*}; +use crate::{ + arithmetic::{limbs_from_hex, montgomery::*}, + bits::BitLength, + c, error, + limb::*, +}; use core::marker::PhantomData; pub use self::elem::*; @@ -33,6 +38,7 @@ pub type Scalar = elem::Elem; #[derive(Clone, Copy)] pub enum N {} +#[derive(Clone, Copy)] pub struct Point { // The coordinates are stored in a contiguous array, where the first // `ops.num_limbs` elements are the X coordinate, the next @@ -62,11 +68,15 @@ pub struct CommonOps { // In all cases, `r`, `a`, and `b` may all alias each other. elem_mul_mont: unsafe extern "C" fn(r: *mut Limb, a: *const Limb, b: *const Limb), elem_sqr_mont: unsafe extern "C" fn(r: *mut Limb, a: *const Limb), - + point_double_jacobian_impl: unsafe extern "C" fn(r: *mut Limb, a: *const Limb), point_add_jacobian_impl: unsafe extern "C" fn(r: *mut Limb, a: *const Limb, b: *const Limb), } impl CommonOps { + fn order_bits(&self) -> BitLength { + BitLength::from_usize_bits(self.num_limbs * LIMB_BITS) + } + #[inline] pub fn elem_add(&self, a: &mut Elem, b: &Elem) { let num_limbs = self.num_limbs; @@ -128,6 +138,36 @@ impl CommonOps { } } + fn point_new_affine(&self, x: &Elem, y: &Elem) -> Point { + // `z` is 1 in the Montgomery domain. + let z = { + let mut acc = Elem::zero(); + acc.limbs[0] = 1; + let mut rr = Elem::zero(); + rr.limbs[..self.num_limbs].copy_from_slice(&self.q.rr[..self.num_limbs]); + + self.elem_mul(&mut acc, &rr); + acc + }; + + let mut r = Point::new_at_infinity(); + r.xyz[..self.num_limbs].copy_from_slice(&x.limbs[..self.num_limbs]); + r.xyz[self.num_limbs..(2 * self.num_limbs)].copy_from_slice(&y.limbs[..self.num_limbs]); + r.xyz[(2 * self.num_limbs)..(3 * self.num_limbs)] + .copy_from_slice(&z.limbs[..self.num_limbs]); + r + } + + fn point_double_assign(&self, r: &mut Point) { + unsafe { (self.point_double_jacobian_impl)(r.xyz.as_mut_ptr(), r.xyz.as_ptr()) } + } + + fn point_add_assign(&self, r: &mut Point, a: &Point) { + unsafe { + (self.point_add_jacobian_impl)(r.xyz.as_mut_ptr(), r.xyz.as_ptr(), a.xyz.as_ptr()) + } + } + pub fn point_sum(&self, a: &Point, b: &Point) -> Point { let mut r = Point::new_at_infinity(); unsafe { @@ -303,6 +343,7 @@ pub struct PrivateScalarOps { // XXX: Inefficient and unnecessarily depends on `PrivateKeyOps`. TODO: implement interleaved wNAF // multiplication. +#[cfg(not(any(target_arch = "aarch64", target_arch = "x86_64")))] fn twin_mul_inefficient( ops: &PrivateKeyOps, g_scalar: &Scalar, @@ -839,52 +880,28 @@ mod tests { #[test] fn p256_point_double_test() { - prefixed_extern! { - fn p256_point_double( - r: *mut Limb, // [p256::COMMON_OPS.num_limbs*3] - a: *const Limb, // [p256::COMMON_OPS.num_limbs*3] - ); - } point_double_test( &p256::PRIVATE_KEY_OPS, - p256_point_double, test_file!("ops/p256_point_double_tests.txt"), ); } #[test] fn p384_point_double_test() { - prefixed_extern! { - fn p384_point_double( - r: *mut Limb, // [p384::COMMON_OPS.num_limbs*3] - a: *const Limb, // [p384::COMMON_OPS.num_limbs*3] - ); - } point_double_test( &p384::PRIVATE_KEY_OPS, - p384_point_double, test_file!("ops/p384_point_double_tests.txt"), ); } - fn point_double_test( - ops: &PrivateKeyOps, - point_double: unsafe extern "C" fn( - r: *mut Limb, // [ops.num_limbs*3] - a: *const Limb, // [ops.num_limbs*3] - ), - test_file: test::File, - ) { + fn point_double_test(ops: &PrivateKeyOps, test_file: test::File) { test::run(test_file, |section, test_case| { assert_eq!(section, ""); - let a = consume_jacobian_point(ops, test_case, "a"); + let mut r_actual = consume_jacobian_point(ops, test_case, "a"); let r_expected = consume_point(ops, test_case, "r"); - let mut r_actual = Point::new_at_infinity(); - unsafe { - point_double(r_actual.xyz.as_mut_ptr(), a.xyz.as_ptr()); - } + ops.common.point_double_assign(&mut r_actual); assert_point_actual_equals_expected(ops, &r_actual, &r_expected); @@ -897,6 +914,7 @@ mod tests { point_mul_tests( &p256::PRIVATE_KEY_OPS, test_file!("ops/p256_point_mul_tests.txt"), + |s, p| p256::PRIVATE_KEY_OPS.point_mul(s, p), ); } @@ -905,10 +923,15 @@ mod tests { point_mul_tests( &p384::PRIVATE_KEY_OPS, test_file!("ops/p384_point_mul_tests.txt"), + |s, p| p384::PRIVATE_KEY_OPS.point_mul(s, p), ); } - fn point_mul_tests(ops: &PrivateKeyOps, test_file: test::File) { + pub(super) fn point_mul_tests( + ops: &PrivateKeyOps, + test_file: test::File, + point_mul: impl Fn(&Scalar, &(Elem, Elem)) -> Point, + ) { test::run(test_file, |section, test_case| { assert_eq!(section, ""); let p_scalar = consume_scalar(ops.common, test_case, "p_scalar"); @@ -919,7 +942,7 @@ mod tests { TestPoint::Affine(x, y) => (x, y), }; let expected_result = consume_point(ops, test_case, "r"); - let actual_result = ops.point_mul(&p_scalar, &(x, y)); + let actual_result = point_mul(&p_scalar, &(x, y)); assert_point_actual_equals_expected(ops, &actual_result, &expected_result); Ok(()) }) @@ -1184,3 +1207,4 @@ mod tests { mod elem; pub mod p256; pub mod p384; +mod vartime; diff --git a/src/ec/suite_b/ops/p256.rs b/src/ec/suite_b/ops/p256.rs index 70b9cbf9fc..2a9365ae47 100644 --- a/src/ec/suite_b/ops/p256.rs +++ b/src/ec/suite_b/ops/p256.rs @@ -30,7 +30,7 @@ pub static COMMON_OPS: CommonOps = CommonOps { elem_mul_mont: p256_mul_mont, elem_sqr_mont: p256_sqr_mont, - + point_double_jacobian_impl: p256_point_double, point_add_jacobian_impl: p256_point_add, }; @@ -293,6 +293,10 @@ prefixed_extern! { a: *const Limb, // [3][COMMON_OPS.num_limbs] b: *const Limb, // [3][COMMON_OPS.num_limbs] ); + fn p256_point_double( + r: *mut Limb, // [p256::COMMON_OPS.num_limbs*3] + a: *const Limb, // [p256::COMMON_OPS.num_limbs*3] + ); fn p256_point_mul( r: *mut Limb, // [3][COMMON_OPS.num_limbs] p_scalar: *const Limb, // [COMMON_OPS.num_limbs] diff --git a/src/ec/suite_b/ops/p384.rs b/src/ec/suite_b/ops/p384.rs index 54ec00aa8f..408fd6a862 100644 --- a/src/ec/suite_b/ops/p384.rs +++ b/src/ec/suite_b/ops/p384.rs @@ -32,10 +32,15 @@ pub static COMMON_OPS: CommonOps = CommonOps { , elem_mul_mont: p384_elem_mul_mont, elem_sqr_mont: p384_elem_sqr_mont, - + point_double_jacobian_impl: p384_point_double, point_add_jacobian_impl: p384_point_add, }; +static GENERATOR: (Elem, Elem) = ( + Elem::from_hex("4d3aadc2299e1513812ff723614ede2b6454868459a30eff879c3afc541b4d6e20e378e2a0d6ce383dd0756649c0b528"), + Elem::from_hex("2b78abc25a15c5e9dd8002263969a840c6c3521968f4ffd98bade7562e83b050a1bfa8bf7bb4a9ac23043dad4b03a4fe"), +); + pub static PRIVATE_KEY_OPS: PrivateKeyOps = PrivateKeyOps { common: &COMMON_OPS, elem_inv_squared: p384_elem_inv_squared, @@ -101,11 +106,6 @@ fn p384_elem_inv_squared(a: &Elem) -> Elem { fn p384_point_mul_base_impl(a: &Scalar) -> Point { // XXX: Not efficient. TODO: Precompute multiples of the generator. - const GENERATOR: (Elem, Elem) = ( - Elem::from_hex("4d3aadc2299e1513812ff723614ede2b6454868459a30eff879c3afc541b4d6e20e378e2a0d6ce383dd0756649c0b528"), - Elem::from_hex("2b78abc25a15c5e9dd8002263969a840c6c3521968f4ffd98bade7562e83b050a1bfa8bf7bb4a9ac23043dad4b03a4fe"), - ); - PRIVATE_KEY_OPS.point_mul(a, &GENERATOR) } @@ -123,7 +123,7 @@ pub static PUBLIC_SCALAR_OPS: PublicScalarOps = PublicScalarOps { scalar_ops: &SCALAR_OPS, public_key_ops: &PUBLIC_KEY_OPS, twin_mul: |g_scalar, p_scalar, p_xy| { - twin_mul_inefficient(&PRIVATE_KEY_OPS, g_scalar, p_scalar, p_xy) + vartime::points_mul_vartime(&COMMON_OPS, g_scalar, &GENERATOR, p_scalar, p_xy) }, q_minus_n: Elem::from_hex("389cb27e0bc8d21fa7e5f24cb74f58851313e696333ad68c"), @@ -291,6 +291,10 @@ prefixed_extern! { a: *const Limb, // [3][COMMON_OPS.num_limbs] b: *const Limb, // [3][COMMON_OPS.num_limbs] ); + fn p384_point_double( + r: *mut Limb, // [p384::COMMON_OPS.num_limbs*3] + a: *const Limb, // [p384::COMMON_OPS.num_limbs*3] + ); fn p384_point_mul( r: *mut Limb, // [3][COMMON_OPS.num_limbs] p_scalar: *const Limb, // [COMMON_OPS.num_limbs] diff --git a/src/ec/suite_b/ops/vartime.rs b/src/ec/suite_b/ops/vartime.rs new file mode 100644 index 0000000000..bfecfe5acc --- /dev/null +++ b/src/ec/suite_b/ops/vartime.rs @@ -0,0 +1,111 @@ +// Copyright 2023 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +use super::{CommonOps, Elem, Point, Scalar}; +use crate::{ + arithmetic::montgomery::R, + limb::{Limb, LIMB_BITS}, +}; + +pub(super) fn points_mul_vartime( + ops: &'static CommonOps, + g_scalar: &Scalar, + g: &(Elem, Elem), + p_scalar: &Scalar, + p: &(Elem, Elem), +) -> Point { + let a_scaled = point_mul_vartime(ops, g_scalar, g); + let b_scaled = point_mul_vartime(ops, p_scalar, p); + ops.point_sum(&a_scaled, &b_scaled) +} + +fn point_mul_vartime(ops: &'static CommonOps, a: &Scalar, (x, y): &(Elem, Elem)) -> Point { + let p = ops.point_new_affine(x, y); + + let mut acc = PointVartime::new_at_infinity(ops); + + // Iterate from the highest bit to the lowest bit. + (0..ops.order_bits().as_usize_bits()).rev().for_each(|i| { + if is_bit_set(&a.limbs, i) { + acc.add_assign(&p); + } + if i > 0 { + acc.double_assign(); + } + }); + acc.value.unwrap_or_else(Point::new_at_infinity) +} + +/// A `Point` with operations optimized for the case where it is the point at +/// infinity. +struct PointVartime { + ops: &'static CommonOps, + + /// `None` means "definitely the point at infinity." `Some(p)` may or may + /// not be the point at infinity. Will be `None` until a nonzero bit of + /// the scalar is encountered. + value: Option, +} + +impl PointVartime { + pub fn new_at_infinity(ops: &'static CommonOps) -> Self { + Self { ops, value: None } + } + + pub fn double_assign(&mut self) { + if let Some(p) = &mut self.value { + self.ops.point_double_assign(p); + } + } + + pub fn add_assign(&mut self, a: &Point) { + if let Some(value) = &mut self.value { + self.ops.point_add_assign(value, a); + } else { + self.value = Some(*a); + } + } +} + +fn is_bit_set(limbs: &[Limb], bit: usize) -> bool { + let limb = limbs[bit / LIMB_BITS]; + let shift = bit % LIMB_BITS; + let bit = (limb >> shift) & 1; + bit != 0 +} + +#[cfg(test)] +mod tests { + use super::{ + super::{p256, p384, tests::point_mul_tests}, + *, + }; + #[test] + fn p256_point_mul_test() { + point_mul_tests( + &p256::PRIVATE_KEY_OPS, + test_file!("p256_point_mul_tests.txt"), + |s, p| point_mul_vartime(&p256::COMMON_OPS, s, p), + ); + } + + #[test] + fn p384_point_mul_test() { + point_mul_tests( + &p384::PRIVATE_KEY_OPS, + test_file!("p384_point_mul_tests.txt"), + |s, p| point_mul_vartime(&p384::COMMON_OPS, s, p), + ); + } +} From 354bd288956a20ddb30064f49fa0df4e4066b173 Mon Sep 17 00:00:00 2001 From: Brian Smith Date: Thu, 19 Oct 2023 01:13:43 -0700 Subject: [PATCH 2/5] Implement a true twin_mul to cut the number of doublings in half. Previously we did N doublings for G + N doublings for P = 2N doublings. Now, we do N doublings. --- src/ec/suite_b/ops.rs | 42 ++++++++++++++++++++++++++++++++-- src/ec/suite_b/ops/p384.rs | 2 +- src/ec/suite_b/ops/vartime.rs | 43 +++++++---------------------------- 3 files changed, 49 insertions(+), 38 deletions(-) diff --git a/src/ec/suite_b/ops.rs b/src/ec/suite_b/ops.rs index edeb9602e8..8befdc211d 100644 --- a/src/ec/suite_b/ops.rs +++ b/src/ec/suite_b/ops.rs @@ -479,7 +479,7 @@ prefixed_extern! { #[cfg(test)] mod tests { extern crate alloc; - use super::*; + use super::{vartime::points_mul_vartime, *}; use crate::test; use alloc::{format, vec, vec::Vec}; @@ -918,6 +918,19 @@ mod tests { ); } + #[test] + fn p256_point_mul_g_test() { + point_mul_tests( + &p256::PRIVATE_KEY_OPS, + test_file!("ops/p256_point_mul_tests.txt"), + |g_scalar, g| { + let p_scalar = Scalar::zero(); + let p = (Elem::zero(), Elem::zero()); + points_mul_vartime(&p256::COMMON_OPS, g_scalar, g, &p_scalar, &p) + }, + ); + } + #[test] fn p384_point_mul_test() { point_mul_tests( @@ -927,7 +940,32 @@ mod tests { ); } - pub(super) fn point_mul_tests( + #[test] + fn p384_point_mul_g_test() { + point_mul_tests( + &p384::PRIVATE_KEY_OPS, + test_file!("ops/p384_point_mul_tests.txt"), + |g_scalar, g| { + let p_scalar = Scalar::zero(); + let p = (Elem::zero(), Elem::zero()); + points_mul_vartime(&p384::COMMON_OPS, g_scalar, g, &p_scalar, &p) + }, + ); + } + + #[test] + fn p384_point_mul_p_test() { + point_mul_tests( + &p384::PRIVATE_KEY_OPS, + test_file!("ops/p384_point_mul_tests.txt"), + |s, p| { + let g_scalar = Scalar::zero(); + points_mul_vartime(&p384::COMMON_OPS, &g_scalar, &p384::GENERATOR, s, p) + }, + ); + } + + fn point_mul_tests( ops: &PrivateKeyOps, test_file: test::File, point_mul: impl Fn(&Scalar, &(Elem, Elem)) -> Point, diff --git a/src/ec/suite_b/ops/p384.rs b/src/ec/suite_b/ops/p384.rs index 408fd6a862..467bde68b1 100644 --- a/src/ec/suite_b/ops/p384.rs +++ b/src/ec/suite_b/ops/p384.rs @@ -36,7 +36,7 @@ pub static COMMON_OPS: CommonOps = CommonOps { point_add_jacobian_impl: p384_point_add, }; -static GENERATOR: (Elem, Elem) = ( +pub(super) static GENERATOR: (Elem, Elem) = ( Elem::from_hex("4d3aadc2299e1513812ff723614ede2b6454868459a30eff879c3afc541b4d6e20e378e2a0d6ce383dd0756649c0b528"), Elem::from_hex("2b78abc25a15c5e9dd8002263969a840c6c3521968f4ffd98bade7562e83b050a1bfa8bf7bb4a9ac23043dad4b03a4fe"), ); diff --git a/src/ec/suite_b/ops/vartime.rs b/src/ec/suite_b/ops/vartime.rs index bfecfe5acc..de895ed7d8 100644 --- a/src/ec/suite_b/ops/vartime.rs +++ b/src/ec/suite_b/ops/vartime.rs @@ -21,23 +21,21 @@ use crate::{ pub(super) fn points_mul_vartime( ops: &'static CommonOps, g_scalar: &Scalar, - g: &(Elem, Elem), + (gx, gy): &(Elem, Elem), p_scalar: &Scalar, - p: &(Elem, Elem), + (px, py): &(Elem, Elem), ) -> Point { - let a_scaled = point_mul_vartime(ops, g_scalar, g); - let b_scaled = point_mul_vartime(ops, p_scalar, p); - ops.point_sum(&a_scaled, &b_scaled) -} - -fn point_mul_vartime(ops: &'static CommonOps, a: &Scalar, (x, y): &(Elem, Elem)) -> Point { - let p = ops.point_new_affine(x, y); + let g = ops.point_new_affine(gx, gy); + let p = ops.point_new_affine(px, py); let mut acc = PointVartime::new_at_infinity(ops); // Iterate from the highest bit to the lowest bit. (0..ops.order_bits().as_usize_bits()).rev().for_each(|i| { - if is_bit_set(&a.limbs, i) { + if is_bit_set(&g_scalar.limbs, i) { + acc.add_assign(&g); + } + if is_bit_set(&p_scalar.limbs, i) { acc.add_assign(&p); } if i > 0 { @@ -84,28 +82,3 @@ fn is_bit_set(limbs: &[Limb], bit: usize) -> bool { let bit = (limb >> shift) & 1; bit != 0 } - -#[cfg(test)] -mod tests { - use super::{ - super::{p256, p384, tests::point_mul_tests}, - *, - }; - #[test] - fn p256_point_mul_test() { - point_mul_tests( - &p256::PRIVATE_KEY_OPS, - test_file!("p256_point_mul_tests.txt"), - |s, p| point_mul_vartime(&p256::COMMON_OPS, s, p), - ); - } - - #[test] - fn p384_point_mul_test() { - point_mul_tests( - &p384::PRIVATE_KEY_OPS, - test_file!("p384_point_mul_tests.txt"), - |s, p| point_mul_vartime(&p384::COMMON_OPS, s, p), - ); - } -} From 637c2ec1bec1e1398e5a788439052cc2896b1a89 Mon Sep 17 00:00:00 2001 From: Brian Smith Date: Tue, 17 Oct 2023 13:40:54 -0700 Subject: [PATCH 3/5] Import BoringSSL's `ec_compute_wNAF` unmodified. It won't build without modificatoins, so don't add it to the build yet. --- crypto/fipsmodule/ec/wnaf.c | 148 ++++++++++++++++++++++++++++++++++++ 1 file changed, 148 insertions(+) create mode 100644 crypto/fipsmodule/ec/wnaf.c diff --git a/crypto/fipsmodule/ec/wnaf.c b/crypto/fipsmodule/ec/wnaf.c new file mode 100644 index 0000000000..56de6cfec5 --- /dev/null +++ b/crypto/fipsmodule/ec/wnaf.c @@ -0,0 +1,148 @@ +/* Originally written by Bodo Moeller for the OpenSSL project. + * ==================================================================== + * Copyright (c) 1998-2005 The OpenSSL Project. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. All advertising materials mentioning features or use of this + * software must display the following acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" + * + * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to + * endorse or promote products derived from this software without + * prior written permission. For written permission, please contact + * openssl-core@openssl.org. + * + * 5. Products derived from this software may not be called "OpenSSL" + * nor may "OpenSSL" appear in their names without prior written + * permission of the OpenSSL Project. + * + * 6. Redistributions of any form whatsoever must retain the following + * acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit (http://www.openssl.org/)" + * + * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * ==================================================================== + * + * This product includes cryptographic software written by Eric Young + * (eay@cryptsoft.com). This product includes software written by Tim + * Hudson (tjh@cryptsoft.com). + * + */ +/* ==================================================================== + * Copyright 2002 Sun Microsystems, Inc. ALL RIGHTS RESERVED. + * + * Portions of the attached software ("Contribution") are developed by + * SUN MICROSYSTEMS, INC., and are contributed to the OpenSSL project. + * + * The Contribution is licensed pursuant to the OpenSSL open source + * license provided above. + * + * The elliptic curve binary polynomial software is originally written by + * Sheueling Chang Shantz and Douglas Stebila of Sun Microsystems + * Laboratories. */ + +#include + +#include +#include + +#include +#include +#include +#include + +#include "internal.h" +#include "../bn/internal.h" +#include "../../internal.h" + + +// This file implements the wNAF-based interleaving multi-exponentiation method +// at: +// http://link.springer.com/chapter/10.1007%2F3-540-45537-X_13 +// http://www.bmoeller.de/pdf/TI-01-08.multiexp.pdf + +void ec_compute_wNAF(const EC_GROUP *group, int8_t *out, + const EC_SCALAR *scalar, size_t bits, int w) { + // 'int8_t' can represent integers with absolute values less than 2^7. + assert(0 < w && w <= 7); + assert(bits != 0); + int bit = 1 << w; // 2^w, at most 128 + int next_bit = bit << 1; // 2^(w+1), at most 256 + int mask = next_bit - 1; // at most 255 + + int window_val = scalar->words[0] & mask; + for (size_t j = 0; j < bits + 1; j++) { + assert(0 <= window_val && window_val <= next_bit); + int digit = 0; + if (window_val & 1) { + assert(0 < window_val && window_val < next_bit); + if (window_val & bit) { + digit = window_val - next_bit; + // We know -next_bit < digit < 0 and window_val - digit = next_bit. + + // modified wNAF + if (j + w + 1 >= bits) { + // special case for generating modified wNAFs: + // no new bits will be added into window_val, + // so using a positive digit here will decrease + // the total length of the representation + + digit = window_val & (mask >> 1); + // We know 0 < digit < bit and window_val - digit = bit. + } + } else { + digit = window_val; + // We know 0 < digit < bit and window_val - digit = 0. + } + + window_val -= digit; + + // Now window_val is 0 or 2^(w+1) in standard wNAF generation. + // For modified window NAFs, it may also be 2^w. + // + // See the comments above for the derivation of each of these bounds. + assert(window_val == 0 || window_val == next_bit || window_val == bit); + assert(-bit < digit && digit < bit); + + // window_val was odd, so digit is also odd. + assert(digit & 1); + } + + out[j] = digit; + + // Incorporate the next bit. Previously, |window_val| <= |next_bit|, so if + // we shift and add at most one copy of |bit|, this will continue to hold + // afterwards. + window_val >>= 1; + window_val += bit * bn_is_bit_set_words(scalar->words, group->order.N.width, + j + w + 1); + assert(window_val <= next_bit); + } + + // bits + 1 entries should be sufficient to consume all bits. + assert(window_val == 0); +} From e0763a1522d5bb2650cf98d12809ff3b84c5190b Mon Sep 17 00:00:00 2001 From: Brian Smith Date: Mon, 23 Oct 2023 12:27:27 -0700 Subject: [PATCH 4/5] P-384 ECDSA: Use WNAF-based point multiplication. --- Cargo.toml | 2 + build.rs | 3 + crypto/fipsmodule/ec/wnaf.c | 50 +++++++--------- crypto/limbs/limbs.c | 5 ++ src/bits.rs | 2 +- src/ec/suite_b/ops.rs | 17 ++++++ src/ec/suite_b/ops/elem.rs | 4 +- src/ec/suite_b/ops/vartime.rs | 110 +++++++++++++++++++++++++--------- src/limb.rs | 14 +++++ 9 files changed, 149 insertions(+), 58 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 3378dc84bc..316161d9a3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -71,6 +71,7 @@ include = [ "crypto/fipsmodule/ec/ecp_nistz.h", "crypto/fipsmodule/ec/ecp_nistz384.h", "crypto/fipsmodule/ec/ecp_nistz384.inl", + "crypto/fipsmodule/ec/internal.h", "crypto/fipsmodule/ec/gfp_p256.c", "crypto/fipsmodule/ec/gfp_p384.c", "crypto/fipsmodule/ec/p256.c", @@ -80,6 +81,7 @@ include = [ "crypto/fipsmodule/ec/p256_shared.h", "crypto/fipsmodule/ec/p256_table.h", "crypto/fipsmodule/ec/util.h", + "crypto/fipsmodule/ec/wnaf.c", "crypto/fipsmodule/ecdsa/ecdsa_verify_tests.txt", "crypto/fipsmodule/modes/asm/aesni-gcm-x86_64.pl", "crypto/fipsmodule/modes/asm/ghash-armv4.pl", diff --git a/build.rs b/build.rs index f7b94108b7..cb9a46922a 100644 --- a/build.rs +++ b/build.rs @@ -42,6 +42,7 @@ const RING_SRCS: &[(&[&str], &str)] = &[ (&[], "crypto/fipsmodule/ec/gfp_p256.c"), (&[], "crypto/fipsmodule/ec/gfp_p384.c"), (&[], "crypto/fipsmodule/ec/p256.c"), + (&[], "crypto/fipsmodule/ec/wnaf.c"), (&[], "crypto/limbs/limbs.c"), (&[], "crypto/mem.c"), (&[], "crypto/poly1305/poly1305.c"), @@ -903,6 +904,7 @@ fn prefix_all_symbols(pp: char, prefix_prefix: &str, prefix: &str) -> String { "LIMBS_reduce_once", "LIMBS_select_512_32", "LIMBS_shl_mod", + "LIMBS_sub_from_assign", "LIMBS_sub_mod", "LIMBS_window5_split_window", "LIMBS_window5_unsplit_window", @@ -933,6 +935,7 @@ fn prefix_all_symbols(pp: char, prefix_prefix: &str, prefix: &str) -> String { "bssl_constant_time_test_main", "chacha20_poly1305_open", "chacha20_poly1305_seal", + "ec_compute_wNAF", "fiat_curve25519_adx_mul", "fiat_curve25519_adx_square", "gcm_ghash_avx", diff --git a/crypto/fipsmodule/ec/wnaf.c b/crypto/fipsmodule/ec/wnaf.c index 56de6cfec5..d0d7299e00 100644 --- a/crypto/fipsmodule/ec/wnaf.c +++ b/crypto/fipsmodule/ec/wnaf.c @@ -65,47 +65,44 @@ * Sheueling Chang Shantz and Douglas Stebila of Sun Microsystems * Laboratories. */ -#include - -#include -#include - -#include -#include -#include -#include - -#include "internal.h" #include "../bn/internal.h" #include "../../internal.h" +#include "../../limbs/limbs.h" +static int is_bit_set(const Limb limbs[], size_t num_limbs, size_t bit) { + size_t i = bit / LIMB_BITS; + if (i >= num_limbs) { + return 0; + } + size_t shift = bit % LIMB_BITS; + return (limbs[i] >> shift) & 1; +} // This file implements the wNAF-based interleaving multi-exponentiation method // at: // http://link.springer.com/chapter/10.1007%2F3-540-45537-X_13 // http://www.bmoeller.de/pdf/TI-01-08.multiexp.pdf -void ec_compute_wNAF(const EC_GROUP *group, int8_t *out, - const EC_SCALAR *scalar, size_t bits, int w) { +void ec_compute_wNAF(int8_t *out, const Limb scalar[], size_t scalar_limbs, size_t bits, int w) { // 'int8_t' can represent integers with absolute values less than 2^7. - assert(0 < w && w <= 7); - assert(bits != 0); + debug_assert_nonsecret(0 < w && w <= 7); + debug_assert_nonsecret(bits != 0); int bit = 1 << w; // 2^w, at most 128 int next_bit = bit << 1; // 2^(w+1), at most 256 int mask = next_bit - 1; // at most 255 - int window_val = scalar->words[0] & mask; + int window_val = ((int)scalar[0]) & mask; for (size_t j = 0; j < bits + 1; j++) { - assert(0 <= window_val && window_val <= next_bit); + debug_assert_nonsecret(0 <= window_val && window_val <= next_bit); int digit = 0; if (window_val & 1) { - assert(0 < window_val && window_val < next_bit); + debug_assert_nonsecret(0 < window_val && window_val < next_bit); if (window_val & bit) { digit = window_val - next_bit; // We know -next_bit < digit < 0 and window_val - digit = next_bit. // modified wNAF - if (j + w + 1 >= bits) { + if (j + ((size_t)w) + 1 >= bits) { // special case for generating modified wNAFs: // no new bits will be added into window_val, // so using a positive digit here will decrease @@ -125,24 +122,23 @@ void ec_compute_wNAF(const EC_GROUP *group, int8_t *out, // For modified window NAFs, it may also be 2^w. // // See the comments above for the derivation of each of these bounds. - assert(window_val == 0 || window_val == next_bit || window_val == bit); - assert(-bit < digit && digit < bit); + debug_assert_nonsecret(window_val == 0 || window_val == next_bit || window_val == bit); + debug_assert_nonsecret(-bit < digit && digit < bit); // window_val was odd, so digit is also odd. - assert(digit & 1); + debug_assert_nonsecret(digit & 1); } - out[j] = digit; + out[j] = (int8_t)digit; // Incorporate the next bit. Previously, |window_val| <= |next_bit|, so if // we shift and add at most one copy of |bit|, this will continue to hold // afterwards. window_val >>= 1; - window_val += bit * bn_is_bit_set_words(scalar->words, group->order.N.width, - j + w + 1); - assert(window_val <= next_bit); + window_val += bit * is_bit_set(scalar, scalar_limbs, j + (size_t)w + 1); + debug_assert_nonsecret(window_val <= next_bit); } // bits + 1 entries should be sufficient to consume all bits. - assert(window_val == 0); + debug_assert_nonsecret(window_val == 0); } diff --git a/crypto/limbs/limbs.c b/crypto/limbs/limbs.c index df84f0767f..31a44bed98 100644 --- a/crypto/limbs/limbs.c +++ b/crypto/limbs/limbs.c @@ -122,6 +122,11 @@ void LIMBS_add_mod(Limb r[], const Limb a[], const Limb b[], const Limb m[], } } +// r := a - r. +void LIMBS_sub_from_assign(Limb r[], const Limb a[], size_t num_limbs) { + (void)limbs_sub(r, a, r, num_limbs); +} + void LIMBS_sub_mod(Limb r[], const Limb a[], const Limb b[], const Limb m[], size_t num_limbs) { Limb underflow = diff --git a/src/bits.rs b/src/bits.rs index 5851aaf593..9bc182ecdf 100644 --- a/src/bits.rs +++ b/src/bits.rs @@ -49,7 +49,7 @@ impl BitLength { /// The number of bits this bit length represents, as a `usize`. #[inline] - pub fn as_usize_bits(&self) -> usize { + pub const fn as_usize_bits(&self) -> usize { self.0 } diff --git a/src/ec/suite_b/ops.rs b/src/ec/suite_b/ops.rs index 8befdc211d..a2f948b75f 100644 --- a/src/ec/suite_b/ops.rs +++ b/src/ec/suite_b/ops.rs @@ -149,7 +149,10 @@ impl CommonOps { self.elem_mul(&mut acc, &rr); acc }; + self.point_new_jacobian(x, y, &z) + } + fn point_new_jacobian(&self, x: &Elem, y: &Elem, z: &Elem) -> Point { let mut r = Point::new_at_infinity(); r.xyz[..self.num_limbs].copy_from_slice(&x.limbs[..self.num_limbs]); r.xyz[self.num_limbs..(2 * self.num_limbs)].copy_from_slice(&y.limbs[..self.num_limbs]); @@ -176,6 +179,20 @@ impl CommonOps { r } + fn point_neg_vartime(&self, a: &Point) -> Point { + let mut r = *a; + let y = &mut r.xyz[self.num_limbs..(2 * self.num_limbs)]; + // Negate y. + // TODO(perf): The way this is used, `y` is never zero; none of the + // curves we support have a point with y == 0, and the caller never + // calls this on the point at infinity. + let is_nonzero = !y.iter().all(|&limb| limb == 0); + if is_nonzero { + limbs_sub_from_assign(y, &self.q.p[..self.num_limbs]); + }; + r + } + pub fn point_x(&self, p: &Point) -> Elem { let mut r = Elem::zero(); r.limbs[..self.num_limbs].copy_from_slice(&p.xyz[0..self.num_limbs]); diff --git a/src/ec/suite_b/ops/elem.rs b/src/ec/suite_b/ops/elem.rs index d9c424fb28..aeaf6e7e67 100644 --- a/src/ec/suite_b/ops/elem.rs +++ b/src/ec/suite_b/ops/elem.rs @@ -17,6 +17,7 @@ use crate::{ limbs_from_hex, montgomery::{Encoding, ProductEncoding}, }, + bits::BitLength, limb::{Limb, LIMB_BITS}, }; use core::marker::PhantomData; @@ -128,4 +129,5 @@ pub fn unary_op_from_binary_op_assign( unsafe { f(a.limbs.as_mut_ptr(), a.limbs.as_ptr(), a.limbs.as_ptr()) } } -pub const MAX_LIMBS: usize = (384 + (LIMB_BITS - 1)) / LIMB_BITS; +pub const MAX_BITS: BitLength = BitLength::from_usize_bits(384); +pub const MAX_LIMBS: usize = (MAX_BITS.as_usize_bits() + (LIMB_BITS - 1)) / LIMB_BITS; diff --git a/src/ec/suite_b/ops/vartime.rs b/src/ec/suite_b/ops/vartime.rs index de895ed7d8..b649f8789b 100644 --- a/src/ec/suite_b/ops/vartime.rs +++ b/src/ec/suite_b/ops/vartime.rs @@ -12,39 +12,98 @@ // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. -use super::{CommonOps, Elem, Point, Scalar}; -use crate::{ - arithmetic::montgomery::R, - limb::{Limb, LIMB_BITS}, -}; +use super::{CommonOps, Elem, Point, Scalar, MAX_BITS}; +use crate::{arithmetic::montgomery::R, c, limb::Limb}; pub(super) fn points_mul_vartime( ops: &'static CommonOps, g_scalar: &Scalar, - (gx, gy): &(Elem, Elem), + g: &(Elem, Elem), p_scalar: &Scalar, - (px, py): &(Elem, Elem), + p: &(Elem, Elem), ) -> Point { - let g = ops.point_new_affine(gx, gy); - let p = ops.point_new_affine(px, py); + let mut g_wnaf: [i8; WNAF_MAX_LEN] = [0; WNAF_MAX_LEN]; + let (g_wnaf, g_precomp) = prepare(ops, g_scalar, g, &mut g_wnaf); - let mut acc = PointVartime::new_at_infinity(ops); + let mut p_wnaf: [i8; WNAF_MAX_LEN] = [0; WNAF_MAX_LEN]; + let (p_wnaf, p_precomp) = prepare(ops, p_scalar, p, &mut p_wnaf); - // Iterate from the highest bit to the lowest bit. - (0..ops.order_bits().as_usize_bits()).rev().for_each(|i| { - if is_bit_set(&g_scalar.limbs, i) { - acc.add_assign(&g); - } - if is_bit_set(&p_scalar.limbs, i) { - acc.add_assign(&p); - } - if i > 0 { - acc.double_assign(); - } - }); + let mut acc = PointVartime::new_at_infinity(ops); + // Iterate from the highest-order digit to the lowest-order digit. + g_wnaf + .iter() + .zip(p_wnaf) + .enumerate() + .rev() + .for_each(|(i, (&g_digit, &p_digit))| { + process_digit(ops, &mut acc, g_digit, &g_precomp); + process_digit(ops, &mut acc, p_digit, &p_precomp); + if i > 0 { + acc.double_assign(); + } + }); acc.value.unwrap_or_else(Point::new_at_infinity) } +const WINDOW_BITS: u32 = 4; +const WNAF_MAX_LEN: usize = MAX_BITS.as_usize_bits() + 1; +const PRECOMP_LEN: usize = 1 << (WINDOW_BITS - 1); + +fn prepare<'a>( + ops: &'static CommonOps, + a: &Scalar, + (x, y): &(Elem, Elem), + wnaf: &'a mut [i8; WNAF_MAX_LEN], +) -> (&'a [i8], [Point; PRECOMP_LEN]) { + let order_bits = ops.order_bits().as_usize_bits(); + let wnaf = &mut wnaf[..(order_bits + 1)]; + prefixed_extern! { + fn ec_compute_wNAF(out: *mut i8, scalar: *const Limb, scalar_limbs: c::size_t, + order_bits: c::size_t, w: c::int); + } + unsafe { + ec_compute_wNAF( + wnaf.as_mut_ptr(), + a.limbs.as_ptr(), + a.limbs.len(), + order_bits, + WINDOW_BITS as c::int, + ); + } + + let mut precomp = [Point::new_at_infinity(); PRECOMP_LEN]; + // Fill `precomp` with `p` and all odd multiples (1 * p, 3 * p, 5 * p, etc.). + precomp[0] = ops.point_new_affine(x, y); + let mut p2 = precomp[0]; + ops.point_double_assign(&mut p2); + for i in 1..precomp.len() { + precomp[i] = ops.point_sum(&p2, &precomp[i - 1]); + } + (wnaf, precomp) +} + +fn process_digit( + ops: &CommonOps, + acc: &mut PointVartime, + digit: i8, + precomp: &[Point; PRECOMP_LEN], +) { + if digit != 0 { + debug_assert_eq!(digit & 1, 1); + let neg = digit < 0; + let idx = usize::try_from(if neg { -digit } else { digit }).unwrap() >> 1; + let entry = &precomp[idx]; + let entry_neg; + let entry = if neg { + entry_neg = ops.point_neg_vartime(entry); + &entry_neg + } else { + entry + }; + acc.add_assign(entry); + } +} + /// A `Point` with operations optimized for the case where it is the point at /// infinity. struct PointVartime { @@ -75,10 +134,3 @@ impl PointVartime { } } } - -fn is_bit_set(limbs: &[Limb], bit: usize) -> bool { - let limb = limbs[bit / LIMB_BITS]; - let shift = bit % LIMB_BITS; - let bit = (limb >> shift) & 1; - bit != 0 -} diff --git a/src/limb.rs b/src/limb.rs index 5825101121..53212f120a 100644 --- a/src/limb.rs +++ b/src/limb.rs @@ -350,6 +350,20 @@ pub(crate) fn limbs_add_assign_mod(a: &mut [Limb], b: &[Limb], m: &[Limb]) { unsafe { LIMBS_add_mod(a.as_mut_ptr(), a.as_ptr(), b.as_ptr(), m.as_ptr(), m.len()) } } +/// r := a - r. +pub(crate) fn limbs_sub_from_assign(r: &mut [Limb], a: &[Limb]) { + debug_assert_eq!(a.len(), a.len()); + prefixed_extern! { + // `r` and `a` may alias. + fn LIMBS_sub_from_assign( + r: *mut Limb, + a: *const Limb, + num_limbs: c::size_t, + ); + } + unsafe { LIMBS_sub_from_assign(r.as_mut_ptr(), a.as_ptr(), r.len()) } +} + prefixed_extern! { fn LIMBS_are_zero(a: *const Limb, num_limbs: c::size_t) -> LimbMask; fn LIMBS_less_than(a: *const Limb, b: *const Limb, num_limbs: c::size_t) -> LimbMask; From faf67e2c9adb8c25309c518b9b73f55128446625 Mon Sep 17 00:00:00 2001 From: Brian Smith Date: Thu, 19 Oct 2023 00:21:28 -0700 Subject: [PATCH 5/5] EC P-256: Have non-nistz256 ECDSA verification use WNAF-based multiplication. --- src/ec/suite_b/ops.rs | 25 +++++++++++-------------- src/ec/suite_b/ops/p256.rs | 9 ++++++++- 2 files changed, 19 insertions(+), 15 deletions(-) diff --git a/src/ec/suite_b/ops.rs b/src/ec/suite_b/ops.rs index a2f948b75f..6a0ea099c1 100644 --- a/src/ec/suite_b/ops.rs +++ b/src/ec/suite_b/ops.rs @@ -358,20 +358,6 @@ pub struct PrivateScalarOps { pub oneRR_mod_n: Scalar, // 1 * R**2 (mod n). TOOD: Use One. } -// XXX: Inefficient and unnecessarily depends on `PrivateKeyOps`. TODO: implement interleaved wNAF -// multiplication. -#[cfg(not(any(target_arch = "aarch64", target_arch = "x86_64")))] -fn twin_mul_inefficient( - ops: &PrivateKeyOps, - g_scalar: &Scalar, - p_scalar: &Scalar, - p_xy: &(Elem, Elem), -) -> Point { - let scaled_g = ops.point_mul_base(g_scalar); - let scaled_p = ops.point_mul(p_scalar, p_xy); - ops.common.point_sum(&scaled_g, &scaled_p) -} - // This assumes n < q < 2*n. pub fn elem_reduced_to_scalar(ops: &CommonOps, elem: &Elem) -> Scalar { let num_limbs = ops.num_limbs; @@ -948,6 +934,17 @@ mod tests { ); } + #[test] + fn p256_point_mul_p_test() { + point_mul_tests( + &p256::PRIVATE_KEY_OPS, + test_file!("ops/p256_point_mul_tests.txt"), + |p_scalar, p| { + let g_scalar = Scalar::zero(); + points_mul_vartime(&p256::COMMON_OPS, &g_scalar, &p256::GENERATOR, p_scalar, p) + }, + ); + } #[test] fn p384_point_mul_test() { point_mul_tests( diff --git a/src/ec/suite_b/ops/p256.rs b/src/ec/suite_b/ops/p256.rs index 2a9365ae47..11a97591ca 100644 --- a/src/ec/suite_b/ops/p256.rs +++ b/src/ec/suite_b/ops/p256.rs @@ -34,6 +34,12 @@ pub static COMMON_OPS: CommonOps = CommonOps { point_add_jacobian_impl: p256_point_add, }; +#[cfg(any(test, not(any(target_arch = "aarch64", target_arch = "x86_64"))))] +pub(super) static GENERATOR: (Elem, Elem) = ( + Elem::from_hex("18905f76a53755c679fb732b7762251075ba95fc5fedb60179e730d418a9143c"), + Elem::from_hex("8571ff1825885d85d2e88688dd21f3258b4ab8e4ba19e45cddf25357ce95560a"), +); + pub static PRIVATE_KEY_OPS: PrivateKeyOps = PrivateKeyOps { common: &COMMON_OPS, elem_inv_squared: p256_elem_inv_squared, @@ -120,7 +126,8 @@ pub static PUBLIC_SCALAR_OPS: PublicScalarOps = PublicScalarOps { #[cfg(not(any(target_arch = "aarch64", target_arch = "x86_64")))] twin_mul: |g_scalar, p_scalar, p_xy| { - twin_mul_inefficient(&PRIVATE_KEY_OPS, g_scalar, p_scalar, p_xy) + // TODO: Make use of precomputed multiples of `g` that already exist. + vartime::points_mul_vartime(&COMMON_OPS, g_scalar, &GENERATOR, p_scalar, p_xy) }, q_minus_n: Elem::from_hex("4319055358e8617b0c46353d039cdaae"),