From 133eb94453837741b76a74fddc90b5228ae3ef6c Mon Sep 17 00:00:00 2001
From: Jade Philipoom <jadep@google.com>
Date: Mon, 5 Dec 2022 11:37:33 +0100
Subject: [PATCH 01/28] [crypto] Adjust P-256 implementation to read k in 320
 bits.

This is part of a large-scale hardening of ECDSA scalars; all procedures
using k will be modified to accept the extra redundant bits.

Signed-off-by: Jade Philipoom <jadep@google.com>
---
 sw/otbn/crypto/p256.s                       | 207 ++++++++++++++------
 sw/otbn/crypto/tests/p256_ecdsa_sign_test.s |   3 +-
 2 files changed, 151 insertions(+), 59 deletions(-)

diff --git a/sw/otbn/crypto/p256.s b/sw/otbn/crypto/p256.s
index 4d6b8cdf6ead9..aeee6d5d4c059 100644
--- a/sw/otbn/crypto/p256.s
+++ b/sw/otbn/crypto/p256.s
@@ -19,22 +19,19 @@
 .text
 
 /**
- * 256-bit modular multiplication based on Barrett reduction algorithm.
+ * Reduce a 512-bit value by a 256-bit P-256 modulus (either n or p).
  *
- * Returns c = a * b mod p
+ * Returns c = a mod m
  *
- * Expects two 256 bit operands, 256 bit modulus and pre-computed parameter u
- * for Barrett reduction (usually greek mu in literature). u is expected
- * without the leading 1 at bit 256. u has to be pre-computed as
- * u = floor(2^512/p).
- * This guarantees that u > 2^256, however, in order for u to be at
- * most 2^257-1, it has to be ensured that p >= 2^255 + 1.
+ * Expects a 512 bit input, a 256 bit modulus and pre-computed parameter u for
+ * Barrett reduction (usually greek mu in literature). u is expected without
+ * the leading 1 at bit 256. u has to be pre-computed as u = floor(2^512/p).
+ * This guarantees that u > 2^256, however, in order for u to be at most
+ * 2^257-1, it has to be ensured that m >= 2^255 + 1.
  *
  * This implementation mostly follows the description in the
  * "Handbook of Applied Cryptography" in Algorithm 14.42.
  * Differences:
- *   - This implementation incorporates a multiplication before the reduction.
- *     Therefore it expects two operands (a, b) instead of a wider integer x.
  *   - The computation of q2 ignores the MSbs of q1 and u to allow using
  *     a 256x256 bit multiplication. This is compensated later by
  *     individual (conditional) additions.
@@ -47,52 +44,27 @@
  * and Barrett constant of the P-256 underlying finite field only. For a
  * generic modulus a 2nd conditional subtraction of the modulus has to be
  * added or the modulus has to be in a range such that it can be mathematically
- * proven that a single subtraction is sufficient.
+ * proven that a single subtraction is sufficient. The following condition is
+ * sufficient (although potentially not necessary):
+ *   b * (b^2k mod m) + mu <= b^(k+1)
+ *
+ * For OTBN with a 256-bit modulus, we have b=2^256 and k=1, so this is:
+ *   2^256 * (2^512 mod m) + mu <= 2^512
  *
  * Flags: Flags have no meaning beyond the scope of this subroutine.
  *
- * @param[in]  w24: a, first 256 bit operand (a * b < 2^256 * p)
- * @param[in]  w25: b, second 256 bit operand (a * b < 2^256 * p)
- * @param[in]  w29: p, modulus of P-256 underlying finite field
- * @param[in]  w28: u, lower 256 bit of Barrett constant for curve P-256
+ * @param[in]  [w19,w20]: a, input (512 bits) such that a < 2^256 * m 
+ * @param[in]  w22: correction factor, msb(a) * u
+ * @param[in]  w29: m, modulus, curve order n or finite field modulus p 
+ * @param[in]  w28: u, lower 256 bit of Barrett constant for m 
  * @param[in]  w31: all-zero
- * @param[in]  MOD: p, modulus of P-256 underlying finite field
+ * @param[in]  MOD: m, modulus 
  * @param[out]  w19: c, result
  *
  * clobbered registers: w19, w20, w21, w22, w23, w24, w25
  * clobbered flag groups: FG0
  */
-mod_mul_256x256:
-  /* Compute the integer product of the operands x = a * b
-     x = [w20, w19] = a * b = w24 * w25
-     => max. length x: 512 bit */
-  bn.mulqacc.z          w24.0, w25.0,  0
-  bn.mulqacc            w24.1, w25.0, 64
-  bn.mulqacc.so  w19.L, w24.0, w25.1, 64
-  bn.mulqacc            w24.2, w25.0,  0
-  bn.mulqacc            w24.1, w25.1,  0
-  bn.mulqacc            w24.0, w25.2,  0
-  bn.mulqacc            w24.3, w25.0, 64
-  bn.mulqacc            w24.2, w25.1, 64
-  bn.mulqacc            w24.1, w25.2, 64
-  bn.mulqacc.so  w19.U, w24.0, w25.3, 64
-  bn.mulqacc            w24.3, w25.1,  0
-  bn.mulqacc            w24.2, w25.2,  0
-  bn.mulqacc            w24.1, w25.3,  0
-  bn.mulqacc            w24.3, w25.2, 64
-  bn.mulqacc.so  w20.L, w24.2, w25.3, 64
-  bn.mulqacc.so  w20.U, w24.3, w25.3,  0
-  bn.add    w20, w20, w31
-
-  /* Store correction factor to compensate for later neglected MSb of x.
-     x is 512 bit wide and therefore the 255 bit right shifted version q1
-     (below) contains 257 bit. Bit 256 of q1 is neglected to allow using a
-     256x256 multiplier. For the MSb of x being set we temporary store u
-     (or zero) here to be used in a later constant time correction of a
-     multiplication with u. Note that this requires the MSb flag being carried
-     over from the multiplication routine. */
-  bn.sel    w22, w28, w31, M
-
+p256_reduce:
   /* Compute q1' = q1[255:0] = x >> 255
      w21 = q1' = [w20, w19] >> 255 */
   bn.rshi   w21, w20, w19 >> 255
@@ -184,6 +156,64 @@ mod_mul_256x256:
 
   ret
 
+/**
+ * 256-bit modular multiplication for P-256 coordinate and scalar fields.
+ *
+ * Returns c = a * b mod p
+ *
+ * Expects two 256 bit operands, 256 bit modulus and pre-computed parameter u
+ * for Barrett reduction (usually greek mu in literature). See the note above
+ * `p256_reduce` for details.
+ *
+ * Flags: Flags have no meaning beyond the scope of this subroutine.
+ *
+ * @param[in]  w24: a, first 256 bit operand (a * b < 2^256 * p)
+ * @param[in]  w25: b, second 256 bit operand (a * b < 2^256 * p)
+ * @param[in]  w29: m, modulus, curve order n or finite field modulus p 
+ * @param[in]  w28: u, lower 256 bit of Barrett constant for curve P-256
+ * @param[in]  w31: all-zero
+ * @param[in]  MOD: p, modulus of P-256 underlying finite field
+ * @param[out]  w19: c, result
+ *
+ * clobbered registers: w19, w20, w21, w22, w23, w24, w25
+ * clobbered flag groups: FG0
+ */
+mod_mul_256x256:
+  /* Compute the integer product of the operands x = a * b
+     x = [w20, w19] = a * b = w24 * w25
+     => max. length x: 512 bit */
+  bn.mulqacc.z          w24.0, w25.0,  0
+  bn.mulqacc            w24.1, w25.0, 64
+  bn.mulqacc.so  w19.L, w24.0, w25.1, 64
+  bn.mulqacc            w24.2, w25.0,  0
+  bn.mulqacc            w24.1, w25.1,  0
+  bn.mulqacc            w24.0, w25.2,  0
+  bn.mulqacc            w24.3, w25.0, 64
+  bn.mulqacc            w24.2, w25.1, 64
+  bn.mulqacc            w24.1, w25.2, 64
+  bn.mulqacc.so  w19.U, w24.0, w25.3, 64
+  bn.mulqacc            w24.3, w25.1,  0
+  bn.mulqacc            w24.2, w25.2,  0
+  bn.mulqacc            w24.1, w25.3,  0
+  bn.mulqacc            w24.3, w25.2, 64
+  bn.mulqacc.so  w20.L, w24.2, w25.3, 64
+  bn.mulqacc.so  w20.U, w24.3, w25.3,  0
+  bn.add    w20, w20, w31
+
+  /* Store correction factor to compensate for later neglected MSb of x.
+     x is 512 bit wide and therefore the 255 bit right shifted version q1
+     (below) contains 257 bit. Bit 256 of q1 is neglected to allow using a
+     256x256 multiplier. For the MSb of x being set we temporary store u
+     (or zero) here to be used in a later constant time correction of a
+     multiplication with u. Note that this requires the MSb flag being carried
+     over from the multiplication routine. */
+  bn.sel    w22, w28, w31, M
+
+  /* Reduce product modulo m. */
+  jal       x1, p256_reduce
+
+  ret
+
 
 /**
  * Checks if a point is a valid curve point on curve P-256 (secp256r1)
@@ -1092,8 +1122,11 @@ scalar_mult_int:
  *   d = (d0 + d1) mod n
  *   k = (k0 + k1) mod n
  *
- * @param[in]  dmem[k0]:  first share of secret scalar (256 bits)
- * @param[in]  dmem[k1]:  second share of secret scalar (256 bits)
+ * Each share is 320 bits, which gives us 64 bits of extra redundancy modulo n
+ * (256 bits). This is a protection measure against side-channel attacks.
+ *
+ * @param[in]  dmem[k0]:  first share of secret scalar (320 bits)
+ * @param[in]  dmem[k1]:  second share of secret scalar (320 bits)
  * @param[in]  dmem[msg]: message to be signed (256 bits)
  * @param[in]  dmem[r]:   dmem buffer for r component of signature (256 bits)
  * @param[in]  dmem[s]:   dmem buffer for s component of signature (256 bits)
@@ -1111,16 +1144,50 @@ p256_sign:
   /* init all-zero register */
   bn.xor    w31, w31, w31
 
-  /* load first share of secret scalar k from dmem: w0 = dmem[k0] */
+  /* load first share of secret scalar k from dmem: w0,w1 = dmem[k0] */
   la        x16, k0
   li        x2, 0
+  bn.lid    x2, 0(x16++)
+  li        x2, 1
   bn.lid    x2, 0(x16)
 
-  /* load second share of secret scalar k from dmem: w1 = dmem[k1] */
+  /* load second share of secret scalar k from dmem: w2,w3 = dmem[k1] */
   la        x16, k1
-  li        x2, 1
+  li        x2, 2
+  bn.lid    x2, 0(x16++)
+  li        x2, 3
   bn.lid    x2, 0(x16)
 
+  /* setup modulus n (curve order) and Barrett constant
+     MOD <= w29 <= n = dmem[p256_n]; w28 <= u_n = dmem[p256_u_n]  */
+  li        x2, 29
+  la        x3, p256_n
+  bn.lid    x2, 0(x3)
+  bn.wsrw   0, w29
+  li        x2, 28
+  la        x3, p256_u_n
+  bn.lid    x2, 0(x3)
+
+  /* Reduce k0 modulo n.
+     TODO: this is temporary until scalar_mult_int supports extra bits; remove later.
+
+     w0 <= [w0,w1] mod n = k0 mod n */
+  bn.mov   w19, w0
+  bn.mov   w20, w1
+  bn.mov   w22, w31
+  jal      x1, p256_reduce
+  bn.mov   w0, w19
+
+  /* Reduce k1 modulo n.
+     TODO: this is temporary until scalar_mult_int supports extra bits; remove later.
+
+     w1 <= [w2,w3] mod n = k1 mod n */
+  bn.mov   w19, w2
+  bn.mov   w20, w3
+  bn.mov   w22, w31
+  jal      x1, p256_reduce
+  bn.mov   w1, w19
+
   /* scalar multiplication with base point
      (x_1, y_1) = (w11, w12) <= k*G = w0*(dmem[p256_gx], dmem[p256_gy]) */
   la        x21, p256_gx
@@ -1137,16 +1204,40 @@ p256_sign:
   la        x3, p256_u_n
   bn.lid    x2, 0(x3)
 
-  /* re-load first share of secret scalar k from dmem: w0 = dmem[k0] */
+  /* re-load first share of secret scalar k from dmem: w0,w1 = dmem[k0] */
   la        x16, k0
   li        x2, 0
+  bn.lid    x2, 0(x16++)
+  li        x2, 1
   bn.lid    x2, 0(x16)
 
-  /* re-load second share of secret scalar k from dmem: w1 = dmem[k1] */
+  /* re-load second share of secret scalar k from dmem: w2,w3 = dmem[k1] */
   la        x16, k1
-  li        x2, 1
+  li        x2, 2
+  bn.lid    x2, 0(x16++)
+  li        x2, 3
   bn.lid    x2, 0(x16)
 
+  /* Reduce k0 modulo n.
+     TODO(#15507): this is temporary until mod_inv supports extra bits; remove later.
+
+     w0 <= [w0,w1] mod n = k0 mod n */
+  bn.mov   w19, w0
+  bn.mov   w20, w1
+  bn.mov   w22, w31
+  jal      x1, p256_reduce
+  bn.mov   w0, w19
+
+  /* Reduce k1 modulo n.
+     TODO(#15507): this is temporary until mod_inv supports extra bits; remove later.
+
+     w1 <= [w2,w3] mod n = k1 mod n */
+  bn.mov   w19, w2
+  bn.mov   w20, w3
+  bn.mov   w22, w31
+  jal      x1, p256_reduce
+  bn.mov   w1, w19
+
   /* Combine the shares of k for inversion.
      TODO(#15507): modify inversion to handle k in shares.
        w0 <= (w0 + w1) mod n = k */
@@ -1927,15 +2018,15 @@ p256_gy:
 
 .section .bss
 
-/* random scalar k (in two shares) */
+/* random scalar k (in two 320b shares) */
 .balign 32
 .weak k0
 k0:
-  .zero 32
+  .zero 40
 .balign 32
 .weak k1
 k1:
-  .zero 32
+  .zero 40
 
 /* message digest */
 .balign 32
diff --git a/sw/otbn/crypto/tests/p256_ecdsa_sign_test.s b/sw/otbn/crypto/tests/p256_ecdsa_sign_test.s
index bef124d5a33de..5671365bd4cc4 100644
--- a/sw/otbn/crypto/tests/p256_ecdsa_sign_test.s
+++ b/sw/otbn/crypto/tests/p256_ecdsa_sign_test.s
@@ -43,7 +43,7 @@ k0:
   .word 0x21d0a016
   .word 0xb0b2c781
   .word 0x9590ef5d
-  .zero 16
+  .zero 24
 
 /* second share of nonce k (128 0s, then last 128 bits of k) */
 .globl k1
@@ -54,6 +54,7 @@ k1:
   .word 0x1b76ebe8
   .word 0x74210263
   .word 0x1420fc41
+  .zero 8
 
 /* message digest */
 .globl msg

From 8327cad716afb8b71c81ac7a72ca3e83cb218834 Mon Sep 17 00:00:00 2001
From: Jade Philipoom <jadep@google.com>
Date: Mon, 5 Dec 2022 13:50:17 +0100
Subject: [PATCH 02/28] [crypto] Make p256_ecdsa_sign_test randomize shares of
 k.

Modify the test to randomize k so that handling of the new extra bits is
tested.

Signed-off-by: Jade Philipoom <jadep@google.com>
---
 sw/otbn/crypto/tests/p256_ecdsa_sign_test.s | 60 ++++++++++++++++++++-
 1 file changed, 58 insertions(+), 2 deletions(-)

diff --git a/sw/otbn/crypto/tests/p256_ecdsa_sign_test.s b/sw/otbn/crypto/tests/p256_ecdsa_sign_test.s
index 5671365bd4cc4..a69527c4754c4 100644
--- a/sw/otbn/crypto/tests/p256_ecdsa_sign_test.s
+++ b/sw/otbn/crypto/tests/p256_ecdsa_sign_test.s
@@ -18,6 +18,22 @@
 .section .text.start
 
 ecdsa_sign_test:
+  /* Load first share of k (256 bits at start).
+       w4 <= k0 */
+  la        x16, k0
+  li        x2, 4
+  bn.lid    x2, 0(x16)
+
+  /* Randomize k0.
+       [w4,w5] <= randomize(w4) = randomize(k0) */
+  jal      x1, randomize_share
+
+  /* Store randomized k0 (320 bits).
+       dmem[k0] <= [w4,w5] = randomize(k0) */
+  li        x2, 4
+  bn.sid    x2, 0(x16++)
+  li        x2, 5
+  bn.sid    x2, 0(x16)
 
   /* call ECDSA signing subroutine in P-256 lib */
   jal      x1, p256_sign
@@ -32,10 +48,50 @@ ecdsa_sign_test:
 
   ecall
 
+/**
+ * Produce a 320-bit share representing a 256-bit value modulo n.
+ *
+ * Returns y = (x + r * n),
+ *   where r is a 63-bit pseudorandom number and n is the P-256 curve order.
+ *
+ * @param[in]        w4: x, input (256 bits)
+ * @param[out]  [w4,w5]: y, output (320 bits)
+ *
+ * clobbered registers: w0 to w5
+ * clobbered flag groups: FG0
+ */
+randomize_share:
+  /* Initialize all-zero register. */
+  bn.xor  w31, w31, w31
+
+  /* Get a 63-bit pseudorandom number.
+       w0 <= URND()[255:193] = r */
+  bn.wsrr  w0, 0x2 /* URND*/
+  bn.rshi  w0, w31, w0 >> 193
+
+  /* Load the curve order n.
+     w1 <= dmem[p256_n] = n */
+  li        x2, 1
+  la        x3, p256_n
+  bn.lid    x2, 0(x3)
+
+  /* w2,w3 <= w0 * w1 = r * n */
+  bn.mulqacc.z         w0.0, w1.0, 0
+  bn.mulqacc.so  w2.L, w0.0, w1.1, 64
+  bn.mulqacc           w0.0, w1.2, 0
+  bn.mulqacc.so  w2.U, w0.0, w1.3, 64
+  bn.mulqacc.wo    w3, w31.0, w31.0, 0
+
+  /* Add to input.
+       w4, w5 <= w4 + [w2,w3] = x + r * n */
+  bn.add  w4, w4, w2
+  bn.addc w5, w31, w3
+
+  ret
 
 .data
 
-/* first share of nonce k (first 128 bits of k, then 128 0s) */
+/* first share of nonce k (first 128 bits of k, then 128 0s, then 64 0s for redundant bits) */
 .globl k0
 .balign 32
 k0:
@@ -45,7 +101,7 @@ k0:
   .word 0x9590ef5d
   .zero 24
 
-/* second share of nonce k (128 0s, then last 128 bits of k) */
+/* second share of nonce k (128 0s, then last 128 bits of k, then 64 0s for redundant bits) */
 .globl k1
 .balign 32
 k1:

From 798f271897a7776bf45f25bd7f643580a485454a Mon Sep 17 00:00:00 2001
From: Jade Philipoom <jadep@google.com>
Date: Thu, 8 Dec 2022 13:51:44 +0100
Subject: [PATCH 03/28] [crypto] Use properly masked values in mod_inv.

Change the way the modular inverse of k is computed; rather than simply
operate on k in plaintext, first multiply k with a random scalar alpha,
compute the inverse (k * alpha)^-1, and multiply by alpha again to get
k^-1.

Signed-off-by: Jade Philipoom <jadep@google.com>
---
 sw/otbn/crypto/p256.s | 120 ++++++++++++++++++++++++++++++++----------
 1 file changed, 92 insertions(+), 28 deletions(-)

diff --git a/sw/otbn/crypto/p256.s b/sw/otbn/crypto/p256.s
index aeee6d5d4c059..11fe7a30b9843 100644
--- a/sw/otbn/crypto/p256.s
+++ b/sw/otbn/crypto/p256.s
@@ -53,12 +53,12 @@
  *
  * Flags: Flags have no meaning beyond the scope of this subroutine.
  *
- * @param[in]  [w19,w20]: a, input (512 bits) such that a < 2^256 * m 
+ * @param[in]  [w19,w20]: a, input (512 bits) such that a < 2^256 * m
  * @param[in]  w22: correction factor, msb(a) * u
- * @param[in]  w29: m, modulus, curve order n or finite field modulus p 
- * @param[in]  w28: u, lower 256 bit of Barrett constant for m 
+ * @param[in]  w29: m, modulus, curve order n or finite field modulus p
+ * @param[in]  w28: u, lower 256 bit of Barrett constant for m
  * @param[in]  w31: all-zero
- * @param[in]  MOD: m, modulus 
+ * @param[in]  MOD: m, modulus
  * @param[out]  w19: c, result
  *
  * clobbered registers: w19, w20, w21, w22, w23, w24, w25
@@ -159,7 +159,7 @@ p256_reduce:
 /**
  * 256-bit modular multiplication for P-256 coordinate and scalar fields.
  *
- * Returns c = a * b mod p
+ * Returns c = a * b mod m
  *
  * Expects two 256 bit operands, 256 bit modulus and pre-computed parameter u
  * for Barrett reduction (usually greek mu in literature). See the note above
@@ -169,7 +169,7 @@ p256_reduce:
  *
  * @param[in]  w24: a, first 256 bit operand (a * b < 2^256 * p)
  * @param[in]  w25: b, second 256 bit operand (a * b < 2^256 * p)
- * @param[in]  w29: m, modulus, curve order n or finite field modulus p 
+ * @param[in]  w29: m, modulus, curve order n or finite field modulus p
  * @param[in]  w28: u, lower 256 bit of Barrett constant for curve P-256
  * @param[in]  w31: all-zero
  * @param[in]  MOD: p, modulus of P-256 underlying finite field
@@ -214,6 +214,59 @@ mod_mul_256x256:
 
   ret
 
+/**
+ * 320- by 128-bit modular multiplication for P-256 coordinate and scalar fields.
+ *
+ * Returns c = a * b mod m
+ *
+ * Expects two operands, 256 bit modulus and pre-computed parameter u
+ * for Barrett reduction (usually greek mu in literature). See the note above
+ * `p256_reduce` for details.
+ *
+ * Flags: Flags have no meaning beyond the scope of this subroutine.
+ *
+ * @param[in]  w24: a0, low part of 320-bit operand a (256 bits)
+ * @param[in]  w25: a1, high part of 320-bit operand a (64 bits)
+ * @param[in]  w26: b, second operand (128 bits)
+ * @param[in]  w29: m, modulus, curve order n or finite field modulus p
+ * @param[in]  w28: u, lower 256 bit of Barrett constant for curve P-256
+ * @param[in]  w31: all-zero
+ * @param[in]  MOD: p, modulus of P-256 underlying finite field
+ * @param[out]  w19: c, result
+ *
+ * clobbered registers: w19, w20, w21, w22, w23, w24, w25
+ * clobbered flag groups: FG0
+ */
+mod_mul_320x128:
+  /* Compute the integer product of the operands x = a * b
+     x = [w20, w19] = a * b = w24 * w25
+     => max. length of x: 448 bit */
+  bn.mulqacc.z          w24.0, w26.0, 0
+  bn.mulqacc            w24.0, w26.1, 64
+  bn.mulqacc.so  w19.L, w24.1, w26.0, 64
+  bn.mulqacc            w24.1, w26.1, 0
+  bn.mulqacc            w24.2, w26.0, 0
+  bn.mulqacc            w24.2, w26.1, 64
+  bn.mulqacc.so  w19.U, w24.3, w26.0, 64
+  bn.mulqacc            w24.3, w26.1, 0
+  bn.mulqacc            w25.0, w26.0, 0
+  bn.mulqacc.wo    w20, w25.0, w26.1, 64
+  /* TODO: try removing this from both, I think M flag is set by .wo */
+  bn.add    w20, w20, w31
+
+  /* Store correction factor to compensate for later neglected MSb of x.
+     x is 512 bit wide and therefore the 255 bit right shifted version q1
+     (below) contains 257 bit. Bit 256 of q1 is neglected to allow using a
+     256x256 multiplier. For the MSb of x being set we temporary store u
+     (or zero) here to be used in a later constant time correction of a
+     multiplication with u. Note that this requires the MSb flag being carried
+     over from the multiplication routine. */
+  bn.sel    w22, w28, w31, M
+
+  /* Reduce product modulo m. */
+  jal       x1, p256_reduce
+
+  ret
 
 /**
  * Checks if a point is a valid curve point on curve P-256 (secp256r1)
@@ -1218,35 +1271,46 @@ p256_sign:
   li        x2, 3
   bn.lid    x2, 0(x16)
 
-  /* Reduce k0 modulo n.
-     TODO(#15507): this is temporary until mod_inv supports extra bits; remove later.
+  /* Generate a random 127-bit number.
+       w4 <= URND()[255:129] */
+  bn.wsrr  w4, 0x2 /* URND */
+  bn.rshi  w4, w31, w4 >> 129
 
-     w0 <= [w0,w1] mod n = k0 mod n */
-  bn.mov   w19, w0
-  bn.mov   w20, w1
-  bn.mov   w22, w31
-  jal      x1, p256_reduce
-  bn.mov   w0, w19
+  /* Add 1 to get a 128-bit nonzero scalar for masking.
+       w4 <= w4 + 1 = gamma */
+  bn.addi  w4, w4, 1
 
-  /* Reduce k1 modulo n.
-     TODO(#15507): this is temporary until mod_inv supports extra bits; remove later.
+  /* Multiply k0 by the masking factor and reduce modulo n.
+       w0 <= ([w0,w1] * w4) mod n = (k0 * gamma) mod n */
+  bn.mov    w24, w0
+  bn.mov    w25, w1
+  bn.mov    w26, w4
+  jal       x1, mod_mul_320x128
+  bn.mov    w0, w19
 
-     w1 <= [w2,w3] mod n = k1 mod n */
-  bn.mov   w19, w2
-  bn.mov   w20, w3
-  bn.mov   w22, w31
-  jal      x1, p256_reduce
-  bn.mov   w1, w19
+  /* Multiply k1 by the masking factor and reduce modulo n.
+       w2 <= ([w2,w3] * w26) mod n = (k1 * gamma) mod n */
+  bn.mov    w24, w2
+  bn.mov    w25, w3
+  jal       x1, mod_mul_320x128
+  bn.mov    w2, w19
 
-  /* Combine the shares of k for inversion.
-     TODO(#15507): modify inversion to handle k in shares.
-       w0 <= (w0 + w1) mod n = k */
-  bn.addm   w0, w0, w1
+  /* Add shares to get (k * gamma) mod n. This should be safe because of the
+     randomness from gamma.
+       w0 <= (w0 + w2) mod n = (k * gamma) mod n */
+  bn.addm  w0, w0, w2
 
-  /* modular multiplicative inverse of k
-     w1 <= k^-1 mod n */
+  /* Compute the inverse of (k * gamma) mod n.
+     w1 <= w0^-1 mod n = (k * gamma)^-1 mod n */
   jal       x1, mod_inv
 
+  /* Compute (gamma * (k * gamma)^-1) mod n = k^-1 mod n
+       w1 <= (w4*w1) mod n = k^-1 mod n */
+  bn.mov    w24, w4
+  bn.mov    w25, w1
+  jal       x1, mod_mul_256x256
+  bn.mov    w1, w19
+
   /* w24 = d0 = dmem[d0] */
   la        x23, d0
   li        x2, 24

From 88b97bde566d0e70002ac26de4802ce3e64bcdd5 Mon Sep 17 00:00:00 2001
From: Jade Philipoom <jadep@google.com>
Date: Thu, 8 Dec 2022 15:16:14 +0100
Subject: [PATCH 04/28] [crypto] Store ECDSA-P256 private key d in 320-bit
 shares.

Similarly to k, all routines handling d will be modified to handle extra
redundant bits.

Signed-off-by: Jade Philipoom <jadep@google.com>
---
 sw/otbn/crypto/p256.s                       | 158 +++++++++++++-------
 sw/otbn/crypto/tests/p256_ecdsa_sign_test.s |   3 +-
 2 files changed, 106 insertions(+), 55 deletions(-)

diff --git a/sw/otbn/crypto/p256.s b/sw/otbn/crypto/p256.s
index 11fe7a30b9843..c63163020fd20 100644
--- a/sw/otbn/crypto/p256.s
+++ b/sw/otbn/crypto/p256.s
@@ -1178,13 +1178,23 @@ scalar_mult_int:
  * Each share is 320 bits, which gives us 64 bits of extra redundancy modulo n
  * (256 bits). This is a protection measure against side-channel attacks.
  *
+ * For s = k^-1 * (r * d + msg), we compute a random nonzero masking scalar
+ * alpha, and compute s as:
+ *   s = ((k * alpha)^-1 * (r * (d * alpha) + alpha * msg)) mod n
+ *
+ * We choose alpha to be at most 128 bits, so the product with a 320b share
+ * produces fits in the same 512-bit modular reduction routine that we use for
+ * 256x256-bit multiplications. It should be safe to compute e.g. k * alpha =
+ * (k0 * alpha + k1 * alpha) mod n, because alpha has enough randomness to mask
+ * the true value of k.
+ *
  * @param[in]  dmem[k0]:  first share of secret scalar (320 bits)
  * @param[in]  dmem[k1]:  second share of secret scalar (320 bits)
  * @param[in]  dmem[msg]: message to be signed (256 bits)
  * @param[in]  dmem[r]:   dmem buffer for r component of signature (256 bits)
  * @param[in]  dmem[s]:   dmem buffer for s component of signature (256 bits)
- * @param[in]  dmem[d0]:  first share of private key d
- * @param[in]  dmem[d1]:  second share of private key d
+ * @param[in]  dmem[d0]:  first share of private key d (320 bits)
+ * @param[in]  dmem[d1]:  second share of private key d (320 bits)
  *
  * Flags: When leaving this subroutine, the M, L and Z flags of FG0 depend on
  *        the computed affine y-coordinate.
@@ -1277,88 +1287,100 @@ p256_sign:
   bn.rshi  w4, w31, w4 >> 129
 
   /* Add 1 to get a 128-bit nonzero scalar for masking.
-       w4 <= w4 + 1 = gamma */
+       w4 <= w4 + 1 = alpha */
   bn.addi  w4, w4, 1
 
-  /* Multiply k0 by the masking factor and reduce modulo n.
-       w0 <= ([w0,w1] * w4) mod n = (k0 * gamma) mod n */
+  /* w0 <= ([w0,w1] * w4) mod n = (k0 * alpha) mod n */
   bn.mov    w24, w0
   bn.mov    w25, w1
   bn.mov    w26, w4
   jal       x1, mod_mul_320x128
   bn.mov    w0, w19
 
-  /* Multiply k1 by the masking factor and reduce modulo n.
-       w2 <= ([w2,w3] * w26) mod n = (k1 * gamma) mod n */
+  /* w19 <= ([w2,w3] * w26) mod n = (k1 * alpha) mod n */
   bn.mov    w24, w2
   bn.mov    w25, w3
   jal       x1, mod_mul_320x128
-  bn.mov    w2, w19
 
-  /* Add shares to get (k * gamma) mod n. This should be safe because of the
-     randomness from gamma.
-       w0 <= (w0 + w2) mod n = (k * gamma) mod n */
-  bn.addm  w0, w0, w2
+  /* w0 <= (w0+w19) mod n = (k * alpha) mod n */
+  bn.addm   w0, w0, w19
 
-  /* Compute the inverse of (k * gamma) mod n.
-     w1 <= w0^-1 mod n = (k * gamma)^-1 mod n */
+  /* w1 <= w0^-1 mod n = (k * alpha)^-1 mod n */
   jal       x1, mod_inv
 
-  /* Compute (gamma * (k * gamma)^-1) mod n = k^-1 mod n
-       w1 <= (w4*w1) mod n = k^-1 mod n */
-  bn.mov    w24, w4
-  bn.mov    w25, w1
-  jal       x1, mod_mul_256x256
-  bn.mov    w1, w19
-
-  /* w24 = d0 = dmem[d0] */
-  la        x23, d0
-  li        x2, 24
-  bn.lid    x2, 0(x23)
+  /* Load first share of secret key d from dmem.
+       w2,w3 = dmem[d0] */
+  la        x16, d0
+  li        x2, 2
+  bn.lid    x2, 0(x16++)
+  li        x2, 3
+  bn.lid    x2, 0(x16)
 
-  /* w2 = k^-1*d0 mod n */
-  bn.mov    w25, w1
-  jal       x1, mod_mul_256x256
-  bn.mov    w2, w19
+  /* Load second share of secret key d from dmem.
+       w5,w6 = dmem[d1] */
+  la        x16, d1
+  li        x2, 5
+  bn.lid    x2, 0(x16++)
+  li        x2, 6
+  bn.lid    x2, 0(x16)
 
-  /* w24 = d1 = dmem[d1] */
-  la        x23, d1
-  li        x2, 24
-  bn.lid    x2, 0(x23)
+  /* w0 <= ([w2,w3] * w4) mod n = (d0 * alpha) mod n */
+  bn.mov    w24, w2
+  bn.mov    w25, w3
+  bn.mov    w26, w4
+  jal       x1, mod_mul_320x128
+  bn.mov    w0, w19
 
-  /* w19 = k^-1*d1 mod n */
-  jal       x1, mod_mul_256x256
+  /* w19 <= ([w5,w6] * w4) mod n = (d1 * alpha) mod n */
+  bn.mov    w24, w5
+  bn.mov    w25, w6
+  bn.mov    w26, w4
+  jal       x1, mod_mul_320x128
 
-  /* w19 <= w2*w19 mod n = (k^-1*d0 + k^-1*d1) mod n
-                         = (k^-1*d) mod n */
-  bn.addm   w19, w2, w19
+  /* w0 <= (w0+w19) mod n = (d * alpha) mod n */
+  bn.addm   w0, w0, w19
 
   /* w24 = r <= w11  mod n */
   bn.addm   w24, w11, w31
 
-  /* store r of signature in dmem: dmem[r] <= r = w24 */
+  /* Store r of signature in dmem.
+       dmem[r] <= r = w24 */
   la        x19, r
   li        x2, 24
   bn.sid    x2, 0(x19)
 
-  /* w0 = w19 <= w24*w25 = w24*w19 = r*k^-1*d  mod n */
+  /* w19 <= (w24 * w0) mod n = (r * d * alpha) mod n */
+  bn.mov    w25, w0
+  jal       x1, mod_mul_256x256
+
+  /* w0 <= (w1 * w19) mod n = ((k * alpha)^-1 * (r * d * alpha)) mod n
+                            = (k^-1 * r * d) mod n */
+  bn.mov    w24, w1
   bn.mov    w25, w19
   jal       x1, mod_mul_256x256
   bn.mov    w0, w19
 
-  /* load message from dmem: w24 = msg <= dmem[msg] */
+  /* Load message from dmem:
+       w24 = msg <= dmem[msg] */
   la        x18, msg
   li        x2, 24
   bn.lid    x2, 0(x18)
 
-  /* w19 = k^-1*msg <= w25*w24 = w1*w24  mod n */
-  bn.mov    w25, w1
+  /* w19 = (w24 * w4) mod n = <= (msg * alpha)  mod n */
+  bn.mov    w25, w4
   jal       x1, mod_mul_256x256
 
-  /* w0 = s <= w19 + w0 = k^-1*msg + r*k^-1*d  mod n */
-  bn.addm   w0, w19, w0
+  /* w19 = (w1 * w19) mod n = ((k * alpha)^-1 * (msg * alpha)) mod n
+                            = (k^-1 * msg) mod n */
+  bn.mov    w24, w1
+  bn.mov    w25, w19
+  jal       x1, mod_mul_256x256
 
-  /* store s of signature in dmem: dmem[s] <= s = w0 */
+  /* w0 = (w0 + w19) mod n = (k^-1*r*d + k^-1*msg) mod n = s */
+  bn.addm   w0, w0, w19
+
+  /* Store s of signature in dmem.
+       dmem[s] <= s = w0 */
   la        x20, s
   li        x2, 0
   bn.sid    x2, 0(x20)
@@ -1381,10 +1403,12 @@ p256_sign:
  * where:
  *   d = (d0 + d1) mod n
  *
+ * The shares d0 and d1 are up to 320 bits each to provide extra redundancy.
+ *
  * This routine runs in constant time.
  *
- * @param[in]     dmem[d0]:  first share of scalar d (256 bits)
- * @param[in]     dmem[d1]:  second share of scalar d (256 bits)
+ * @param[in]     dmem[d0]:  first share of scalar d (320 bits)
+ * @param[in]     dmem[d1]:  second share of scalar d (320 bits)
  * @param[in,out] dmem[x]:   affine x-coordinate (256 bits)
  * @param[in,out] dmem[y]:   affine y-coordinate (256 bits)
  *
@@ -1398,16 +1422,42 @@ p256_base_mult:
   /* init all-zero register */
   bn.xor    w31, w31, w31
 
-  /* load first share of scalar d from dmem: w0 = dmem[d0] */
+  /* Load first share of secret key d from dmem.
+       w0,w1 = dmem[d0] */
   la        x16, d0
   li        x2, 0
+  bn.lid    x2, 0(x16++)
+  li        x2, 1
   bn.lid    x2, 0(x16)
 
-  /* load second share of scalar d from dmem: w1 = dmem[d0] */
+  /* Load second share of secret key d from dmem.
+       w2,w3 = dmem[d1] */
   la        x16, d1
-  li        x2, 1
+  li        x2, 2
+  bn.lid    x2, 0(x16++)
+  li        x2, 3
   bn.lid    x2, 0(x16)
 
+  /* Reduce d0 modulo n.
+     TODO: this is temporary until scalar_mult_int supports extra bits; remove later.
+
+     w0 <= [w0,w1] mod n = d0 mod n */
+  bn.mov   w19, w0
+  bn.mov   w20, w1
+  bn.mov   w22, w31
+  jal      x1, p256_reduce
+  bn.mov   w0, w19
+
+  /* Reduce d1 modulo n.
+     TODO: this is temporary until scalar_mult_int supports extra bits; remove later.
+
+     w1 <= [w2,w3] mod n = d1 mod n */
+  bn.mov   w19, w2
+  bn.mov   w20, w3
+  bn.mov   w22, w31
+  jal      x1, p256_reduce
+  bn.mov   w1, w19
+
   /* call internal scalar multiplication routine
      R = (x_a, y_a) = (w11, w12) <= d*P = (w0 + w1)*P */
   la        x21, p256_gx
@@ -2122,15 +2172,15 @@ x:
 y:
   .zero 32
 
-/* private key d (in two shares) */
+/* private key d (in two 320b shares) */
 .balign 32
 .weak d0
 d_share0:
-  .zero 32
+  .zero 40
 .balign 32
 .weak d1
 d_share1:
-  .zero 32
+  .zero 40
 
 /* verification result x_r (aka x_1) */
 .balign 32
diff --git a/sw/otbn/crypto/tests/p256_ecdsa_sign_test.s b/sw/otbn/crypto/tests/p256_ecdsa_sign_test.s
index a69527c4754c4..e29f1c1ca8c90 100644
--- a/sw/otbn/crypto/tests/p256_ecdsa_sign_test.s
+++ b/sw/otbn/crypto/tests/p256_ecdsa_sign_test.s
@@ -137,12 +137,13 @@ d0:
   .word 0xe5f2cbee
   .word 0x9144233d
   .word 0xc0fbe256
+  .zero 8
 
 /* second share of d (all-zero) */
 .globl d1
 .balign 32
 d1:
-  .zero 32
+  .zero 40
 
 /* signature R */
 .globl r

From 953380a9c25433df87ee831b9a55639a86aeaef9 Mon Sep 17 00:00:00 2001
From: Jade Philipoom <jadep@google.com>
Date: Thu, 8 Dec 2022 15:25:04 +0100
Subject: [PATCH 05/28] [crypto] Make p256_ecdsa_sign_test randomize private
 key.

Randomize first share of d so that the handling of new extra bits is tested.

Signed-off-by: Jade Philipoom <jadep@google.com>
---
 sw/otbn/crypto/tests/p256_ecdsa_sign_test.s | 53 ++++++++++++---------
 1 file changed, 31 insertions(+), 22 deletions(-)

diff --git a/sw/otbn/crypto/tests/p256_ecdsa_sign_test.s b/sw/otbn/crypto/tests/p256_ecdsa_sign_test.s
index e29f1c1ca8c90..96cff35decf0d 100644
--- a/sw/otbn/crypto/tests/p256_ecdsa_sign_test.s
+++ b/sw/otbn/crypto/tests/p256_ecdsa_sign_test.s
@@ -18,22 +18,15 @@
 .section .text.start
 
 ecdsa_sign_test:
-  /* Load first share of k (256 bits at start).
-       w4 <= k0 */
-  la        x16, k0
-  li        x2, 4
-  bn.lid    x2, 0(x16)
-
   /* Randomize k0.
-       [w4,w5] <= randomize(w4) = randomize(k0) */
+       dmem[k0] <= randomize(dmem[k0]) */
+  la       x16, k0
   jal      x1, randomize_share
 
-  /* Store randomized k0 (320 bits).
-       dmem[k0] <= [w4,w5] = randomize(k0) */
-  li        x2, 4
-  bn.sid    x2, 0(x16++)
-  li        x2, 5
-  bn.sid    x2, 0(x16)
+  /* Randomize d0.
+       dmem[d0] <= randomize(dmem[d0]) */
+  la       x16, d0
+  jal      x1, randomize_share
 
   /* call ECDSA signing subroutine in P-256 lib */
   jal      x1, p256_sign
@@ -54,8 +47,11 @@ ecdsa_sign_test:
  * Returns y = (x + r * n),
  *   where r is a 63-bit pseudorandom number and n is the P-256 curve order.
  *
- * @param[in]        w4: x, input (256 bits)
- * @param[out]  [w4,w5]: y, output (320 bits)
+ * Reads input from DMEM and stores back to the same location.
+ *
+ * @param[in]        x16: x_ptr, DMEM location of input
+ * @param[in]  dmem[x16]: x, input (256 bits)
+ * @param[out] dmem[x16]: y, output (320 bits)
  *
  * clobbered registers: w0 to w5
  * clobbered flag groups: FG0
@@ -64,6 +60,11 @@ randomize_share:
   /* Initialize all-zero register. */
   bn.xor  w31, w31, w31
 
+  /* Load input.
+       w4 <= dmem[x16] = x */
+  li        x2, 4
+  bn.lid    x2, 0(x16)
+
   /* Get a 63-bit pseudorandom number.
        w0 <= URND()[255:193] = r */
   bn.wsrr  w0, 0x2 /* URND*/
@@ -87,6 +88,13 @@ randomize_share:
   bn.add  w4, w4, w2
   bn.addc w5, w31, w3
 
+  /* Store randomized share (320 bits).
+       dmem[x16] <= [w4,w5] = x + r * n */
+  li        x2, 4
+  bn.sid    x2, 0(x16++)
+  li        x2, 5
+  bn.sid    x2, 0(x16)
+
   ret
 
 .data
@@ -125,7 +133,7 @@ msg:
   .word 0x6ce90fef
   .word 0x06d71207
 
-/* private key d */
+/* first share of private key d (first 128 bits of d, then 192 0s) */
 .globl d0
 .balign 32
 d0:
@@ -133,18 +141,19 @@ d0:
   .word 0xfbd94efe
   .word 0xaa847f52
   .word 0x2d869bf4
+  .zero 24
+
+/* second share of private key d (128 0s, then last 128 bits of d, then 64 0s) */
+.globl d1
+.balign 32
+d1:
+  .zero 16
   .word 0x543b963b
   .word 0xe5f2cbee
   .word 0x9144233d
   .word 0xc0fbe256
   .zero 8
 
-/* second share of d (all-zero) */
-.globl d1
-.balign 32
-d1:
-  .zero 40
-
 /* signature R */
 .globl r
 .balign 32

From cea4cce85ffac33eb953928f9e3c0b3eef5ead91 Mon Sep 17 00:00:00 2001
From: Jade Philipoom <jadep@google.com>
Date: Thu, 8 Dec 2022 15:27:11 +0100
Subject: [PATCH 06/28] [crypto] Randomize both shares in p256_ecdsa_sign_test.

Randomize both shares of scalars d and k in ECDSA sign test, not just
the first share, to increase test coverage.

Signed-off-by: Jade Philipoom <jadep@google.com>
---
 sw/otbn/crypto/tests/p256_ecdsa_sign_test.s | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/sw/otbn/crypto/tests/p256_ecdsa_sign_test.s b/sw/otbn/crypto/tests/p256_ecdsa_sign_test.s
index 96cff35decf0d..c9076c2411d4f 100644
--- a/sw/otbn/crypto/tests/p256_ecdsa_sign_test.s
+++ b/sw/otbn/crypto/tests/p256_ecdsa_sign_test.s
@@ -18,15 +18,19 @@
 .section .text.start
 
 ecdsa_sign_test:
-  /* Randomize k0.
-       dmem[k0] <= randomize(dmem[k0]) */
+  /* Randomize all shares.
+       dmem[k0] <= randomize(dmem[k0])
+       dmem[k1] <= randomize(dmem[k1])
+       dmem[d0] <= randomize(dmem[d0])
+       dmem[d1] <= randomize(dmem[d1]) */
   la       x16, k0
   jal      x1, randomize_share
-
-  /* Randomize d0.
-       dmem[d0] <= randomize(dmem[d0]) */
+  la       x16, k1
+  jal      x1, randomize_share
   la       x16, d0
   jal      x1, randomize_share
+  la       x16, d1
+  jal      x1, randomize_share
 
   /* call ECDSA signing subroutine in P-256 lib */
   jal      x1, p256_sign

From 5d4d37fb311fdf8194cf4fd299f0dcdc13f163c8 Mon Sep 17 00:00:00 2001
From: Jade Philipoom <jadep@google.com>
Date: Wed, 14 Dec 2022 14:37:03 +0100
Subject: [PATCH 07/28] [crypto] Remove unnecessary instructions from P256.

Remove some instructions that turn out to be unnecessary due to
bn.mulqacc setting the M flag.

Signed-off-by: Jade Philipoom <jadep@google.com>
---
 sw/otbn/crypto/p256.s | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/sw/otbn/crypto/p256.s b/sw/otbn/crypto/p256.s
index c63163020fd20..7b5b25e4d4dc4 100644
--- a/sw/otbn/crypto/p256.s
+++ b/sw/otbn/crypto/p256.s
@@ -198,7 +198,6 @@ mod_mul_256x256:
   bn.mulqacc            w24.3, w25.2, 64
   bn.mulqacc.so  w20.L, w24.2, w25.3, 64
   bn.mulqacc.so  w20.U, w24.3, w25.3,  0
-  bn.add    w20, w20, w31
 
   /* Store correction factor to compensate for later neglected MSb of x.
      x is 512 bit wide and therefore the 255 bit right shifted version q1
@@ -251,8 +250,6 @@ mod_mul_320x128:
   bn.mulqacc            w24.3, w26.1, 0
   bn.mulqacc            w25.0, w26.0, 0
   bn.mulqacc.wo    w20, w25.0, w26.1, 64
-  /* TODO: try removing this from both, I think M flag is set by .wo */
-  bn.add    w20, w20, w31
 
   /* Store correction factor to compensate for later neglected MSb of x.
      x is 512 bit wide and therefore the 255 bit right shifted version q1

From 3d9bdc0b4020f015f597848781a89e182609299c Mon Sep 17 00:00:00 2001
From: Jade Philipoom <jadep@google.com>
Date: Thu, 22 Dec 2022 16:59:08 +0100
Subject: [PATCH 08/28] [crypto] Update C code to use longer P-256 scalars.

Update the cryptolib implementation and the SCA P-256 code to match new,
longer P-256 scalar inputs.

Also adjust all I/O buffers for scalars to have 512 bits so that reads
don't produce runtime errors (since a 320-bit load on OTBN must be a
512-bit load).

Signed-off-by: Jade Philipoom <jadep@google.com>
---
 sw/device/lib/crypto/drivers/otbn.c           | 15 +++++++++
 sw/device/lib/crypto/drivers/otbn.h           | 17 ++++++++++
 .../lib/crypto/impl/ecdsa_p256/ecdsa_p256.c   | 17 +++++++---
 .../lib/crypto/impl/ecdsa_p256/ecdsa_p256.h   | 14 +++++---
 sw/device/tests/crypto/ecdsa_p256_functest.c  |  2 +-
 sw/device/tests/otbn_ecdsa_op_irq_test.c      | 12 ++++---
 sw/otbn/crypto/p256.s                         | 20 ++++++++----
 sw/otbn/crypto/p256_ecdsa.s                   |  8 ++---
 sw/otbn/crypto/p256_ecdsa_sca.s               | 32 ++++++-------------
 sw/otbn/crypto/tests/p256_base_mult_test.s    | 10 +++++-
 sw/otbn/crypto/tests/p256_ecdsa_sign_test.s   | 18 ++++++-----
 sw/otbn/crypto/tests/p256_scalar_mult_test.s  | 10 +++++-
 12 files changed, 119 insertions(+), 56 deletions(-)

diff --git a/sw/device/lib/crypto/drivers/otbn.c b/sw/device/lib/crypto/drivers/otbn.c
index 131e58bccd5ac..ef9f60e6e1574 100644
--- a/sw/device/lib/crypto/drivers/otbn.c
+++ b/sw/device/lib/crypto/drivers/otbn.c
@@ -145,6 +145,21 @@ otbn_error_t otbn_dmem_write(size_t num_words, const uint32_t *src,
   return kOtbnErrorOk;
 }
 
+otbn_error_t otbn_dmem_set(size_t num_words, const uint32_t src,
+                           otbn_addr_t dest) {
+  OTBN_RETURN_IF_ERROR(check_offset_len(dest, num_words, kOtbnDMemSizeBytes));
+
+  // No need to randomize here, since all the values are the same.
+  size_t i = 0;
+  for (; launder32(i) < num_words; ++i) {
+    abs_mmio_write32(kBase + OTBN_DMEM_REG_OFFSET + dest + i * sizeof(uint32_t),
+                     src);
+    HARDENED_CHECK_LT(i, num_words);
+  }
+  HARDENED_CHECK_EQ(i, num_words);
+  return kOtbnErrorOk;
+}
+
 otbn_error_t otbn_dmem_read(size_t num_words, otbn_addr_t src, uint32_t *dest) {
   OTBN_RETURN_IF_ERROR(check_offset_len(src, num_words, kOtbnDMemSizeBytes));
 
diff --git a/sw/device/lib/crypto/drivers/otbn.h b/sw/device/lib/crypto/drivers/otbn.h
index eb6160367d2f9..12900b9cc73bb 100644
--- a/sw/device/lib/crypto/drivers/otbn.h
+++ b/sw/device/lib/crypto/drivers/otbn.h
@@ -251,6 +251,23 @@ typedef struct otbn_app {
 otbn_error_t otbn_dmem_write(size_t num_words, const uint32_t *src,
                              otbn_addr_t dest);
 
+/**
+ * Set a range of OTBN's data memory (DMEM) to a particular value.
+ *
+ * Only 32b-aligned 32b word accesses are allowed. If `dest` is not
+ * word-aligned or if the length and offset exceed the DMEM size, this function
+ * will return an error.
+ *
+ * The caller must ensure OTBN is idle before calling this function.
+ *
+ * @param num_words Length of the range to set in 32-bit words.
+ * @param src The value to set each word in DMEM to.
+ * @param dest The DMEM location to set.
+ * @return Result of the operation.
+ */
+otbn_error_t otbn_dmem_set(size_t num_words, const uint32_t src,
+                           otbn_addr_t dest);
+
 /**
  * Read from OTBN's data memory (DMEM)
  *
diff --git a/sw/device/lib/crypto/impl/ecdsa_p256/ecdsa_p256.c b/sw/device/lib/crypto/impl/ecdsa_p256/ecdsa_p256.c
index 6ab78f5bbf70e..ff91bcdeb77c4 100644
--- a/sw/device/lib/crypto/impl/ecdsa_p256/ecdsa_p256.c
+++ b/sw/device/lib/crypto/impl/ecdsa_p256/ecdsa_p256.c
@@ -55,10 +55,19 @@ otbn_error_t ecdsa_p256_sign(const ecdsa_p256_message_digest_t *digest,
       otbn_dmem_write(kP256ScalarNumWords, digest->h, kOtbnVarEcdsaMsg));
 
   // Set the private key shares.
-  OTBN_RETURN_IF_ERROR(
-      otbn_dmem_write(kP256ScalarNumWords, private_key->d0, kOtbnVarEcdsaD0));
-  OTBN_RETURN_IF_ERROR(
-      otbn_dmem_write(kP256ScalarNumWords, private_key->d1, kOtbnVarEcdsaD1));
+  OTBN_RETURN_IF_ERROR(otbn_dmem_write(kP256SecretScalarNumWords,
+                                       private_key->d0, kOtbnVarEcdsaD0));
+  OTBN_RETURN_IF_ERROR(otbn_dmem_write(kP256SecretScalarNumWords,
+                                       private_key->d1, kOtbnVarEcdsaD1));
+
+  // Write trailing 0s to the upper parts of d0 and d1 so that OTBN's 256-bit
+  // read of the 64-bit second share does not cause an error.
+  size_t num_zeroes = kOtbnWideWordNumWords -
+                      (kP256SecretScalarNumWords % kOtbnWideWordNumWords);
+  OTBN_RETURN_IF_ERROR(otbn_dmem_set(
+      num_zeroes, 0, kOtbnVarEcdsaD0 + kP256SecretScalarNumBytes));
+  OTBN_RETURN_IF_ERROR(otbn_dmem_set(
+      num_zeroes, 0, kOtbnVarEcdsaD1 + kP256SecretScalarNumBytes));
 
   // Start the OTBN routine.
   OTBN_RETURN_IF_ERROR(otbn_execute());
diff --git a/sw/device/lib/crypto/impl/ecdsa_p256/ecdsa_p256.h b/sw/device/lib/crypto/impl/ecdsa_p256/ecdsa_p256.h
index 9bf8949930210..c941d8ab02601 100644
--- a/sw/device/lib/crypto/impl/ecdsa_p256/ecdsa_p256.h
+++ b/sw/device/lib/crypto/impl/ecdsa_p256/ecdsa_p256.h
@@ -30,8 +30,14 @@ enum {
   kP256ScalarNumBits = 256,
   /* Length of a number modulo the P-256 "n" parameter in words */
   kP256ScalarNumWords = kP256ScalarNumBits / (sizeof(uint32_t) * 8),
-  /* Length of the message digest in bits */
-  kP256MessageDigestNumBits = 256,
+  /**
+   * Length of a secret scalar share (uses extra redundant bits).
+   */
+  kP256SecretScalarNumBits = kP256ScalarNumBits + 64,
+  /* Length of secret scalar share in bytes. */
+  kP256SecretScalarNumBytes = kP256SecretScalarNumBits / 8,
+  /* Length of secret scalar share in words. */
+  kP256SecretScalarNumWords = kP256SecretScalarNumBytes / sizeof(uint32_t),
 };
 
 /**
@@ -52,8 +58,8 @@ typedef struct ecdsa_p256_signature_t {
  * shares d0 and d1 are also both computed modulo n.
  */
 typedef struct ecdsa_p256_private_key_t {
-  uint32_t d0[kP256ScalarNumWords];
-  uint32_t d1[kP256ScalarNumWords];
+  uint32_t d0[kP256SecretScalarNumWords];
+  uint32_t d1[kP256SecretScalarNumWords];
 } ecdsa_p256_private_key_t;
 
 /**
diff --git a/sw/device/tests/crypto/ecdsa_p256_functest.c b/sw/device/tests/crypto/ecdsa_p256_functest.c
index 968e9766c12b4..a869c7dd3adee 100644
--- a/sw/device/tests/crypto/ecdsa_p256_functest.c
+++ b/sw/device/tests/crypto/ecdsa_p256_functest.c
@@ -28,7 +28,7 @@ static const ecdsa_p256_public_key_t kPublicKey = {
 // Private key (d) in two shares
 static const ecdsa_p256_private_key_t kPrivateKey = {
     .d0 = {0xaf57b4cd, 0x744c9f1c, 0x8b7e0c02, 0x283e93e9, 0x0d18f00c,
-           0xda0b6cf4, 0x8fe6bb7a, 0x5545a0b7},
+           0xda0b6cf4, 0x8fe6bb7a, 0x5545a0b7, 0x00000000, 0x00000000},
     // TODO(#15409): add real data here to ensure the second share is
     // incorporated.
     .d1 = {0},
diff --git a/sw/device/tests/otbn_ecdsa_op_irq_test.c b/sw/device/tests/otbn_ecdsa_op_irq_test.c
index aed433092db7e..7d569184ccbed 100644
--- a/sw/device/tests/otbn_ecdsa_op_irq_test.c
+++ b/sw/device/tests/otbn_ecdsa_op_irq_test.c
@@ -233,7 +233,7 @@ static void profile_end(uint64_t t_start, const char *msg) {
 /**
  * Signs a message with ECDSA using the P-256 curve.
  *
- * @param otbn            The OTBN context object.
+ * @param otbn                The OTBN context object.
  * @param msg                 The message to sign (32B).
  * @param private_key_d       The private key (32B).
  * @param[out] signature_r    Signature component r (the x-coordinate of R).
@@ -252,9 +252,13 @@ static void p256_ecdsa_sign(dif_otbn_t *otbn, const uint8_t *msg,
   otbn_testutils_write_data(otbn, /*len_bytes=*/32, msg, kOtbnVarMsg);
   otbn_testutils_write_data(otbn, /*len_bytes=*/32, private_key_d, kOtbnVarD0);
 
+  // Write redundant upper bits of d (all-zero for this test).
+  uint8_t d0_high[32] = {0};
+  otbn_testutils_write_data(otbn, /*len_bytes=*/32, d0_high, kOtbnVarD0 + 32);
+
   // Write second share of d (all-zero for this test).
-  uint8_t d1[32] = {0};
-  otbn_testutils_write_data(otbn, /*len_bytes=*/32, d1, kOtbnVarD1);
+  uint8_t d1[64] = {0};
+  otbn_testutils_write_data(otbn, /*len_bytes=*/64, d1, kOtbnVarD1);
 
   // Call OTBN to perform operation, and wait for it to complete.
   otbn_testutils_execute(otbn);
@@ -268,7 +272,7 @@ static void p256_ecdsa_sign(dif_otbn_t *otbn, const uint8_t *msg,
 /**
  * Verifies a message with ECDSA using the P-256 curve.
  *
- * @param otbn             The OTBN context object.
+ * @param otbn                 The OTBN context object.
  * @param msg                  The message to verify (32B).
  * @param signature_r          The signature component r (the proof) (32B).
  * @param signature_s          The signature component s (the proof) (32B).
diff --git a/sw/otbn/crypto/p256.s b/sw/otbn/crypto/p256.s
index 7b5b25e4d4dc4..104c3a77ed76c 100644
--- a/sw/otbn/crypto/p256.s
+++ b/sw/otbn/crypto/p256.s
@@ -2025,12 +2025,18 @@ p256_generate_k:
        w20, w21 <= k0, k1 */
   jal  x1, p256_random_scalar
 
-  /* Write the shares to DMEM. */
+  /* Write the shares to DMEM.
+     TODO: zeroes for high bits are temporary until p256_random_scalar supports
+     extra bits; remove later. */
   la        x20, k0
   li        x2, 20
-  bn.sid    x2++, 0(x20)
+  bn.sid    x2, 0(x20++)
+  li        x3, 31
+  bn.sid    x3, 0(x20)
   la        x20, k1
-  bn.sid    x2, 0(x20)
+  li        x2, 21
+  bn.sid    x2, 0(x20++)
+  bn.sid    x3, 0(x20)
 
   ret
 
@@ -2133,11 +2139,11 @@ p256_gy:
 .balign 32
 .weak k0
 k0:
-  .zero 40
+  .zero 64
 .balign 32
 .weak k1
 k1:
-  .zero 40
+  .zero 64
 
 /* message digest */
 .balign 32
@@ -2173,11 +2179,11 @@ y:
 .balign 32
 .weak d0
 d_share0:
-  .zero 40
+  .zero 64
 .balign 32
 .weak d1
 d_share1:
-  .zero 40
+  .zero 64
 
 /* verification result x_r (aka x_1) */
 .balign 32
diff --git a/sw/otbn/crypto/p256_ecdsa.s b/sw/otbn/crypto/p256_ecdsa.s
index 19491203b73ae..db7b422ce212e 100644
--- a/sw/otbn/crypto/p256_ecdsa.s
+++ b/sw/otbn/crypto/p256_ecdsa.s
@@ -77,11 +77,11 @@ y:
 .globl d0
 .balign 32
 d0:
-  .zero 32
+  .zero 64
 .globl d1
 .balign 32
 d1:
-  .zero 32
+  .zero 64
 
 /* Verification result x_r (aka x_1). */
 .globl x_r
@@ -95,9 +95,9 @@ x_r:
 .globl k0
 .balign 32
 k0:
-  .zero 32
+  .zero 64
 
 .globl k1
 .balign 32
 k1:
-  .zero 32
+  .zero 64
diff --git a/sw/otbn/crypto/p256_ecdsa_sca.s b/sw/otbn/crypto/p256_ecdsa_sca.s
index 170d4fcdb6017..eae9e93ddcb5d 100644
--- a/sw/otbn/crypto/p256_ecdsa_sca.s
+++ b/sw/otbn/crypto/p256_ecdsa_sca.s
@@ -49,31 +49,11 @@ mode:
 .global k0
 .balign 32
 k0:
-  /* k0 = 0x0000000...ffffffff */
-  /* Note: Byte order in a word is little-endian */
-  .word 0xffffffff
-  .word 0xffffffff
-  .word 0xffffffff
-  .word 0xffffffff
-  .word 0x00000000
-  .word 0x00000000
-  .word 0x00000000
-  .word 0x00000000
-
+  .zero 64
 .global k1
 .balign 32
 k1:
-  /* k1= 0x0000000...00000000 */
-  /* Note: Byte order in a word is little-endian */
-  .word 0x00000000
-  .word 0x00000000
-  .word 0x00000000
-  .word 0x00000000
-  .word 0x00000000
-  .word 0x00000000
-  .word 0x00000000
-  .word 0x00000000
-
+  .zero 64
 
 /* message digest */
 .globl msg
@@ -137,6 +117,7 @@ y:
 .globl d0
 .balign 32
 d0:
+<<<<<<< HEAD
   /* d0= 0x5545a0b7...af57b4cd */
   /* Note: Byte order in a word is little-endian */
   .word 0xaf57b4cd
@@ -161,6 +142,13 @@ d1:
   .word 0x00000000
   .word 0x00000000
   .word 0x00000000
+=======
+  .zero 64
+.globl d1
+.balign 32
+d1:
+  .zero 64
+>>>>>>> [crypto] Update C code to use longer P-256 scalars.
 
 /* verification result x_r (aka x_1) */
 .globl x_r
diff --git a/sw/otbn/crypto/tests/p256_base_mult_test.s b/sw/otbn/crypto/tests/p256_base_mult_test.s
index 423d7493736d4..7deabc5bb706d 100644
--- a/sw/otbn/crypto/tests/p256_base_mult_test.s
+++ b/sw/otbn/crypto/tests/p256_base_mult_test.s
@@ -43,10 +43,18 @@ d0:
   .word 0xe5f2cbee
   .word 0x9144233d
   .word 0xc0fbe256
+  .word 0x00000000
+  .word 0x00000000
+  .word 0x00000000
+  .word 0x00000000
+  .word 0x00000000
+  .word 0x00000000
+  .word 0x00000000
+  .word 0x00000000
 .globl d1
 .balign 32
 d1:
-  .zero 32
+  .zero 64
 
 /* result buffer x-coordinate */
 .globl x
diff --git a/sw/otbn/crypto/tests/p256_ecdsa_sign_test.s b/sw/otbn/crypto/tests/p256_ecdsa_sign_test.s
index c9076c2411d4f..56735db6132f3 100644
--- a/sw/otbn/crypto/tests/p256_ecdsa_sign_test.s
+++ b/sw/otbn/crypto/tests/p256_ecdsa_sign_test.s
@@ -103,7 +103,7 @@ randomize_share:
 
 .data
 
-/* first share of nonce k (first 128 bits of k, then 128 0s, then 64 0s for redundant bits) */
+/* first share of nonce k (first 128 bits of k, then all 0s) */
 .globl k0
 .balign 32
 k0:
@@ -111,9 +111,10 @@ k0:
   .word 0x21d0a016
   .word 0xb0b2c781
   .word 0x9590ef5d
-  .zero 24
+  .zero 16
+  .zero 32
 
-/* second share of nonce k (128 0s, then last 128 bits of k, then 64 0s for redundant bits) */
+/* second share of nonce k (128 0s, then last 128 bits of k, then all 0s) */
 .globl k1
 .balign 32
 k1:
@@ -122,7 +123,7 @@ k1:
   .word 0x1b76ebe8
   .word 0x74210263
   .word 0x1420fc41
-  .zero 8
+  .zero 32
 
 /* message digest */
 .globl msg
@@ -137,7 +138,7 @@ msg:
   .word 0x6ce90fef
   .word 0x06d71207
 
-/* first share of private key d (first 128 bits of d, then 192 0s) */
+/* first share of private key d (first 128 bits of d, then all 0s) */
 .globl d0
 .balign 32
 d0:
@@ -145,9 +146,10 @@ d0:
   .word 0xfbd94efe
   .word 0xaa847f52
   .word 0x2d869bf4
-  .zero 24
+  .zero 16
+  .zero 32
 
-/* second share of private key d (128 0s, then last 128 bits of d, then 64 0s) */
+/* second share of private key d (128 0s, then last 128 bits of d, then all 0s) */
 .globl d1
 .balign 32
 d1:
@@ -156,7 +158,7 @@ d1:
   .word 0xe5f2cbee
   .word 0x9144233d
   .word 0xc0fbe256
-  .zero 8
+  .zero 32
 
 /* signature R */
 .globl r
diff --git a/sw/otbn/crypto/tests/p256_scalar_mult_test.s b/sw/otbn/crypto/tests/p256_scalar_mult_test.s
index 896cc990c5330..a4e594077c9bf 100644
--- a/sw/otbn/crypto/tests/p256_scalar_mult_test.s
+++ b/sw/otbn/crypto/tests/p256_scalar_mult_test.s
@@ -44,10 +44,18 @@ k0:
   .word 0x1b76ebe8
   .word 0x74210263
   .word 0x1420fc41
+  .word 0x00000000
+  .word 0x00000000
+  .word 0x00000000
+  .word 0x00000000
+  .word 0x00000000
+  .word 0x00000000
+  .word 0x00000000
+  .word 0x00000000
 .globl k1
 .balign 32
 k1:
-  .zero 32
+  .zero 64
 
 /* example curve point x-coordinate */
 .globl x

From dca76ad084c01136f1581913b23561cd1c8be13b Mon Sep 17 00:00:00 2001
From: Jade Philipoom <jadep@google.com>
Date: Wed, 14 Dec 2022 13:48:13 +0100
Subject: [PATCH 09/28] [crypto] Reduce register pressure in scalar_mult_int.

In preparation for changing scalar_mult_int to handle longer scalars,
reduce the register pressure in the inner loop.

Signed-off-by: Jade Philipoom <jadep@google.com>
---
 sw/otbn/crypto/p256.s | 94 +++++++++++++++++++++----------------------
 1 file changed, 47 insertions(+), 47 deletions(-)

diff --git a/sw/otbn/crypto/p256.s b/sw/otbn/crypto/p256.s
index 104c3a77ed76c..0cc0777e1a104 100644
--- a/sw/otbn/crypto/p256.s
+++ b/sw/otbn/crypto/p256.s
@@ -897,44 +897,44 @@ mod_inv:
  * @param[in]  w29: p, modulus of P-256 underlying finite field
  * @param[in]  w31: all-zero
  * @param[in]  MOD: p, modulus of P-256 underlying finite field
- * @param[out]  w26: z, random projective z-coordinate
- * @param[out]  w6: x, projective x-coordinate
- * @param[out]  w7: y, projective y-coordinate
+ * @param[out] w14: x, projective x-coordinate
+ * @param[out] w15: y, projective y-coordinate
+ * @param[out] w16: z, random projective z-coordinate
  *
  * Flags: When leaving this subroutine, the M, L and Z flags of FG0 depend on
  *        the scaled projective y-coordinate.
  *
- * clobbered registers: w2, w6, w7, w19 to w26
+ * clobbered registers: w14 to w16, w19 to w26
  * clobbered flag groups: FG0
  */
 fetch_proj_randomize:
 
   /* get random number from URND */
-  bn.wsrr   w2, 2
+  bn.wsrr   w16, 2 /* URND */
 
   /* reduce random number
-     w26 = z <= w2 mod p */
-  bn.addm   w26, w2, w31
+     w16 = z <= w16 mod p */
+  bn.addm   w16, w16, w31
 
   /* fetch x-coordinate from dmem
      w24 = x_a <= dmem[x22] = dmem[dptr_x] */
   bn.lid    x10, 0(x21)
 
   /* scale x-coordinate
-     w6 = x <= w24*w26 = x_a*z  mod p */
-  bn.mov    w25, w26
+     w14 = x <= w24*w16 = x_a*z  mod p */
+  bn.mov    w25, w16
   jal       x1, mod_mul_256x256
-  bn.mov    w6, w19
+  bn.mov    w14, w19
 
   /* fetch y-coordinate from dmem
      w24 = y_a <= dmem[x22] = dmem[dptr_y] */
   bn.lid    x10, 0(x22)
 
   /* scale y-coordinate
-     w7 = y <= w24*w26 = y_a*z  mod p */
-  bn.mov    w25, w26
+     w15 = y <= w24*w16 = y_a*z  mod p */
+  bn.mov    w25, w16
   jal       x1, mod_mul_256x256
-  bn.mov    w7, w19
+  bn.mov    w15, w19
 
   ret
 
@@ -1041,21 +1041,21 @@ scalar_mult_int:
   bn.lid    x2, 0(x3)
 
   /* get randomized projective coodinates of curve point
-     P = (x_p, y_p, z_p) = (w8, w9, w10) = (w6, w7, w26) =
+     P = (x_p, y_p, z_p) = (w8, w9, w10) = (w14, w15, w16) =
      (x*z mod p, y*z mod p, z) */
   li        x10, 24
   jal       x1, fetch_proj_randomize
-  bn.mov    w8, w6
-  bn.mov    w9, w7
-  bn.mov    w10, w26
+  bn.mov    w8, w14
+  bn.mov    w9, w15
+  bn.mov    w10, w16
 
   /* Init 2P, this will be used for the addition part in the double-and-add
      loop when the bit at the current index is 1 for both shares of the scalar.
-     2P = (w3, w4, w5) <= (w11, w12, w13) <= 2*(w8, w9, w10) = 2*P */
+     2P = (w4, w5, w6) <= (w11, w12, w13) <= 2*(w8, w9, w10) = 2*P */
   jal       x1, proj_double
-  bn.mov    w3, w11
-  bn.mov    w4, w12
-  bn.mov    w5, w13
+  bn.mov    w4, w11
+  bn.mov    w5, w12
+  bn.mov    w6, w13
 
   /* init double-and-add with point in infinity
      Q = (w8, w9, w10) <= (0, 1, 0) */
@@ -1071,7 +1071,7 @@ scalar_mult_int:
     jal       x1, proj_double
 
     /* re-fetch and randomize P again
-       P = (w6, w7, w26) */
+       P = (w14, w15, w16) */
     jal       x1, fetch_proj_randomize
 
     /* probe if MSb of either of the two scalars (k0 or k1) but not both is 1.
@@ -1079,19 +1079,19 @@ scalar_mult_int:
        - If both MSbs are set, select 2P for addition
        - If neither MSB is set, also 2P will be selected but this will be
          discarded later */
-    bn.xor    w8, w0, w1
+    bn.xor    w20, w0, w1
 
     /* P = (w8, w9, w10)
-        <= (w0[255] xor w1[256])?P=(w6, w7, w26):2P=(w3, w4, w5) */
-    bn.sel    w8, w6, w3, M
-    bn.sel    w9, w7, w4, M
-    bn.sel    w10, w26, w5, M
+        <= (w0[255] xor w1[255])?P=(w14, w15, w16):2P=(w4, w5, w6) */
+    bn.sel    w8, w14, w4, M
+    bn.sel    w9, w15, w5, M
+    bn.sel    w10, w16, w6, M
 
     /* save doubling result to survive follow-up subroutine call
-       Q = (w2, w6, w7) <= (w11, w12, w13) */
-    bn.mov    w2, w11
-    bn.mov    w6, w12
-    bn.mov    w7, w13
+       Q = (w7, w26, w30) <= (w11, w12, w13) */
+    bn.mov    w7, w11
+    bn.mov    w26, w12
+    bn.mov    w30, w13
 
     /* add points
        Q+P = (w11, w12, w13) <= (w11, w12, w13) + (w8, w9, w10) */
@@ -1099,13 +1099,13 @@ scalar_mult_int:
 
     /* probe if MSb of either one or both of the two
        scalars (k0 or k1) is 1.*/
-    bn.or     w8, w0, w1
+    bn.or     w20, w0, w1
 
     /* select doubling result (Q) or addition result (Q+P)
-       Q = w0[255] or w1[255]?Q_a=(w11, w12, w13):Q=(w2, w6, w7) */
-    bn.sel    w8, w11, w2, M
-    bn.sel    w9, w12, w6, M
-    bn.sel    w10, w13, w7, M
+       Q = w0[255] or w1[255]?Q_a=(w11, w12, w13):Q=(w7, w26, w30) */
+    bn.sel    w8, w11, w7, M
+    bn.sel    w9, w12, w26, M
+    bn.sel    w10, w13, w30, M
 
     /* rotate both scalars left 1 bit */
     bn.rshi   w0, w0, w0 >> 255
@@ -1119,26 +1119,26 @@ scalar_mult_int:
     /* get a fresh random number from URND and scale the coordinates of
        2P = (w3, w4, w5) (scaling each projective coordinate with same
        factor results in same point) */
-    bn.wsrr   w2, 2
+    bn.wsrr   w7, 2
 
-    /* w3 = w3 * w2 */
-    bn.mov    w24, w3
-    bn.mov    w25, w2
-    jal       x1, mod_mul_256x256
-    bn.mov    w3, w19
-
-    /* w4 = w4 * w2 */
+    /* w4 = w4 * w7 */
     bn.mov    w24, w4
-    bn.mov    w25, w2
+    bn.mov    w25, w7
     jal       x1, mod_mul_256x256
     bn.mov    w4, w19
 
-    /* w5 = w5 * w2 */
+    /* w5 = w5 * w7 */
     bn.mov    w24, w5
-    bn.mov    w25, w2
+    bn.mov    w25, w7
     jal       x1, mod_mul_256x256
     bn.mov    w5, w19
 
+    /* w6 = w6 * w7 */
+    bn.mov    w24, w6
+    bn.mov    w25, w7
+    jal       x1, mod_mul_256x256
+    bn.mov    w6, w19
+
   /* convert back to affine coordinates
      R = (x_a, y_a) = (w11, w12) */
   jal       x1, proj_to_affine

From f1778363d50090f5f13f802260803a63d4cb254c Mon Sep 17 00:00:00 2001
From: Jade Philipoom <jadep@google.com>
Date: Wed, 14 Dec 2022 14:28:52 +0100
Subject: [PATCH 10/28] [crypto] Support extra bits in scalar_mult_int.

Change the scalar_mult_int subroutine for ECDSA-P256 to support 320-bit
hardened scalars.

Signed-off-by: Jade Philipoom <jadep@google.com>
---
 sw/otbn/crypto/p256.s | 100 +++++++++++++++++++-----------------------
 1 file changed, 46 insertions(+), 54 deletions(-)

diff --git a/sw/otbn/crypto/p256.s b/sw/otbn/crypto/p256.s
index 0cc0777e1a104..faba163269a81 100644
--- a/sw/otbn/crypto/p256.s
+++ b/sw/otbn/crypto/p256.s
@@ -998,13 +998,25 @@ proj_double:
  *
  * The routine receives the scalar in two shares k0, k1 such that
  *   k = (k0 + k1) mod n
- * The double-and-add loop operates on both shares in parallel applying
- * Shamir's trick.
+ * The loop operates on both shares in parallel, computing (k0 + k1) * P as
+ * follows:
+ *  Q = (0, 1, 0) # origin
+ *  for i in 319..0:
+ *    Q = 2 * Q
+ *    A = if (k0[i] ^ k1[i]) then P else 2P
+ *    B = Q + A
+ *    Q = if (k0[i] | k1[i]) then B else Q
+ *
+ *
+ * Each share k0/k1 is 320 bits, even though it represents a 256-bit value.
+ * This is a side-channel protection measure.
  *
  * @param[in]  x21: dptr_x, pointer to affine x-coordinate in dmem
  * @param[in]  x22: dptr_y, pointer to affine y-coordinate in dmem
- * @param[in]  w0: k0, first share of scalar for multiplication
- * @param[in]  w1: k1, second share of scalar for multiplication
+ * @param[in]  w0: lower 256 bits of k0, first share of scalar
+ * @param[in]  w1: upper 64 bits of k0, first share of scalar
+ * @param[in]  w2: lower 256 bits of k1, second share of scalar
+ * @param[in]  w3: upper 64 bits of k1, second share of scalar
  * @param[in]  w27: b, curve domain parameter
  * @param[in]  w31: all-zero
  * @param[in]  MOD: p, modulus, 2^256 > p > 2^255.
@@ -1014,7 +1026,7 @@ proj_double:
  * Flags: When leaving this subroutine, the M, L and Z flags of FG0 depend on
  *        the computed affine y-coordinate.
  *
- * clobbered registers: x2, x3, x10, w0 to w29
+ * clobbered registers: x2, x3, x10, w0 to w30
  * clobbered flag groups: FG0
  */
 scalar_mult_int:
@@ -1063,8 +1075,17 @@ scalar_mult_int:
   bn.addi   w9, w31, 1
   bn.mov    w10, w31
 
+  /* Shift shares of k so their MSBs are in the most significant position of a
+     word.
+       w0,w1 <= [w0, w1] << 192 = k0 << 192
+       w2,w3 <= [w2, w3] << 192 = k1 << 192 */
+  bn.rshi   w1, w1, w0 >> 64
+  bn.rshi   w0, w0, w31 >> 64
+  bn.rshi   w3, w3, w2 >> 64
+  bn.rshi   w2, w2, w31 >> 64
+
   /* double-and-add loop with decreasing index */
-  loopi     256, 32
+  loopi     320, 34
 
     /* double point Q
        Q = (w11, w12, w13) <= 2*(w8, w9, w10) = 2*Q */
@@ -1079,9 +1100,15 @@ scalar_mult_int:
        - If both MSbs are set, select 2P for addition
        - If neither MSB is set, also 2P will be selected but this will be
          discarded later */
-    bn.xor    w20, w0, w1
+    bn.xor    w20, w1, w3
 
-    /* P = (w8, w9, w10)
+    /* N.B. The M bit here is secret. For side channel protection in the
+       selects below, it is vital that neither option is equal to the
+       destionation register (e.g. bn.sel w0, w0, w1). In this case, the
+       hamming distance from the destination's previous value to its new value
+       will be 0 in one of the cases and potentially reveal M.
+
+       P = (w8, w9, w10)
         <= (w0[255] xor w1[255])?P=(w14, w15, w16):2P=(w4, w5, w6) */
     bn.sel    w8, w14, w4, M
     bn.sel    w9, w15, w5, M
@@ -1099,17 +1126,22 @@ scalar_mult_int:
 
     /* probe if MSb of either one or both of the two
        scalars (k0 or k1) is 1.*/
-    bn.or     w20, w0, w1
+    bn.or     w20, w1, w3
+
+    /* N.B. As before, the select instructions below must use distinct
+       source/destination registers to avoid revealing M.
 
-    /* select doubling result (Q) or addition result (Q+P)
-       Q = w0[255] or w1[255]?Q_a=(w11, w12, w13):Q=(w7, w26, w30) */
+       Select doubling result (Q) or addition result (Q+P)
+         Q = w0[255] or w1[255]?Q_a=(w11, w12, w13):Q=(w7, w26, w30) */
     bn.sel    w8, w11, w7, M
     bn.sel    w9, w12, w26, M
     bn.sel    w10, w13, w30, M
 
-    /* rotate both scalars left 1 bit */
-    bn.rshi   w0, w0, w0 >> 255
-    bn.rshi   w1, w1, w1 >> 255
+    /* Shift both scalars left 1 bit. */
+    bn.rshi   w1, w1, w0 >> 255
+    bn.rshi   w0, w0, w31 >> 255
+    bn.rshi   w3, w3, w2 >> 255
+    bn.rshi   w2, w2, w31 >> 255
 
     /* init regs with random numbers from URND */
     bn.wsrr   w11, 2
@@ -1228,26 +1260,6 @@ p256_sign:
   la        x3, p256_u_n
   bn.lid    x2, 0(x3)
 
-  /* Reduce k0 modulo n.
-     TODO: this is temporary until scalar_mult_int supports extra bits; remove later.
-
-     w0 <= [w0,w1] mod n = k0 mod n */
-  bn.mov   w19, w0
-  bn.mov   w20, w1
-  bn.mov   w22, w31
-  jal      x1, p256_reduce
-  bn.mov   w0, w19
-
-  /* Reduce k1 modulo n.
-     TODO: this is temporary until scalar_mult_int supports extra bits; remove later.
-
-     w1 <= [w2,w3] mod n = k1 mod n */
-  bn.mov   w19, w2
-  bn.mov   w20, w3
-  bn.mov   w22, w31
-  jal      x1, p256_reduce
-  bn.mov   w1, w19
-
   /* scalar multiplication with base point
      (x_1, y_1) = (w11, w12) <= k*G = w0*(dmem[p256_gx], dmem[p256_gy]) */
   la        x21, p256_gx
@@ -1435,26 +1447,6 @@ p256_base_mult:
   li        x2, 3
   bn.lid    x2, 0(x16)
 
-  /* Reduce d0 modulo n.
-     TODO: this is temporary until scalar_mult_int supports extra bits; remove later.
-
-     w0 <= [w0,w1] mod n = d0 mod n */
-  bn.mov   w19, w0
-  bn.mov   w20, w1
-  bn.mov   w22, w31
-  jal      x1, p256_reduce
-  bn.mov   w0, w19
-
-  /* Reduce d1 modulo n.
-     TODO: this is temporary until scalar_mult_int supports extra bits; remove later.
-
-     w1 <= [w2,w3] mod n = d1 mod n */
-  bn.mov   w19, w2
-  bn.mov   w20, w3
-  bn.mov   w22, w31
-  jal      x1, p256_reduce
-  bn.mov   w1, w19
-
   /* call internal scalar multiplication routine
      R = (x_a, y_a) = (w11, w12) <= d*P = (w0 + w1)*P */
   la        x21, p256_gx

From 9d71bf7c68871e743cfb2d03563f5e5c7249bab9 Mon Sep 17 00:00:00 2001
From: Jade Philipoom <jadep@google.com>
Date: Thu, 22 Dec 2022 16:23:15 +0100
Subject: [PATCH 11/28] [crypto] Update P-256 random scalar generation
 subroutine.

Modify the P-256 random scalar generation routine to produce extra-long
scalars.

Signed-off-by: Jade Philipoom <jadep@google.com>
---
 sw/otbn/crypto/p256.s | 185 +++++++++++++++++++++++++++---------------
 1 file changed, 119 insertions(+), 66 deletions(-)

diff --git a/sw/otbn/crypto/p256.s b/sw/otbn/crypto/p256.s
index faba163269a81..cfa4a866637d1 100644
--- a/sw/otbn/crypto/p256.s
+++ b/sw/otbn/crypto/p256.s
@@ -1896,27 +1896,41 @@ p256_scalar_mult:
 /**
  * Generate a nonzero random value in the scalar field.
  *
- * Returns t, a random value in the range [1,n-1].
- *
- * This follows the method in FIPS 186-4 sections B.4.2 and B.5.2 for
- * generation of secret scalar values d and k. The computation is:
- *   do {
- *     c = RBG(256); // fetch 256b random value
- *   } while (c >= n - 1)
- *   return c + 1;
- *
- * This implementation handles the unmasked secret value, but the seed is
- * pulled from RND so it cannot be re-run with the same seed. This method
- * should not be used for keymgr-derived seeds! However, it masks the secret
- * scalar before returning so that it can be safely handled e.g. in scalarmult.
+ * Returns t, a random value that is nonzero mod n, in shares.
+ *
+ * This follows a modified version of the method in FIPS 186-4 sections B.4.1
+ * and B.5.1 for generation of secret scalar values d and k. The computation
+ * in FIPS 186-4 is:
+ *   seed = RBG(seedlen) // seedlen >= 320
+ *   return (seed mod (n-1)) + 1
+ *
+ * The important features here are that (a) the seed is at least 64 bits longer
+ * than n in order to minimize bias after the reduction and (b) the resulting
+ * scalar is guaranteed to be nonzero.
+ *
+ * We deviate from FIPS a little bit here because for side-channel protection,
+ * we do not want to fully reduce the seed modulo (n-1) or combine the shares.
+ * Instead, we do the following:
+ *   seed0 = RBG(320)
+ *   seed1 = RBG(320)
+ *   x = URND(127) + 1 // random value for masking
+ *   if (seed0 * x + seed1 * x) mod n == 0:
+ *     retry
+ *   return seed0, seed1
+ *
+ * Essentially, we get two independent seeds and interpret these as additive
+ * shares of the scalar t = (seed0 + seed1) mod n. Then, we need to ensure t is
+ * nonzero. Multiplying each share with a random masking parameter allows us to
+ * safely add them, and then check if this result is 0; if it is, then t must
+ * be 0 mod n and we need to retry.
  *
  * Flags: Flags have no meaning beyond the scope of this subroutine.
  *
  * @param[in]  w31:  all-zero
- * @param[out] w20:  first share of secret scalar t
- * @param[out] w21:  second share of secret scalar t
+ * @param[out] w15,w16:  first share of secret scalar t (320 bits)
+ * @param[out] w17,w18:  second share of secret scalar t (320 bits)
  *
- * clobbered registers: x2, x3, x20, w20, w21, w29
+ * clobbered registers: x2, x3, x20, w12 to w29
  * clobbered flag groups: FG0
  */
 p256_random_scalar:
@@ -1926,52 +1940,77 @@ p256_random_scalar:
   la        x3, p256_n
   bn.lid    x2, 0(x3)
 
-  /* w21 <= w29 - 1 = n - 1 */
-  bn.subi   w21, w29, 1
-
-  generate_key_retry:
-  /* Obtain 256 bits of randomness from RND. */
-  bn.wsrr   w20, 0x1 /* RND */
+  /* Copy n into the MOD register. */
+  bn.wsrw   0, w29
 
-  /* Additionally mask the seed with some bits from URND, just in case
-     there's any vulnerability in EDN that lets the attacker recover some bits
-     before they reach OTBN. */
-  bn.wsrr   w22, 0x2 /* URND */
-  bn.xor    w20, w20, w22
+  /* Load Barrett constant for n.
+     w28 <= u_n = dmem[p256_u_n]  */
+  li        x2, 28
+  la        x3, p256_u_n
+  bn.lid    x2, 0(x3)
 
-  /* Compare the random value to (n-1).
-     FG0.C <= w20 < w21 = w20 < n - 1 */
-  bn.cmp    w20, w21
+  random_scalar_retry:
+  /* Obtain 768 bits of randomness from RND. */
+  bn.wsrr   w15, 0x1 /* RND */
+  bn.wsrr   w16, 0x1 /* RND */
+  bn.wsrr   w17, 0x1 /* RND */
+
+  /* XOR with bits from URND, just in case there's any vulnerability in EDN
+     that lets the attacker recover bits before they reach OTBN. */
+  bn.wsrr   w20, 0x2 /* URND */
+  bn.xor    w16, w16, w20
+  bn.wsrr   w20, 0x2 /* URND */
+  bn.xor    w17, w17, w20
+  bn.wsrr   w20, 0x2 /* URND */
+  bn.xor    w18, w18, w20
+
+  /* Shift bits to get 320-bit seeds.
+     w18 <= w16[255:192]
+     w16 <= w16[63:0] */
+  bn.rshi   w18, w31, w16 >> 192
+  bn.rshi   w20, w16, w31 >> 64
+  bn.rshi   w16, w20, w31 >> 192
+
+  /* Generate a random masking parameter.
+     w14 <= URND(127) + 1 = x */
+  bn.wsrr   w14, 0x2 /* URND */
+  bn.addi   w14, w14, 1
+
+  /* w12 <= ([w15,w16] * w14) mod n = (seed0 * x) mod n */
+  bn.mov    w24, w15
+  bn.mov    w25, w16
+  bn.mov    w26, w14
+  jal       x1, mod_mul_320x128
+  bn.mov    w12, w19
 
-  /* Read the FG0.C flag.
-     x2 <= FG0.C = w21 < n - 1 */
-  csrrw     x2, 0x7c0, x0
-  andi      x2, x2, 1
+  /* w13 <= ([w17,w18] * w14) mod n = (seed1 * x) mod n */
+  bn.mov    w24, w17
+  bn.mov    w25, w18
+  bn.mov    w26, w14
+  jal       x1, mod_mul_320x128
+  bn.mov    w13, w19
 
-  /* Done if w20 < n - 1, otherwise retry */
-  li        x3, 1
-  bne       x2, x3, generate_key_retry
+  /* w12 <= (w12 + w13) mod n = ((seed0 + seed1) * x) mod n */
+  bn.addm   w12, w12, w13
 
-  /* If we get here, then w20 < n - 1. Add 1 to get the private key.
-     w20 <= w20 + 1 = t */
-  bn.addi   w20, w20, 1
+  /* Compare to 0.
+     FG0.Z <= (w12 =? w31) = ((seed0 + seed1) mod n =? 0) */
+  bn.cmp    w12, w31
 
-  /* MOD <= n */
-  bn.wsrw   0, w29
+  /* Read the FG0.Z flag (position 3).
+     x2 <= 8 if FG0.Z else 0 */
+  csrrw     x2, 0x7c0, x0
+  andi      x2, x2, 8
 
-  /* Get a new 256-bit random number from URND for masking.
-      w21 <= URND() mod n = t1 */
-  bn.wsrr   w21, 0x2 /* URND */
-  bn.addm   w21, w21, w31
+  /* Retry if x2 != 0. */
+  bne       x2, x0, random_scalar_retry
 
-  /* Calculate the other share of t.
-     w20 <= (w20 - w21) mod n = (t - t1) mod n = t0 */
-  bn.subm   w20, w20, w21
+  /* If we get here, then (seed0 + seed1) mod n is nonzero mod n; return. */
 
   ret
 
 /**
- * Generate the secret key d according to FIPS 186-4 section B.4.2.
+ * Generate the secret key d from a random seed.
  *
  * Flags: Flags have no meaning beyond the scope of this subroutine.
  *
@@ -1985,21 +2024,31 @@ p256_generate_random_key:
   /* Init all-zero register. */
   bn.xor    w31, w31, w31
 
-  /* Generate a random scalar in two shares.
-       w20, w21 <= d0, d1 */
+  /* Generate a random scalar in two 320-bit shares.
+       w15, w16 <= d0
+       w17, w18 <= d1 */
   jal  x1, p256_random_scalar
 
-  /* Write the shares to DMEM. */
+  /* Write first share to DMEM.
+       dmem[d0] <= w15, w16 = d0 */
   la        x20, d0
-  li        x2, 20
-  bn.sid    x2++, 0(x20)
+  li        x2, 15
+  bn.sid    x2, 0(x20++)
+  li        x2, 16
+  bn.sid    x2, 0(x20)
+
+  /* Write second share to DMEM.
+       dmem[d1] <= w15, w16 = d0 */
   la        x20, d1
+  li        x2, 17
+  bn.sid    x2, 0(x20++)
+  li        x2, 18
   bn.sid    x2, 0(x20)
 
   ret
 
 /**
- * Generate the secret scalar k according to FIPS 186-4 section B.5.2.
+ * Generate the secret scalar k from a random seed.
  *
  * Flags: Flags have no meaning beyond the scope of this subroutine.
  *
@@ -2013,22 +2062,26 @@ p256_generate_k:
   /* Init all-zero register. */
   bn.xor    w31, w31, w31
 
-  /* Generate a random scalar in two shares.
-       w20, w21 <= k0, k1 */
+  /* Generate a random scalar in two 320-bit shares.
+       w15, w16 <= k0
+       w17, w18 <= k1 */
   jal  x1, p256_random_scalar
 
-  /* Write the shares to DMEM.
-     TODO: zeroes for high bits are temporary until p256_random_scalar supports
-     extra bits; remove later. */
+  /* Write first share to DMEM.
+       dmem[k0] <= w15, w16 = k0 */
   la        x20, k0
-  li        x2, 20
+  li        x2, 15
   bn.sid    x2, 0(x20++)
-  li        x3, 31
-  bn.sid    x3, 0(x20)
+  li        x2, 16
+  bn.sid    x2, 0(x20)
+
+  /* Write second share to DMEM.
+       dmem[k1] <= w15, w16 = k0 */
   la        x20, k1
-  li        x2, 21
+  li        x2, 17
   bn.sid    x2, 0(x20++)
-  bn.sid    x3, 0(x20)
+  li        x2, 18
+  bn.sid    x2, 0(x20)
 
   ret
 

From d505813edb27d148fbbc7256c6c6972789c2ea78 Mon Sep 17 00:00:00 2001
From: Jade Philipoom <jadep@google.com>
Date: Mon, 16 Jan 2023 17:56:09 +0100
Subject: [PATCH 12/28] wip, using check_scalar_nonzero subroutine

---
 sw/otbn/crypto/p256.s | 379 +++++++++++++++++++++++++++++++++++++++---
 1 file changed, 354 insertions(+), 25 deletions(-)

diff --git a/sw/otbn/crypto/p256.s b/sw/otbn/crypto/p256.s
index cfa4a866637d1..f1425510937c0 100644
--- a/sw/otbn/crypto/p256.s
+++ b/sw/otbn/crypto/p256.s
@@ -1893,6 +1893,58 @@ p256_scalar_mult:
 
   ret
 
+/**
+ * Check if a masked scalar is zero modulo n.
+ *
+ * Returns (t0 + t1) =? 0 (mod n).
+ *
+ * To safely check if the scalar is zero, we generate a random, nonzero scalar
+ * "alpha" for masking. We multiply each share by alpha modulo n, and add them
+ * together and check if the result (t * alpha) is zero. The addition is safe
+ * because of the randomness introduced by the mask.
+ *
+ * Flags: After this routine, FG0.Z is 1 iff the masked scalar is zero mod n.
+ *
+ * @param[in] w15,w16:  first share of secret scalar t (320 bits)
+ * @param[in] w17,w18:  second share of secret scalar t (320 bits)
+ * @param[in]     w29:  n, curve order
+ * @param[in]     w28:  u, lower 256 bit of Barrett constant for n
+ * @param[in]     MOD:  n, curve order
+ * @param[in]     w31:  all-zero
+ *
+ * clobbered registers: TODO
+ * clobbered flag groups: FG0
+ */
+check_scalar_nonzero:
+  /* Generate a random nonzero masking parameter.
+     w14 <= URND(127) + 1 = alpha */
+  bn.wsrr   w14, 0x2 /* URND */
+  bn.rshi   w14, w31, w14 >> 129
+  bn.addi   w14, w14, 1
+
+  /* w12 <= ([w15,w16] * w14) mod n = (t0 * alpha) mod n */
+  bn.mov    w24, w15
+  bn.mov    w25, w16
+  bn.mov    w26, w14
+  jal       x1, mod_mul_320x128
+  bn.mov    w12, w19
+
+  /* w13 <= ([w17,w18] * w14) mod n = (t1 * alpha) mod n */
+  bn.mov    w24, w17
+  bn.mov    w25, w18
+  bn.mov    w26, w14
+  jal       x1, mod_mul_320x128
+  bn.mov    w13, w19
+
+  /* w12 <= (w12 + w13) mod n = ((t0 + t1) * alpha) mod n */
+  bn.addm   w12, w12, w13
+
+  /* Compare to 0.
+     FG0.Z <= (w12 =? w31) = ((t0 + t1) mod n =? 0) */
+  bn.cmp    w12, w31
+
+  ret
+
 /**
  * Generate a nonzero random value in the scalar field.
  *
@@ -1971,31 +2023,7 @@ p256_random_scalar:
   bn.rshi   w20, w16, w31 >> 64
   bn.rshi   w16, w20, w31 >> 192
 
-  /* Generate a random masking parameter.
-     w14 <= URND(127) + 1 = x */
-  bn.wsrr   w14, 0x2 /* URND */
-  bn.addi   w14, w14, 1
-
-  /* w12 <= ([w15,w16] * w14) mod n = (seed0 * x) mod n */
-  bn.mov    w24, w15
-  bn.mov    w25, w16
-  bn.mov    w26, w14
-  jal       x1, mod_mul_320x128
-  bn.mov    w12, w19
-
-  /* w13 <= ([w17,w18] * w14) mod n = (seed1 * x) mod n */
-  bn.mov    w24, w17
-  bn.mov    w25, w18
-  bn.mov    w26, w14
-  jal       x1, mod_mul_320x128
-  bn.mov    w13, w19
-
-  /* w12 <= (w12 + w13) mod n = ((seed0 + seed1) * x) mod n */
-  bn.addm   w12, w12, w13
-
-  /* Compare to 0.
-     FG0.Z <= (w12 =? w31) = ((seed0 + seed1) mod n =? 0) */
-  bn.cmp    w12, w31
+  jal       x1, check_scalar_nonzero
 
   /* Read the FG0.Z flag (position 3).
      x2 <= 8 if FG0.Z else 0 */
@@ -2085,6 +2113,307 @@ p256_generate_k:
 
   ret
 
+/**
+ * Convert boolean shares to arithmetic ones using Goubin's algorithm.
+ *
+ * Returns x0, x1 such that (s0 ^ s1) = (x0 + x1) mod 2^321.
+ *
+ * The input consists of two 320-bit shares, s0 and s1. Bits at position 320
+ * and above in the input shares will be ignored. We compute the result mod
+ * 2^321 so that the high bit of x0 will reveal the carry modulo 2^320.
+ *
+ * The second share x1 is always simply (s1 mod 2^320).
+ *
+ * We then use Goubin's boolean-to-arithmetic masking algorithm to switch from
+ * this boolean masking scheme to an arithmetic one without ever unmasking the
+ * seed. See Algorithm 1 here:
+ * https://link.springer.com/content/pdf/10.1007/3-540-44709-1_2.pdf
+ *
+ * The algorithm is reproduced here for reference:
+ *   Input:
+ *     s0, s1: k-bit shares such that x = s0 ^ s1
+ *     gamma: random k-bit number
+ *   Output: x0, k-bit number such that x = (x0 + s1) mod 2^k
+ *   Pseudocode:
+ *     T := ((s0 ^ gamma) - gamma) mod 2^k
+ *     T2 := T ^ s0
+ *     G := gamma ^ s1
+ *     A := ((s0 ^ G) - G) mod 2^k
+ *     return x0 := (A ^ T2)
+ *
+ * This routine runs in constant time.
+ *
+ * Flags: Flags have no meaning beyond the scope of this subroutine.
+ *
+ * @param[in]  [w21, w20]: s0, first share of seed (320 bits)
+ * @param[in]  [w23, w22]: s1, second share of seed (320 bits)
+ * @param[in]         w31: all-zero
+ * @param[out] [w21, w20]: result x0 (321 bits)
+ *
+ * clobbered registers: w1 to w4, w20 to w23
+ * clobbered flag groups: FG0
+ */
+boolean_to_arithmetic:
+  /* Create a 64-bit mask.
+       w1 <= 2^64 - 1 */
+  bn.subi   w1, w31, 1
+  bn.rshi   w1, w31, w1 >> 192
+
+  /* Mask out excess bits from seed shares.
+       [w21, w20] <= s0 mod 2^320
+       [w23, w22] <= s1 mod 2^320 = x1 */
+  bn.and    w21, w21, w1
+  bn.and    w23, w23, w1
+
+  /* Fetch 321 bits of randomness from URND.
+       [w2, w1] <= gamma */
+  bn.wsrr   w1, 2
+  bn.wsrr   w2, 2
+  bn.rshi   w2, w31, w2 >> 191
+
+  /* [w4, w3] <= [w21, w20] ^ [w2, w1] = s0 ^ gamma */
+  bn.xor    w3, w20, w1
+  bn.xor    w4, w21, w2
+
+  /* Subtract gamma. This may result in bits above 2^321, but these will be
+     stripped off in the next step.
+       [w4, w3] <= [w4, w3] - [w2, w1] = ((s0 ^ gamma) - gamma) mod 2^512 */
+  bn.sub    w3, w3, w1
+  bn.subb   w4, w4, w2
+
+  /* [w4, w3] <= [w4, w3] ^ [w21, w20] = T2 */
+  bn.xor    w3, w3, w20
+  bn.xor    w4, w4, w21
+
+  /* [w2, w1] <= [w2, w1] ^ [w23, w22] = gamma ^ s1 = G */
+  bn.xor    w1, w1, w22
+  bn.xor    w2, w2, w23
+
+  /* [w21, w20] <= [w21, w20] ^ [w2, w1] = s0 ^ G */
+  bn.xor    w20, w20, w1
+  bn.xor    w21, w21, w2
+
+  /* [w21, w20] <= [w21, w20] - [w2, w1] = ((s0 ^ G) - G) mod 2^512 */
+  bn.sub    w20, w20, w1
+  bn.subb   w21, w21, w2
+
+  /* [w21, w20] <= [w21, w20] ^ [w4, w3] = A ^ T2 = x0 */
+  bn.xor    w20, w20, w1
+  bn.xor    w21, w21, w2
+
+  ret
+
+/**
+ * P-256 ECDSA secret key generation.
+ *
+ * Returns the secret key d in two 320-bit shares d0 and d1, such that:
+ *    d = (d0 + d1) mod n
+ * ...where n is the curve order.
+ *
+ * This implementation is similar to FIPS 186-4 section B.4.1, where we
+ * generate d using N+64 random bits (320 bits in this case) as a seed. But
+ * while FIPS computes d = (seed mod (n-1)) + 1 to ensure a nonzero key, we
+ * instead compute d = seed mod n and check for zero by multiplying with a
+ * nonzero random scalar.
+ *
+ * Most complexity in this routine comes from masking. The input seed is
+ * provided in two 320-bit shares, seed0 and seed1, such that:
+ *   seed = seed0 ^ seed1
+ * Bits at position 320 and above in the input shares will be ignored.
+ *
+ * We then use Goubin's boolean-to-arithmetic masking algorithm to switch from
+ * this boolean masking scheme to an arithmetic one without ever unmasking the
+ * seed. See Algorithm 1 here:
+ * https://link.springer.com/content/pdf/10.1007/3-540-44709-1_2.pdf
+ *
+ * For a Coq proof of the correctness of the basic computational logic here
+ * see:
+ *   https://gist.github.com/jadephilipoom/24f44c59cbe59327e2f753867564fa28#file-masked_reduce-v-L226
+ *
+ * The proof does not cover leakage properties; it mostly just shows that this
+ * logic correctly computes (seed mod n) and the carry-handling works.
+ *
+ * This routine runs in constant time.
+ *
+ * Flags: Flags have no meaning beyond the scope of this subroutine.
+ *
+ * @param[in]  [w21, w20]: seed0, first share of seed (320 bits)
+ * @param[in]  [w23, w22]: seed1, second share of seed (320 bits)
+ * @param[in]         w31: all-zero
+ * @param[out] [w16, w15]: d0, first share of private key d (320 bits)
+ * @param[out] [w18, w17]: d1, second share of private key d (320 bits)
+ *
+ * clobbered registers: TODO
+ * clobbered flag groups: FG0
+ */
+p256_secret_key_from_seed:
+  /* Convert from a boolean to an arithmetic mask using Goubin's algorithm.
+       [w21, w20] <= ((seed0 ^ seed1) - seed1) mod 2^321 = x0 */
+  jal       x1, boolean_to_arithmetic
+
+  /* At this point, we have arithmetic shares modulo 2^321:
+       [w21, w20] : x0
+       [w23, w22] : x1
+
+     We know that x1=seed1, and seed and x1 are at most 320 bits. Therefore,
+     the highest bit of x0 holds a carry bit modulo 2^320:
+       x0 = (seed - x1) mod 2^321
+       x0 = (seed - x1) mod 2^320 + (if (x1 <= seed) then 0 else 2^320)
+
+     The carry bit then allows us to replace (mod 2^321) with a conditional
+     statement:
+       seed = (x0 mod 2^320) + x1 - (x0[320] << 320)
+
+     Note that the carry bit is not very sensitive from a side channel
+     perspective; x1 <= seed has some bias related to the highest bit of the
+     seed, but since the seed is 64 bits larger than n, this single-bit noisy
+     leakage should not be significant.
+
+     From here, we want to convert to shares modulo (n * 2^64) -- these shares
+     will be equivalent to the seed modulo n but still retain 64 bits of extra
+     masking. We compute the new shares as follows:
+       c = (x0[320] << 320) mod (n << 64)
+       d0 = ((x0 mod 2^320) - c) mod (n << 64))
+       d1 = x1 mod (n << 64)
+
+       d = seed mod n = (d0 + d1) mod n
+  */
+
+  /* Load curve order n from DMEM.
+       w29 <= dmem[p256_n] = n */
+  li        x2, 29
+  la        x3, p256_n
+  bn.lid    x2, 0(x3)
+
+  /* Compute (n << 64).
+       [w29,w28] <= w29 << 64 = n << 64 */
+  bn.rshi   w28, w29, w31 >> 192
+  bn.rshi   w29, w31, w29 >> 192
+
+  /* [w25,w24] <= (x1 - (n << 64)) mod 2^256 */
+  bn.sub    w24, w22, w28
+  bn.subb   w25, w23, w29
+
+  /* Compute d1. Because 2^320 < 2 * (n << 64), a conditional subtraction is
+     sufficient to reduce. Similarly to the carry bit, the conditional bit here
+     is not very sensitive because the shares are large relative to n.
+       [w18,w17] <= x1 mod (n << 64) = d1 */
+  bn.sel    w17, w22, w24, FG0.C
+  bn.sel    w18, w23, w25, FG0.C
+
+  /* Isolate the carry bit and shift it back into position.
+       w25 <= x0[320] << 64 */
+  bn.rshi   w25, w31, w21 >> 64
+  bn.rshi   w25, w25, w31 >> 192
+
+  /* Clear the carry bit from the original result.
+       [w21,w20] <= x0 mod 2^320 */
+  bn.xor    w21, w21, w25
+
+  /* Conditionally subtract (n << 64) to reduce.
+       [w21,w20] <= (x0 mod 2^320) mod (n << 64) */
+  bn.sub    w26, w20, w28
+  bn.subb   w27, w21, w29
+  bn.sel    w20, w20, w26, FG0.C
+  bn.sel    w21, w21, w27, FG0.C
+
+  /* Compute the correction factor.
+       [w25,w24] <= (x[320] << 320) mod (n << 64) = c */
+  bn.sub    w26, w24, w28
+  bn.subb   w27, w25, w29
+  bn.sel    w24, w24, w26, FG0.C
+  bn.sel    w25, w25, w27, FG0.C
+
+  /* Compute d0 with a modular subtraction. First we add (n << 64) to protect
+     against underflow, then conditionally subtract it if needed.
+       [w16,w15] <= ([w21, w20] - [w25,w24]) mod (n << 64) = d1 */
+  bn.add    w20, w20, w28
+  bn.addc   w21, w21, w29
+  bn.sub    w26, w20, w24
+  bn.subb   w27, w21, w25
+  bn.sel    w15, w20, w26, FG0.C
+  bn.sel    w16, w21, w27, FG0.C
+
+  /* Re-load the curve order n.
+     w29 <= dmem[p256_n] = n */
+  li        x2, 29
+  la        x3, p256_n
+  bn.lid    x2, 0(x3)
+
+  /* Copy n into the MOD register. */
+  bn.wsrw   0, w29
+
+  /* Load Barrett constant for n.
+     w28 <= u_n = dmem[p256_u_n]  */
+  li        x2, 28
+  la        x3, p256_u_n
+  bn.lid    x2, 0(x3)
+
+  /* Check if the generated key is 0 modulo n.
+       FG0.Z <= if (d0 + d1) == 0 (mod n) then 0 else 1 */
+  jal       x1, check_scalar_nonzero
+
+  /* Read the FG0.Z flag (position 3).
+     x2 <= 8 if FG0.Z else 0 */
+  csrrw     x2, 0x7c0, x0
+  andi      x2, x2, 8
+
+  /* Jump to failure case if x2 != 0. */
+  bne       x2, x0, p256_secret_key_from_seed_fail
+
+  ret
+
+  p256_secret_key_from_seed_fail:
+    /* Trigger a fault. */
+    unimp
+
+/**
+ * Top-level P-256 ECDSA key generation.
+ *
+ * Returns the secret key d in two 320-bit shares d0 and d1, such that:
+ *    d = (d0 + d1) mod n
+ * ...where n is the curve order.
+ *
+ * This implementation is similar to FIPS 186-4 section B.4.1, where we
+ * generate d using N+64 random bits (320 bits in this case) as a seed. But
+ * while FIPS computes d = (seed mod (n-1)) + 1 to ensure a nonzero key, we
+ * instead compute d = seed mod n and check for zero during point-scalar
+ * multiplication while we get the public key.
+ *
+ * Most complexity in this routine comes from masking. The input seed is
+ * provided in two 320-bit shares, seed0 and seed1, such that:
+ *   seed = seed0 ^ seed1
+ * Bits at position 320 and above in the input shares will be ignored.
+ *
+ * We then use Goubin's boolean-to-arithmetic masking algorithm to switch from
+ * this boolean masking scheme to an arithmetic one without ever unmasking the
+ * seed. See Algorithm 1 here:
+ * https://link.springer.com/content/pdf/10.1007/3-540-44709-1_2.pdf
+ *
+ * For a Coq proof of the correctness of the basic computational logic here
+ * see:
+ *   https://gist.github.com/jadephilipoom/24f44c59cbe59327e2f753867564fa28#file-masked_reduce-v-L226
+ *
+ * The proof does not cover leakage properties; it mostly just shows that this
+ * logic correctly computes (seed mod n) and the carry-handling works.
+ *
+ * This routine runs in constant time.
+ *
+ * Flags: Flags have no meaning beyond the scope of this subroutine.
+ *
+ * @param[in]  [w21, w20]: seed0, first share of seed (320 bits)
+ * @param[in]  [w23, w22]: seed1, second share of seed (320 bits)
+ * @param[out]   dmem[d0]: d0, first share of private key d (256 bits)
+ * @param[out]   dmem[d1]: d1, second share of private key d (256 bits)
+ *
+ * clobbered registers: TODO
+ * clobbered flag groups: FG0
+ */
+p256_keygen:
+  /* Init all-zero register. */
+  bn.xor    w31, w31, w31
+
+
 .section .data
 
 /* P-256 domain parameter b */

From b723b3f5353c669fc8fbd2b339859f91cbc702c0 Mon Sep 17 00:00:00 2001
From: Jade Philipoom <jadep@google.com>
Date: Mon, 16 Jan 2023 18:34:15 +0100
Subject: [PATCH 13/28] first draft done

---
 sw/otbn/crypto/p256.s | 224 +++++++++++++++---------------------------
 1 file changed, 77 insertions(+), 147 deletions(-)

diff --git a/sw/otbn/crypto/p256.s b/sw/otbn/crypto/p256.s
index f1425510937c0..8b4af72cf5a52 100644
--- a/sw/otbn/crypto/p256.s
+++ b/sw/otbn/crypto/p256.s
@@ -15,9 +15,19 @@
 .globl proj_add
 .globl p256_generate_k
 .globl p256_generate_random_key
+.globl p256_key_from_seed
 
 .text
 
+/**
+ * Trigger a fault.
+ *
+ * P256 routines jump here if they suspect they are under fault attack and must
+ * abort the computation.
+ */
+p256_trigger_fault:
+  unimp
+
 /**
  * Reduce a 512-bit value by a 256-bit P-256 modulus (either n or p).
  *
@@ -1171,6 +1181,18 @@ scalar_mult_int:
     jal       x1, mod_mul_256x256
     bn.mov    w6, w19
 
+  /* Check if the z-coordinate of Q is 0. If so, fail; this represents the
+     point at infinity and means the scalar was zero mod n, which likely
+     indicates a fault attack.
+  
+     FG0.Z <= if (w10 == 0) then 1 else 0 */
+  bn.cmp    w10, w31
+
+  /* Read the FG0.Z flag and fail if it is nonzero. */
+  csrrw     x2, 0x7c0, x0
+  andi      x2, x2, 8
+  bne       x2, x0, p256_trigger_fault
+
   /* convert back to affine coordinates
      R = (x_a, y_a) = (w11, w12) */
   jal       x1, proj_to_affine
@@ -1349,6 +1371,21 @@ p256_sign:
   /* w0 <= (w0+w19) mod n = (d * alpha) mod n */
   bn.addm   w0, w0, w19
 
+  /* Compare to 0.
+     FG0.Z <= (w0 =? w31) = ((d * alpha) mod n =? 0) */
+  bn.cmp    w0, w31
+
+  /* Read the FG0.Z flag (bit position 3) and trigger a fault if it is nonzero,
+     aborting the computation.
+
+     Since alpha is nonzero mod n, (d * alpha) mod n = 0 means d is zero mod n,
+     which violates ECDSA private key requirements. This could technically be
+     triggered by an unlucky key manager seed, but the probability is so low (~1/n)
+     that it more likely indicates a fault attack. */
+  csrrw     x2, 0x7c0, x0
+  andi      x2, x2, 8
+  bne       x2, x0, p256_trigger_fault
+
   /* w24 = r <= w11  mod n */
   bn.addm   w24, w11, w31
 
@@ -1893,58 +1930,6 @@ p256_scalar_mult:
 
   ret
 
-/**
- * Check if a masked scalar is zero modulo n.
- *
- * Returns (t0 + t1) =? 0 (mod n).
- *
- * To safely check if the scalar is zero, we generate a random, nonzero scalar
- * "alpha" for masking. We multiply each share by alpha modulo n, and add them
- * together and check if the result (t * alpha) is zero. The addition is safe
- * because of the randomness introduced by the mask.
- *
- * Flags: After this routine, FG0.Z is 1 iff the masked scalar is zero mod n.
- *
- * @param[in] w15,w16:  first share of secret scalar t (320 bits)
- * @param[in] w17,w18:  second share of secret scalar t (320 bits)
- * @param[in]     w29:  n, curve order
- * @param[in]     w28:  u, lower 256 bit of Barrett constant for n
- * @param[in]     MOD:  n, curve order
- * @param[in]     w31:  all-zero
- *
- * clobbered registers: TODO
- * clobbered flag groups: FG0
- */
-check_scalar_nonzero:
-  /* Generate a random nonzero masking parameter.
-     w14 <= URND(127) + 1 = alpha */
-  bn.wsrr   w14, 0x2 /* URND */
-  bn.rshi   w14, w31, w14 >> 129
-  bn.addi   w14, w14, 1
-
-  /* w12 <= ([w15,w16] * w14) mod n = (t0 * alpha) mod n */
-  bn.mov    w24, w15
-  bn.mov    w25, w16
-  bn.mov    w26, w14
-  jal       x1, mod_mul_320x128
-  bn.mov    w12, w19
-
-  /* w13 <= ([w17,w18] * w14) mod n = (t1 * alpha) mod n */
-  bn.mov    w24, w17
-  bn.mov    w25, w18
-  bn.mov    w26, w14
-  jal       x1, mod_mul_320x128
-  bn.mov    w13, w19
-
-  /* w12 <= (w12 + w13) mod n = ((t0 + t1) * alpha) mod n */
-  bn.addm   w12, w12, w13
-
-  /* Compare to 0.
-     FG0.Z <= (w12 =? w31) = ((t0 + t1) mod n =? 0) */
-  bn.cmp    w12, w31
-
-  ret
-
 /**
  * Generate a nonzero random value in the scalar field.
  *
@@ -2023,7 +2008,31 @@ p256_random_scalar:
   bn.rshi   w20, w16, w31 >> 64
   bn.rshi   w16, w20, w31 >> 192
 
-  jal       x1, check_scalar_nonzero
+  /* Generate a random masking parameter.
+     w14 <= URND(127) + 1 = x */
+  bn.wsrr   w14, 0x2 /* URND */
+  bn.addi   w14, w14, 1
+
+  /* w12 <= ([w15,w16] * w14) mod n = (seed0 * x) mod n */
+  bn.mov    w24, w15
+  bn.mov    w25, w16
+  bn.mov    w26, w14
+  jal       x1, mod_mul_320x128
+  bn.mov    w12, w19
+
+  /* w13 <= ([w17,w18] * w14) mod n = (seed1 * x) mod n */
+  bn.mov    w24, w17
+  bn.mov    w25, w18
+  bn.mov    w26, w14
+  jal       x1, mod_mul_320x128
+  bn.mov    w13, w19
+
+  /* w12 <= (w12 + w13) mod n = ((seed0 + seed1) * x) mod n */
+  bn.addm   w12, w12, w13
+
+  /* Compare to 0.
+     FG0.Z <= (w12 =? w31) = ((seed0 + seed1) mod n =? 0) */
+  bn.cmp    w12, w31
 
   /* Read the FG0.Z flag (position 3).
      x2 <= 8 if FG0.Z else 0 */
@@ -2116,14 +2125,12 @@ p256_generate_k:
 /**
  * Convert boolean shares to arithmetic ones using Goubin's algorithm.
  *
- * Returns x0, x1 such that (s0 ^ s1) = (x0 + x1) mod 2^321.
+ * Returns x0 such that (s0 ^ s1) = (x0 + s1) mod 2^321.
  *
  * The input consists of two 320-bit shares, s0 and s1. Bits at position 320
  * and above in the input shares will be ignored. We compute the result mod
  * 2^321 so that the high bit of x0 will reveal the carry modulo 2^320.
  *
- * The second share x1 is always simply (s1 mod 2^320).
- *
  * We then use Goubin's boolean-to-arithmetic masking algorithm to switch from
  * this boolean masking scheme to an arithmetic one without ever unmasking the
  * seed. See Algorithm 1 here:
@@ -2210,11 +2217,12 @@ boolean_to_arithmetic:
  *    d = (d0 + d1) mod n
  * ...where n is the curve order.
  *
- * This implementation is similar to FIPS 186-4 section B.4.1, where we
+ * This implementation follows FIPS 186-4 section B.4.1, where we
  * generate d using N+64 random bits (320 bits in this case) as a seed. But
  * while FIPS computes d = (seed mod (n-1)) + 1 to ensure a nonzero key, we
- * instead compute d = seed mod n and check for zero by multiplying with a
- * nonzero random scalar.
+ * instead just compute d = seed mod n. The caller MUST ensure that if this
+ * routine is used, then other routines that use d (e.g. signing, public key
+ * generation) are checking if d is 0.
  *
  * Most complexity in this routine comes from masking. The input seed is
  * provided in two 320-bit shares, seed0 and seed1, such that:
@@ -2240,13 +2248,13 @@ boolean_to_arithmetic:
  * @param[in]  [w21, w20]: seed0, first share of seed (320 bits)
  * @param[in]  [w23, w22]: seed1, second share of seed (320 bits)
  * @param[in]         w31: all-zero
- * @param[out] [w16, w15]: d0, first share of private key d (320 bits)
- * @param[out] [w18, w17]: d1, second share of private key d (320 bits)
+ * @param[out] [w21, w20]: d0, first share of private key d (320 bits)
+ * @param[out] [w23, w22]: d1, second share of private key d (320 bits)
  *
  * clobbered registers: TODO
  * clobbered flag groups: FG0
  */
-p256_secret_key_from_seed:
+p256_key_from_seed:
   /* Convert from a boolean to an arithmetic mask using Goubin's algorithm.
        [w21, w20] <= ((seed0 ^ seed1) - seed1) mod 2^321 = x0 */
   jal       x1, boolean_to_arithmetic
@@ -2297,9 +2305,9 @@ p256_secret_key_from_seed:
   /* Compute d1. Because 2^320 < 2 * (n << 64), a conditional subtraction is
      sufficient to reduce. Similarly to the carry bit, the conditional bit here
      is not very sensitive because the shares are large relative to n.
-       [w18,w17] <= x1 mod (n << 64) = d1 */
-  bn.sel    w17, w22, w24, FG0.C
-  bn.sel    w18, w23, w25, FG0.C
+       [w23,w22] <= x1 mod (n << 64) = d1 */
+  bn.sel    w22, w22, w24, FG0.C
+  bn.sel    w23, w23, w25, FG0.C
 
   /* Isolate the carry bit and shift it back into position.
        w25 <= x0[320] << 64 */
@@ -2324,96 +2332,18 @@ p256_secret_key_from_seed:
   bn.sel    w24, w24, w26, FG0.C
   bn.sel    w25, w25, w27, FG0.C
 
-  /* Compute d0 with a modular subtraction. First we add (n << 64) to protect
+  /* Compute d1 with a modular subtraction. First we add (n << 64) to protect
      against underflow, then conditionally subtract it if needed.
-       [w16,w15] <= ([w21, w20] - [w25,w24]) mod (n << 64) = d1 */
+       [w21,w20] <= ([w21, w20] - [w25,w24]) mod (n << 64) = d1 */
   bn.add    w20, w20, w28
   bn.addc   w21, w21, w29
   bn.sub    w26, w20, w24
   bn.subb   w27, w21, w25
-  bn.sel    w15, w20, w26, FG0.C
-  bn.sel    w16, w21, w27, FG0.C
-
-  /* Re-load the curve order n.
-     w29 <= dmem[p256_n] = n */
-  li        x2, 29
-  la        x3, p256_n
-  bn.lid    x2, 0(x3)
-
-  /* Copy n into the MOD register. */
-  bn.wsrw   0, w29
-
-  /* Load Barrett constant for n.
-     w28 <= u_n = dmem[p256_u_n]  */
-  li        x2, 28
-  la        x3, p256_u_n
-  bn.lid    x2, 0(x3)
-
-  /* Check if the generated key is 0 modulo n.
-       FG0.Z <= if (d0 + d1) == 0 (mod n) then 0 else 1 */
-  jal       x1, check_scalar_nonzero
-
-  /* Read the FG0.Z flag (position 3).
-     x2 <= 8 if FG0.Z else 0 */
-  csrrw     x2, 0x7c0, x0
-  andi      x2, x2, 8
-
-  /* Jump to failure case if x2 != 0. */
-  bne       x2, x0, p256_secret_key_from_seed_fail
+  bn.sel    w20, w20, w26, FG0.C
+  bn.sel    w21, w21, w27, FG0.C
 
   ret
 
-  p256_secret_key_from_seed_fail:
-    /* Trigger a fault. */
-    unimp
-
-/**
- * Top-level P-256 ECDSA key generation.
- *
- * Returns the secret key d in two 320-bit shares d0 and d1, such that:
- *    d = (d0 + d1) mod n
- * ...where n is the curve order.
- *
- * This implementation is similar to FIPS 186-4 section B.4.1, where we
- * generate d using N+64 random bits (320 bits in this case) as a seed. But
- * while FIPS computes d = (seed mod (n-1)) + 1 to ensure a nonzero key, we
- * instead compute d = seed mod n and check for zero during point-scalar
- * multiplication while we get the public key.
- *
- * Most complexity in this routine comes from masking. The input seed is
- * provided in two 320-bit shares, seed0 and seed1, such that:
- *   seed = seed0 ^ seed1
- * Bits at position 320 and above in the input shares will be ignored.
- *
- * We then use Goubin's boolean-to-arithmetic masking algorithm to switch from
- * this boolean masking scheme to an arithmetic one without ever unmasking the
- * seed. See Algorithm 1 here:
- * https://link.springer.com/content/pdf/10.1007/3-540-44709-1_2.pdf
- *
- * For a Coq proof of the correctness of the basic computational logic here
- * see:
- *   https://gist.github.com/jadephilipoom/24f44c59cbe59327e2f753867564fa28#file-masked_reduce-v-L226
- *
- * The proof does not cover leakage properties; it mostly just shows that this
- * logic correctly computes (seed mod n) and the carry-handling works.
- *
- * This routine runs in constant time.
- *
- * Flags: Flags have no meaning beyond the scope of this subroutine.
- *
- * @param[in]  [w21, w20]: seed0, first share of seed (320 bits)
- * @param[in]  [w23, w22]: seed1, second share of seed (320 bits)
- * @param[out]   dmem[d0]: d0, first share of private key d (256 bits)
- * @param[out]   dmem[d1]: d1, second share of private key d (256 bits)
- *
- * clobbered registers: TODO
- * clobbered flag groups: FG0
- */
-p256_keygen:
-  /* Init all-zero register. */
-  bn.xor    w31, w31, w31
-
-
 .section .data
 
 /* P-256 domain parameter b */

From 3fbd384e97d0ffb93c5c88787b3a6977ed56ee3f Mon Sep 17 00:00:00 2001
From: Jade Philipoom <jadep@google.com>
Date: Tue, 17 Jan 2023 10:31:01 +0100
Subject: [PATCH 14/28] fix bugs after testing

---
 sw/otbn/crypto/p256.s      | 47 +++++++++++++++++++++++---------------
 sw/otbn/crypto/tests/BUILD | 19 +++++++++++++++
 2 files changed, 48 insertions(+), 18 deletions(-)

diff --git a/sw/otbn/crypto/p256.s b/sw/otbn/crypto/p256.s
index 8b4af72cf5a52..eabf3ee3e1080 100644
--- a/sw/otbn/crypto/p256.s
+++ b/sw/otbn/crypto/p256.s
@@ -2125,7 +2125,7 @@ p256_generate_k:
 /**
  * Convert boolean shares to arithmetic ones using Goubin's algorithm.
  *
- * Returns x0 such that (s0 ^ s1) = (x0 + s1) mod 2^321.
+ * Returns x0, x1 such that (s0 ^ s1) = (x0 + x1) mod 2^321.
  *
  * The input consists of two 320-bit shares, s0 and s1. Bits at position 320
  * and above in the input shares will be ignored. We compute the result mod
@@ -2148,6 +2148,8 @@ p256_generate_k:
  *     A := ((s0 ^ G) - G) mod 2^k
  *     return x0 := (A ^ T2)
  *
+ * The output x1 is always (s1 mod 2^320).
+ *
  * This routine runs in constant time.
  *
  * Flags: Flags have no meaning beyond the scope of this subroutine.
@@ -2156,21 +2158,19 @@ p256_generate_k:
  * @param[in]  [w23, w22]: s1, second share of seed (320 bits)
  * @param[in]         w31: all-zero
  * @param[out] [w21, w20]: result x0 (321 bits)
+ * @param[out] [w23, w22]: result x1 (320 bits)
  *
- * clobbered registers: w1 to w4, w20 to w23
+ * clobbered registers: w1 to w5, w20 to w23
  * clobbered flag groups: FG0
  */
 boolean_to_arithmetic:
-  /* Create a 64-bit mask.
-       w1 <= 2^64 - 1 */
-  bn.subi   w1, w31, 1
-  bn.rshi   w1, w31, w1 >> 192
-
   /* Mask out excess bits from seed shares.
        [w21, w20] <= s0 mod 2^320
        [w23, w22] <= s1 mod 2^320 = x1 */
-  bn.and    w21, w21, w1
-  bn.and    w23, w23, w1
+  bn.rshi   w21, w21, w31 >> 64
+  bn.rshi   w21, w31, w21 >> 192
+  bn.rshi   w23, w23, w31 >> 64
+  bn.rshi   w23, w31, w23 >> 192
 
   /* Fetch 321 bits of randomness from URND.
        [w2, w1] <= gamma */
@@ -2188,6 +2188,11 @@ boolean_to_arithmetic:
   bn.sub    w3, w3, w1
   bn.subb   w4, w4, w2
 
+  /* Truncate subtraction result to 321 bits.
+       [w4, w3] <= [w4, w3] mod 2^321 = T */
+  bn.rshi   w4, w4, w31 >> 65
+  bn.rshi   w4, w31, w4 >> 191
+
   /* [w4, w3] <= [w4, w3] ^ [w21, w20] = T2 */
   bn.xor    w3, w3, w20
   bn.xor    w4, w4, w21
@@ -2204,9 +2209,13 @@ boolean_to_arithmetic:
   bn.sub    w20, w20, w1
   bn.subb   w21, w21, w2
 
+  /* [w21, w20] <= [w21, w20] mod 2^321 = A */
+  bn.rshi   w21, w21, w31 >> 65
+  bn.rshi   w21, w31, w21 >> 191
+
   /* [w21, w20] <= [w21, w20] ^ [w4, w3] = A ^ T2 = x0 */
-  bn.xor    w20, w20, w1
-  bn.xor    w21, w21, w2
+  bn.xor    w20, w20, w3
+  bn.xor    w21, w21, w4
 
   ret
 
@@ -2298,7 +2307,7 @@ p256_key_from_seed:
   bn.rshi   w28, w29, w31 >> 192
   bn.rshi   w29, w31, w29 >> 192
 
-  /* [w25,w24] <= (x1 - (n << 64)) mod 2^256 */
+  /* [w25,w24] <= (x1 - (n << 64)) mod 2^512 */
   bn.sub    w24, w22, w28
   bn.subb   w25, w23, w29
 
@@ -2327,18 +2336,20 @@ p256_key_from_seed:
 
   /* Compute the correction factor.
        [w25,w24] <= (x[320] << 320) mod (n << 64) = c */
-  bn.sub    w26, w24, w28
+  bn.sub    w26, w31, w28
   bn.subb   w27, w25, w29
-  bn.sel    w24, w24, w26, FG0.C
+  bn.sel    w24, w31, w26, FG0.C
   bn.sel    w25, w25, w27, FG0.C
 
-  /* Compute d1 with a modular subtraction. First we add (n << 64) to protect
-     against underflow, then conditionally subtract it if needed.
+  /* Compute d0 with a modular subtraction. First we add (n << 64) to protect
+     against underflow, then conditionally subtract it again if needed.
        [w21,w20] <= ([w21, w20] - [w25,w24]) mod (n << 64) = d1 */
   bn.add    w20, w20, w28
   bn.addc   w21, w21, w29
-  bn.sub    w26, w20, w24
-  bn.subb   w27, w21, w25
+  bn.sub    w20, w20, w24
+  bn.subb   w21, w21, w25
+  bn.sub    w26, w20, w28
+  bn.subb   w27, w21, w29
   bn.sel    w20, w20, w26, FG0.C
   bn.sel    w21, w21, w27, FG0.C
 
diff --git a/sw/otbn/crypto/tests/BUILD b/sw/otbn/crypto/tests/BUILD
index 9031ce67921b5..ff9bbd8da398d 100644
--- a/sw/otbn/crypto/tests/BUILD
+++ b/sw/otbn/crypto/tests/BUILD
@@ -99,6 +99,25 @@ otbn_consttime_test(
     ],
 )
 
+otbn_consttime_test(
+    name = "p256_key_from_seed_consttime",
+    subroutine = "p256_key_from_seed",
+    deps = [
+        "//sw/otbn/crypto:p256_ecdsa",
+    ],
+)
+
+otbn_sim_test(
+    name = "p256_key_from_seed_test",
+    srcs = [
+        "p256_key_from_seed_test.s",
+    ],
+    exp = "p256_key_from_seed_test.exp",
+    deps = [
+        "//sw/otbn/crypto:p256",
+    ],
+)
+
 otbn_consttime_test(
     name = "p256_proj_add_consttime",
     subroutine = "proj_add",

From 67591a39329a012071d48b54f398037ce233609f Mon Sep 17 00:00:00 2001
From: Jade Philipoom <jadep@google.com>
Date: Tue, 17 Jan 2023 10:31:42 +0100
Subject: [PATCH 15/28] add test for p256 key derivation

---
 .../crypto/tests/p256_key_from_seed_test.exp  |  6 ++
 .../crypto/tests/p256_key_from_seed_test.s    | 90 +++++++++++++++++++
 2 files changed, 96 insertions(+)
 create mode 100644 sw/otbn/crypto/tests/p256_key_from_seed_test.exp
 create mode 100644 sw/otbn/crypto/tests/p256_key_from_seed_test.s

diff --git a/sw/otbn/crypto/tests/p256_key_from_seed_test.exp b/sw/otbn/crypto/tests/p256_key_from_seed_test.exp
new file mode 100644
index 0000000000000..ac787d0d7ea8e
--- /dev/null
+++ b/sw/otbn/crypto/tests/p256_key_from_seed_test.exp
@@ -0,0 +1,6 @@
+# [w21, w20]: d0
+w20 = 0x69f4dddfceb69450b5461eaecc5617f7a9b21a37d1b6b5d5e16682969aa68ff0
+w21 = 0x5b0421c0bbeb8814
+# [w23, w22]: d1
+w22 = 0x54cd40e4208329529bb79deec3927359daf2e50762a8c404debb1d2488dafa5c
+w23 = 0x2f2876129e3e8356
diff --git a/sw/otbn/crypto/tests/p256_key_from_seed_test.s b/sw/otbn/crypto/tests/p256_key_from_seed_test.s
new file mode 100644
index 0000000000000..03cabb9e5649a
--- /dev/null
+++ b/sw/otbn/crypto/tests/p256_key_from_seed_test.s
@@ -0,0 +1,90 @@
+/* Copyright lowRISC contributors. */
+/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */
+/* SPDX-License-Identifier: Apache-2.0 */
+
+/**
+ * Standalone test for P256 secret key derivation.
+ */
+
+.section .text.start
+
+key_from_seed_test:
+  /* Init all-zero register. */
+  bn.xor    w31, w31, w31
+
+  /* Load shares of seed from DMEM.
+       [w21,w20] <= dmem[seed0]
+       [w23,w33] <= dmem[seed1] */
+  li        x2, 20
+  la        x3, seed0
+  bn.lid    x2, 0(x3++)
+  li        x2, 21
+  bn.lid    x2++, 0(x3)
+  la        x3, seed1
+  bn.lid    x2, 0(x3++)
+  li        x2, 23
+  bn.lid    x2, 0(x3)
+
+  /* Generate the derived secret key. */
+  jal       x1, p256_key_from_seed
+
+  ecall
+
+.data
+
+/* Full test data for reference (randomly generated):
+
+Seed shares:
+0xa504e1c1c414883cea0f5e27cfba94f1cb4a21734c7af8085e561a3856f7bdde1e9a829fab5b7010
+0x2f2876129e3e835654cd40e4208329529bb79deec3927359daf2e50762a8c404debb1d2488dafa5c
+
+Expected key shares:
+0x5b0421c0bbeb881469f4dddfceb69450b5461eaecc5617f7a9b21a37d1b6b5d5e16682969aa68ff0
+0x2f2876129e3e835654cd40e4208329529bb79deec3927359daf2e50762a8c404debb1d2488dafa5c
+
+Real seed value:
+0x8a2c97d35a2a0b6abec21ec3ef39bda350fdbc9d8fe88b5184a4ff3f345f79dac0219fbb23818a4c
+
+Real masked value of key (seed mod n):
+0x18ec2a2e0ae31a657534e99429990b0d42bc6e1b9ab120bc1218be813585edae
+*/
+
+/* First share of seed (320 bits). */
+.balign 32
+seed0:
+  .word 0xab5b7010
+  .word 0x1e9a829f
+  .word 0x56f7bdde
+  .word 0x5e561a38
+  .word 0x4c7af808
+  .word 0xcb4a2173
+  .word 0xcfba94f1
+  .word 0xea0f5e27
+  .word 0xc414883c
+  .word 0xa504e1c1
+  .word 0x00000000
+  .word 0x00000000
+  .word 0x00000000
+  .word 0x00000000
+  .word 0x00000000
+  .word 0x00000000
+
+/* Second share of seed (320 bits) */
+.balign 32
+seed1:
+  .word 0x88dafa5c
+  .word 0xdebb1d24
+  .word 0x62a8c404
+  .word 0xdaf2e507
+  .word 0xc3927359
+  .word 0x9bb79dee
+  .word 0x20832952
+  .word 0x54cd40e4
+  .word 0x9e3e8356
+  .word 0x2f287612
+  .word 0x00000000
+  .word 0x00000000
+  .word 0x00000000
+  .word 0x00000000
+  .word 0x00000000
+  .word 0x00000000

From 6b5893e40f6bb28dfe623ac4307af2024260c4af Mon Sep 17 00:00:00 2001
From: Jade Philipoom <jadep@google.com>
Date: Tue, 17 Jan 2023 10:39:27 +0100
Subject: [PATCH 16/28] change to a test that has a carry bit

---
 .../crypto/tests/p256_key_from_seed_test.exp  |  9 +--
 .../crypto/tests/p256_key_from_seed_test.s    | 55 +++++++++----------
 2 files changed, 32 insertions(+), 32 deletions(-)

diff --git a/sw/otbn/crypto/tests/p256_key_from_seed_test.exp b/sw/otbn/crypto/tests/p256_key_from_seed_test.exp
index ac787d0d7ea8e..0b6ff554a2321 100644
--- a/sw/otbn/crypto/tests/p256_key_from_seed_test.exp
+++ b/sw/otbn/crypto/tests/p256_key_from_seed_test.exp
@@ -1,6 +1,7 @@
 # [w21, w20]: d0
-w20 = 0x69f4dddfceb69450b5461eaecc5617f7a9b21a37d1b6b5d5e16682969aa68ff0
-w21 = 0x5b0421c0bbeb8814
+w20 = 0x9def3b61bc577b4b45c0f8b23ed867e3302b5143e9e71859e3ef3615df0ace13
+w21 = 0xe46bcaf84b3890e1
+
 # [w23, w22]: d1
-w22 = 0x54cd40e4208329529bb79deec3927359daf2e50762a8c404debb1d2488dafa5c
-w23 = 0x2f2876129e3e8356
+w22 = 0x17bcfeef551f77d199dd9f5af7d1a8736f2f939abeb67c9e2df4bec0225596d6
+w23 = 0x63e2e86d4e67f1f7
diff --git a/sw/otbn/crypto/tests/p256_key_from_seed_test.s b/sw/otbn/crypto/tests/p256_key_from_seed_test.s
index 03cabb9e5649a..69bbcbefa894e 100644
--- a/sw/otbn/crypto/tests/p256_key_from_seed_test.s
+++ b/sw/otbn/crypto/tests/p256_key_from_seed_test.s
@@ -33,35 +33,34 @@ key_from_seed_test:
 .data
 
 /* Full test data for reference (randomly generated):
-
 Seed shares:
-0xa504e1c1c414883cea0f5e27cfba94f1cb4a21734c7af8085e561a3856f7bdde1e9a829fab5b7010
-0x2f2876129e3e835654cd40e4208329529bb79deec3927359daf2e50762a8c404debb1d2488dafa5c
+0x2bac5b0bd7c77320a210c4be446984ccbb6a02057843d9a2c48e8981128c13393c174a162335f23f
+0x63e2e86d4e67f1f717bcfeef551f77d199dd9f5af7d1a8736f2f939abeb67c9e2df4bec0225596d6
 
 Expected key shares:
-0x5b0421c0bbeb881469f4dddfceb69450b5461eaecc5617f7a9b21a37d1b6b5d5e16682969aa68ff0
-0x2f2876129e3e835654cd40e4208329529bb79deec3927359daf2e50762a8c404debb1d2488dafa5c
+0xe46bcaf84b3890e19def3b61bc577b4b45c0f8b23ed867e3302b5143e9e71859e3ef3615df0ace13
+0x63e2e86d4e67f1f717bcfeef551f77d199dd9f5af7d1a8736f2f939abeb67c9e2df4bec0225596d6
 
-Real seed value:
-0x8a2c97d35a2a0b6abec21ec3ef39bda350fdbc9d8fe88b5184a4ff3f345f79dac0219fbb23818a4c
+Real masked seed value:
+0x484eb36699a082d7b5ac3a511176f31d22b79d5f8f9271d1aba11a1bac3a6fa711e3f4d6016064e9
 
 Real masked value of key (seed mod n):
-0x18ec2a2e0ae31a657534e99429990b0d42bc6e1b9ab120bc1218be813585edae
+0x4f4cbd282f87bcdf35ab4783cd934744c2865e67bd8a418324fc72bdefdf454b
 */
 
 /* First share of seed (320 bits). */
 .balign 32
 seed0:
-  .word 0xab5b7010
-  .word 0x1e9a829f
-  .word 0x56f7bdde
-  .word 0x5e561a38
-  .word 0x4c7af808
-  .word 0xcb4a2173
-  .word 0xcfba94f1
-  .word 0xea0f5e27
-  .word 0xc414883c
-  .word 0xa504e1c1
+  .word 0x2335f23f
+  .word 0x3c174a16
+  .word 0x128c1339
+  .word 0xc48e8981
+  .word 0x7843d9a2
+  .word 0xbb6a0205
+  .word 0x446984cc
+  .word 0xa210c4be
+  .word 0xd7c77320
+  .word 0x2bac5b0b
   .word 0x00000000
   .word 0x00000000
   .word 0x00000000
@@ -72,16 +71,16 @@ seed0:
 /* Second share of seed (320 bits) */
 .balign 32
 seed1:
-  .word 0x88dafa5c
-  .word 0xdebb1d24
-  .word 0x62a8c404
-  .word 0xdaf2e507
-  .word 0xc3927359
-  .word 0x9bb79dee
-  .word 0x20832952
-  .word 0x54cd40e4
-  .word 0x9e3e8356
-  .word 0x2f287612
+  .word 0x225596d6
+  .word 0x2df4bec0
+  .word 0xbeb67c9e
+  .word 0x6f2f939a
+  .word 0xf7d1a873
+  .word 0x99dd9f5a
+  .word 0x551f77d1
+  .word 0x17bcfeef
+  .word 0x4e67f1f7
+  .word 0x63e2e86d
   .word 0x00000000
   .word 0x00000000
   .word 0x00000000

From 2e97f72bc6d00f01aa1138c35bb8bcab01dedefa Mon Sep 17 00:00:00 2001
From: Jade Philipoom <jadep@google.com>
Date: Tue, 17 Jan 2023 11:53:39 +0100
Subject: [PATCH 17/28] add SCA interface for P256 keygen

---
 sw/device/sca/BUILD                           |  24 +-
 sw/device/sca/ecc256_keygen_serial.c          | 220 ++++++++++++++++++
 .../{ecc_serial.c => ecc256_sign_serial.c}    |   0
 sw/otbn/crypto/BUILD                          |  10 +
 sw/otbn/crypto/p256_key_from_seed_sca.s       |  88 +++++++
 5 files changed, 340 insertions(+), 2 deletions(-)
 create mode 100644 sw/device/sca/ecc256_keygen_serial.c
 rename sw/device/sca/{ecc_serial.c => ecc256_sign_serial.c} (100%)
 create mode 100644 sw/otbn/crypto/p256_key_from_seed_sca.s

diff --git a/sw/device/sca/BUILD b/sw/device/sca/BUILD
index 8c192f39a0741..3f529e5eaf581 100644
--- a/sw/device/sca/BUILD
+++ b/sw/device/sca/BUILD
@@ -85,8 +85,28 @@ opentitan_flash_binary(
 )
 
 opentitan_flash_binary(
-    name = "ecc_serial",
-    srcs = ["ecc_serial.c"],
+    name = "ecc256_keygen_serial",
+    srcs = ["ecc256_keygen_serial.c"],
+    deps = [
+        "//hw/top_earlgrey/sw/autogen:top_earlgrey",
+        "//sw/device/lib/base:abs_mmio",
+        "//sw/device/lib/base:memory",
+        "//sw/device/lib/crypto/drivers:otbn",
+        "//sw/device/lib/runtime:ibex",
+        "//sw/device/lib/runtime:log",
+        "//sw/device/lib/testing:entropy_testutils",
+        "//sw/device/lib/testing/test_framework:ottf_ld_silicon_creator_slot_a",
+        "//sw/device/lib/testing/test_framework:ottf_main",
+        "//sw/device/sca/lib:prng",
+        "//sw/device/sca/lib:sca",
+        "//sw/device/sca/lib:simple_serial",
+        "//sw/otbn/crypto:p256_key_from_seed_sca",
+    ],
+)
+
+opentitan_flash_binary(
+    name = "ecc256_sign_serial",
+    srcs = ["ecc256_sign_serial.c"],
     deps = [
         "//hw/top_earlgrey/sw/autogen:top_earlgrey",
         "//sw/device/lib/base:abs_mmio",
diff --git a/sw/device/sca/ecc256_keygen_serial.c b/sw/device/sca/ecc256_keygen_serial.c
new file mode 100644
index 0000000000000..6612407f685e2
--- /dev/null
+++ b/sw/device/sca/ecc256_keygen_serial.c
@@ -0,0 +1,220 @@
+// Copyright lowRISC contributors.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#include "sw/device/lib/base/abs_mmio.h"
+#include "sw/device/lib/base/memory.h"
+#include "sw/device/lib/crypto/drivers/otbn.h"
+#include "sw/device/lib/runtime/ibex.h"
+#include "sw/device/lib/runtime/log.h"
+#include "sw/device/lib/testing/entropy_testutils.h"
+#include "sw/device/lib/testing/test_framework/ottf_main.h"
+#include "sw/device/lib/testing/test_framework/ottf_test_config.h"
+#include "sw/device/sca/lib/sca.h"
+#include "sw/device/sca/lib/simple_serial.h"
+
+#include "hw/top_earlgrey/sw/autogen/top_earlgrey.h"
+#include "otbn_regs.h"
+
+/**
+ * OpenTitan program for OTBN ECDSA-P256 side-channel analysis.
+ *
+ * This program implements the following simple serial commands:
+ *   - Set seed ('s')*,
+ *   - Keygen ('k')+,
+ * See https://wiki.newae.com/SimpleSerial for details on the protocol.
+ */
+
+OTTF_DEFINE_TEST_CONFIG();
+
+enum {
+  /**
+   * Number of bytes for ECDSA P-256 seeds and masked private keys.
+   */
+  kEcc256SeedNumBytes = (256 + 64) / 8,
+  /**
+   * Number of 32b words for ECDSA P-256 seeds and masked private keys.
+   */
+  kEcc256SeedNumWords = kEcc256SeedNumBytes / sizeof(uint32_t),
+};
+
+OTBN_DECLARE_APP_SYMBOLS(p256_key_from_seed_sca);
+
+OTBN_DECLARE_SYMBOL_ADDR(p256_key_from_seed_sca, mode);
+OTBN_DECLARE_SYMBOL_ADDR(p256_key_from_seed_sca, seed0);
+OTBN_DECLARE_SYMBOL_ADDR(p256_key_from_seed_sca, seed1);
+OTBN_DECLARE_SYMBOL_ADDR(p256_key_from_seed_sca, d0);
+OTBN_DECLARE_SYMBOL_ADDR(p256_key_from_seed_sca, d1);
+
+static const otbn_app_t kOtbnAppP256KeyFromSeed = OTBN_APP_T_INIT(p256_key_from_seed_sca);
+
+static const otbn_addr_t kOtbnVarMode = OTBN_ADDR_T_INIT(p256_key_from_seed_sca, mode);
+static const otbn_addr_t kOtbnVarSeed0 = OTBN_ADDR_T_INIT(p256_key_from_seed_sca, seed0);
+static const otbn_addr_t kOtbnVarSeed1 = OTBN_ADDR_T_INIT(p256_key_from_seed_sca, seed1);
+static const otbn_addr_t kOtbnVarD0 = OTBN_ADDR_T_INIT(p256_key_from_seed_sca, d0);
+static const otbn_addr_t kOtbnVarD1 = OTBN_ADDR_T_INIT(p256_key_from_seed_sca, d1);
+
+
+/**
+ * Seed value.
+ *
+ * The default value corresponds to the test data in
+ *   sw/otbn/crypto/test/p256_key_from_seed_test.s
+ *
+ * This default value can be overwritten via the simpleserial command `s`
+ * (see ecc256_set_seed)
+ */
+uint32_t ecc256_seed[kEcc256SeedNumWords] = {
+  0x016064e9,
+  0x11e3f4d6,
+  0xac3a6fa7,
+  0xaba11a1b,
+  0x8f9271d1,
+  0x22b79d5f,
+  0x1176f31d,
+  0xb5ac3a51,
+  0x99a082d7,
+  0x484eb366,
+};
+
+/**
+ * Simple serial 's' (set seed) command handler.
+ *
+ * The key must be `kEcc256SeedNumBytes` bytes long.
+ *
+ * @param seed Value for seed share.
+ * @param seed_len Length of seed share.
+ */
+static void ecc256_set_seed(const uint8_t *seed, size_t seed_len) {
+  SS_CHECK(seed_len == kEcc256SeedNumBytes);
+  memcpy(ecc256_seed, seed, seed_len);
+}
+
+/**
+ * Generates a private key from a masked seed.
+ *
+ * The seed shares must be `kEcc256SeedNumWords` words long, and the caller
+ * must provide pre-allocated buffers of the same length for the key shares.
+ *
+ * @param[in] seed  Seed for key generation.
+ * @param[in] mask  Mask for seed.
+ * @param[out] d0   First share of masked private key d. 
+ * @param[out] d1   Second share of masked private key d.
+ */
+static void p256_ecdsa_keygen(const uint32_t *seed, const uint32_t *mask,
+                              uint32_t *d0, uint32_t *d1) {
+
+  // Write mode.
+  uint32_t mode = 1;  // mode 1 => generate private key
+  SS_CHECK(otbn_dmem_write(/*num_words=*/1, &mode, kOtbnVarMode) ==
+           kOtbnErrorOk);
+
+  // Compute first share of seed (seed ^ mask).
+  uint32_t seed0[kEcc256SeedNumWords];
+  for (size_t i = 0; i < kEcc256SeedNumWords; i++) {
+    seed0[i] = seed[i] ^ mask[i];
+  }
+
+  // Write seed shares.
+  SS_CHECK(otbn_dmem_write(kEcc256SeedNumWords, seed0, kOtbnVarSeed0) == kOtbnErrorOk);
+  SS_CHECK(otbn_dmem_write(kEcc256SeedNumWords, mask, kOtbnVarSeed1) == kOtbnErrorOk);
+
+  // Set high bits of seed0, seed1 to all-zero. These bits are ignored in the
+  // implementation but must be written to avoid runtime errors.
+  size_t offset = kEcc256SeedNumWords % kOtbnWideWordNumWords;
+  size_t num_zeroes = (kOtbnWideWordNumWords - offset) % kOtbnWideWordNumWords;
+  SS_CHECK(otbn_dmem_set(num_zeroes, 0, kOtbnVarSeed0 + kEcc256SeedNumBytes) ==
+           kOtbnErrorOk);
+  SS_CHECK(otbn_dmem_set(num_zeroes, 0, kOtbnVarSeed1 + kEcc256SeedNumBytes) ==
+           kOtbnErrorOk);
+
+  LOG_INFO("Executing program...");
+  SS_CHECK(otbn_execute() == kOtbnErrorOk);
+  SS_CHECK(otbn_busy_wait_for_done() == kOtbnErrorOk);
+
+  LOG_INFO("Reading results...");
+  SS_CHECK(otbn_dmem_read(kEcc256SeedNumWords, kOtbnVarD0, d0) ==
+           kOtbnErrorOk);
+  SS_CHECK(otbn_dmem_read(kEcc256SeedNumWords, kOtbnVarD1, d1) ==
+           kOtbnErrorOk);
+  for (size_t i = 0; i < kEcc256SeedNumWords; i++) {
+    LOG_INFO("d0[%d]: 0x%08x", i, d0[i]);
+  }
+  for (size_t i = 0; i < kEcc256SeedNumWords; i++) {
+    LOG_INFO("d1[%d]: 0x%08x", i, d1[i]);
+  }
+}
+
+/**
+ * Simple serial 'k' (keygen) command handler.
+ *
+ * Takes the mask value from the simple serial UART and triggers an OTBN
+ * secret key generation operation. The mask must be `kEcc256SeedNumBytes`
+ * bytes long.
+ *
+ * Uses a fixed seed. To overwrite the seed, use the simpleserial command 's'.
+ *
+ * @param[in] mask The mask provided by the simpleserial UART.
+ * @param[in] mask_len Length of the mask.
+ */
+static void ecc256_ecdsa_keygen(const uint8_t *mask,
+                         size_t mask_len) {
+  if (mask_len != kEcc256SeedNumBytes) {
+    LOG_ERROR("Invalid mask length %hu", (uint8_t)mask_len);
+    return;
+  }
+
+  // Copy mask to an aligned buffer.
+  uint32_t ecc256_mask[kEcc256SeedNumWords];
+  memcpy(ecc256_mask, mask, kEcc256SeedNumBytes);
+
+  LOG_INFO("Loading app...");
+  SS_CHECK(otbn_load_app(kOtbnAppP256KeyFromSeed) == kOtbnErrorOk);
+
+  uint32_t ecc256_d0[kEcc256SeedNumWords];
+  uint32_t ecc256_d1[kEcc256SeedNumWords];
+
+  LOG_INFO("Running keygen...");
+  sca_set_trigger_high();
+  p256_ecdsa_keygen(ecc256_seed, ecc256_mask, ecc256_d0, ecc256_d1);
+  sca_set_trigger_low();
+
+  // TODO: Remove these if they are not necessary for the side-channel analysis.
+  simple_serial_send_packet('r', (unsigned char *)ecc256_d0, kEcc256SeedNumBytes);
+  simple_serial_send_packet('r', (unsigned char *)ecc256_d1, kEcc256SeedNumBytes);
+
+  LOG_INFO("Clearing OTBN memory.");
+  SS_CHECK(otbn_dmem_sec_wipe() == kOtbnErrorOk);
+  SS_CHECK(otbn_imem_sec_wipe() == kOtbnErrorOk);
+}
+
+/**
+ * Initializes peripherals and processes simple serial packets received over
+ * UART.
+ */
+static void simple_serial_main(void) {
+  entropy_testutils_auto_mode_init();
+
+  sca_init(kScaTriggerSourceOtbn, kScaPeripheralEntropy | kScaPeripheralIoDiv4 |
+                                      kScaPeripheralOtbn | kScaPeripheralCsrng |
+                                      kScaPeripheralEdn | kScaPeripheralHmac);
+
+  LOG_INFO("Running ECC serial");
+  LOG_INFO("Initializing simple serial interface to capture board.");
+
+  simple_serial_init(sca_get_uart());
+  SS_CHECK(simple_serial_register_handler('k', ecc256_ecdsa_keygen) ==
+           kSimpleSerialOk);
+  SS_CHECK(simple_serial_register_handler('s', ecc256_set_seed) ==
+           kSimpleSerialOk);
+
+  LOG_INFO("Starting simple serial packet handling.");
+  while (true) {
+    simple_serial_process_packet();
+  }
+}
+
+bool test_main(void) {
+  simple_serial_main();
+  return true;
+}
diff --git a/sw/device/sca/ecc_serial.c b/sw/device/sca/ecc256_sign_serial.c
similarity index 100%
rename from sw/device/sca/ecc_serial.c
rename to sw/device/sca/ecc256_sign_serial.c
diff --git a/sw/otbn/crypto/BUILD b/sw/otbn/crypto/BUILD
index f8074f32d026d..2d82ab13c6f0d 100644
--- a/sw/otbn/crypto/BUILD
+++ b/sw/otbn/crypto/BUILD
@@ -143,6 +143,16 @@ otbn_binary(
     ],
 )
 
+otbn_binary(
+    name = "p256_key_from_seed_sca",
+    srcs = [
+        "p256_key_from_seed_sca.s",
+    ],
+    deps = [
+        ":p256",
+    ],
+)
+
 otbn_binary(
     name = "p384_ecdsa_sca",
     srcs = [
diff --git a/sw/otbn/crypto/p256_key_from_seed_sca.s b/sw/otbn/crypto/p256_key_from_seed_sca.s
new file mode 100644
index 0000000000000..6972f3c30e3cd
--- /dev/null
+++ b/sw/otbn/crypto/p256_key_from_seed_sca.s
@@ -0,0 +1,88 @@
+/* Copyright lowRISC contributors. */
+/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */
+/* SPDX-License-Identifier: Apache-2.0 */
+
+/**
+ * Wrapper specifically for SCA/formal analysis of p256 keygen.
+ *
+ * Normally, the key generation routines called here would be used with key
+ * manager seeds only. This wrapper uses software-provided seeds for analysis
+ * purposes and should not be used for production code.
+ */
+
+.section .text.start
+
+start:
+  /* Init all-zero register. */
+  bn.xor    w31, w31, w31
+
+  /* Load shares of seed from DMEM.
+       [w21,w20] <= dmem[seed0]
+       [w23,w33] <= dmem[seed1] */
+  li        x2, 20
+  la        x3, seed0
+  bn.lid    x2, 0(x3++)
+  li        x2, 21
+  bn.lid    x2++, 0(x3)
+  la        x3, seed1
+  bn.lid    x2, 0(x3++)
+  li        x2, 23
+  bn.lid    x2, 0(x3)
+
+  /* Generate the derived secret key.
+       [w21,w20] <= d0
+       [w23,w33] <= d1 */
+  jal       x1, p256_key_from_seed
+
+  /* Write the results to DMEM.
+       dmem[d0] <= [w21, w20]
+       dmem[d1] <= [w23, w22] */
+  li        x2, 20
+  la        x3, d0
+  bn.sid    x2, 0(x3++)
+  li        x2, 21
+  bn.lid    x2++, 0(x3)
+  la        x3, d1
+  bn.lid    x2, 0(x3++)
+  li        x2, 23
+  bn.lid    x2, 0(x3)
+
+  ecall
+
+.bss
+
+/* Mode (1 = private key only) */
+.balign 4
+.globl mode
+mode:
+.zero 4
+
+/**
+ * Note: Software must write the full 512 bits of the seed values to avoid
+ * runtime errors when OTBN tries to load the data with two 256-bit loads. Bits
+ * above position 320 will be ignored.
+ */
+
+/* First share of seed (320 bits). */
+.balign 32
+.globl seed0
+seed0:
+.zero 64
+
+/* Second share of seed (320 bits). */
+.balign 32
+.globl seed1
+seed1:
+.zero 64
+
+/* First share of output key (320 bits). */
+.balign 32
+.globl d0
+d0:
+.zero 64
+
+/* Second share of output key (320 bits). */
+.balign 32
+.globl d1
+d1:
+.zero 64

From 727415f04c96b270f1dfcead200035f1ea626337 Mon Sep 17 00:00:00 2001
From: Jade Philipoom <jadep@google.com>
Date: Tue, 17 Jan 2023 13:58:48 +0100
Subject: [PATCH 18/28] add keypair generation

---
 sw/device/sca/ecc256_keygen_serial.c    | 140 +++++++++++++++++++++---
 sw/otbn/crypto/p256.s                   |   8 +-
 sw/otbn/crypto/p256_key_from_seed_sca.s |  41 ++++++-
 3 files changed, 171 insertions(+), 18 deletions(-)

diff --git a/sw/device/sca/ecc256_keygen_serial.c b/sw/device/sca/ecc256_keygen_serial.c
index 6612407f685e2..0af6c02a581f1 100644
--- a/sw/device/sca/ecc256_keygen_serial.c
+++ b/sw/device/sca/ecc256_keygen_serial.c
@@ -21,7 +21,8 @@
  *
  * This program implements the following simple serial commands:
  *   - Set seed ('s')*,
- *   - Keygen ('k')+,
+ *   - Secret key generation ('k')+,
+ *   - Keypair generation ('p')+,
  * See https://wiki.newae.com/SimpleSerial for details on the protocol.
  */
 
@@ -31,11 +32,27 @@ enum {
   /**
    * Number of bytes for ECDSA P-256 seeds and masked private keys.
    */
-  kEcc256SeedNumBytes = (256 + 64) / 8,
+  kEcc256SeedNumBytes = 320 / 8,
   /**
    * Number of 32b words for ECDSA P-256 seeds and masked private keys.
    */
   kEcc256SeedNumWords = kEcc256SeedNumBytes / sizeof(uint32_t),
+  /**
+   * Number of bytes for ECDSA P-256 point coordinates.
+   */
+  kEcc256CoordNumBytes = 256 / 8,
+  /**
+   * Number of 32b words for ECDSA P-256 point coordinates.
+   */
+  kEcc256CoordNumWords = kEcc256CoordNumBytes / sizeof(uint32_t),
+  /**
+   * Mode option for the ECDSA keygen app (generates the private key only).
+   */
+  kEcc256ModePrivateKeyOnly = 1,
+  /**
+   * Mode option for the ECDSA keygen app (generates the full keypair).
+   */
+  kEcc256ModeKeypair = 2,
 };
 
 OTBN_DECLARE_APP_SYMBOLS(p256_key_from_seed_sca);
@@ -45,6 +62,8 @@ OTBN_DECLARE_SYMBOL_ADDR(p256_key_from_seed_sca, seed0);
 OTBN_DECLARE_SYMBOL_ADDR(p256_key_from_seed_sca, seed1);
 OTBN_DECLARE_SYMBOL_ADDR(p256_key_from_seed_sca, d0);
 OTBN_DECLARE_SYMBOL_ADDR(p256_key_from_seed_sca, d1);
+OTBN_DECLARE_SYMBOL_ADDR(p256_key_from_seed_sca, x);
+OTBN_DECLARE_SYMBOL_ADDR(p256_key_from_seed_sca, y);
 
 static const otbn_app_t kOtbnAppP256KeyFromSeed = OTBN_APP_T_INIT(p256_key_from_seed_sca);
 
@@ -53,6 +72,8 @@ static const otbn_addr_t kOtbnVarSeed0 = OTBN_ADDR_T_INIT(p256_key_from_seed_sca
 static const otbn_addr_t kOtbnVarSeed1 = OTBN_ADDR_T_INIT(p256_key_from_seed_sca, seed1);
 static const otbn_addr_t kOtbnVarD0 = OTBN_ADDR_T_INIT(p256_key_from_seed_sca, d0);
 static const otbn_addr_t kOtbnVarD1 = OTBN_ADDR_T_INIT(p256_key_from_seed_sca, d1);
+static const otbn_addr_t kOtbnVarX = OTBN_ADDR_T_INIT(p256_key_from_seed_sca, x);
+static const otbn_addr_t kOtbnVarY = OTBN_ADDR_T_INIT(p256_key_from_seed_sca, y);
 
 
 /**
@@ -101,11 +122,11 @@ static void ecc256_set_seed(const uint8_t *seed, size_t seed_len) {
  * @param[out] d0   First share of masked private key d. 
  * @param[out] d1   Second share of masked private key d.
  */
-static void p256_ecdsa_keygen(const uint32_t *seed, const uint32_t *mask,
+static void p256_ecdsa_gen_secret_key(const uint32_t *seed, const uint32_t *mask,
                               uint32_t *d0, uint32_t *d1) {
 
   // Write mode.
-  uint32_t mode = 1;  // mode 1 => generate private key
+  uint32_t mode = kEcc256ModePrivateKeyOnly;
   SS_CHECK(otbn_dmem_write(/*num_words=*/1, &mode, kOtbnVarMode) ==
            kOtbnErrorOk);
 
@@ -128,25 +149,114 @@ static void p256_ecdsa_keygen(const uint32_t *seed, const uint32_t *mask,
   SS_CHECK(otbn_dmem_set(num_zeroes, 0, kOtbnVarSeed1 + kEcc256SeedNumBytes) ==
            kOtbnErrorOk);
 
-  LOG_INFO("Executing program...");
   SS_CHECK(otbn_execute() == kOtbnErrorOk);
   SS_CHECK(otbn_busy_wait_for_done() == kOtbnErrorOk);
 
-  LOG_INFO("Reading results...");
   SS_CHECK(otbn_dmem_read(kEcc256SeedNumWords, kOtbnVarD0, d0) ==
            kOtbnErrorOk);
   SS_CHECK(otbn_dmem_read(kEcc256SeedNumWords, kOtbnVarD1, d1) ==
            kOtbnErrorOk);
+}
+
+/**
+ * Generates a keypair from a masked seed.
+ *
+ * The seed shares must be `kEcc256SeedNumWords` words long, and the caller
+ * must provide pre-allocated buffers of the same length for the private key
+ * shares and of length `kEcc256CoordNumWords` for the public key coordinates. 
+ *
+ * @param[in] seed  Seed for key generation.
+ * @param[in] mask  Mask for seed.
+ * @param[out] d0   First share of masked private key d. 
+ * @param[out] d1   Second share of masked private key d.
+ * @param[out] x    x-coordinate of public key Q.
+ * @param[out] y    y-coordinate of public key Q.
+ */
+static void p256_ecdsa_gen_keypair(const uint32_t *seed, const uint32_t *mask,
+                              uint32_t *d0, uint32_t *d1, uint32_t *x, uint32_t *y) {
+
+  // Write mode.
+  uint32_t mode = kEcc256ModeKeypair;
+  SS_CHECK(otbn_dmem_write(/*num_words=*/1, &mode, kOtbnVarMode) ==
+           kOtbnErrorOk);
+
+  // Compute first share of seed (seed ^ mask).
+  uint32_t seed0[kEcc256SeedNumWords];
   for (size_t i = 0; i < kEcc256SeedNumWords; i++) {
-    LOG_INFO("d0[%d]: 0x%08x", i, d0[i]);
+    seed0[i] = seed[i] ^ mask[i];
   }
-  for (size_t i = 0; i < kEcc256SeedNumWords; i++) {
-    LOG_INFO("d1[%d]: 0x%08x", i, d1[i]);
+
+  // Write seed shares.
+  SS_CHECK(otbn_dmem_write(kEcc256SeedNumWords, seed0, kOtbnVarSeed0) == kOtbnErrorOk);
+  SS_CHECK(otbn_dmem_write(kEcc256SeedNumWords, mask, kOtbnVarSeed1) == kOtbnErrorOk);
+
+  // Set high bits of seed0, seed1 to all-zero. These bits are ignored in the
+  // implementation but must be written to avoid runtime errors.
+  size_t offset = kEcc256SeedNumWords % kOtbnWideWordNumWords;
+  size_t num_zeroes = (kOtbnWideWordNumWords - offset) % kOtbnWideWordNumWords;
+  SS_CHECK(otbn_dmem_set(num_zeroes, 0, kOtbnVarSeed0 + kEcc256SeedNumBytes) ==
+           kOtbnErrorOk);
+  SS_CHECK(otbn_dmem_set(num_zeroes, 0, kOtbnVarSeed1 + kEcc256SeedNumBytes) ==
+           kOtbnErrorOk);
+
+  SS_CHECK(otbn_execute() == kOtbnErrorOk);
+  SS_CHECK(otbn_busy_wait_for_done() == kOtbnErrorOk);
+
+  SS_CHECK(otbn_dmem_read(kEcc256SeedNumWords, kOtbnVarD0, d0) ==
+           kOtbnErrorOk);
+  SS_CHECK(otbn_dmem_read(kEcc256SeedNumWords, kOtbnVarD1, d1) ==
+           kOtbnErrorOk);
+  SS_CHECK(otbn_dmem_read(kEcc256CoordNumWords, kOtbnVarX, x) ==
+           kOtbnErrorOk);
+  SS_CHECK(otbn_dmem_read(kEcc256CoordNumWords, kOtbnVarY, y) ==
+           kOtbnErrorOk);
+}
+
+/**
+ * Simple serial 'k' (secret keygen) command handler.
+ *
+ * Takes the mask value from the simple serial UART and triggers an OTBN
+ * secret key generation operation. The mask must be `kEcc256SeedNumBytes`
+ * bytes long.
+ *
+ * Uses a fixed seed. To overwrite the seed, use the simpleserial command 's'.
+ *
+ * @param[in] mask The mask provided by the simpleserial UART.
+ * @param[in] mask_len Length of the mask.
+ */
+static void ecc256_ecdsa_secret_keygen(const uint8_t *mask,
+                         size_t mask_len) {
+  if (mask_len != kEcc256SeedNumBytes) {
+    LOG_ERROR("Invalid mask length %hu", (uint8_t)mask_len);
+    return;
   }
+
+  // Copy mask to an aligned buffer.
+  uint32_t ecc256_mask[kEcc256SeedNumWords];
+  memcpy(ecc256_mask, mask, kEcc256SeedNumBytes);
+
+  LOG_INFO("Loading app...");
+  SS_CHECK(otbn_load_app(kOtbnAppP256KeyFromSeed) == kOtbnErrorOk);
+
+  uint32_t ecc256_d0[kEcc256SeedNumWords];
+  uint32_t ecc256_d1[kEcc256SeedNumWords];
+
+  LOG_INFO("Running keygen...");
+  sca_set_trigger_high();
+  p256_ecdsa_gen_secret_key(ecc256_seed, ecc256_mask, ecc256_d0, ecc256_d1);
+  sca_set_trigger_low();
+
+  // TODO: Remove these if they are not necessary for the side-channel analysis.
+  simple_serial_send_packet('r', (unsigned char *)ecc256_d0, kEcc256SeedNumBytes);
+  simple_serial_send_packet('r', (unsigned char *)ecc256_d1, kEcc256SeedNumBytes);
+
+  LOG_INFO("Clearing OTBN memory.");
+  SS_CHECK(otbn_dmem_sec_wipe() == kOtbnErrorOk);
+  SS_CHECK(otbn_imem_sec_wipe() == kOtbnErrorOk);
 }
 
 /**
- * Simple serial 'k' (keygen) command handler.
+ * Simple serial 'p' (keypair generation) command handler.
  *
  * Takes the mask value from the simple serial UART and triggers an OTBN
  * secret key generation operation. The mask must be `kEcc256SeedNumBytes`
@@ -157,7 +267,7 @@ static void p256_ecdsa_keygen(const uint32_t *seed, const uint32_t *mask,
  * @param[in] mask The mask provided by the simpleserial UART.
  * @param[in] mask_len Length of the mask.
  */
-static void ecc256_ecdsa_keygen(const uint8_t *mask,
+static void ecc256_ecdsa_gen_keypair(const uint8_t *mask,
                          size_t mask_len) {
   if (mask_len != kEcc256SeedNumBytes) {
     LOG_ERROR("Invalid mask length %hu", (uint8_t)mask_len);
@@ -173,10 +283,12 @@ static void ecc256_ecdsa_keygen(const uint8_t *mask,
 
   uint32_t ecc256_d0[kEcc256SeedNumWords];
   uint32_t ecc256_d1[kEcc256SeedNumWords];
+  uint32_t ecc256_x[kEcc256CoordNumWords];
+  uint32_t ecc256_y[kEcc256CoordNumWords];
 
   LOG_INFO("Running keygen...");
   sca_set_trigger_high();
-  p256_ecdsa_keygen(ecc256_seed, ecc256_mask, ecc256_d0, ecc256_d1);
+  p256_ecdsa_gen_keypair(ecc256_seed, ecc256_mask, ecc256_d0, ecc256_d1, ecc256_x, ecc256_y);
   sca_set_trigger_low();
 
   // TODO: Remove these if they are not necessary for the side-channel analysis.
@@ -203,7 +315,9 @@ static void simple_serial_main(void) {
   LOG_INFO("Initializing simple serial interface to capture board.");
 
   simple_serial_init(sca_get_uart());
-  SS_CHECK(simple_serial_register_handler('k', ecc256_ecdsa_keygen) ==
+  SS_CHECK(simple_serial_register_handler('k', ecc256_ecdsa_secret_keygen) ==
+           kSimpleSerialOk);
+  SS_CHECK(simple_serial_register_handler('p', ecc256_ecdsa_gen_keypair) ==
            kSimpleSerialOk);
   SS_CHECK(simple_serial_register_handler('s', ecc256_set_seed) ==
            kSimpleSerialOk);
diff --git a/sw/otbn/crypto/p256.s b/sw/otbn/crypto/p256.s
index eabf3ee3e1080..b6ea916769f5c 100644
--- a/sw/otbn/crypto/p256.s
+++ b/sw/otbn/crypto/p256.s
@@ -1453,10 +1453,10 @@ p256_sign:
  *
  * This routine runs in constant time.
  *
- * @param[in]     dmem[d0]:  first share of scalar d (320 bits)
- * @param[in]     dmem[d1]:  second share of scalar d (320 bits)
- * @param[in,out] dmem[x]:   affine x-coordinate (256 bits)
- * @param[in,out] dmem[y]:   affine y-coordinate (256 bits)
+ * @param[in]   dmem[d0]:  first share of scalar d (320 bits)
+ * @param[in]   dmem[d1]:  second share of scalar d (320 bits)
+ * @param[out]  dmem[x]:   affine x-coordinate (256 bits)
+ * @param[out]  dmem[y]:   affine y-coordinate (256 bits)
  *
  * Flags: Flags have no meaning beyond the scope of this subroutine.
  *
diff --git a/sw/otbn/crypto/p256_key_from_seed_sca.s b/sw/otbn/crypto/p256_key_from_seed_sca.s
index 6972f3c30e3cd..124833fecf0e2 100644
--- a/sw/otbn/crypto/p256_key_from_seed_sca.s
+++ b/sw/otbn/crypto/p256_key_from_seed_sca.s
@@ -13,6 +13,20 @@
 .section .text.start
 
 start:
+  /* Read mode, then tail-call either p256_gen_secret_key or p256_gen_keypair */
+  la    x2, mode
+  lw    x2, 0(x2)
+
+  li    x3, 1
+  beq   x2, x3, p256_gen_secret_key
+
+  li    x3, 2
+  beq   x2, x3, p256_gen_keypair
+
+  /* Invalid mode; fail. */
+  unimp
+
+p256_gen_secret_key:
   /* Init all-zero register. */
   bn.xor    w31, w31, w31
 
@@ -49,9 +63,22 @@ start:
 
   ecall
 
+p256_gen_keypair:
+  /* First, generate the masked secret key d and write to DMEM.
+       dmem[d0] <= d0
+       dmem[d1] <= d1 */
+  jal       x1, p256_gen_secret_key
+
+  /* Generate the public key Q = d*G.
+       dmem[x] <= Q.x
+       dmem[y] <= Q.y */
+  jal       x1, p256_base_mult
+
+  ecall
+
 .bss
 
-/* Mode (1 = private key only) */
+/* Mode (1 = private key only, 2 = keypair) */
 .balign 4
 .globl mode
 mode:
@@ -86,3 +113,15 @@ d0:
 .globl d1
 d1:
 .zero 64
+
+/* x-coordinate of output public key (256 bits). */
+.balign 32
+.globl x
+x:
+.zero 32
+
+/* y-coordinate of output public key (256 bits). */
+.balign 32
+.globl y
+y:
+.zero 32

From 2698902655a8d0ffe13840de00c5e3d1f961dfc6 Mon Sep 17 00:00:00 2001
From: Jade Philipoom <jadep@google.com>
Date: Tue, 17 Jan 2023 15:25:35 +0100
Subject: [PATCH 19/28] simplify a little

---
 sw/device/sca/ecc256_keygen_serial.c | 65 +++++++++++++---------------
 1 file changed, 31 insertions(+), 34 deletions(-)

diff --git a/sw/device/sca/ecc256_keygen_serial.c b/sw/device/sca/ecc256_keygen_serial.c
index 0af6c02a581f1..6822244445b36 100644
--- a/sw/device/sca/ecc256_keygen_serial.c
+++ b/sw/device/sca/ecc256_keygen_serial.c
@@ -112,21 +112,17 @@ static void ecc256_set_seed(const uint8_t *seed, size_t seed_len) {
 }
 
 /**
- * Generates a private key from a masked seed.
+ * Runs the OTBN key generation program.
  *
- * The seed shares must be `kEcc256SeedNumWords` words long, and the caller
- * must provide pre-allocated buffers of the same length for the key shares.
+ * The seed shares must be `kEcc256SeedNumWords` words long.
  *
+ * @param[in] mode  Mode parameter (private key only or full keypair).
  * @param[in] seed  Seed for key generation.
  * @param[in] mask  Mask for seed.
- * @param[out] d0   First share of masked private key d. 
- * @param[out] d1   Second share of masked private key d.
  */
-static void p256_ecdsa_gen_secret_key(const uint32_t *seed, const uint32_t *mask,
-                              uint32_t *d0, uint32_t *d1) {
+static void p256_run_keygen(uint32_t mode, const uint32_t *seed, const uint32_t *mask) {
 
   // Write mode.
-  uint32_t mode = kEcc256ModePrivateKeyOnly;
   SS_CHECK(otbn_dmem_write(/*num_words=*/1, &mode, kOtbnVarMode) ==
            kOtbnErrorOk);
 
@@ -149,9 +145,30 @@ static void p256_ecdsa_gen_secret_key(const uint32_t *seed, const uint32_t *mask
   SS_CHECK(otbn_dmem_set(num_zeroes, 0, kOtbnVarSeed1 + kEcc256SeedNumBytes) ==
            kOtbnErrorOk);
 
+  // Execute program.
   SS_CHECK(otbn_execute() == kOtbnErrorOk);
   SS_CHECK(otbn_busy_wait_for_done() == kOtbnErrorOk);
+}
+
+/**
+ * Generates a secret key from a masked seed.
+ *
+ * The seed shares must be `kEcc256SeedNumWords` words long, and the caller
+ * must provide pre-allocated buffers of the same length for the private key
+ * shares. 
+ *
+ * @param[in] seed  Seed for key generation.
+ * @param[in] mask  Mask for seed.
+ * @param[out] d0   First share of masked private key d. 
+ * @param[out] d1   Second share of masked private key d.
+ */
+static void p256_ecdsa_gen_secret_key(const uint32_t *seed, const uint32_t *mask,
+                              uint32_t *d0, uint32_t *d1) {
+
+  // Run the key generation program.
+  p256_run_keygen(kEcc256ModePrivateKeyOnly, seed, mask);
 
+  // Read results.
   SS_CHECK(otbn_dmem_read(kEcc256SeedNumWords, kOtbnVarD0, d0) ==
            kOtbnErrorOk);
   SS_CHECK(otbn_dmem_read(kEcc256SeedNumWords, kOtbnVarD1, d1) ==
@@ -175,33 +192,10 @@ static void p256_ecdsa_gen_secret_key(const uint32_t *seed, const uint32_t *mask
 static void p256_ecdsa_gen_keypair(const uint32_t *seed, const uint32_t *mask,
                               uint32_t *d0, uint32_t *d1, uint32_t *x, uint32_t *y) {
 
-  // Write mode.
-  uint32_t mode = kEcc256ModeKeypair;
-  SS_CHECK(otbn_dmem_write(/*num_words=*/1, &mode, kOtbnVarMode) ==
-           kOtbnErrorOk);
-
-  // Compute first share of seed (seed ^ mask).
-  uint32_t seed0[kEcc256SeedNumWords];
-  for (size_t i = 0; i < kEcc256SeedNumWords; i++) {
-    seed0[i] = seed[i] ^ mask[i];
-  }
-
-  // Write seed shares.
-  SS_CHECK(otbn_dmem_write(kEcc256SeedNumWords, seed0, kOtbnVarSeed0) == kOtbnErrorOk);
-  SS_CHECK(otbn_dmem_write(kEcc256SeedNumWords, mask, kOtbnVarSeed1) == kOtbnErrorOk);
-
-  // Set high bits of seed0, seed1 to all-zero. These bits are ignored in the
-  // implementation but must be written to avoid runtime errors.
-  size_t offset = kEcc256SeedNumWords % kOtbnWideWordNumWords;
-  size_t num_zeroes = (kOtbnWideWordNumWords - offset) % kOtbnWideWordNumWords;
-  SS_CHECK(otbn_dmem_set(num_zeroes, 0, kOtbnVarSeed0 + kEcc256SeedNumBytes) ==
-           kOtbnErrorOk);
-  SS_CHECK(otbn_dmem_set(num_zeroes, 0, kOtbnVarSeed1 + kEcc256SeedNumBytes) ==
-           kOtbnErrorOk);
-
-  SS_CHECK(otbn_execute() == kOtbnErrorOk);
-  SS_CHECK(otbn_busy_wait_for_done() == kOtbnErrorOk);
+  // Run the key generation program.
+  p256_run_keygen(kEcc256ModeKeypair, seed, mask);
 
+  // Read results.
   SS_CHECK(otbn_dmem_read(kEcc256SeedNumWords, kOtbnVarD0, d0) ==
            kOtbnErrorOk);
   SS_CHECK(otbn_dmem_read(kEcc256SeedNumWords, kOtbnVarD1, d1) ==
@@ -238,6 +232,7 @@ static void ecc256_ecdsa_secret_keygen(const uint8_t *mask,
   LOG_INFO("Loading app...");
   SS_CHECK(otbn_load_app(kOtbnAppP256KeyFromSeed) == kOtbnErrorOk);
 
+  uint32_t mode = kEcc256ModePrivateKeyOnly;
   uint32_t ecc256_d0[kEcc256SeedNumWords];
   uint32_t ecc256_d1[kEcc256SeedNumWords];
 
@@ -294,6 +289,8 @@ static void ecc256_ecdsa_gen_keypair(const uint8_t *mask,
   // TODO: Remove these if they are not necessary for the side-channel analysis.
   simple_serial_send_packet('r', (unsigned char *)ecc256_d0, kEcc256SeedNumBytes);
   simple_serial_send_packet('r', (unsigned char *)ecc256_d1, kEcc256SeedNumBytes);
+  simple_serial_send_packet('r', (unsigned char *)ecc256_x, kEcc256CoordNumBytes);
+  simple_serial_send_packet('r', (unsigned char *)ecc256_y, kEcc256CoordNumBytes);
 
   LOG_INFO("Clearing OTBN memory.");
   SS_CHECK(otbn_dmem_sec_wipe() == kOtbnErrorOk);

From 00e5bc9a4acfa23dc31a3a61f1103a328d829fd9 Mon Sep 17 00:00:00 2001
From: Jade Philipoom <jadep@google.com>
Date: Tue, 17 Jan 2023 15:25:55 +0100
Subject: [PATCH 20/28] formatting

---
 sw/device/sca/ecc256_keygen_serial.c | 112 +++++++++++++--------------
 1 file changed, 56 insertions(+), 56 deletions(-)

diff --git a/sw/device/sca/ecc256_keygen_serial.c b/sw/device/sca/ecc256_keygen_serial.c
index 6822244445b36..043b1d9d5a838 100644
--- a/sw/device/sca/ecc256_keygen_serial.c
+++ b/sw/device/sca/ecc256_keygen_serial.c
@@ -65,16 +65,23 @@ OTBN_DECLARE_SYMBOL_ADDR(p256_key_from_seed_sca, d1);
 OTBN_DECLARE_SYMBOL_ADDR(p256_key_from_seed_sca, x);
 OTBN_DECLARE_SYMBOL_ADDR(p256_key_from_seed_sca, y);
 
-static const otbn_app_t kOtbnAppP256KeyFromSeed = OTBN_APP_T_INIT(p256_key_from_seed_sca);
-
-static const otbn_addr_t kOtbnVarMode = OTBN_ADDR_T_INIT(p256_key_from_seed_sca, mode);
-static const otbn_addr_t kOtbnVarSeed0 = OTBN_ADDR_T_INIT(p256_key_from_seed_sca, seed0);
-static const otbn_addr_t kOtbnVarSeed1 = OTBN_ADDR_T_INIT(p256_key_from_seed_sca, seed1);
-static const otbn_addr_t kOtbnVarD0 = OTBN_ADDR_T_INIT(p256_key_from_seed_sca, d0);
-static const otbn_addr_t kOtbnVarD1 = OTBN_ADDR_T_INIT(p256_key_from_seed_sca, d1);
-static const otbn_addr_t kOtbnVarX = OTBN_ADDR_T_INIT(p256_key_from_seed_sca, x);
-static const otbn_addr_t kOtbnVarY = OTBN_ADDR_T_INIT(p256_key_from_seed_sca, y);
-
+static const otbn_app_t kOtbnAppP256KeyFromSeed =
+    OTBN_APP_T_INIT(p256_key_from_seed_sca);
+
+static const otbn_addr_t kOtbnVarMode =
+    OTBN_ADDR_T_INIT(p256_key_from_seed_sca, mode);
+static const otbn_addr_t kOtbnVarSeed0 =
+    OTBN_ADDR_T_INIT(p256_key_from_seed_sca, seed0);
+static const otbn_addr_t kOtbnVarSeed1 =
+    OTBN_ADDR_T_INIT(p256_key_from_seed_sca, seed1);
+static const otbn_addr_t kOtbnVarD0 =
+    OTBN_ADDR_T_INIT(p256_key_from_seed_sca, d0);
+static const otbn_addr_t kOtbnVarD1 =
+    OTBN_ADDR_T_INIT(p256_key_from_seed_sca, d1);
+static const otbn_addr_t kOtbnVarX =
+    OTBN_ADDR_T_INIT(p256_key_from_seed_sca, x);
+static const otbn_addr_t kOtbnVarY =
+    OTBN_ADDR_T_INIT(p256_key_from_seed_sca, y);
 
 /**
  * Seed value.
@@ -86,16 +93,8 @@ static const otbn_addr_t kOtbnVarY = OTBN_ADDR_T_INIT(p256_key_from_seed_sca, y)
  * (see ecc256_set_seed)
  */
 uint32_t ecc256_seed[kEcc256SeedNumWords] = {
-  0x016064e9,
-  0x11e3f4d6,
-  0xac3a6fa7,
-  0xaba11a1b,
-  0x8f9271d1,
-  0x22b79d5f,
-  0x1176f31d,
-  0xb5ac3a51,
-  0x99a082d7,
-  0x484eb366,
+    0x016064e9, 0x11e3f4d6, 0xac3a6fa7, 0xaba11a1b, 0x8f9271d1,
+    0x22b79d5f, 0x1176f31d, 0xb5ac3a51, 0x99a082d7, 0x484eb366,
 };
 
 /**
@@ -120,8 +119,8 @@ static void ecc256_set_seed(const uint8_t *seed, size_t seed_len) {
  * @param[in] seed  Seed for key generation.
  * @param[in] mask  Mask for seed.
  */
-static void p256_run_keygen(uint32_t mode, const uint32_t *seed, const uint32_t *mask) {
-
+static void p256_run_keygen(uint32_t mode, const uint32_t *seed,
+                            const uint32_t *mask) {
   // Write mode.
   SS_CHECK(otbn_dmem_write(/*num_words=*/1, &mode, kOtbnVarMode) ==
            kOtbnErrorOk);
@@ -133,8 +132,10 @@ static void p256_run_keygen(uint32_t mode, const uint32_t *seed, const uint32_t
   }
 
   // Write seed shares.
-  SS_CHECK(otbn_dmem_write(kEcc256SeedNumWords, seed0, kOtbnVarSeed0) == kOtbnErrorOk);
-  SS_CHECK(otbn_dmem_write(kEcc256SeedNumWords, mask, kOtbnVarSeed1) == kOtbnErrorOk);
+  SS_CHECK(otbn_dmem_write(kEcc256SeedNumWords, seed0, kOtbnVarSeed0) ==
+           kOtbnErrorOk);
+  SS_CHECK(otbn_dmem_write(kEcc256SeedNumWords, mask, kOtbnVarSeed1) ==
+           kOtbnErrorOk);
 
   // Set high bits of seed0, seed1 to all-zero. These bits are ignored in the
   // implementation but must be written to avoid runtime errors.
@@ -155,24 +156,22 @@ static void p256_run_keygen(uint32_t mode, const uint32_t *seed, const uint32_t
  *
  * The seed shares must be `kEcc256SeedNumWords` words long, and the caller
  * must provide pre-allocated buffers of the same length for the private key
- * shares. 
+ * shares.
  *
  * @param[in] seed  Seed for key generation.
  * @param[in] mask  Mask for seed.
- * @param[out] d0   First share of masked private key d. 
+ * @param[out] d0   First share of masked private key d.
  * @param[out] d1   Second share of masked private key d.
  */
-static void p256_ecdsa_gen_secret_key(const uint32_t *seed, const uint32_t *mask,
-                              uint32_t *d0, uint32_t *d1) {
-
+static void p256_ecdsa_gen_secret_key(const uint32_t *seed,
+                                      const uint32_t *mask, uint32_t *d0,
+                                      uint32_t *d1) {
   // Run the key generation program.
   p256_run_keygen(kEcc256ModePrivateKeyOnly, seed, mask);
 
   // Read results.
-  SS_CHECK(otbn_dmem_read(kEcc256SeedNumWords, kOtbnVarD0, d0) ==
-           kOtbnErrorOk);
-  SS_CHECK(otbn_dmem_read(kEcc256SeedNumWords, kOtbnVarD1, d1) ==
-           kOtbnErrorOk);
+  SS_CHECK(otbn_dmem_read(kEcc256SeedNumWords, kOtbnVarD0, d0) == kOtbnErrorOk);
+  SS_CHECK(otbn_dmem_read(kEcc256SeedNumWords, kOtbnVarD1, d1) == kOtbnErrorOk);
 }
 
 /**
@@ -180,30 +179,26 @@ static void p256_ecdsa_gen_secret_key(const uint32_t *seed, const uint32_t *mask
  *
  * The seed shares must be `kEcc256SeedNumWords` words long, and the caller
  * must provide pre-allocated buffers of the same length for the private key
- * shares and of length `kEcc256CoordNumWords` for the public key coordinates. 
+ * shares and of length `kEcc256CoordNumWords` for the public key coordinates.
  *
  * @param[in] seed  Seed for key generation.
  * @param[in] mask  Mask for seed.
- * @param[out] d0   First share of masked private key d. 
+ * @param[out] d0   First share of masked private key d.
  * @param[out] d1   Second share of masked private key d.
  * @param[out] x    x-coordinate of public key Q.
  * @param[out] y    y-coordinate of public key Q.
  */
 static void p256_ecdsa_gen_keypair(const uint32_t *seed, const uint32_t *mask,
-                              uint32_t *d0, uint32_t *d1, uint32_t *x, uint32_t *y) {
-
+                                   uint32_t *d0, uint32_t *d1, uint32_t *x,
+                                   uint32_t *y) {
   // Run the key generation program.
   p256_run_keygen(kEcc256ModeKeypair, seed, mask);
 
   // Read results.
-  SS_CHECK(otbn_dmem_read(kEcc256SeedNumWords, kOtbnVarD0, d0) ==
-           kOtbnErrorOk);
-  SS_CHECK(otbn_dmem_read(kEcc256SeedNumWords, kOtbnVarD1, d1) ==
-           kOtbnErrorOk);
-  SS_CHECK(otbn_dmem_read(kEcc256CoordNumWords, kOtbnVarX, x) ==
-           kOtbnErrorOk);
-  SS_CHECK(otbn_dmem_read(kEcc256CoordNumWords, kOtbnVarY, y) ==
-           kOtbnErrorOk);
+  SS_CHECK(otbn_dmem_read(kEcc256SeedNumWords, kOtbnVarD0, d0) == kOtbnErrorOk);
+  SS_CHECK(otbn_dmem_read(kEcc256SeedNumWords, kOtbnVarD1, d1) == kOtbnErrorOk);
+  SS_CHECK(otbn_dmem_read(kEcc256CoordNumWords, kOtbnVarX, x) == kOtbnErrorOk);
+  SS_CHECK(otbn_dmem_read(kEcc256CoordNumWords, kOtbnVarY, y) == kOtbnErrorOk);
 }
 
 /**
@@ -218,8 +213,7 @@ static void p256_ecdsa_gen_keypair(const uint32_t *seed, const uint32_t *mask,
  * @param[in] mask The mask provided by the simpleserial UART.
  * @param[in] mask_len Length of the mask.
  */
-static void ecc256_ecdsa_secret_keygen(const uint8_t *mask,
-                         size_t mask_len) {
+static void ecc256_ecdsa_secret_keygen(const uint8_t *mask, size_t mask_len) {
   if (mask_len != kEcc256SeedNumBytes) {
     LOG_ERROR("Invalid mask length %hu", (uint8_t)mask_len);
     return;
@@ -242,8 +236,10 @@ static void ecc256_ecdsa_secret_keygen(const uint8_t *mask,
   sca_set_trigger_low();
 
   // TODO: Remove these if they are not necessary for the side-channel analysis.
-  simple_serial_send_packet('r', (unsigned char *)ecc256_d0, kEcc256SeedNumBytes);
-  simple_serial_send_packet('r', (unsigned char *)ecc256_d1, kEcc256SeedNumBytes);
+  simple_serial_send_packet('r', (unsigned char *)ecc256_d0,
+                            kEcc256SeedNumBytes);
+  simple_serial_send_packet('r', (unsigned char *)ecc256_d1,
+                            kEcc256SeedNumBytes);
 
   LOG_INFO("Clearing OTBN memory.");
   SS_CHECK(otbn_dmem_sec_wipe() == kOtbnErrorOk);
@@ -262,8 +258,7 @@ static void ecc256_ecdsa_secret_keygen(const uint8_t *mask,
  * @param[in] mask The mask provided by the simpleserial UART.
  * @param[in] mask_len Length of the mask.
  */
-static void ecc256_ecdsa_gen_keypair(const uint8_t *mask,
-                         size_t mask_len) {
+static void ecc256_ecdsa_gen_keypair(const uint8_t *mask, size_t mask_len) {
   if (mask_len != kEcc256SeedNumBytes) {
     LOG_ERROR("Invalid mask length %hu", (uint8_t)mask_len);
     return;
@@ -283,14 +278,19 @@ static void ecc256_ecdsa_gen_keypair(const uint8_t *mask,
 
   LOG_INFO("Running keygen...");
   sca_set_trigger_high();
-  p256_ecdsa_gen_keypair(ecc256_seed, ecc256_mask, ecc256_d0, ecc256_d1, ecc256_x, ecc256_y);
+  p256_ecdsa_gen_keypair(ecc256_seed, ecc256_mask, ecc256_d0, ecc256_d1,
+                         ecc256_x, ecc256_y);
   sca_set_trigger_low();
 
   // TODO: Remove these if they are not necessary for the side-channel analysis.
-  simple_serial_send_packet('r', (unsigned char *)ecc256_d0, kEcc256SeedNumBytes);
-  simple_serial_send_packet('r', (unsigned char *)ecc256_d1, kEcc256SeedNumBytes);
-  simple_serial_send_packet('r', (unsigned char *)ecc256_x, kEcc256CoordNumBytes);
-  simple_serial_send_packet('r', (unsigned char *)ecc256_y, kEcc256CoordNumBytes);
+  simple_serial_send_packet('r', (unsigned char *)ecc256_d0,
+                            kEcc256SeedNumBytes);
+  simple_serial_send_packet('r', (unsigned char *)ecc256_d1,
+                            kEcc256SeedNumBytes);
+  simple_serial_send_packet('r', (unsigned char *)ecc256_x,
+                            kEcc256CoordNumBytes);
+  simple_serial_send_packet('r', (unsigned char *)ecc256_y,
+                            kEcc256CoordNumBytes);
 
   LOG_INFO("Clearing OTBN memory.");
   SS_CHECK(otbn_dmem_sec_wipe() == kOtbnErrorOk);

From 9330d350a0a8ebd03c5556c516dd5e1022223353 Mon Sep 17 00:00:00 2001
From: Jade Philipoom <jadep@google.com>
Date: Tue, 17 Jan 2023 15:27:13 +0100
Subject: [PATCH 21/28] add simple script to generate test data

---
 p256_key_from_seed.py | 82 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 82 insertions(+)
 create mode 100755 p256_key_from_seed.py

diff --git a/p256_key_from_seed.py b/p256_key_from_seed.py
new file mode 100755
index 0000000000000..5f9f2db3f0841
--- /dev/null
+++ b/p256_key_from_seed.py
@@ -0,0 +1,82 @@
+#! /usr/bin/env python3
+
+import argparse
+import random
+
+CURVE = 'ECDSA-P256'
+KEY_SIZE = 256
+SEED_SIZE = 320
+CURVE_ORDER = 0xffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2fc632551
+
+def key_from_seed(seed0, seed1):
+    '''Mimics the behavior of the secret-key derivation algorithm.'''
+    # The modulus is the curve order (n) shifted by the number of extra bits in
+    # the seed compared to the key.
+    mod = CURVE_ORDER << (SEED_SIZE - KEY_SIZE)
+    # The computation of d0 is more complex in the masked version, but here we
+    # can simply combine shares and compute based on the real seed.
+    d0 = ((seed0 ^ seed1) - seed1) % mod
+    d1 = seed1 % mod
+    return d0, d1
+
+
+def seed_size_int_from_arg(seed):
+    '''Parses the input argument into a valid SEED_SIZE-bit number.
+
+    If `seed` is None, generates a random number. If `seed` is out of the range
+    [0, 2^SEED_SIZE), prints a warning and reduces modulo (2^SEED_SIZE).
+    '''
+    if seed is None:
+        seed = random.randrange(1 << SEED_SIZE)
+    elif int.bit_length(seed) > SEED_SIZE:
+        seed = seed & ((1 << SEED_SIZE) - 1)
+        print(f'WARNING: seed is too large. Reducing to {SEED_SIZE} bits.')
+    return seed
+
+def print_int(name, value, nbits):
+    '''Prints an integer with `nbits` bits in hexadecimal form.'''
+    # Size of the number in hex is 2 characters for the `0x` prefix plus the
+    # number of nibbles (size / 4 rounded up).
+    length = ((nbits + 3) // 4) + 2
+    print('{name}={value:#0{length}x}'
+          .format(name=name, value=value, length=length))
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+            description=('Mimics the behavior of an OTBN routine '
+                         'that derives a private key from a seed '
+                         f'for {CURVE}.'))
+    parser.add_argument('-s',
+                        '--seed',
+                        type=int,
+                        required=False,
+                        help=f'Real seed value ({SEED_SIZE} bits).')
+    parser.add_argument('-m',
+                        '--mask',
+                        type=int,
+                        required=False,
+                        help=f'Mask value for seed ({SEED_SIZE} bits).')
+    args = parser.parse_args()
+
+    # Size for seed print formatting
+    seed_print_size = (SEED_SIZE // 4) + 2
+
+    seed = seed_size_int_from_arg(args.seed)
+    mask = seed_size_int_from_arg(args.mask)
+    print_int('seed', seed, SEED_SIZE)
+    print_int('mask', mask, SEED_SIZE)
+
+    # Generate the two shares for the seed.
+    seed0 = seed ^ mask
+    seed1 = mask
+    print_int('seed0', seed0, SEED_SIZE)
+    print_int('seed1', seed1, SEED_SIZE)
+
+    # Compute the shares for the resulting key.
+    d0, d1 = key_from_seed(seed0, seed1)
+    print_int('d0', d0, SEED_SIZE)
+    print_int('d1', d1, SEED_SIZE)
+
+    # Compute the real key.
+    d = (d0 + d1) % CURVE_ORDER
+    print_int('d', d, KEY_SIZE)

From 767a69a73cf97ee35c6703d50ed0024bdfd91793 Mon Sep 17 00:00:00 2001
From: Jade Philipoom <jadep@google.com>
Date: Tue, 17 Jan 2023 20:59:05 +0100
Subject: [PATCH 22/28] fix so it works with capture

---
 sw/device/lib/crypto/drivers/otbn.h     |  2 +-
 sw/device/sca/ecc256_keygen_serial.c    | 44 ++++++++++++-------------
 sw/device/sca/lib/simple_serial.c       |  4 +--
 sw/device/sca/lib/simple_serial.h       |  4 +--
 sw/otbn/crypto/p256_key_from_seed_sca.s | 13 ++++----
 5 files changed, 32 insertions(+), 35 deletions(-)

diff --git a/sw/device/lib/crypto/drivers/otbn.h b/sw/device/lib/crypto/drivers/otbn.h
index 12900b9cc73bb..1762e4e406b8d 100644
--- a/sw/device/lib/crypto/drivers/otbn.h
+++ b/sw/device/lib/crypto/drivers/otbn.h
@@ -60,7 +60,7 @@ typedef enum otbn_err_bits {
   /** A BUS_INTG_VIOLATION error was observed. */
   kOtbnErrBitsBusIntgViolation = (1 << 19),
   /** A BAD_INTERNAL_STATE error was observed. */
-  kDifOtbnErrBitsBadInternalState = (1 << 20),
+  kOtbnErrBitsBadInternalState = (1 << 20),
   /** An ILLEGAL_BUS_ACCESS error was observed. */
   kOtbnErrBitsIllegalBusAccess = (1 << 21),
   /** A LIFECYCLE_ESCALATION error was observed. */
diff --git a/sw/device/sca/ecc256_keygen_serial.c b/sw/device/sca/ecc256_keygen_serial.c
index 043b1d9d5a838..1f1c32d7feb2a 100644
--- a/sw/device/sca/ecc256_keygen_serial.c
+++ b/sw/device/sca/ecc256_keygen_serial.c
@@ -20,9 +20,11 @@
  * OpenTitan program for OTBN ECDSA-P256 side-channel analysis.
  *
  * This program implements the following simple serial commands:
- *   - Set seed ('s')*,
- *   - Secret key generation ('k')+,
- *   - Keypair generation ('p')+,
+ *   - Set seed ('x'),
+ *   - Secret key generation ('k'),
+ *   - Keypair generation ('p'),
+ *   - Get version ('v') (implemented in simpleserial library),
+ *   - Seed PRNG ('s') (implemented in simpleserial library),
  * See https://wiki.newae.com/SimpleSerial for details on the protocol.
  */
 
@@ -98,7 +100,7 @@ uint32_t ecc256_seed[kEcc256SeedNumWords] = {
 };
 
 /**
- * Simple serial 's' (set seed) command handler.
+ * Simple serial 'x' (set seed) command handler.
  *
  * The key must be `kEcc256SeedNumBytes` bytes long.
  *
@@ -137,18 +139,17 @@ static void p256_run_keygen(uint32_t mode, const uint32_t *seed,
   SS_CHECK(otbn_dmem_write(kEcc256SeedNumWords, mask, kOtbnVarSeed1) ==
            kOtbnErrorOk);
 
-  // Set high bits of seed0, seed1 to all-zero. These bits are ignored in the
-  // implementation but must be written to avoid runtime errors.
-  size_t offset = kEcc256SeedNumWords % kOtbnWideWordNumWords;
-  size_t num_zeroes = (kOtbnWideWordNumWords - offset) % kOtbnWideWordNumWords;
-  SS_CHECK(otbn_dmem_set(num_zeroes, 0, kOtbnVarSeed0 + kEcc256SeedNumBytes) ==
-           kOtbnErrorOk);
-  SS_CHECK(otbn_dmem_set(num_zeroes, 0, kOtbnVarSeed1 + kEcc256SeedNumBytes) ==
-           kOtbnErrorOk);
-
   // Execute program.
   SS_CHECK(otbn_execute() == kOtbnErrorOk);
   SS_CHECK(otbn_busy_wait_for_done() == kOtbnErrorOk);
+  /*
+  if (otbn_busy_wait_for_done() != kOtbnErrorOk) {
+    simple_serial_send_status(otbn_instruction_count_get());
+    otbn_err_bits_t err_bits;
+    otbn_err_bits_get(&err_bits);
+    simple_serial_send_status(err_bits);
+  }
+  */
 }
 
 /**
@@ -167,7 +168,9 @@ static void p256_ecdsa_gen_secret_key(const uint32_t *seed,
                                       const uint32_t *mask, uint32_t *d0,
                                       uint32_t *d1) {
   // Run the key generation program.
+  sca_set_trigger_high();
   p256_run_keygen(kEcc256ModePrivateKeyOnly, seed, mask);
+  sca_set_trigger_low();
 
   // Read results.
   SS_CHECK(otbn_dmem_read(kEcc256SeedNumWords, kOtbnVarD0, d0) == kOtbnErrorOk);
@@ -192,7 +195,9 @@ static void p256_ecdsa_gen_keypair(const uint32_t *seed, const uint32_t *mask,
                                    uint32_t *d0, uint32_t *d1, uint32_t *x,
                                    uint32_t *y) {
   // Run the key generation program.
+  sca_set_trigger_high();
   p256_run_keygen(kEcc256ModeKeypair, seed, mask);
+  sca_set_trigger_low();
 
   // Read results.
   SS_CHECK(otbn_dmem_read(kEcc256SeedNumWords, kOtbnVarD0, d0) == kOtbnErrorOk);
@@ -231,15 +236,10 @@ static void ecc256_ecdsa_secret_keygen(const uint8_t *mask, size_t mask_len) {
   uint32_t ecc256_d1[kEcc256SeedNumWords];
 
   LOG_INFO("Running keygen...");
-  sca_set_trigger_high();
   p256_ecdsa_gen_secret_key(ecc256_seed, ecc256_mask, ecc256_d0, ecc256_d1);
-  sca_set_trigger_low();
 
-  // TODO: Remove these if they are not necessary for the side-channel analysis.
-  simple_serial_send_packet('r', (unsigned char *)ecc256_d0,
-                            kEcc256SeedNumBytes);
-  simple_serial_send_packet('r', (unsigned char *)ecc256_d1,
-                            kEcc256SeedNumBytes);
+  simple_serial_send_packet('r', (unsigned char *) ecc256_d0, kEcc256SeedNumBytes);
+  simple_serial_send_packet('r', (unsigned char *) ecc256_d1, kEcc256SeedNumBytes);
 
   LOG_INFO("Clearing OTBN memory.");
   SS_CHECK(otbn_dmem_sec_wipe() == kOtbnErrorOk);
@@ -277,10 +277,8 @@ static void ecc256_ecdsa_gen_keypair(const uint8_t *mask, size_t mask_len) {
   uint32_t ecc256_y[kEcc256CoordNumWords];
 
   LOG_INFO("Running keygen...");
-  sca_set_trigger_high();
   p256_ecdsa_gen_keypair(ecc256_seed, ecc256_mask, ecc256_d0, ecc256_d1,
                          ecc256_x, ecc256_y);
-  sca_set_trigger_low();
 
   // TODO: Remove these if they are not necessary for the side-channel analysis.
   simple_serial_send_packet('r', (unsigned char *)ecc256_d0,
@@ -316,7 +314,7 @@ static void simple_serial_main(void) {
            kSimpleSerialOk);
   SS_CHECK(simple_serial_register_handler('p', ecc256_ecdsa_gen_keypair) ==
            kSimpleSerialOk);
-  SS_CHECK(simple_serial_register_handler('s', ecc256_set_seed) ==
+  SS_CHECK(simple_serial_register_handler('x', ecc256_set_seed) ==
            kSimpleSerialOk);
 
   LOG_INFO("Starting simple serial packet handling.");
diff --git a/sw/device/sca/lib/simple_serial.c b/sw/device/sca/lib/simple_serial.c
index ca8a8af01419c..07fe76286ee4a 100644
--- a/sw/device/sca/lib/simple_serial.c
+++ b/sw/device/sca/lib/simple_serial.c
@@ -215,8 +215,8 @@ void simple_serial_send_packet(const uint8_t cmd, const uint8_t *data,
   IGNORE_RESULT(dif_uart_byte_send_polled(uart, buf));
 }
 
-void simple_serial_send_status(uint8_t res) {
-  simple_serial_send_packet('z', (uint8_t[1]){res}, 1);
+void simple_serial_send_status(uint32_t res) {
+  simple_serial_send_packet('z', (unsigned char *) &res, sizeof(res));
 }
 
 void simple_serial_print_hex(const uint8_t *data, size_t data_len) {
diff --git a/sw/device/sca/lib/simple_serial.h b/sw/device/sca/lib/simple_serial.h
index ffe8f75726306..51c482fde561b 100644
--- a/sw/device/sca/lib/simple_serial.h
+++ b/sw/device/sca/lib/simple_serial.h
@@ -28,7 +28,7 @@
 #define SS_CHECK(condition)                          \
   do {                                               \
     if (!(condition)) {                              \
-      simple_serial_send_status(kSimpleSerialError); \
+      simple_serial_send_status(__LINE__); \
       return;                                        \
     }                                                \
   } while (false)
@@ -100,7 +100,7 @@ void simple_serial_send_packet(const uint8_t cmd, const uint8_t *data,
  *
  * @param res Status code.
  */
-void simple_serial_send_status(uint8_t res);
+void simple_serial_send_status(uint32_t res);
 
 /**
  * Sends a buffer over UART as a hex encoded string.
diff --git a/sw/otbn/crypto/p256_key_from_seed_sca.s b/sw/otbn/crypto/p256_key_from_seed_sca.s
index 124833fecf0e2..04b3ad492e4f4 100644
--- a/sw/otbn/crypto/p256_key_from_seed_sca.s
+++ b/sw/otbn/crypto/p256_key_from_seed_sca.s
@@ -76,7 +76,12 @@ p256_gen_keypair:
 
   ecall
 
-.bss
+
+/**
+ * Note: Technically this could be a .bss section, but it is convenient for
+ * software to have zeroes already set on the high bits of the seeds.
+ */
+.data
 
 /* Mode (1 = private key only, 2 = keypair) */
 .balign 4
@@ -84,12 +89,6 @@ p256_gen_keypair:
 mode:
 .zero 4
 
-/**
- * Note: Software must write the full 512 bits of the seed values to avoid
- * runtime errors when OTBN tries to load the data with two 256-bit loads. Bits
- * above position 320 will be ignored.
- */
-
 /* First share of seed (320 bits). */
 .balign 32
 .globl seed0

From 80babb966d9ca9b7ee3aef052722ba3e4d529673 Mon Sep 17 00:00:00 2001
From: Jade Philipoom <jadep@google.com>
Date: Wed, 18 Jan 2023 15:26:53 +0100
Subject: [PATCH 23/28] fix typos in sca file

---
 sw/device/sca/ecc256_keygen_serial.c    | 8 ++------
 sw/otbn/crypto/p256_key_from_seed_sca.s | 6 +++---
 2 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/sw/device/sca/ecc256_keygen_serial.c b/sw/device/sca/ecc256_keygen_serial.c
index 1f1c32d7feb2a..804424858c9ad 100644
--- a/sw/device/sca/ecc256_keygen_serial.c
+++ b/sw/device/sca/ecc256_keygen_serial.c
@@ -123,6 +123,8 @@ static void ecc256_set_seed(const uint8_t *seed, size_t seed_len) {
  */
 static void p256_run_keygen(uint32_t mode, const uint32_t *seed,
                             const uint32_t *mask) {
+  SS_CHECK(otbn_load_app(kOtbnAppP256KeyFromSeed) == kOtbnErrorOk);
+
   // Write mode.
   SS_CHECK(otbn_dmem_write(/*num_words=*/1, &mode, kOtbnVarMode) ==
            kOtbnErrorOk);
@@ -228,9 +230,6 @@ static void ecc256_ecdsa_secret_keygen(const uint8_t *mask, size_t mask_len) {
   uint32_t ecc256_mask[kEcc256SeedNumWords];
   memcpy(ecc256_mask, mask, kEcc256SeedNumBytes);
 
-  LOG_INFO("Loading app...");
-  SS_CHECK(otbn_load_app(kOtbnAppP256KeyFromSeed) == kOtbnErrorOk);
-
   uint32_t mode = kEcc256ModePrivateKeyOnly;
   uint32_t ecc256_d0[kEcc256SeedNumWords];
   uint32_t ecc256_d1[kEcc256SeedNumWords];
@@ -268,9 +267,6 @@ static void ecc256_ecdsa_gen_keypair(const uint8_t *mask, size_t mask_len) {
   uint32_t ecc256_mask[kEcc256SeedNumWords];
   memcpy(ecc256_mask, mask, kEcc256SeedNumBytes);
 
-  LOG_INFO("Loading app...");
-  SS_CHECK(otbn_load_app(kOtbnAppP256KeyFromSeed) == kOtbnErrorOk);
-
   uint32_t ecc256_d0[kEcc256SeedNumWords];
   uint32_t ecc256_d1[kEcc256SeedNumWords];
   uint32_t ecc256_x[kEcc256CoordNumWords];
diff --git a/sw/otbn/crypto/p256_key_from_seed_sca.s b/sw/otbn/crypto/p256_key_from_seed_sca.s
index 04b3ad492e4f4..bc2a61dc5c447 100644
--- a/sw/otbn/crypto/p256_key_from_seed_sca.s
+++ b/sw/otbn/crypto/p256_key_from_seed_sca.s
@@ -55,11 +55,11 @@ p256_gen_secret_key:
   la        x3, d0
   bn.sid    x2, 0(x3++)
   li        x2, 21
-  bn.lid    x2++, 0(x3)
+  bn.sid    x2++, 0(x3)
   la        x3, d1
-  bn.lid    x2, 0(x3++)
+  bn.sid    x2, 0(x3++)
   li        x2, 23
-  bn.lid    x2, 0(x3)
+  bn.sid    x2, 0(x3)
 
   ecall
 

From 31dfb43f24a44651ce081bf51cad5966c77d829b Mon Sep 17 00:00:00 2001
From: Jade Philipoom <jadep@google.com>
Date: Wed, 18 Jan 2023 16:56:48 +0100
Subject: [PATCH 24/28] add default data to sca asm

---
 sw/device/sca/ecc256_keygen_serial.c    | 10 +++----
 sw/otbn/crypto/p256_key_from_seed_sca.s | 36 ++++++++++++++++++++++---
 2 files changed, 37 insertions(+), 9 deletions(-)

diff --git a/sw/device/sca/ecc256_keygen_serial.c b/sw/device/sca/ecc256_keygen_serial.c
index 804424858c9ad..1a5711ec74c16 100644
--- a/sw/device/sca/ecc256_keygen_serial.c
+++ b/sw/device/sca/ecc256_keygen_serial.c
@@ -230,13 +230,13 @@ static void ecc256_ecdsa_secret_keygen(const uint8_t *mask, size_t mask_len) {
   uint32_t ecc256_mask[kEcc256SeedNumWords];
   memcpy(ecc256_mask, mask, kEcc256SeedNumBytes);
 
-  uint32_t mode = kEcc256ModePrivateKeyOnly;
+  LOG_INFO("Running keygen...");
   uint32_t ecc256_d0[kEcc256SeedNumWords];
   uint32_t ecc256_d1[kEcc256SeedNumWords];
-
-  LOG_INFO("Running keygen...");
   p256_ecdsa_gen_secret_key(ecc256_seed, ecc256_mask, ecc256_d0, ecc256_d1);
 
+  simple_serial_send_packet('r', (unsigned char *) &kOtbnVarD0, sizeof(otbn_addr_t));
+  simple_serial_send_packet('r', (unsigned char *) &kOtbnVarD1, sizeof(otbn_addr_t));
   simple_serial_send_packet('r', (unsigned char *) ecc256_d0, kEcc256SeedNumBytes);
   simple_serial_send_packet('r', (unsigned char *) ecc256_d1, kEcc256SeedNumBytes);
 
@@ -267,16 +267,14 @@ static void ecc256_ecdsa_gen_keypair(const uint8_t *mask, size_t mask_len) {
   uint32_t ecc256_mask[kEcc256SeedNumWords];
   memcpy(ecc256_mask, mask, kEcc256SeedNumBytes);
 
+  LOG_INFO("Running keygen...");
   uint32_t ecc256_d0[kEcc256SeedNumWords];
   uint32_t ecc256_d1[kEcc256SeedNumWords];
   uint32_t ecc256_x[kEcc256CoordNumWords];
   uint32_t ecc256_y[kEcc256CoordNumWords];
-
-  LOG_INFO("Running keygen...");
   p256_ecdsa_gen_keypair(ecc256_seed, ecc256_mask, ecc256_d0, ecc256_d1,
                          ecc256_x, ecc256_y);
 
-  // TODO: Remove these if they are not necessary for the side-channel analysis.
   simple_serial_send_packet('r', (unsigned char *)ecc256_d0,
                             kEcc256SeedNumBytes);
   simple_serial_send_packet('r', (unsigned char *)ecc256_d1,
diff --git a/sw/otbn/crypto/p256_key_from_seed_sca.s b/sw/otbn/crypto/p256_key_from_seed_sca.s
index bc2a61dc5c447..2077d039ea05c 100644
--- a/sw/otbn/crypto/p256_key_from_seed_sca.s
+++ b/sw/otbn/crypto/p256_key_from_seed_sca.s
@@ -87,19 +87,49 @@ p256_gen_keypair:
 .balign 4
 .globl mode
 mode:
-.zero 4
+  .word 0x00000001
 
 /* First share of seed (320 bits). */
 .balign 32
 .globl seed0
 seed0:
-.zero 64
+  .word 0x2335f23f
+  .word 0x3c174a16
+  .word 0x128c1339
+  .word 0xc48e8981
+  .word 0x7843d9a2
+  .word 0xbb6a0205
+  .word 0x446984cc
+  .word 0xa210c4be
+  .word 0xd7c77320
+  .word 0x2bac5b0b
+  .word 0x00000000
+  .word 0x00000000
+  .word 0x00000000
+  .word 0x00000000
+  .word 0x00000000
+  .word 0x00000000
 
 /* Second share of seed (320 bits). */
 .balign 32
 .globl seed1
 seed1:
-.zero 64
+  .word 0x225596d6
+  .word 0x2df4bec0
+  .word 0xbeb67c9e
+  .word 0x6f2f939a
+  .word 0xf7d1a873
+  .word 0x99dd9f5a
+  .word 0x551f77d1
+  .word 0x17bcfeef
+  .word 0x4e67f1f7
+  .word 0x63e2e86d
+  .word 0x00000000
+  .word 0x00000000
+  .word 0x00000000
+  .word 0x00000000
+  .word 0x00000000
+  .word 0x00000000
 
 /* First share of output key (320 bits). */
 .balign 32

From 3f161665e9494a7e5cde0783100e10655afdc75a Mon Sep 17 00:00:00 2001
From: Jade Philipoom <jadep@google.com>
Date: Wed, 18 Jan 2023 18:55:54 +0100
Subject: [PATCH 25/28] cleanup serial file

---
 sw/device/sca/ecc256_keygen_serial.c | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/sw/device/sca/ecc256_keygen_serial.c b/sw/device/sca/ecc256_keygen_serial.c
index 1a5711ec74c16..ae25e3863ff87 100644
--- a/sw/device/sca/ecc256_keygen_serial.c
+++ b/sw/device/sca/ecc256_keygen_serial.c
@@ -144,14 +144,6 @@ static void p256_run_keygen(uint32_t mode, const uint32_t *seed,
   // Execute program.
   SS_CHECK(otbn_execute() == kOtbnErrorOk);
   SS_CHECK(otbn_busy_wait_for_done() == kOtbnErrorOk);
-  /*
-  if (otbn_busy_wait_for_done() != kOtbnErrorOk) {
-    simple_serial_send_status(otbn_instruction_count_get());
-    otbn_err_bits_t err_bits;
-    otbn_err_bits_get(&err_bits);
-    simple_serial_send_status(err_bits);
-  }
-  */
 }
 
 /**
@@ -230,17 +222,13 @@ static void ecc256_ecdsa_secret_keygen(const uint8_t *mask, size_t mask_len) {
   uint32_t ecc256_mask[kEcc256SeedNumWords];
   memcpy(ecc256_mask, mask, kEcc256SeedNumBytes);
 
-  LOG_INFO("Running keygen...");
   uint32_t ecc256_d0[kEcc256SeedNumWords];
   uint32_t ecc256_d1[kEcc256SeedNumWords];
   p256_ecdsa_gen_secret_key(ecc256_seed, ecc256_mask, ecc256_d0, ecc256_d1);
 
-  simple_serial_send_packet('r', (unsigned char *) &kOtbnVarD0, sizeof(otbn_addr_t));
-  simple_serial_send_packet('r', (unsigned char *) &kOtbnVarD1, sizeof(otbn_addr_t));
   simple_serial_send_packet('r', (unsigned char *) ecc256_d0, kEcc256SeedNumBytes);
   simple_serial_send_packet('r', (unsigned char *) ecc256_d1, kEcc256SeedNumBytes);
 
-  LOG_INFO("Clearing OTBN memory.");
   SS_CHECK(otbn_dmem_sec_wipe() == kOtbnErrorOk);
   SS_CHECK(otbn_imem_sec_wipe() == kOtbnErrorOk);
 }
@@ -267,7 +255,6 @@ static void ecc256_ecdsa_gen_keypair(const uint8_t *mask, size_t mask_len) {
   uint32_t ecc256_mask[kEcc256SeedNumWords];
   memcpy(ecc256_mask, mask, kEcc256SeedNumBytes);
 
-  LOG_INFO("Running keygen...");
   uint32_t ecc256_d0[kEcc256SeedNumWords];
   uint32_t ecc256_d1[kEcc256SeedNumWords];
   uint32_t ecc256_x[kEcc256CoordNumWords];
@@ -284,7 +271,6 @@ static void ecc256_ecdsa_gen_keypair(const uint8_t *mask, size_t mask_len) {
   simple_serial_send_packet('r', (unsigned char *)ecc256_y,
                             kEcc256CoordNumBytes);
 
-  LOG_INFO("Clearing OTBN memory.");
   SS_CHECK(otbn_dmem_sec_wipe() == kOtbnErrorOk);
   SS_CHECK(otbn_imem_sec_wipe() == kOtbnErrorOk);
 }

From fee0a0edd0b368304022be1be44b338ac4885363 Mon Sep 17 00:00:00 2001
From: Jade Philipoom <jadep@google.com>
Date: Thu, 19 Jan 2023 10:20:26 +0100
Subject: [PATCH 26/28] change error back to byte

---
 sw/device/sca/lib/simple_serial.c | 4 ++--
 sw/device/sca/lib/simple_serial.h | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/sw/device/sca/lib/simple_serial.c b/sw/device/sca/lib/simple_serial.c
index 07fe76286ee4a..ca8a8af01419c 100644
--- a/sw/device/sca/lib/simple_serial.c
+++ b/sw/device/sca/lib/simple_serial.c
@@ -215,8 +215,8 @@ void simple_serial_send_packet(const uint8_t cmd, const uint8_t *data,
   IGNORE_RESULT(dif_uart_byte_send_polled(uart, buf));
 }
 
-void simple_serial_send_status(uint32_t res) {
-  simple_serial_send_packet('z', (unsigned char *) &res, sizeof(res));
+void simple_serial_send_status(uint8_t res) {
+  simple_serial_send_packet('z', (uint8_t[1]){res}, 1);
 }
 
 void simple_serial_print_hex(const uint8_t *data, size_t data_len) {
diff --git a/sw/device/sca/lib/simple_serial.h b/sw/device/sca/lib/simple_serial.h
index 51c482fde561b..ffe8f75726306 100644
--- a/sw/device/sca/lib/simple_serial.h
+++ b/sw/device/sca/lib/simple_serial.h
@@ -28,7 +28,7 @@
 #define SS_CHECK(condition)                          \
   do {                                               \
     if (!(condition)) {                              \
-      simple_serial_send_status(__LINE__); \
+      simple_serial_send_status(kSimpleSerialError); \
       return;                                        \
     }                                                \
   } while (false)
@@ -100,7 +100,7 @@ void simple_serial_send_packet(const uint8_t cmd, const uint8_t *data,
  *
  * @param res Status code.
  */
-void simple_serial_send_status(uint32_t res);
+void simple_serial_send_status(uint8_t res);
 
 /**
  * Sends a buffer over UART as a hex encoded string.

From 54c8df2b6df0791de743a8a7746d59271236aabd Mon Sep 17 00:00:00 2001
From: Jade Philipoom <jadep@google.com>
Date: Thu, 19 Jan 2023 11:50:14 +0100
Subject: [PATCH 27/28] move trigger

---
 sw/device/sca/ecc256_keygen_serial.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/sw/device/sca/ecc256_keygen_serial.c b/sw/device/sca/ecc256_keygen_serial.c
index ae25e3863ff87..3862373465e3b 100644
--- a/sw/device/sca/ecc256_keygen_serial.c
+++ b/sw/device/sca/ecc256_keygen_serial.c
@@ -142,8 +142,10 @@ static void p256_run_keygen(uint32_t mode, const uint32_t *seed,
            kOtbnErrorOk);
 
   // Execute program.
+  sca_set_trigger_high();
   SS_CHECK(otbn_execute() == kOtbnErrorOk);
   SS_CHECK(otbn_busy_wait_for_done() == kOtbnErrorOk);
+  sca_set_trigger_low();
 }
 
 /**
@@ -162,9 +164,7 @@ static void p256_ecdsa_gen_secret_key(const uint32_t *seed,
                                       const uint32_t *mask, uint32_t *d0,
                                       uint32_t *d1) {
   // Run the key generation program.
-  sca_set_trigger_high();
   p256_run_keygen(kEcc256ModePrivateKeyOnly, seed, mask);
-  sca_set_trigger_low();
 
   // Read results.
   SS_CHECK(otbn_dmem_read(kEcc256SeedNumWords, kOtbnVarD0, d0) == kOtbnErrorOk);
@@ -189,9 +189,7 @@ static void p256_ecdsa_gen_keypair(const uint32_t *seed, const uint32_t *mask,
                                    uint32_t *d0, uint32_t *d1, uint32_t *x,
                                    uint32_t *y) {
   // Run the key generation program.
-  sca_set_trigger_high();
   p256_run_keygen(kEcc256ModeKeypair, seed, mask);
-  sca_set_trigger_low();
 
   // Read results.
   SS_CHECK(otbn_dmem_read(kEcc256SeedNumWords, kOtbnVarD0, d0) == kOtbnErrorOk);

From 347a2777536d987d1ad441a268156aede4bb62a7 Mon Sep 17 00:00:00 2001
From: Moritz Wettermann <moritz.wettermann@gi-de.com>
Date: Fri, 10 Feb 2023 15:15:57 +0100
Subject: [PATCH 28/28] [sca/otbn] Split loading of OTBN IMEM and DMEM

Split loading of OTBN IMEM and DMEM from Ibex side. OTBN app is
loaded for every trace, but only DMEM is really needed.
This speeds up trace capture.

Signed-off-by: Moritz Wettermann <moritz.wettermann@gi-de.com>
---
 sw/device/sca/ecc256_keygen_serial.c | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/sw/device/sca/ecc256_keygen_serial.c b/sw/device/sca/ecc256_keygen_serial.c
index 3862373465e3b..049967f0855ea 100644
--- a/sw/device/sca/ecc256_keygen_serial.c
+++ b/sw/device/sca/ecc256_keygen_serial.c
@@ -123,8 +123,6 @@ static void ecc256_set_seed(const uint8_t *seed, size_t seed_len) {
  */
 static void p256_run_keygen(uint32_t mode, const uint32_t *seed,
                             const uint32_t *mask) {
-  SS_CHECK(otbn_load_app(kOtbnAppP256KeyFromSeed) == kOtbnErrorOk);
-
   // Write mode.
   SS_CHECK(otbn_dmem_write(/*num_words=*/1, &mode, kOtbnVarMode) ==
            kOtbnErrorOk);
@@ -224,11 +222,10 @@ static void ecc256_ecdsa_secret_keygen(const uint8_t *mask, size_t mask_len) {
   uint32_t ecc256_d1[kEcc256SeedNumWords];
   p256_ecdsa_gen_secret_key(ecc256_seed, ecc256_mask, ecc256_d0, ecc256_d1);
 
-  simple_serial_send_packet('r', (unsigned char *) ecc256_d0, kEcc256SeedNumBytes);
-  simple_serial_send_packet('r', (unsigned char *) ecc256_d1, kEcc256SeedNumBytes);
-
-  SS_CHECK(otbn_dmem_sec_wipe() == kOtbnErrorOk);
-  SS_CHECK(otbn_imem_sec_wipe() == kOtbnErrorOk);
+  simple_serial_send_packet('r', (unsigned char *)ecc256_d0,
+                            kEcc256SeedNumBytes);
+  simple_serial_send_packet('r', (unsigned char *)ecc256_d1,
+                            kEcc256SeedNumBytes);
 }
 
 /**
@@ -268,9 +265,6 @@ static void ecc256_ecdsa_gen_keypair(const uint8_t *mask, size_t mask_len) {
                             kEcc256CoordNumBytes);
   simple_serial_send_packet('r', (unsigned char *)ecc256_y,
                             kEcc256CoordNumBytes);
-
-  SS_CHECK(otbn_dmem_sec_wipe() == kOtbnErrorOk);
-  SS_CHECK(otbn_imem_sec_wipe() == kOtbnErrorOk);
 }
 
 /**
@@ -295,6 +289,9 @@ static void simple_serial_main(void) {
   SS_CHECK(simple_serial_register_handler('x', ecc256_set_seed) ==
            kSimpleSerialOk);
 
+  LOG_INFO("Load p256 keygen from seed app into OTBN");
+  SS_CHECK(otbn_load_app(kOtbnAppP256KeyFromSeed) == kOtbnErrorOk);
+
   LOG_INFO("Starting simple serial packet handling.");
   while (true) {
     simple_serial_process_packet();