-
Notifications
You must be signed in to change notification settings - Fork 122
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
AVX-512 support for RSA Signing #1273
Changes from 10 commits
b9088fc
e6269ff
6d2ece9
e0ad9da
024a9ec
cd2a3d1
d4d89fc
a0f3737
8e55af5
7d1ea20
407df8d
e67bbda
b33709e
0e7c607
b2d1327
14fefe0
5e1c7ee
73d389d
087bf5c
c439bf0
abe1124
37b4a4a
e06d8d0
bf9fc29
e626c2c
92b9e3f
1055b42
58af762
56d8fd6
f925e7c
2473469
ef26ced
73b7b8f
506dced
0dd53a1
f3715bb
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
name: MinGW | ||
on: | ||
pull_request: | ||
branches: [ '*' ] | ||
push: | ||
branches: [ '*' ] | ||
|
||
concurrency: | ||
group: ${{ github.workflow }}-${{ github.event.pull_request.number }} | ||
cancel-in-progress: true | ||
jobs: | ||
mingw: | ||
if: github.repository == 'aws/aws-lc' | ||
runs-on: windows-latest | ||
steps: | ||
- name: Install NASM | ||
uses: ilammy/[email protected] | ||
- name: Checkout | ||
uses: actions/checkout@v4 | ||
- name: Setup MinGW | ||
uses: egor-tensin/[email protected] | ||
id: setup_mingw | ||
with: | ||
static: 0 | ||
- name: Setup CMake | ||
uses: threeal/[email protected] | ||
with: | ||
generator: Ninja | ||
c-compiler: ${{ steps.setup_mingw.outputs.gcc }} | ||
cxx-compiler: ${{ steps.setup_mingw.outputs.gxx }} | ||
options: | | ||
CMAKE_SYSTEM_NAME=Windows \ | ||
CMAKE_SYSTEM_PROCESSOR=x86_64 \ | ||
CMAKE_BUILD_TOOL=C:/ProgramData/chocolatey/lib/mingw/tools/install/mingw64/bin/ninja.exe \ | ||
CMAKE_FIND_ROOT_PATH=C:/ProgramData/chocolatey/lib/mingw/tools/install/mingw64 \ | ||
CMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \ | ||
CMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \ | ||
CMAKE_FIND_ROOT_PATH_MODE_INCLUDE=ONLY \ | ||
- name: Build Project | ||
run: cmake --build ./build --target all | ||
- name: Run tests | ||
run: cmake --build ./build --target run_tests |
Original file line number | Diff line number | Diff line change | ||
---|---|---|---|---|
|
@@ -75,54 +75,20 @@ | |||
*STDOUT=*OUT; | ||||
|
||||
if ($avx512ifma>0) {{{ | ||||
@_6_args_universal_ABI = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9"); | ||||
|
||||
$code.=<<___; | ||||
.text | ||||
.extern OPENSSL_ia32cap_P | ||||
.globl ossl_rsaz_avx512ifma_eligible | ||||
.type ossl_rsaz_avx512ifma_eligible,\@abi-omnipotent | ||||
.align 32 | ||||
ossl_rsaz_avx512ifma_eligible: | ||||
leaq OPENSSL_ia32cap_P(%rip),%r11 | ||||
mov 8(%r11),%r11d | ||||
xor %eax,%eax | ||||
and \$`1<<31|1<<21|1<<17|1<<16`, %r11d # avx512vl + avx512ifma + avx512dq + avx512f | ||||
cmp \$`1<<31|1<<21|1<<17|1<<16`, %r11d | ||||
cmove %r11d,%eax | ||||
ret | ||||
.size ossl_rsaz_avx512ifma_eligible, .-ossl_rsaz_avx512ifma_eligible | ||||
___ | ||||
@_6_args_universal_ABI = $win64 ? | ||||
("%rcx","%rdx","%r8","%r9","%r10","%r11") : | ||||
("%rdi","%rsi","%rdx","%rcx","%r8","%r9"); | ||||
|
||||
############################################################################### | ||||
# Almost Montgomery Multiplication (AMM) for 20-digit number in radix 2^52. | ||||
# | ||||
# AMM is defined as presented in the paper [1]. | ||||
# | ||||
# The input and output are presented in 2^52 radix domain, i.e. | ||||
# |res|, |a|, |b|, |m| are arrays of 20 64-bit qwords with 12 high bits zeroed. | ||||
# |k0| is a Montgomery coefficient, which is here k0 = -1/m mod 2^64 | ||||
# | ||||
# NB: the AMM implementation does not perform "conditional" subtraction step | ||||
# specified in the original algorithm as according to the Lemma 1 from the paper | ||||
# [2], the result will be always < 2*m and can be used as a direct input to | ||||
# the next AMM iteration. This post-condition is true, provided the correct | ||||
# parameter |s| (notion of the Lemma 1 from [2]) is chosen, i.e. s >= n + 2 * k, | ||||
# which matches our case: 1040 > 1024 + 2 * 1. | ||||
# | ||||
# [1] Gueron, S. Efficient software implementations of modular exponentiation. | ||||
# DOI: 10.1007/s13389-012-0031-5 | ||||
# [2] Gueron, S. Enhanced Montgomery Multiplication. | ||||
# DOI: 10.1007/3-540-36400-5_5 | ||||
# | ||||
# void ossl_rsaz_amm52x20_x1_ifma256(BN_ULONG *res, | ||||
# void rsaz_amm52x20_x1_ifma256(BN_ULONG *res, | ||||
# const BN_ULONG *a, | ||||
# const BN_ULONG *b, | ||||
# const BN_ULONG *m, | ||||
# BN_ULONG k0); | ||||
############################################################################### | ||||
{ | ||||
# input parameters ("%rdi","%rsi","%rdx","%rcx","%r8") | ||||
# input parameters | ||||
my ($res,$a,$b,$m,$k0) = @_6_args_universal_ABI; | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's not clear to me if this takes win64 into account. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think you're right, I don't think it is either! I've updated this with a ternary check. |
||||
|
||||
my $mask52 = "%rax"; | ||||
|
@@ -325,10 +291,10 @@ sub amm52x20_x1_norm { | |||
$code.=<<___; | ||||
.text | ||||
|
||||
.globl ossl_rsaz_amm52x20_x1_ifma256 | ||||
.type ossl_rsaz_amm52x20_x1_ifma256,\@function,5 | ||||
.globl rsaz_amm52x20_x1_ifma256 | ||||
.type rsaz_amm52x20_x1_ifma256,\@function,5 | ||||
.align 32 | ||||
ossl_rsaz_amm52x20_x1_ifma256: | ||||
rsaz_amm52x20_x1_ifma256: | ||||
.cfi_startproc | ||||
endbranch | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just to confirm my understanding, this is effectively the same as aws-lc/include/openssl/asm_base.h Line 63 in 518df30
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. When CET is available, yes. Here is its definition in GCC, where |
||||
push %rbx | ||||
|
@@ -343,7 +309,7 @@ sub amm52x20_x1_norm { | |||
.cfi_push %r14 | ||||
push %r15 | ||||
.cfi_push %r15 | ||||
.Lossl_rsaz_amm52x20_x1_ifma256_body: | ||||
.Lrsaz_amm52x20_x1_ifma256_body: | ||||
|
||||
# Zeroing accumulators | ||||
vpxord $zero, $zero, $zero | ||||
|
@@ -396,10 +362,10 @@ sub amm52x20_x1_norm { | |||
.cfi_restore %rbx | ||||
lea 48(%rsp),%rsp | ||||
.cfi_adjust_cfa_offset -48 | ||||
.Lossl_rsaz_amm52x20_x1_ifma256_epilogue: | ||||
.Lrsaz_amm52x20_x1_ifma256_epilogue: | ||||
ret | ||||
.cfi_endproc | ||||
.size ossl_rsaz_amm52x20_x1_ifma256, .-ossl_rsaz_amm52x20_x1_ifma256 | ||||
.size rsaz_amm52x20_x1_ifma256, .-rsaz_amm52x20_x1_ifma256 | ||||
___ | ||||
|
||||
$code.=<<___; | ||||
|
@@ -414,27 +380,20 @@ sub amm52x20_x1_norm { | |||
___ | ||||
|
||||
############################################################################### | ||||
# Dual Almost Montgomery Multiplication for 20-digit number in radix 2^52 | ||||
# | ||||
# See description of ossl_rsaz_amm52x20_x1_ifma256() above for details about Almost | ||||
# Montgomery Multiplication algorithm and function input parameters description. | ||||
# | ||||
# This function does two AMMs for two independent inputs, hence dual. | ||||
# | ||||
# void ossl_rsaz_amm52x20_x2_ifma256(BN_ULONG out[2][20], | ||||
# const BN_ULONG a[2][20], | ||||
# const BN_ULONG b[2][20], | ||||
# const BN_ULONG m[2][20], | ||||
# const BN_ULONG k0[2]); | ||||
# void rsaz_amm52x20_x2_ifma256(BN_ULONG out[2][20], | ||||
# const BN_ULONG a[2][20], | ||||
# const BN_ULONG b[2][20], | ||||
# const BN_ULONG m[2][20], | ||||
# const BN_ULONG k0[2]); | ||||
############################################################################### | ||||
|
||||
$code.=<<___; | ||||
.text | ||||
|
||||
.globl ossl_rsaz_amm52x20_x2_ifma256 | ||||
.type ossl_rsaz_amm52x20_x2_ifma256,\@function,5 | ||||
.globl rsaz_amm52x20_x2_ifma256 | ||||
.type rsaz_amm52x20_x2_ifma256,\@function,5 | ||||
.align 32 | ||||
ossl_rsaz_amm52x20_x2_ifma256: | ||||
rsaz_amm52x20_x2_ifma256: | ||||
.cfi_startproc | ||||
endbranch | ||||
push %rbx | ||||
|
@@ -449,7 +408,7 @@ sub amm52x20_x1_norm { | |||
.cfi_push %r14 | ||||
push %r15 | ||||
.cfi_push %r15 | ||||
.Lossl_rsaz_amm52x20_x2_ifma256_body: | ||||
.Lrsaz_amm52x20_x2_ifma256_body: | ||||
|
||||
# Zeroing accumulators | ||||
vpxord $zero, $zero, $zero | ||||
|
@@ -514,27 +473,18 @@ sub amm52x20_x1_norm { | |||
.cfi_restore %rbx | ||||
lea 48(%rsp),%rsp | ||||
.cfi_adjust_cfa_offset -48 | ||||
.Lossl_rsaz_amm52x20_x2_ifma256_epilogue: | ||||
.Lrsaz_amm52x20_x2_ifma256_epilogue: | ||||
ret | ||||
.cfi_endproc | ||||
.size ossl_rsaz_amm52x20_x2_ifma256, .-ossl_rsaz_amm52x20_x2_ifma256 | ||||
.size rsaz_amm52x20_x2_ifma256, .-rsaz_amm52x20_x2_ifma256 | ||||
___ | ||||
} | ||||
|
||||
############################################################################### | ||||
# Constant time extraction from the precomputed table of powers base^i, where | ||||
# i = 0..2^EXP_WIN_SIZE-1 | ||||
# | ||||
# The input |red_table| contains precomputations for two independent base values. | ||||
# |red_table_idx1| and |red_table_idx2| are corresponding power indexes. | ||||
# | ||||
# Extracted value (output) is 2 20 digit numbers in 2^52 radix. | ||||
# | ||||
# void ossl_extract_multiplier_2x20_win5(BN_ULONG *red_Y, | ||||
# void extract_multiplier_2x20_win5(BN_ULONG *red_Y, | ||||
# const BN_ULONG red_table[1 << EXP_WIN_SIZE][2][20], | ||||
# int red_table_idx1, int red_table_idx2); | ||||
# | ||||
# EXP_WIN_SIZE = 5 | ||||
############################################################################### | ||||
{ | ||||
# input parameters | ||||
|
@@ -553,9 +503,9 @@ sub amm52x20_x1_norm { | |||
.text | ||||
|
||||
.align 32 | ||||
.globl ossl_extract_multiplier_2x20_win5 | ||||
.type ossl_extract_multiplier_2x20_win5,\@abi-omnipotent | ||||
ossl_extract_multiplier_2x20_win5: | ||||
.globl extract_multiplier_2x20_win5 | ||||
.type extract_multiplier_2x20_win5,\@abi-omnipotent | ||||
extract_multiplier_2x20_win5: | ||||
.cfi_startproc | ||||
endbranch | ||||
vmovdqa64 .Lones(%rip), $ones # broadcast ones | ||||
|
@@ -597,7 +547,7 @@ sub amm52x20_x1_norm { | |||
$code.=<<___; | ||||
ret | ||||
.cfi_endproc | ||||
.size ossl_extract_multiplier_2x20_win5, .-ossl_extract_multiplier_2x20_win5 | ||||
.size extract_multiplier_2x20_win5, .-extract_multiplier_2x20_win5 | ||||
___ | ||||
$code.=<<___; | ||||
.section .rodata | ||||
|
@@ -707,47 +657,39 @@ sub amm52x20_x1_norm { | |||
|
||||
.section .pdata | ||||
.align 4 | ||||
.rva .LSEH_begin_ossl_rsaz_amm52x20_x1_ifma256 | ||||
.rva .LSEH_end_ossl_rsaz_amm52x20_x1_ifma256 | ||||
.rva .LSEH_info_ossl_rsaz_amm52x20_x1_ifma256 | ||||
.rva .LSEH_begin_rsaz_amm52x20_x1_ifma256 | ||||
.rva .LSEH_end_rsaz_amm52x20_x1_ifma256 | ||||
.rva .LSEH_info_rsaz_amm52x20_x1_ifma256 | ||||
|
||||
.rva .LSEH_begin_ossl_rsaz_amm52x20_x2_ifma256 | ||||
.rva .LSEH_end_ossl_rsaz_amm52x20_x2_ifma256 | ||||
.rva .LSEH_info_ossl_rsaz_amm52x20_x2_ifma256 | ||||
.rva .LSEH_begin_rsaz_amm52x20_x2_ifma256 | ||||
.rva .LSEH_end_rsaz_amm52x20_x2_ifma256 | ||||
.rva .LSEH_info_rsaz_amm52x20_x2_ifma256 | ||||
|
||||
.section .xdata | ||||
.align 8 | ||||
.LSEH_info_ossl_rsaz_amm52x20_x1_ifma256: | ||||
.LSEH_info_rsaz_amm52x20_x1_ifma256: | ||||
.byte 9,0,0,0 | ||||
.rva rsaz_def_handler | ||||
.rva .Lossl_rsaz_amm52x20_x1_ifma256_body,.Lossl_rsaz_amm52x20_x1_ifma256_epilogue | ||||
.LSEH_info_ossl_rsaz_amm52x20_x2_ifma256: | ||||
.rva .Lrsaz_amm52x20_x1_ifma256_body,.Lrsaz_amm52x20_x1_ifma256_epilogue | ||||
.LSEH_info_rsaz_amm52x20_x2_ifma256: | ||||
.byte 9,0,0,0 | ||||
.rva rsaz_def_handler | ||||
.rva .Lossl_rsaz_amm52x20_x2_ifma256_body,.Lossl_rsaz_amm52x20_x2_ifma256_epilogue | ||||
.rva .Lrsaz_amm52x20_x2_ifma256_body,.Lrsaz_amm52x20_x2_ifma256_epilogue | ||||
___ | ||||
} | ||||
}}} else {{{ # fallback for old assembler | ||||
$code.=<<___; | ||||
.text | ||||
|
||||
.globl ossl_rsaz_avx512ifma_eligible | ||||
.type ossl_rsaz_avx512ifma_eligible,\@abi-omnipotent | ||||
ossl_rsaz_avx512ifma_eligible: | ||||
xor %eax,%eax | ||||
ret | ||||
.size ossl_rsaz_avx512ifma_eligible, .-ossl_rsaz_avx512ifma_eligible | ||||
|
||||
.globl ossl_rsaz_amm52x20_x1_ifma256 | ||||
.globl ossl_rsaz_amm52x20_x2_ifma256 | ||||
.globl ossl_extract_multiplier_2x20_win5 | ||||
.type ossl_rsaz_amm52x20_x1_ifma256,\@abi-omnipotent | ||||
ossl_rsaz_amm52x20_x1_ifma256: | ||||
ossl_rsaz_amm52x20_x2_ifma256: | ||||
ossl_extract_multiplier_2x20_win5: | ||||
.globl rsaz_amm52x20_x1_ifma256 | ||||
.globl rsaz_amm52x20_x2_ifma256 | ||||
.globl extract_multiplier_2x20_win5 | ||||
.type rsaz_amm52x20_x1_ifma256,\@abi-omnipotent | ||||
rsaz_amm52x20_x1_ifma256: | ||||
rsaz_amm52x20_x2_ifma256: | ||||
extract_multiplier_2x20_win5: | ||||
.byte 0x0f,0x0b # ud2 | ||||
ret | ||||
.size ossl_rsaz_amm52x20_x1_ifma256, .-ossl_rsaz_amm52x20_x1_ifma256 | ||||
.size rsaz_amm52x20_x1_ifma256, .-rsaz_amm52x20_x1_ifma256 | ||||
___ | ||||
}}} | ||||
|
||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
why do we need this? Is this not covered by our existing intel SDE tests?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not sure how this file got into this PR. This appears to be a duplicate of CI tests we already have: https://github.com/aws/aws-lc/blob/main/.github/workflows/windows-alt.yml#L11
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It must have come in with an intermediate merge somewhere along the way. I will remove it.