Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Experiment: backport ff_derive x86_64 assembly from fff crate #58

Draft
wants to merge 11 commits into
base: main
Choose a base branch
from
5 changes: 5 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,16 @@ edition = "2018"
bitvec = { version = "0.22", default-features = false, optional = true }
byteorder = { version = "1", default-features = false, optional = true }
ff_derive = { version = "0.8", path = "ff_derive", optional = true }
lazy_static = { version = "1.4.0", optional = true }
rand_core = { version = "0.6", default-features = false }
subtle = { version = "2.2.1", default-features = false, features = ["i128"] }

[target.'cfg(target_arch = "x86_64")'.build-dependencies]
cc = "1.0.50"

[features]
default = ["bits", "std"]
asm = ["lazy_static", "std"]
bits = ["bitvec"]
derive = ["byteorder", "ff_derive"]
std = []
Expand Down
203 changes: 203 additions & 0 deletions asm/mul_4.S
Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
// A*B
// Schoolbook multiplication of four 64b limbs
// result in r8 - r15
.macro mul_256 a b
xor %rax, %rax
mov 0x00\a, %rdx
mulx 0x00\b, %r8, %r9
mulx 0x08\b, %rbx, %r10
adcx %rbx, %r9
mulx 0x10\b, %rbx, %r11
adcx %rbx, %r10
mulx 0x18\b, %rbx, %r12
adcx %rbx, %r11
adcx %rax, %r12
xor %rax, %rax
mov 0x08\a, %rdx
mulx 0x00\b, %rbp, %rbx
adcx %rbp, %r9
adox %rbx, %r10
mulx 0x08\b, %rbp, %rbx
adcx %rbp, %r10
adox %rbx, %r11
mulx 0x10\b, %rbp, %rbx
adcx %rbp, %r11
adox %rbx, %r12
mulx 0x18\b, %rbp, %r13
adcx %rbp, %r12
adox %rax, %r13
adcx %rax, %r13
xor %rax, %rax
mov 0x10\a, %rdx
mulx 0x00\b, %rbp, %rbx
adcx %rbp, %r10
adox %rbx, %r11
mulx 0x08\b, %rbp, %rbx
adcx %rbp, %r11
adox %rbx, %r12
mulx 0x10\b, %rbp, %rbx
adcx %rbp, %r12
adox %rbx, %r13
mulx 0x18\b, %rbp, %r14
adcx %rbp, %r13
adox %rax, %r14
adcx %rax, %r14
xor %rax, %rax
mov 0x18\a, %rdx
mulx 0x00\b, %rbp, %rbx
adcx %rbp, %r11
adox %rbx, %r12
mulx 0x08\b, %rbp, %rbx
adcx %rbp, %r12
adox %rbx, %r13
mulx 0x10\b, %rbp, %rbx
adcx %rbp, %r13
adox %rbx, %r14
mulx 0x18\b, %rbp, %r15
adcx %rbp, %r14
adox %rax, %r15
adcx %rax, %r15
.endm

// Montgomery reduction
// expects multiplication result in r8 - r15
// See algo 14.32 from Handbook of Applied Cryptography
.macro red_256 res name
push %rsi
lea .LM(%rip), %rsi
xor %rax, %rax
mov 0x20(%rsi), %rdx
mulx %r8, %rdx, %rbp
mulx 0x00(%rsi), %rbp, %rbx
adox %rbp, %r8
adcx %rbx, %r9
mulx 0x08(%rsi), %rbp, %rbx
adox %rbp, %r9
adcx %rbx, %r10
mulx 0x10(%rsi), %rbp, %rbx
adox %rbp, %r10
adcx %rbx, %r11
mulx 0x18(%rsi), %rbp, %rbx
adox %rbp, %r11
adcx %rbx, %r12
adox %rax, %r12
adcx %rax, %r13
adox %rax, %r13
adcx %rax, %r14
adox %rax, %r14
adcx %rax, %r15
adox %rax, %r15
mov 0x20(%rsi), %rdx
mulx %r9, %rdx, %rbp
mulx 0x00(%rsi), %rbp, %rbx
adox %rbp, %r9
adcx %rbx, %r10
mulx 0x08(%rsi), %rbp, %rbx
adox %rbp, %r10
adcx %rbx, %r11
mulx 0x10(%rsi), %rbp, %rbx
adox %rbp, %r11
adcx %rbx, %r12
mulx 0x18(%rsi), %rbp, %rbx
adox %rbp, %r12
adcx %rbx, %r13
adox %rax, %r13
adcx %rax, %r14
adox %rax, %r14
adcx %rax, %r15
adox %rax, %r15
mov 0x20(%rsi), %rdx
mulx %r10, %rdx, %rbp
mulx 0x00(%rsi), %rbp, %rbx
adox %rbp, %r10
adcx %rbx, %r11
mulx 0x08(%rsi), %rbp, %rbx
adox %rbp, %r11
adcx %rbx, %r12
mulx 0x10(%rsi), %rbp, %rbx
adox %rbp, %r12
adcx %rbx, %r13
mulx 0x18(%rsi), %rbp, %rbx
adox %rbp, %r13
adcx %rbx, %r14
adox %rax, %r14
adcx %rax, %r15
adox %rax, %r15
mov 0x20(%rsi), %rdx
mulx %r11, %rdx, %rbp
mov 0x00(%rsi), %r8
mulx %r8, %rbp, %rbx
adox %rbp, %r11
adcx %rbx, %r12
mov 0x08(%rsi), %r9
mulx %r9, %rbp, %rbx
adox %rbp, %r12
adcx %rbx, %r13
mov 0x10(%rsi), %r10
mulx %r10, %rbp, %rbx
adox %rbp, %r13
adcx %rbx, %r14
mov 0x18(%rsi), %r11
mulx %r11, %rbp, %rbx
adox %rbp, %r14
adcx %rbx, %r15
adox %rax, %r15
mov %r12, 0x00\res
mov %r13, 0x08\res
mov %r14, 0x10\res
mov %r15, 0x18\res
sub %r8, %r12
sbb %r9, %r13
sbb %r10, %r14
sbb %r11, %r15
jb .Lred_256\name
mov %r12, 0x00\res
mov %r13, 0x08\res
mov %r14, 0x10\res
mov %r15, 0x18\res
.Lred_256\name:
pop %rsi
.endm

.macro mod_mul_256 a b res name
mul_256 \a, \b
red_256 \res, \name
.endm

// BLS12-381 G1 order r used as modulus
// Montgomery constant -m^-1 mod b
.LM:
.quad 0xffffffff00000001
.quad 0x53bda402fffe5bfe
.quad 0x3339d80809a1d805
.quad 0x73eda753299d7d48
.quad 0xfffffffeffffffff

#ifdef __APPLE__
.global _mod_mul_4w
_mod_mul_4w:
#else
.global mod_mul_4w
mod_mul_4w:
#endif
// x = rdi
// y = rsi
// result = rdx
push %rbp
push %rbx
push %r12
push %r13
push %r14
push %r15
mov %rdx, %rcx // rcx = result

// x * y
mod_mul_256 (%rdi), (%rsi), (%rcx), mm

pop %r15
pop %r14
pop %r13
pop %r12
pop %rbx
pop %rbp
ret
14 changes: 14 additions & 0 deletions build.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#[cfg(target_arch = "x86_64")]
fn main() {
let target_arch = std::env::var("CARGO_CFG_TARGET_ARCH").unwrap();

if target_arch == "x86_64" {
cc::Build::new()
.flag("-c")
.file("./asm/mul_4.S")
.compile("libff-derive-crypto.a");
}
}

#[cfg(not(target_arch = "x86_64"))]
fn main() {}
Loading