Rewrite field assembly to match the C version

pull/11871/head
Pieter Wuille 10 years ago
parent 4d879a3a66
commit f048615970

@ -1,4 +1,11 @@
;; Added by Diederik Huys, March 2013 ;; Copyright (c) 2013-2014 Diederik Huys, Pieter Wuille
;; Distributed under the MIT software license, see the accompanying
;; file COPYING or http://www.opensource.org/licenses/mit-license.php.
;; Changelog:
;; * March 2013, Diederik Huys: Original version
;; * November 2014, Pieter Wuille: Updated to use Peter Dettman's parallel
;; multiplication algorithm
;; ;;
;; Provided public procedures: ;; Provided public procedures:
;; secp256k1_fe_mul_inner ;; secp256k1_fe_mul_inner
@ -24,14 +31,12 @@
;; ;;
;; INTERNAL: rdx:rax = multiplication accumulator ;; INTERNAL: rdx:rax = multiplication accumulator
;; r9:r8 = c ;; r9:r8 = c
;; r10-r13 = t0-t3 ;; r10:r14 = a0-a4
;; r14 = b.n[0] / t4 ;; rcx:rbx = d
;; r15 = b.n[1] / t5 ;; rbp = R
;; rbx = b.n[2] / t6 ;; rdi = t?
;; rcx = b.n[3] / t7 ;; r15 = b->n
;; rbp = Constant 0FFFFFFFFFFFFFh / t8 ;; rsi = r->n
;; rsi = b.n / b.n[4] / t9
GLOBAL SYM(secp256k1_fe_mul_inner) GLOBAL SYM(secp256k1_fe_mul_inner)
ALIGN 32 ALIGN 32
SYM(secp256k1_fe_mul_inner): SYM(secp256k1_fe_mul_inner):
@ -41,263 +46,256 @@ SYM(secp256k1_fe_mul_inner):
push r13 push r13
push r14 push r14
push r15 push r15
push rdx mov r10,[rdi+0*8]
mov r14,[rsi+8*0] ; preload b.n[0]. This will be the case until mov r11,[rdi+1*8]
; b.n[0] is no longer needed, then we reassign mov r12,[rdi+2*8]
; r14 to t4 mov r13,[rdi+3*8]
;; c=a.n[0] * b.n[0] mov r14,[rdi+4*8]
mov rax,[rdi+0*8] ; load a.n[0] mov rbp,01000003D10h
mov rbp,0FFFFFFFFFFFFFh mov r15,rsi
mul r14 ; rdx:rax=a.n[0]*b.n[0] mov rsi,rdx
mov r15,[rsi+1*8]
mov r10,rbp ; load modulus into target register for t0 ;; d += a3 * b0
mov rax,[r15+0*8]
mul r13
mov rbx,rax
mov rcx,rdx
;; d += a2 * b1
mov rax,[r15+1*8]
mul r12
add rbx,rax
adc rcx,rdx
;; d += a1 * b2
mov rax,[r15+2*8]
mul r11
add rbx,rax
adc rcx,rdx
;; d = a0 * b3
mov rax,[r15+3*8]
mul r10
add rbx,rax
adc rcx,rdx
;; c = a4 * b4
mov rax,[r15+4*8]
mul r14
mov r8,rax mov r8,rax
and r10,rax ; only need lower qword of c mov r9,rdx
shrd r8,rdx,52 ;; d += (c & M) * R
xor r9,r9 ; c < 2^64, so we ditch the HO part mov rdx,0fffffffffffffh
and rax,rdx
;; c+=a.n[0] * b.n[1] + a.n[1] * b.n[0] mul rbp
mov rax,[rdi+0*8] add rbx,rax
mul r15 adc rcx,rdx
add r8,rax ;; c >>= 52 (r8 only)
adc r9,rdx
mov rax,[rdi+1*8]
mul r14
mov r11,rbp
mov rbx,[rsi+2*8]
add r8,rax
adc r9,rdx
and r11,r8
shrd r8,r9,52 shrd r8,r9,52
xor r9,r9 ;; t3 (stack) = d & M
mov rdi,rbx
;; c+=a.n[0 1 2] * b.n[2 1 0] mov rdx,0fffffffffffffh
mov rax,[rdi+0*8] and rdi,rdx
mul rbx push rdi
add r8,rax ;; d >>= 52
adc r9,rdx shrd rbx,rcx,52
mov rcx,0
mov rax,[rdi+1*8] ;; d += a4 * b0
mul r15 mov rax,[r15+0*8]
add r8,rax
adc r9,rdx
mov rax,[rdi+2*8]
mul r14 mul r14
mov r12,rbp add rbx,rax
mov rcx,[rsi+3*8] adc rcx,rdx
add r8,rax ;; d += a3 * b1
adc r9,rdx mov rax,[r15+1*8]
and r12,r8 mul r13
shrd r8,r9,52 add rbx,rax
xor r9,r9 adc rcx,rdx
;; d += a2 * b2
;; c+=a.n[0 1 2 3] * b.n[3 2 1 0] mov rax,[r15+2*8]
mov rax,[rdi+0*8] mul r12
mul rcx add rbx,rax
add r8,rax adc rcx,rdx
adc r9,rdx ;; d += a1 * b3
mov rax,[r15+3*8]
mov rax,[rdi+1*8] mul r11
mul rbx add rbx,rax
add r8,rax adc rcx,rdx
adc r9,rdx ;; d += a0 * b4
mov rax,[r15+4*8]
mov rax,[rdi+2*8] mul r10
mul r15 add rbx,rax
add r8,rax adc rcx,rdx
adc r9,rdx ;; d += c * R
mov rax,r8
mov rax,[rdi+3*8] mul rbp
mul r14 add rbx,rax
mov r13,rbp adc rcx,rdx
mov rsi,[rsi+4*8] ; load b.n[4] and destroy pointer ;; t4 = d & M (rdi)
add r8,rax mov rdi,rbx
adc r9,rdx mov rdx,0fffffffffffffh
and r13,r8 and rdi,rdx
;; d >>= 52
shrd r8,r9,52 shrd rbx,rcx,52
xor r9,r9 mov rcx,0
;; tx = t4 >> 48 (rbp, overwrites R)
mov rbp,rdi
;; c+=a.n[0 1 2 3 4] * b.n[4 3 2 1 0] shr rbp,48
mov rax,[rdi+0*8] ;; t4 &= (M >> 4) (stack)
mul rsi mov rax,0ffffffffffffh
add r8,rax and rdi,rax
adc r9,rdx push rdi
;; c = a0 * b0
mov rax,[rdi+1*8] mov rax,[r15+0*8]
mul rcx mul r10
add r8,rax mov r8,rax
adc r9,rdx mov r9,rdx
;; d += a4 * b1
mov rax,[rdi+2*8] mov rax,[r15+1*8]
mul rbx mul r14
add r8,rax add rbx,rax
adc r9,rdx adc rcx,rdx
;; d += a3 * b2
mov rax,[rdi+3*8] mov rax,[r15+2*8]
mul r15 mul r13
add r8,rax add rbx,rax
adc r9,rdx adc rcx,rdx
;; d += a2 * b3
mov rax,[rdi+4*8] mov rax,[r15+3*8]
mul r14 mul r12
mov r14,rbp ; load modulus into t4 and destroy a.n[0] add rbx,rax
adc rcx,rdx
;; d += a1 * b4
mov rax,[r15+4*8]
mul r11
add rbx,rax
adc rcx,rdx
;; u0 = d & M (rdi)
mov rdi,rbx
mov rdx,0fffffffffffffh
and rdi,rdx
;; d >>= 52
shrd rbx,rcx,52
mov rcx,0
;; u0 = (u0 << 4) | tx (rdi)
shl rdi,4
or rdi,rbp
;; c += u0 * (R >> 4)
mov rax,01000003D1h
mul rdi
add r8,rax add r8,rax
adc r9,rdx adc r9,rdx
and r14,r8 ;; r[0] = c & M
mov rax,r8
mov rdx,0fffffffffffffh
and rax,rdx
mov [rsi+0*8],rax
;; c >>= 52
shrd r8,r9,52 shrd r8,r9,52
xor r9,r9 mov r9,0
;; c += a1 * b0
;; c+=a.n[1 2 3 4] * b.n[4 3 2 1] mov rax,[r15+0*8]
mov rax,[rdi+1*8] mul r11
mul rsi
add r8,rax
adc r9,rdx
mov rax,[rdi+2*8]
mul rcx
add r8,rax add r8,rax
adc r9,rdx adc r9,rdx
;; c += a0 * b1
mov rax,[rdi+3*8] mov rax,[r15+1*8]
mul rbx mul r10
add r8,rax
adc r9,rdx
mov rax,[rdi+4*8]
mul r15
mov r15,rbp
add r8,rax add r8,rax
adc r9,rdx adc r9,rdx
;; d += a4 * b2
and r15,r8 mov rax,[r15+2*8]
mul r14
add rbx,rax
adc rcx,rdx
;; d += a3 * b3
mov rax,[r15+3*8]
mul r13
add rbx,rax
adc rcx,rdx
;; d += a2 * b4
mov rax,[r15+4*8]
mul r12
add rbx,rax
adc rcx,rdx
;; restore rdp = R
mov rbp,01000003D10h
;; c += (d & M) * R
mov rax,rbx
mov rdx,0fffffffffffffh
and rax,rdx
mul rbp
add r8,rax
adc r9,rdx
;; d >>= 52
shrd rbx,rcx,52
mov rcx,0
;; r[1] = c & M
mov rax,r8
mov rdx,0fffffffffffffh
and rax,rdx
mov [rsi+8*1],rax
;; c >>= 52
shrd r8,r9,52 shrd r8,r9,52
xor r9,r9 mov r9,0
;; c += a2 * b0
;; c+=a.n[2 3 4] * b.n[4 3 2] mov rax,[r15+0*8]
mov rax,[rdi+2*8] mul r12
mul rsi
add r8,rax add r8,rax
adc r9,rdx adc r9,rdx
;; c += a1 * b1
mov rax,[rdi+3*8] mov rax,[r15+1*8]
mul rcx mul r11
add r8,rax add r8,rax
adc r9,rdx adc r9,rdx
;; c += a0 * b2 (last use of r10 = a0)
mov rax,[rdi+4*8] mov rax,[r15+2*8]
mul rbx mul r10
mov rbx,rbp
add r8,rax add r8,rax
adc r9,rdx adc r9,rdx
;; fetch t3 (r10, overwrites a0),t4 (rdi)
and rbx,r8 pop rdi
shrd r8,r9,52 pop r10
xor r9,r9 ;; d += a4 * b3
mov rax,[r15+3*8]
;; c+=a.n[3 4] * b.n[4 3] mul r14
mov rax,[rdi+3*8] add rbx,rax
mul rsi adc rcx,rdx
add r8,rax ;; d += a3 * b4
adc r9,rdx mov rax,[r15+4*8]
mul r13
mov rax,[rdi+4*8] add rbx,rax
mul rcx adc rcx,rdx
mov rcx,rbp ;; c += (d & M) * R
add r8,rax mov rax,rbx
adc r9,rdx mov rdx,0fffffffffffffh
and rcx,r8 and rax,rdx
mul rbp
add r8,rax
adc r9,rdx
;; d >>= 52 (rbx only)
shrd rbx,rcx,52
;; r[2] = c & M
mov rax,r8
mov rdx,0fffffffffffffh
and rax,rdx
mov [rsi+2*8],rax
;; c >>= 52
shrd r8,r9,52 shrd r8,r9,52
xor r9,r9 mov r9,0
;; c += t3
;; c+=a.n[4] * b.n[4] add r8,r10
mov rax,[rdi+4*8] ;; c += d * R
mul rsi mov rax,rbx
;; mov rbp,rbp ; modulus already there! mul rbp
add r8,rax add r8,rax
adc r9,rdx adc r9,rdx
and rbp,r8 ;; r[3] = c & M
mov rax,r8
mov rdx,0fffffffffffffh
and rax,rdx
mov [rsi+3*8],rax
;; c >>= 52 (r8 only)
shrd r8,r9,52 shrd r8,r9,52
xor r9,r9 ;; c += t4 (r8 only)
add r8,rdi
mov rsi,r8 ; load c into t9 and destroy b.n[4] ;; r[4] = c
mov [rsi+4*8],r8
;; *******************************************************
common_exit_norm:
mov rdi,01000003D10h ; load constant
mov rax,r15 ; get t5
mul rdi
add rax,r10 ; +t0
adc rdx,0
mov r10,0FFFFFFFFFFFFFh ; modulus. Sadly, we ran out of registers!
mov r8,rax ; +c
and r10,rax
shrd r8,rdx,52
xor r9,r9
mov rax,rbx ; get t6
mul rdi
add rax,r11 ; +t1
adc rdx,0
mov r11,0FFFFFFFFFFFFFh ; modulus
add r8,rax ; +c
adc r9,rdx
and r11,r8
shrd r8,r9,52
xor r9,r9
mov rax,rcx ; get t7
mul rdi
add rax,r12 ; +t2
adc rdx,0
pop rbx ; retrieve pointer to this.n
mov r12,0FFFFFFFFFFFFFh ; modulus
add r8,rax ; +c
adc r9,rdx
and r12,r8
mov [rbx+2*8],r12 ; mov into this.n[2]
shrd r8,r9,52
xor r9,r9
mov rax,rbp ; get t8
mul rdi
add rax,r13 ; +t3
adc rdx,0
mov r13,0FFFFFFFFFFFFFh ; modulus
add r8,rax ; +c
adc r9,rdx
and r13,r8
mov [rbx+3*8],r13 ; -> this.n[3]
shrd r8,r9,52
xor r9,r9
mov rax,rsi ; get t9
mul rdi
add rax,r14 ; +t4
adc rdx,0
mov r14,0FFFFFFFFFFFFh ; !!!
add r8,rax ; +c
adc r9,rdx
and r14,r8
mov [rbx+4*8],r14 ; -> this.n[4]
shrd r8,r9,48 ; !!!
xor r9,r9
mov rax,01000003D1h
mul r8
add rax,r10
adc rdx,0
mov r10,0FFFFFFFFFFFFFh ; modulus
mov r8,rax
and rax,r10
shrd r8,rdx,52
mov [rbx+0*8],rax ; -> this.n[0]
add r8,r11
mov [rbx+1*8],r8 ; -> this.n[1]
pop r15 pop r15
pop r14 pop r14
@ -311,16 +309,14 @@ common_exit_norm:
;; PROC ExSetSquare ;; PROC ExSetSquare
;; Register Layout: ;; Register Layout:
;; INPUT: rdi = a.n ;; INPUT: rdi = a.n
;; rsi = this.a ;; rsi = r.n
;; INTERNAL: rdx:rax = multiplication accumulator ;; INTERNAL: rdx:rax = multiplication accumulator
;; r9:r8 = c ;; r9:r8 = c
;; r10-r13 = t0-t3 ;; r10:r14 = a0-a4
;; r14 = a.n[0] / t4 ;; rcx:rbx = d
;; r15 = a.n[1] / t5 ;; rbp = R
;; rbx = a.n[2] / t6 ;; rdi = t?
;; rcx = a.n[3] / t7 ;; r15 = M
;; rbp = 0FFFFFFFFFFFFFh / t8
;; rsi = a.n[4] / t9
GLOBAL SYM(secp256k1_fe_sqr_inner) GLOBAL SYM(secp256k1_fe_sqr_inner)
ALIGN 32 ALIGN 32
SYM(secp256k1_fe_sqr_inner): SYM(secp256k1_fe_sqr_inner):
@ -330,140 +326,204 @@ SYM(secp256k1_fe_sqr_inner):
push r13 push r13
push r14 push r14
push r15 push r15
push rsi mov r10,[rdi+0*8]
mov rbp,0FFFFFFFFFFFFFh mov r11,[rdi+1*8]
mov r12,[rdi+2*8]
;; c=a.n[0] * a.n[0] mov r13,[rdi+3*8]
mov r14,[rdi+0*8] ; r14=a.n[0] mov r14,[rdi+4*8]
mov r10,rbp ; modulus mov rbp,01000003D10h
mov r15,0fffffffffffffh
;; d = (a0*2) * a3
lea rax,[r10*2]
mul r13
mov rbx,rax
mov rcx,rdx
;; d += (a1*2) * a2
lea rax,[r11*2]
mul r12
add rbx,rax
adc rcx,rdx
;; c = a4 * a4
mov rax,r14 mov rax,r14
mul rax mul r14
mov r15,[rdi+1*8] ; a.n[1]
add r14,r14 ; r14=2*a.n[0]
mov r8,rax mov r8,rax
and r10,rax ; only need lower qword mov r9,rdx
shrd r8,rdx,52 ;; d += (c & M) * R
xor r9,r9 and rax,r15
mul rbp
;; c+=2*a.n[0] * a.n[1] add rbx,rax
mov rax,r14 ; r14=2*a.n[0] adc rcx,rdx
mul r15 ;; c >>= 52 (r8 only)
mov rbx,[rdi+2*8] ; rbx=a.n[2]
mov r11,rbp ; modulus
add r8,rax
adc r9,rdx
and r11,r8
shrd r8,r9,52 shrd r8,r9,52
xor r9,r9 ;; t3 (stack) = d & M
mov rdi,rbx
;; c+=2*a.n[0]*a.n[2]+a.n[1]*a.n[1] and rdi,r15
mov rax,r14 push rdi
mul rbx ;; d >>= 52
add r8,rax shrd rbx,rcx,52
adc r9,rdx mov rcx,0
;; a4 *= 2
mov rax,r15 add r14,r14
mov r12,rbp ; modulus ;; d += a0 * a4
mul rax mov rax,r10
mov rcx,[rdi+3*8] ; rcx=a.n[3] mul r14
add r15,r15 ; r15=a.n[1]*2 add rbx,rax
adc rcx,rdx
;; d+= (a1*2) * a3
lea rax,[r11*2]
mul r13
add rbx,rax
adc rcx,rdx
;; d += a2 * a2
mov rax,r12
mul r12
add rbx,rax
adc rcx,rdx
;; d += c * R
mov rax,r8
mul rbp
add rbx,rax
adc rcx,rdx
;; t4 = d & M (rdi)
mov rdi,rbx
and rdi,r15
;; d >>= 52
shrd rbx,rcx,52
mov rcx,0
;; tx = t4 >> 48 (rbp, overwrites constant)
mov rbp,rdi
shr rbp,48
;; t4 &= (M >> 4) (stack)
mov rax,0ffffffffffffh
and rdi,rax
push rdi
;; c = a0 * a0
mov rax,r10
mul r10
mov r8,rax
mov r9,rdx
;; d += a1 * a4
mov rax,r11
mul r14
add rbx,rax
adc rcx,rdx
;; d += (a2*2) * a3
lea rax,[r12*2]
mul r13
add rbx,rax
adc rcx,rdx
;; u0 = d & M (rdi)
mov rdi,rbx
and rdi,r15
;; d >>= 52
shrd rbx,rcx,52
mov rcx,0
;; u0 = (u0 << 4) | tx (rdi)
shl rdi,4
or rdi,rbp
;; c += u0 * (R >> 4)
mov rax,01000003D1h
mul rdi
add r8,rax add r8,rax
adc r9,rdx adc r9,rdx
and r12,r8 ; only need lower dword ;; r[0] = c & M
mov rax,r8
and rax,r15
mov [rsi+0*8],rax
;; c >>= 52
shrd r8,r9,52 shrd r8,r9,52
xor r9,r9 mov r9,0
;; a0 *= 2
;; c+=2*a.n[0]*a.n[3]+2*a.n[1]*a.n[2] add r10,r10
mov rax,r14 ;; c += a0 * a1
mul rcx mov rax,r10
mul r11
add r8,rax add r8,rax
adc r9,rdx adc r9,rdx
;; d += a2 * a4
mov rax,r15 ; rax=2*a.n[1] mov rax,r12
mov r13,rbp ; modulus mul r14
mul rbx add rbx,rax
mov rsi,[rdi+4*8] ; rsi=a.n[4] adc rcx,rdx
add r8,rax ;; d += a3 * a3
adc r9,rdx mov rax,r13
and r13,r8 mul r13
add rbx,rax
adc rcx,rdx
;; load R in rbp
mov rbp,01000003D10h
;; c += (d & M) * R
mov rax,rbx
and rax,r15
mul rbp
add r8,rax
adc r9,rdx
;; d >>= 52
shrd rbx,rcx,52
mov rcx,0
;; r[1] = c & M
mov rax,r8
and rax,r15
mov [rsi+8*1],rax
;; c >>= 52
shrd r8,r9,52 shrd r8,r9,52
xor r9,r9 mov r9,0
;; c += a0 * a2 (last use of r10)
;; c+=2*a.n[0]*a.n[4]+2*a.n[1]*a.n[3]+a.n[2]*a.n[2] mov rax,r10
mov rax,r14 ; last time we need 2*a.n[0] mul r12
mul rsi
add r8,rax add r8,rax
adc r9,rdx adc r9,rdx
;; fetch t3 (r10, overwrites a0),t4 (rdi)
mov rax,r15 pop rdi
mul rcx pop r10
mov r14,rbp ; modulus ;; c += a1 * a1
mov rax,r11
mul r11
add r8,rax add r8,rax
adc r9,rdx adc r9,rdx
;; d += a3 * a4
mov rax,r13
mul r14
add rbx,rax
adc rcx,rdx
;; c += (d & M) * R
mov rax,rbx mov rax,rbx
mul rax and rax,r15
add rbx,rbx ; rcx=2*a.n[2] mul rbp
add r8,rax add r8,rax
adc r9,rdx adc r9,rdx
and r14,r8 ;; d >>= 52 (rbx only)
shrd rbx,rcx,52
;; r[2] = c & M
mov rax,r8
and rax,r15
mov [rsi+2*8],rax
;; c >>= 52
shrd r8,r9,52 shrd r8,r9,52
xor r9,r9 mov r9,0
;; c += t3
;; c+=2*a.n[1]*a.n[4]+2*a.n[2]*a.n[3] add r8,r10
mov rax,r15 ; last time we need 2*a.n[1] ;; c += d * R
mul rsi
add r8,rax
adc r9,rdx
mov rax,rbx mov rax,rbx
mul rcx mul rbp
mov r15,rbp ; modulus
add r8,rax
adc r9,rdx
and r15,r8
shrd r8,r9,52
xor r9,r9
;; c+=2*a.n[2]*a.n[4]+a.n[3]*a.n[3]
mov rax,rbx ; last time we need 2*a.n[2]
mul rsi
add r8,rax
adc r9,rdx
mov rax,rcx ; a.n[3]
mul rax
mov rbx,rbp ; modulus
add r8,rax
adc r9,rdx
and rbx,r8 ; only need lower dword
lea rax,[2*rcx]
shrd r8,r9,52
xor r9,r9
;; c+=2*a.n[3]*a.n[4]
mul rsi
mov rcx,rbp ; modulus
add r8,rax add r8,rax
adc r9,rdx adc r9,rdx
and rcx,r8 ; only need lower dword ;; r[3] = c & M
mov rax,r8
and rax,r15
mov [rsi+3*8],rax
;; c >>= 52 (r8 only)
shrd r8,r9,52 shrd r8,r9,52
xor r9,r9 ;; c += t4 (r8 only)
add r8,rdi
;; c+=a.n[4]*a.n[4] ;; r[4] = c
mov rax,rsi mov [rsi+4*8],r8
mul rax
;; mov rbp,rbp ; modulus is already there!
add r8,rax
adc r9,rdx
and rbp,r8
shrd r8,r9,52
xor r9,r9
mov rsi,r8
;; ******************************************************* pop r15
jmp common_exit_norm pop r14
end pop r13
pop r12
pop rbx
pop rbp
ret

Loading…
Cancel
Save