|
|
@ -1,41 +1,60 @@
|
|
|
|
.x64
|
|
|
|
;; Added by Diederik Huys, March 2013
|
|
|
|
|
|
|
|
;;
|
|
|
|
|
|
|
|
;; Provided public procedures:
|
|
|
|
|
|
|
|
;; ExSetMult
|
|
|
|
|
|
|
|
;; ExSetSquare
|
|
|
|
|
|
|
|
;;
|
|
|
|
|
|
|
|
;; Needed tools: JWASM (http://www.japheth.de/JWasm.html)
|
|
|
|
|
|
|
|
;;
|
|
|
|
|
|
|
|
;; !!! WARNING !!! !!! WARNING !!! !!! WARNING !!!
|
|
|
|
|
|
|
|
;;
|
|
|
|
|
|
|
|
;; Please note that recompiling this binary (jwasm) under a 64-bit OS
|
|
|
|
|
|
|
|
;; may yield unexpected results and create a corrupted ELF64 header.
|
|
|
|
|
|
|
|
;;
|
|
|
|
|
|
|
|
;;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.x64
|
|
|
|
QTEST EQU 1
|
|
|
|
QTEST EQU 1
|
|
|
|
.code
|
|
|
|
.code
|
|
|
|
|
|
|
|
|
|
|
|
;; Register Layout:
|
|
|
|
;; Register Layout:
|
|
|
|
;; INPUT: rdi = a.n
|
|
|
|
;; INPUT: rdi = a.n
|
|
|
|
;; rsi = b.n
|
|
|
|
;; rsi = b.n
|
|
|
|
;; rdx = this.a
|
|
|
|
;; rdx = this.a
|
|
|
|
;; OUTPUT: [rbx]
|
|
|
|
;;
|
|
|
|
;; INTERNAL: rdx:rax = multiplication accumulator
|
|
|
|
;; INTERNAL: rdx:rax = multiplication accumulator
|
|
|
|
;; rsi = b.n / t9
|
|
|
|
;; r9:r8 = c
|
|
|
|
;; r8:r9 = c
|
|
|
|
;; r10-r13 = t0-t3
|
|
|
|
;; r10-r15 = t0-t5
|
|
|
|
;; r14 = b.n[0] / t4
|
|
|
|
;; rbx = t6
|
|
|
|
;; r15 = b.n[1] / t5
|
|
|
|
;; rcx = t7
|
|
|
|
;; rbx = b.n[2] / t6
|
|
|
|
;; rbp = t8
|
|
|
|
;; rcx = b.n[3] / t7
|
|
|
|
|
|
|
|
;; rbp = Constant 0FFFFFFFFFFFFFh / t8
|
|
|
|
|
|
|
|
;; rsi = b.n / b.n[4] / t9
|
|
|
|
ExSetMult PROC C PUBLIC USES rbx rbp r12 r13 r14 r15
|
|
|
|
ExSetMult PROC C PUBLIC USES rbx rbp r12 r13 r14 r15
|
|
|
|
push rdx
|
|
|
|
push rdx
|
|
|
|
mov r14,[rsi+8*0]
|
|
|
|
mov r14,[rsi+8*0] ; preload b.n[0]. This will be the case until
|
|
|
|
|
|
|
|
; b.n[0] is no longer needed, then we reassign
|
|
|
|
|
|
|
|
; r14 to t4
|
|
|
|
;; c=a.n[0] * b.n[0]
|
|
|
|
;; c=a.n[0] * b.n[0]
|
|
|
|
mov rax,[rdi+0*8]
|
|
|
|
mov rax,[rdi+0*8] ; load a.n[0]
|
|
|
|
mov rbp,0FFFFFFFFFFFFFh
|
|
|
|
mov rbp,0FFFFFFFFFFFFFh
|
|
|
|
mul r14 ; rsi=b.n[0]
|
|
|
|
mul r14 ; rdx:rax=a.n[0]*b.n[0]
|
|
|
|
mov r15,[rsi+1*8]
|
|
|
|
mov r15,[rsi+1*8]
|
|
|
|
mov r10,rbp
|
|
|
|
mov r10,rbp ; load modulus into target register for t0
|
|
|
|
mov r8,rax
|
|
|
|
mov r8,rax
|
|
|
|
and r10,rax ; only need lower qword
|
|
|
|
and r10,rax ; only need lower qword of c
|
|
|
|
shrd r8,rdx,52
|
|
|
|
shrd r8,rdx,52
|
|
|
|
xor r9,r9
|
|
|
|
xor r9,r9 ; c < 2^64, so we ditch the HO part
|
|
|
|
|
|
|
|
|
|
|
|
;; c+=a.n[0] * b.n[1] + a.n[1] * b.n[0]
|
|
|
|
;; c+=a.n[0] * b.n[1] + a.n[1] * b.n[0]
|
|
|
|
mov rax,[rdi+0*8]
|
|
|
|
mov rax,[rdi+0*8]
|
|
|
|
mul r15 ; b.n[1]
|
|
|
|
mul r15
|
|
|
|
add r8,rax
|
|
|
|
add r8,rax
|
|
|
|
adc r9,rdx
|
|
|
|
adc r9,rdx
|
|
|
|
|
|
|
|
|
|
|
|
mov rax,[rdi+1*8]
|
|
|
|
mov rax,[rdi+1*8]
|
|
|
|
mul r14 ; b.n[0]
|
|
|
|
mul r14
|
|
|
|
mov r11,rbp
|
|
|
|
mov r11,rbp
|
|
|
|
mov rbx,[rsi+2*8]
|
|
|
|
mov rbx,[rsi+2*8]
|
|
|
|
add r8,rax
|
|
|
|
add r8,rax
|
|
|
@ -46,44 +65,44 @@ ExSetMult PROC C PUBLIC USES rbx rbp r12 r13 r14 r15
|
|
|
|
|
|
|
|
|
|
|
|
;; c+=a.n[0 1 2] * b.n[2 1 0]
|
|
|
|
;; c+=a.n[0 1 2] * b.n[2 1 0]
|
|
|
|
mov rax,[rdi+0*8]
|
|
|
|
mov rax,[rdi+0*8]
|
|
|
|
mul rbx ; b.n[2]
|
|
|
|
mul rbx
|
|
|
|
add r8,rax
|
|
|
|
add r8,rax
|
|
|
|
adc r9,rdx
|
|
|
|
adc r9,rdx
|
|
|
|
|
|
|
|
|
|
|
|
mov rax,[rdi+1*8]
|
|
|
|
mov rax,[rdi+1*8]
|
|
|
|
mul r15 ; b.n[1]
|
|
|
|
mul r15
|
|
|
|
add r8,rax
|
|
|
|
add r8,rax
|
|
|
|
adc r9,rdx
|
|
|
|
adc r9,rdx
|
|
|
|
|
|
|
|
|
|
|
|
mov rax,[rdi+2*8]
|
|
|
|
mov rax,[rdi+2*8]
|
|
|
|
mul r14
|
|
|
|
mul r14
|
|
|
|
mov r12,rbp ; modulus
|
|
|
|
mov r12,rbp
|
|
|
|
mov rcx,[rsi+3*8]
|
|
|
|
mov rcx,[rsi+3*8]
|
|
|
|
add r8,rax
|
|
|
|
add r8,rax
|
|
|
|
adc r9,rdx
|
|
|
|
adc r9,rdx
|
|
|
|
and r12,r8 ; only need lower dword
|
|
|
|
and r12,r8
|
|
|
|
shrd r8,r9,52
|
|
|
|
shrd r8,r9,52
|
|
|
|
xor r9,r9
|
|
|
|
xor r9,r9
|
|
|
|
|
|
|
|
|
|
|
|
;; c+=a.n[0 1 2 3] * b.n[3 2 1 0]
|
|
|
|
;; c+=a.n[0 1 2 3] * b.n[3 2 1 0]
|
|
|
|
mov rax,[rdi+0*8]
|
|
|
|
mov rax,[rdi+0*8]
|
|
|
|
mul rcx ; b.n[3]
|
|
|
|
mul rcx
|
|
|
|
add r8,rax
|
|
|
|
add r8,rax
|
|
|
|
adc r9,rdx
|
|
|
|
adc r9,rdx
|
|
|
|
|
|
|
|
|
|
|
|
mov rax,[rdi+1*8]
|
|
|
|
mov rax,[rdi+1*8]
|
|
|
|
mul rbx ; b.n[2]
|
|
|
|
mul rbx
|
|
|
|
add r8,rax
|
|
|
|
add r8,rax
|
|
|
|
adc r9,rdx
|
|
|
|
adc r9,rdx
|
|
|
|
|
|
|
|
|
|
|
|
mov rax,[rdi+2*8]
|
|
|
|
mov rax,[rdi+2*8]
|
|
|
|
mul r15 ; b.n[1]
|
|
|
|
mul r15
|
|
|
|
add r8,rax
|
|
|
|
add r8,rax
|
|
|
|
adc r9,rdx
|
|
|
|
adc r9,rdx
|
|
|
|
|
|
|
|
|
|
|
|
mov rax,[rdi+3*8]
|
|
|
|
mov rax,[rdi+3*8]
|
|
|
|
mul r14 ; b.n[0]
|
|
|
|
mul r14
|
|
|
|
mov r13,rbp ; modulus
|
|
|
|
mov r13,rbp
|
|
|
|
mov rsi,[rsi+4*8] ; load b.n[4] and destroy pointer
|
|
|
|
mov rsi,[rsi+4*8] ; load b.n[4] and destroy pointer
|
|
|
|
add r8,rax
|
|
|
|
add r8,rax
|
|
|
|
adc r9,rdx
|
|
|
|
adc r9,rdx
|
|
|
@ -105,18 +124,18 @@ ExSetMult PROC C PUBLIC USES rbx rbp r12 r13 r14 r15
|
|
|
|
adc r9,rdx
|
|
|
|
adc r9,rdx
|
|
|
|
|
|
|
|
|
|
|
|
mov rax,[rdi+2*8]
|
|
|
|
mov rax,[rdi+2*8]
|
|
|
|
mul rbx ; b.n[2]
|
|
|
|
mul rbx
|
|
|
|
add r8,rax
|
|
|
|
add r8,rax
|
|
|
|
adc r9,rdx
|
|
|
|
adc r9,rdx
|
|
|
|
|
|
|
|
|
|
|
|
mov rax,[rdi+3*8]
|
|
|
|
mov rax,[rdi+3*8]
|
|
|
|
mul r15 ; b.n[1]
|
|
|
|
mul r15
|
|
|
|
add r8,rax
|
|
|
|
add r8,rax
|
|
|
|
adc r9,rdx
|
|
|
|
adc r9,rdx
|
|
|
|
|
|
|
|
|
|
|
|
mov rax,[rdi+4*8]
|
|
|
|
mov rax,[rdi+4*8]
|
|
|
|
mul r14 ; b.n[0]
|
|
|
|
mul r14
|
|
|
|
mov r14,rbp ; modulus
|
|
|
|
mov r14,rbp ; load modulus into t4 and destroy a.n[0]
|
|
|
|
add r8,rax
|
|
|
|
add r8,rax
|
|
|
|
adc r9,rdx
|
|
|
|
adc r9,rdx
|
|
|
|
and r14,r8
|
|
|
|
and r14,r8
|
|
|
@ -141,7 +160,7 @@ ExSetMult PROC C PUBLIC USES rbx rbp r12 r13 r14 r15
|
|
|
|
|
|
|
|
|
|
|
|
mov rax,[rdi+4*8]
|
|
|
|
mov rax,[rdi+4*8]
|
|
|
|
mul r15
|
|
|
|
mul r15
|
|
|
|
mov r15,rbp ; modulus
|
|
|
|
mov r15,rbp
|
|
|
|
add r8,rax
|
|
|
|
add r8,rax
|
|
|
|
adc r9,rdx
|
|
|
|
adc r9,rdx
|
|
|
|
|
|
|
|
|
|
|
@ -162,11 +181,11 @@ ExSetMult PROC C PUBLIC USES rbx rbp r12 r13 r14 r15
|
|
|
|
|
|
|
|
|
|
|
|
mov rax,[rdi+4*8]
|
|
|
|
mov rax,[rdi+4*8]
|
|
|
|
mul rbx
|
|
|
|
mul rbx
|
|
|
|
mov rbx,rbp ; modulus
|
|
|
|
mov rbx,rbp
|
|
|
|
add r8,rax
|
|
|
|
add r8,rax
|
|
|
|
adc r9,rdx
|
|
|
|
adc r9,rdx
|
|
|
|
|
|
|
|
|
|
|
|
and rbx,r8 ; only need lower dword
|
|
|
|
and rbx,r8
|
|
|
|
shrd r8,r9,52
|
|
|
|
shrd r8,r9,52
|
|
|
|
xor r9,r9
|
|
|
|
xor r9,r9
|
|
|
|
|
|
|
|
|
|
|
@ -178,10 +197,10 @@ ExSetMult PROC C PUBLIC USES rbx rbp r12 r13 r14 r15
|
|
|
|
|
|
|
|
|
|
|
|
mov rax,[rdi+4*8]
|
|
|
|
mov rax,[rdi+4*8]
|
|
|
|
mul rcx
|
|
|
|
mul rcx
|
|
|
|
mov rcx,rbp ; modulus
|
|
|
|
mov rcx,rbp
|
|
|
|
add r8,rax
|
|
|
|
add r8,rax
|
|
|
|
adc r9,rdx
|
|
|
|
adc r9,rdx
|
|
|
|
and rcx,r8 ; only need lower dword
|
|
|
|
and rcx,r8
|
|
|
|
shrd r8,r9,52
|
|
|
|
shrd r8,r9,52
|
|
|
|
xor r9,r9
|
|
|
|
xor r9,r9
|
|
|
|
|
|
|
|
|
|
|
@ -195,17 +214,17 @@ ExSetMult PROC C PUBLIC USES rbx rbp r12 r13 r14 r15
|
|
|
|
shrd r8,r9,52
|
|
|
|
shrd r8,r9,52
|
|
|
|
xor r9,r9
|
|
|
|
xor r9,r9
|
|
|
|
|
|
|
|
|
|
|
|
mov rsi,r8
|
|
|
|
mov rsi,r8 ; load c into t9 and destroy b.n[4]
|
|
|
|
|
|
|
|
|
|
|
|
;; *******************************************************
|
|
|
|
;; *******************************************************
|
|
|
|
common_exit_norm::
|
|
|
|
common_exit_norm::
|
|
|
|
mov rdi,01000003D10h
|
|
|
|
mov rdi,01000003D10h ; load constant
|
|
|
|
|
|
|
|
|
|
|
|
mov rax,r15 ; get t5
|
|
|
|
mov rax,r15 ; get t5
|
|
|
|
mul rdi
|
|
|
|
mul rdi
|
|
|
|
add rax,r10 ; +t0
|
|
|
|
add rax,r10 ; +t0
|
|
|
|
adc rdx,0
|
|
|
|
adc rdx,0
|
|
|
|
mov r10,0FFFFFFFFFFFFFh ; modulus
|
|
|
|
mov r10,0FFFFFFFFFFFFFh ; modulus. Sadly, we ran out of registers!
|
|
|
|
mov r8,rax ; +c
|
|
|
|
mov r8,rax ; +c
|
|
|
|
and r10,rax
|
|
|
|
and r10,rax
|
|
|
|
shrd r8,rdx,52
|
|
|
|
shrd r8,rdx,52
|
|
|
@ -226,12 +245,12 @@ common_exit_norm::
|
|
|
|
mul rdi
|
|
|
|
mul rdi
|
|
|
|
add rax,r12 ; +t2
|
|
|
|
add rax,r12 ; +t2
|
|
|
|
adc rdx,0
|
|
|
|
adc rdx,0
|
|
|
|
pop rbx ; retrieve pointer to this.a.n
|
|
|
|
pop rbx ; retrieve pointer to this.n
|
|
|
|
mov r12,0FFFFFFFFFFFFFh ; modulus
|
|
|
|
mov r12,0FFFFFFFFFFFFFh ; modulus
|
|
|
|
add r8,rax ; +c
|
|
|
|
add r8,rax ; +c
|
|
|
|
adc r9,rdx
|
|
|
|
adc r9,rdx
|
|
|
|
and r12,r8
|
|
|
|
and r12,r8
|
|
|
|
mov [rbx+2*8],r12
|
|
|
|
mov [rbx+2*8],r12 ; mov into this.n[2]
|
|
|
|
shrd r8,r9,52
|
|
|
|
shrd r8,r9,52
|
|
|
|
xor r9,r9
|
|
|
|
xor r9,r9
|
|
|
|
|
|
|
|
|
|
|
@ -243,7 +262,7 @@ common_exit_norm::
|
|
|
|
add r8,rax ; +c
|
|
|
|
add r8,rax ; +c
|
|
|
|
adc r9,rdx
|
|
|
|
adc r9,rdx
|
|
|
|
and r13,r8
|
|
|
|
and r13,r8
|
|
|
|
mov [rbx+3*8],r13
|
|
|
|
mov [rbx+3*8],r13 ; -> this.n[3]
|
|
|
|
shrd r8,r9,52
|
|
|
|
shrd r8,r9,52
|
|
|
|
xor r9,r9
|
|
|
|
xor r9,r9
|
|
|
|
|
|
|
|
|
|
|
@ -255,11 +274,11 @@ common_exit_norm::
|
|
|
|
add r8,rax ; +c
|
|
|
|
add r8,rax ; +c
|
|
|
|
adc r9,rdx
|
|
|
|
adc r9,rdx
|
|
|
|
and r14,r8
|
|
|
|
and r14,r8
|
|
|
|
mov [rbx+4*8],r14
|
|
|
|
mov [rbx+4*8],r14 ; -> this.n[4]
|
|
|
|
shrd r8,r9,48
|
|
|
|
shrd r8,r9,48 ; !!!
|
|
|
|
xor r9,r9
|
|
|
|
xor r9,r9
|
|
|
|
|
|
|
|
|
|
|
|
mov rax,01000003D1h
|
|
|
|
mov rax,01000003D1h
|
|
|
|
mul r8
|
|
|
|
mul r8
|
|
|
|
add rax,r10
|
|
|
|
add rax,r10
|
|
|
|
adc rdx,0
|
|
|
|
adc rdx,0
|
|
|
@ -267,50 +286,46 @@ common_exit_norm::
|
|
|
|
mov r8,rax
|
|
|
|
mov r8,rax
|
|
|
|
and rax,r10
|
|
|
|
and rax,r10
|
|
|
|
shrd r8,rdx,52
|
|
|
|
shrd r8,rdx,52
|
|
|
|
mov [rbx+0*8],rax
|
|
|
|
mov [rbx+0*8],rax ; -> this.n[0]
|
|
|
|
add r8,r11
|
|
|
|
add r8,r11
|
|
|
|
mov [rbx+1*8],r8
|
|
|
|
mov [rbx+1*8],r8 ; -> this.n[1]
|
|
|
|
ret
|
|
|
|
ret
|
|
|
|
ExSetMult ENDP
|
|
|
|
ExSetMult ENDP
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
;; Register Layout:
|
|
|
|
;; Register Layout:
|
|
|
|
;; INPUT: rdi = a.n
|
|
|
|
;; INPUT: rdi = a.n
|
|
|
|
;; rsi = this.a
|
|
|
|
;; rsi = this.a
|
|
|
|
;; OUTPUT: [rsi]
|
|
|
|
|
|
|
|
;; INTERNAL: rdx:rax = multiplication accumulator
|
|
|
|
;; INTERNAL: rdx:rax = multiplication accumulator
|
|
|
|
;; r8:r9 = c
|
|
|
|
;; r9:r8 = c
|
|
|
|
;; r10-r14 = t0-t4
|
|
|
|
;; r10-r13 = t0-t3
|
|
|
|
;; r15 = a.n[0]*2 / t5
|
|
|
|
;; r14 = a.n[0] / t4
|
|
|
|
;; rbx = a.n[1]*2 / t6
|
|
|
|
;; r15 = a.n[1] / t5
|
|
|
|
;; rcx = a.n[2]*2 / t7
|
|
|
|
;; rbx = a.n[2] / t6
|
|
|
|
;; rbp = a.n[3]*2 / t8
|
|
|
|
;; rcx = a.n[3] / t7
|
|
|
|
;; rsi = a.n[4] / t9
|
|
|
|
;; rbp = 0FFFFFFFFFFFFFh / t8
|
|
|
|
|
|
|
|
;; rsi = a.n[4] / a.n[4] /t9
|
|
|
|
ExSetSquare PROC C PUBLIC USES rbx rbp r12 r13 r14 r15
|
|
|
|
ExSetSquare PROC C PUBLIC USES rbx rbp r12 r13 r14 r15
|
|
|
|
push rsi
|
|
|
|
push rsi
|
|
|
|
mov rsi,0FFFFFFFFFFFFFh
|
|
|
|
mov rbp,0FFFFFFFFFFFFFh
|
|
|
|
|
|
|
|
|
|
|
|
;; c=a.n[0] * a.n[0]
|
|
|
|
;; c=a.n[0] * a.n[0]
|
|
|
|
mov r15,[rdi+0*8]
|
|
|
|
mov r14,[rdi+0*8] ; r14=a.n[0]
|
|
|
|
mov r10,rsi ; modulus
|
|
|
|
mov r10,rbp ; modulus
|
|
|
|
mov rax,r15
|
|
|
|
mov rax,r14
|
|
|
|
mul rax ; rsi=b.n[0]
|
|
|
|
mul rax
|
|
|
|
mov rbx,[rdi+1*8] ; a.n[1]
|
|
|
|
mov r15,[rdi+1*8] ; a.n[1]
|
|
|
|
add r15,r15 ; r15=2*a.n[0]
|
|
|
|
add r14,r14 ; r14=2*a.n[0]
|
|
|
|
mov r8,rax
|
|
|
|
mov r8,rax
|
|
|
|
and r10,rax ; only need lower qword
|
|
|
|
and r10,rax ; only need lower qword
|
|
|
|
shrd r8,rdx,52
|
|
|
|
shrd r8,rdx,52
|
|
|
|
xor r9,r9
|
|
|
|
xor r9,r9
|
|
|
|
|
|
|
|
|
|
|
|
;; c+=2*a.n[0] * a.n[1]
|
|
|
|
;; c+=2*a.n[0] * a.n[1]
|
|
|
|
mov rax,r15
|
|
|
|
mov rax,r14 ; r14=2*a.n[0]
|
|
|
|
mul rbx
|
|
|
|
mul r15
|
|
|
|
mov rcx,[rdi+2*8] ; rcx=a.n[2]
|
|
|
|
mov rbx,[rdi+2*8] ; rbx=a.n[2]
|
|
|
|
mov r11,rsi ; modulus
|
|
|
|
mov r11,rbp ; modulus
|
|
|
|
add r8,rax
|
|
|
|
add r8,rax
|
|
|
|
adc r9,rdx
|
|
|
|
adc r9,rdx
|
|
|
|
and r11,r8
|
|
|
|
and r11,r8
|
|
|
@ -318,33 +333,32 @@ ExSetSquare PROC C PUBLIC USES rbx rbp r12 r13 r14 r15
|
|
|
|
xor r9,r9
|
|
|
|
xor r9,r9
|
|
|
|
|
|
|
|
|
|
|
|
;; c+=2*a.n[0]*a.n[2]+a.n[1]*a.n[1]
|
|
|
|
;; c+=2*a.n[0]*a.n[2]+a.n[1]*a.n[1]
|
|
|
|
mov rax,r15
|
|
|
|
mov rax,r14
|
|
|
|
mul rcx
|
|
|
|
mul rbx
|
|
|
|
add r8,rax
|
|
|
|
add r8,rax
|
|
|
|
adc r9,rdx
|
|
|
|
adc r9,rdx
|
|
|
|
|
|
|
|
|
|
|
|
mov rax,rbx
|
|
|
|
mov rax,r15
|
|
|
|
mov r12,rsi ; modulus
|
|
|
|
mov r12,rbp ; modulus
|
|
|
|
mul rax
|
|
|
|
mul rax
|
|
|
|
mov rbp,[rdi+3*8] ; rbp=a.n[3]
|
|
|
|
mov rcx,[rdi+3*8] ; rcx=a.n[3]
|
|
|
|
add rbx,rbx ; rbx=a.n[1]*2
|
|
|
|
add r15,r15 ; r15=a.n[1]*2
|
|
|
|
add r8,rax
|
|
|
|
add r8,rax
|
|
|
|
adc r9,rdx
|
|
|
|
adc r9,rdx
|
|
|
|
|
|
|
|
|
|
|
|
and r12,r8 ; only need lower dword
|
|
|
|
and r12,r8 ; only need lower dword
|
|
|
|
shrd r8,r9,52
|
|
|
|
shrd r8,r9,52
|
|
|
|
xor r9,r9
|
|
|
|
xor r9,r9
|
|
|
|
|
|
|
|
|
|
|
|
;; c+=2*a.n[0]*a.n[3]+2*a.n[1]*a.n[2]
|
|
|
|
;; c+=2*a.n[0]*a.n[3]+2*a.n[1]*a.n[2]
|
|
|
|
mov rax,r15
|
|
|
|
mov rax,r14
|
|
|
|
mul rbp
|
|
|
|
mul rcx
|
|
|
|
add r8,rax
|
|
|
|
add r8,rax
|
|
|
|
adc r9,rdx
|
|
|
|
adc r9,rdx
|
|
|
|
|
|
|
|
|
|
|
|
mov rax,rbx ; rax=2*a.n[1]
|
|
|
|
mov rax,r15 ; rax=2*a.n[1]
|
|
|
|
mov r13,rsi ; modulus
|
|
|
|
mov r13,rbp ; modulus
|
|
|
|
mul rcx
|
|
|
|
mul rbx
|
|
|
|
mov rsi,[rdi+4*8] ; rsi=a.n[4] / destroy constant
|
|
|
|
mov rsi,[rdi+4*8] ; rsi=a.n[4]
|
|
|
|
add r8,rax
|
|
|
|
add r8,rax
|
|
|
|
adc r9,rdx
|
|
|
|
adc r9,rdx
|
|
|
|
and r13,r8
|
|
|
|
and r13,r8
|
|
|
@ -352,20 +366,20 @@ ExSetSquare PROC C PUBLIC USES rbx rbp r12 r13 r14 r15
|
|
|
|
xor r9,r9
|
|
|
|
xor r9,r9
|
|
|
|
|
|
|
|
|
|
|
|
;; c+=2*a.n[0]*a.n[4]+2*a.n[1]*a.n[3]+a.n[2]*a.n[2]
|
|
|
|
;; c+=2*a.n[0]*a.n[4]+2*a.n[1]*a.n[3]+a.n[2]*a.n[2]
|
|
|
|
mov rax,r15 ; last time we need 2*a.n[0]
|
|
|
|
mov rax,r14 ; last time we need 2*a.n[0]
|
|
|
|
mul rsi
|
|
|
|
mul rsi
|
|
|
|
add r8,rax
|
|
|
|
add r8,rax
|
|
|
|
adc r9,rdx
|
|
|
|
adc r9,rdx
|
|
|
|
|
|
|
|
|
|
|
|
mov rax,rbx
|
|
|
|
mov rax,r15
|
|
|
|
mul rbp
|
|
|
|
mul rcx
|
|
|
|
mov r14,0FFFFFFFFFFFFFh ; modulus
|
|
|
|
mov r14,rbp ; modulus
|
|
|
|
add r8,rax
|
|
|
|
add r8,rax
|
|
|
|
adc r9,rdx
|
|
|
|
adc r9,rdx
|
|
|
|
|
|
|
|
|
|
|
|
mov rax,rcx
|
|
|
|
mov rax,rbx
|
|
|
|
mul rax
|
|
|
|
mul rax
|
|
|
|
add rcx,rcx ; rcx=2*a.n[2]
|
|
|
|
add rbx,rbx ; rcx=2*a.n[2]
|
|
|
|
add r8,rax
|
|
|
|
add r8,rax
|
|
|
|
adc r9,rdx
|
|
|
|
adc r9,rdx
|
|
|
|
and r14,r8
|
|
|
|
and r14,r8
|
|
|
@ -373,14 +387,14 @@ ExSetSquare PROC C PUBLIC USES rbx rbp r12 r13 r14 r15
|
|
|
|
xor r9,r9
|
|
|
|
xor r9,r9
|
|
|
|
|
|
|
|
|
|
|
|
;; c+=2*a.n[1]*a.n[4]+2*a.n[2]*a.n[3]
|
|
|
|
;; c+=2*a.n[1]*a.n[4]+2*a.n[2]*a.n[3]
|
|
|
|
mov rax,rbx
|
|
|
|
mov rax,r15 ; last time we need 2*a.n[1]
|
|
|
|
mul rsi
|
|
|
|
mul rsi
|
|
|
|
add r8,rax
|
|
|
|
add r8,rax
|
|
|
|
adc r9,rdx
|
|
|
|
adc r9,rdx
|
|
|
|
|
|
|
|
|
|
|
|
mov rax,rcx
|
|
|
|
mov rax,rbx
|
|
|
|
mul rbp
|
|
|
|
mul rcx
|
|
|
|
mov r15,0FFFFFFFFFFFFFh ; modulus
|
|
|
|
mov r15,rbp ; modulus
|
|
|
|
add r8,rax
|
|
|
|
add r8,rax
|
|
|
|
adc r9,rdx
|
|
|
|
adc r9,rdx
|
|
|
|
and r15,r8
|
|
|
|
and r15,r8
|
|
|
@ -388,24 +402,24 @@ ExSetSquare PROC C PUBLIC USES rbx rbp r12 r13 r14 r15
|
|
|
|
xor r9,r9
|
|
|
|
xor r9,r9
|
|
|
|
|
|
|
|
|
|
|
|
;; c+=2*a.n[2]*a.n[4]+a.n[3]*a.n[3]
|
|
|
|
;; c+=2*a.n[2]*a.n[4]+a.n[3]*a.n[3]
|
|
|
|
mov rax,rcx ; 2*a.n[2]
|
|
|
|
mov rax,rbx ; last time we need 2*a.n[2]
|
|
|
|
mul rsi
|
|
|
|
mul rsi
|
|
|
|
add r8,rax
|
|
|
|
add r8,rax
|
|
|
|
adc r9,rdx
|
|
|
|
adc r9,rdx
|
|
|
|
|
|
|
|
|
|
|
|
mov rax,rbp ; a.n[3]
|
|
|
|
mov rax,rcx ; a.n[3]
|
|
|
|
mul rax
|
|
|
|
mul rax
|
|
|
|
mov rbx,0FFFFFFFFFFFFFh ; modulus
|
|
|
|
mov rbx,rbp ; modulus
|
|
|
|
add r8,rax
|
|
|
|
add r8,rax
|
|
|
|
adc r9,rdx
|
|
|
|
adc r9,rdx
|
|
|
|
and rbx,r8 ; only need lower dword
|
|
|
|
and rbx,r8 ; only need lower dword
|
|
|
|
lea rax,[2*rbp]
|
|
|
|
lea rax,[2*rcx]
|
|
|
|
shrd r8,r9,52
|
|
|
|
shrd r8,r9,52
|
|
|
|
xor r9,r9
|
|
|
|
xor r9,r9
|
|
|
|
|
|
|
|
|
|
|
|
;; c+=2*a.n[3]*a.n[4]
|
|
|
|
;; c+=2*a.n[3]*a.n[4]
|
|
|
|
mul rsi
|
|
|
|
mul rsi
|
|
|
|
mov rcx,0FFFFFFFFFFFFFh ; modulus
|
|
|
|
mov rcx,rbp ; modulus
|
|
|
|
add r8,rax
|
|
|
|
add r8,rax
|
|
|
|
adc r9,rdx
|
|
|
|
adc r9,rdx
|
|
|
|
and rcx,r8 ; only need lower dword
|
|
|
|
and rcx,r8 ; only need lower dword
|
|
|
@ -415,7 +429,7 @@ ExSetSquare PROC C PUBLIC USES rbx rbp r12 r13 r14 r15
|
|
|
|
;; c+=a.n[4]*a.n[4]
|
|
|
|
;; c+=a.n[4]*a.n[4]
|
|
|
|
mov rax,rsi
|
|
|
|
mov rax,rsi
|
|
|
|
mul rax
|
|
|
|
mul rax
|
|
|
|
mov rbp,0FFFFFFFFFFFFFh ; modulus
|
|
|
|
;; mov rbp,rbp ; modulus is already there!
|
|
|
|
add r8,rax
|
|
|
|
add r8,rax
|
|
|
|
adc r9,rdx
|
|
|
|
adc r9,rdx
|
|
|
|
and rbp,r8
|
|
|
|
and rbp,r8
|
|
|
|