#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v.
2.
0. If a copy of the MPL was not distributed with this
#
file, You can obtain one at
http://mozilla.org/MPL/2.
0/.
.
data
.
align 4
#
# -
1 means to call s_mpi_is_sse to determine if we support sse
# instructions.
#
0 means to use x86 instructions
#
1 means to use sse2 instructions
.type is_sse,@object
.
size is_sse,
4
is_sse: .long -
1
#
# sigh, handle the difference between -fPIC and not PIC
# default to pic, since this
file seems to be exclusively
# linux right now (solaris uses mpi_i86pc.s and windows uses
# mpi_x86_asm.c)
#
.ifndef NO_PIC
.macro GET var,reg
movl \var@GOTOFF(%ebx),\reg
.endm
.macro PUT reg,var
movl \reg,\var@GOTOFF(%ebx)
.endm
.else
.macro GET var,reg
movl \var,\reg
.endm
.macro PUT reg,var
movl \reg,\var
.endm
.endif
.text
# ebp -
36: caller
's esi
# ebp -
32: caller
's edi
# ebp -
28:
# ebp -
24:
# ebp -
20:
# ebp -
16:
# ebp -
12:
# ebp -
8:
# ebp -
4:
# ebp +
0: caller
's ebp
# ebp +
4: return address
# ebp +
8: a argument
# ebp +
12: a_len argument
# ebp +
16: b argument
# ebp +
20: c argument
# registers:
# eax:
# ebx: carry
# ecx: a_len
# edx:
# esi: a ptr
# edi: c ptr
.globl s_mpv_mul_d
.type s_mpv_mul_d,@function
s_mpv_mul_d:
GET is_sse,%eax
cmp $
0,%eax
je s_mpv_mul_d_x86
jg s_mpv_mul_d_sse2
call s_mpi_is_sse2
PUT %eax,is_sse
cmp $
0,%eax
jg s_mpv_mul_d_sse2
s_mpv_mul_d_x86:
push %ebp
mov %esp,%ebp
sub $
28,%esp
push %edi
push %esi
push %ebx
movl $
0,%ebx # carry =
0
mov
12(%ebp),%ecx # ecx = a_len
mov
20(%ebp),%edi
cmp $
0,%ecx
je
2f # jmp if a_len ==
0
mov
8(%ebp),%esi # esi = a
cld
1:
lodsl # eax = [ds:esi]; esi +=
4
mov
16(%ebp),%edx # edx = b
mull %edx # edx:eax = Phi:Plo = a_i * b
add %ebx,%eax # add carry (%ebx) to edx:eax
adc $
0,%edx
mov %edx,%ebx # high half of product becomes next carry
stosl # [es:edi] = ax; edi +=
4;
dec %ecx # --a_len
jnz
1b # jmp if a_len
!= 0
2:
mov %ebx,
0(%edi) # *c = carry
pop %ebx
pop %esi
pop %edi
leave
ret
nop
s_mpv_mul_d_sse2:
push %ebp
mov %esp,%ebp
push %edi
push %esi
psubq %mm2,%mm2 # carry =
0
mov
12(%ebp),%ecx # ecx = a_len
movd
16(%ebp),%mm1 # mm1 = b
mov
20(%ebp),%edi
cmp $
0,%ecx
je
6f # jmp if a_len ==
0
mov
8(%ebp),%esi # esi = a
cld
5:
movd
0(%esi),%mm0 # mm0 = *a++
add $
4,%esi
pmuludq %mm1,%mm0 # mm0 = b * *a++
paddq %mm0,%mm2 # add the carry
movd %mm2,
0(%edi) # store the
32bit result
add $
4,%edi
psrlq $
32, %mm2 # save the carry
dec %ecx # --a_len
jnz
5b # jmp if a_len
!= 0
6:
movd %mm2,
0(%edi) # *c = carry
emms
pop %esi
pop %edi
leave
ret
nop
# ebp -
36: caller
's esi
# ebp -
32: caller
's edi
# ebp -
28:
# ebp -
24:
# ebp -
20:
# ebp -
16:
# ebp -
12:
# ebp -
8:
# ebp -
4:
# ebp +
0: caller
's ebp
# ebp +
4: return address
# ebp +
8: a argument
# ebp +
12: a_len argument
# ebp +
16: b argument
# ebp +
20: c argument
# registers:
# eax:
# ebx: carry
# ecx: a_len
# edx:
# esi: a ptr
# edi: c ptr
.globl s_mpv_mul_d_add
.type s_mpv_mul_d_add,@function
s_mpv_mul_d_add:
GET is_sse,%eax
cmp $
0,%eax
je s_mpv_mul_d_add_x86
jg s_mpv_mul_d_add_sse2
call s_mpi_is_sse2
PUT %eax,is_sse
cmp $
0,%eax
jg s_mpv_mul_d_add_sse2
s_mpv_mul_d_add_x86:
push %ebp
mov %esp,%ebp
sub $
28,%esp
push %edi
push %esi
push %ebx
movl $
0,%ebx # carry =
0
mov
12(%ebp),%ecx # ecx = a_len
mov
20(%ebp),%edi
cmp $
0,%ecx
je
11f # jmp if a_len ==
0
mov
8(%ebp),%esi # esi = a
cld
10:
lodsl # eax = [ds:esi]; esi +=
4
mov
16(%ebp),%edx # edx = b
mull %edx # edx:eax = Phi:Plo = a_i * b
add %ebx,%eax # add carry (%ebx) to edx:eax
adc $
0,%edx
mov
0(%edi),%ebx # add in current
word from *c
add %ebx,%eax
adc $
0,%edx
mov %edx,%ebx # high half of product becomes next carry
stosl # [es:edi] = ax; edi +=
4;
dec %ecx # --a_len
jnz
10b # jmp if a_len
!= 0
11:
mov %ebx,
0(%edi) # *c = carry
pop %ebx
pop %esi
pop %edi
leave
ret
nop
s_mpv_mul_d_add_sse2:
push %ebp
mov %esp,%ebp
push %edi
push %esi
psubq %mm2,%mm2 # carry =
0
mov
12(%ebp),%ecx # ecx = a_len
movd
16(%ebp),%mm1 # mm1 = b
mov
20(%ebp),%edi
cmp $
0,%ecx
je
16f # jmp if a_len ==
0
mov
8(%ebp),%esi # esi = a
cld
15:
movd
0(%esi),%mm0 # mm0 = *a++
add $
4,%esi
pmuludq %mm1,%mm0 # mm0 = b * *a++
paddq %mm0,%mm2 # add the carry
movd
0(%edi),%mm0
paddq %mm0,%mm2 # add the carry
movd %mm2,
0(%edi) # store the
32bit result
add $
4,%edi
psrlq $
32, %mm2 # save the carry
dec %ecx # --a_len
jnz
15b # jmp if a_len
!= 0
16:
movd %mm2,
0(%edi) # *c = carry
emms
pop %esi
pop %edi
leave
ret
nop
# ebp -
8: caller
's esi
# ebp -
4: caller
's edi
# ebp +
0: caller
's ebp
# ebp +
4: return address
# ebp +
8: a argument
# ebp +
12: a_len argument
# ebp +
16: b argument
# ebp +
20: c argument
# registers:
# eax:
# ebx: carry
# ecx: a_len
# edx:
# esi: a ptr
# edi: c ptr
.globl s_mpv_mul_d_add_prop
.type s_mpv_mul_d_add_prop,@function
s_mpv_mul_d_add_prop:
GET is_sse,%eax
cmp $
0,%eax
je s_mpv_mul_d_add_prop_x86
jg s_mpv_mul_d_add_prop_sse2
call s_mpi_is_sse2
PUT %eax,is_sse
cmp $
0,%eax
jg s_mpv_mul_d_add_prop_sse2
s_mpv_mul_d_add_prop_x86:
push %ebp
mov %esp,%ebp
sub $
28,%esp
push %edi
push %esi
push %ebx
movl $
0,%ebx # carry =
0
mov
12(%ebp),%ecx # ecx = a_len
mov
20(%ebp),%edi
cmp $
0,%ecx
je
21f # jmp if a_len ==
0
cld
mov
8(%ebp),%esi # esi = a
20:
lodsl # eax = [ds:esi]; esi +=
4
mov
16(%ebp),%edx # edx = b
mull %edx # edx:eax = Phi:Plo = a_i * b
add %ebx,%eax # add carry (%ebx) to edx:eax
adc $
0,%edx
mov
0(%edi),%ebx # add in current
word from *c
add %ebx,%eax
adc $
0,%edx
mov %edx,%ebx # high half of product becomes next carry
stosl # [es:edi] = ax; edi +=
4;
dec %ecx # --a_len
jnz
20b # jmp if a_len
!= 0
21:
cmp $
0,%ebx # is carry zero?
jz
23f
mov
0(%edi),%eax # add in current
word from *c
add %ebx,%eax
stosl # [es:edi] = ax; edi +=
4;
jnc
23f
22:
mov
0(%edi),%eax # add in current
word from *c
adc $
0,%eax
stosl # [es:edi] = ax; edi +=
4;
jc
22b
23:
pop %ebx
pop %esi
pop %edi
leave
ret
nop
s_mpv_mul_d_add_prop_sse2:
push %ebp
mov %esp,%ebp
push %edi
push %esi
push %ebx
psubq %mm2,%mm2 # carry =
0
mov
12(%ebp),%ecx # ecx = a_len
movd
16(%ebp),%mm1 # mm1 = b
mov
20(%ebp),%edi
cmp $
0,%ecx
je
26f # jmp if a_len ==
0
mov
8(%ebp),%esi # esi = a
cld
25:
movd
0(%esi),%mm0 # mm0 = *a++
movd
0(%edi),%mm3 # fetch the sum
add $
4,%esi
pmuludq %mm1,%mm0 # mm0 = b * *a++
paddq %mm0,%mm2 # add the carry
paddq %mm3,%mm2 # add *c++
movd %mm2,
0(%edi) # store the
32bit result
add $
4,%edi
psrlq $
32, %mm2 # save the carry
dec %ecx # --a_len
jnz
25b # jmp if a_len
!= 0
26:
movd %mm2,%ebx
cmp $
0,%ebx # is carry zero?
jz
28f
mov
0(%edi),%eax
add %ebx, %eax
stosl
jnc
28f
27:
mov
0(%edi),%eax # add in current
word from *c
adc $
0,%eax
stosl # [es:edi] = ax; edi +=
4;
jc
27b
28:
emms
pop %ebx
pop %esi
pop %edi
leave
ret
nop
# ebp -
20: caller
's esi
# ebp -
16: caller
's edi
# ebp -
12:
# ebp -
8: carry
# ebp -
4: a_len local
# ebp +
0: caller
's ebp
# ebp +
4: return address
# ebp +
8: pa argument
# ebp +
12: a_len argument
# ebp +
16: ps argument
# ebp +
20:
# registers:
# eax:
# ebx: carry
# ecx: a_len
# edx:
# esi: a ptr
# edi: c ptr
.globl s_mpv_sqr_add_prop
.type s_mpv_sqr_add_prop,@function
s_mpv_sqr_add_prop:
GET is_sse,%eax
cmp $
0,%eax
je s_mpv_sqr_add_prop_x86
jg s_mpv_sqr_add_prop_sse2
call s_mpi_is_sse2
PUT %eax,is_sse
cmp $
0,%eax
jg s_mpv_sqr_add_prop_sse2
s_mpv_sqr_add_prop_x86:
push %ebp
mov %esp,%ebp
sub $
12,%esp
push %edi
push %esi
push %ebx
movl $
0,%ebx # carry =
0
mov
12(%ebp),%ecx # a_len
mov
16(%ebp),%edi # edi = ps
cmp $
0,%ecx
je
31f # jump if a_len ==
0
cld
mov
8(%ebp),%esi # esi = pa
30:
lodsl # %eax = [ds:si]; si +=
4;
mull %eax
add %ebx,%eax # add
"carry"
adc $
0,%edx
mov
0(%edi),%ebx
add %ebx,%eax # add low
word from result
mov
4(%edi),%ebx
stosl # [es:di] = %eax; di +=
4;
adc %ebx,%edx # add high
word from result
movl $
0,%ebx
mov %edx,%eax
adc $
0,%ebx
stosl # [es:di] = %eax; di +=
4;
dec %ecx # --a_len
jnz
30b # jmp if a_len
!= 0
31:
cmp $
0,%ebx # is carry zero?
jz
34f
mov
0(%edi),%eax # add in current
word from *c
add %ebx,%eax
stosl # [es:edi] = ax; edi +=
4;
jnc
34f
32:
mov
0(%edi),%eax # add in current
word from *c
adc $
0,%eax
stosl # [es:edi] = ax; edi +=
4;
jc
32b
34:
pop %ebx
pop %esi
pop %edi
leave
ret
nop
s_mpv_sqr_add_prop_sse2:
push %ebp
mov %esp,%ebp
push %edi
push %esi
push %ebx
psubq %mm2,%mm2 # carry =
0
mov
12(%ebp),%ecx # ecx = a_len
mov
16(%ebp),%edi
cmp $
0,%ecx
je
36f # jmp if a_len ==
0
mov
8(%ebp),%esi # esi = a
cld
35:
movd
0(%esi),%mm0 # mm0 = *a
movd
0(%edi),%mm3 # fetch the sum
add $
4,%esi
pmuludq %mm0,%mm0 # mm0 = sqr(a)
paddq %mm0,%mm2 # add the carry
paddq %mm3,%mm2 # add the low
word
movd
4(%edi),%mm3
movd %mm2,
0(%edi) # store the
32bit result
psrlq $
32, %mm2
paddq %mm3,%mm2 # add the high
word
movd %mm2,
4(%edi) # store the
32bit result
psrlq $
32, %mm2 # save the carry.
add $
8,%edi
dec %ecx # --a_len
jnz
35b # jmp if a_len
!= 0
36:
movd %mm2,%ebx
cmp $
0,%ebx # is carry zero?
jz
38f
mov
0(%edi),%eax
add %ebx, %eax
stosl
jnc
38f
37:
mov
0(%edi),%eax # add in current
word from *c
adc $
0,%eax
stosl # [es:edi] = ax; edi +=
4;
jc
37b
38:
emms
pop %ebx
pop %esi
pop %edi
leave
ret
nop
#
# Divide
64-bit (Nhi,Nlo) by
32-bit divisor, which must be normalized
# so its high bit is
1. This code is from NSPR.
#
# mp_err s_mpv_div_2dx1d(mp_digit Nhi, mp_digit Nlo, mp_digit divisor,
# mp_digit *qp, mp_digit *rp)
# esp +
0: Caller
's ebx
# esp +
4: return address
# esp +
8: Nhi argument
# esp +
12: Nlo argument
# esp +
16: divisor argument
# esp +
20: qp argument
# esp +
24: rp argument
# registers:
# eax:
# ebx: carry
# ecx: a_len
# edx:
# esi: a ptr
# edi: c ptr
#
.globl s_mpv_div_2dx1d
.type s_mpv_div_2dx1d,@function
s_mpv_div_2dx1d:
push %ebx
mov
8(%esp),%edx
mov
12(%esp),%eax
mov
16(%esp),%ebx
div %ebx
mov
20(%esp),%ebx
mov %eax,
0(%ebx)
mov
24(%esp),%ebx
mov %edx,
0(%ebx)
xor %eax,%eax # return zero
pop %ebx
ret
nop
# Magic indicating no need for an executable stack
.
section .note.GNU-stack,
"", @progbits
.previous