dnl This file is part of the GNU MP Library.
dnl
dnl The GNU MP Library is free software; you can redistribute it and/or modify
dnl it under the terms of either:
dnl
dnl * the GNU Lesser General Public License as published by the Free
dnl Software Foundation; either version 3 of the License, or (at your
dnl option) any later version.
dnl
dnl or
dnl
dnl * the GNU General Public License as published by the Free Software
dnl Foundation; either version 2 of the License, or (at your option) any
dnl later version.
dnl
dnl or both in parallel, as here.
dnl
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
dnl for more details.
dnl
dnl You should have received copies of the GNU General Public License and the
dnl GNU Lesser General Public License along with the GNU MP Library. If not,
dnl see https://www.gnu.org/licenses/.
C TODO
C * The o1...o8 loops for special dn counts were naively hand-optimised by
C folding the generic loops. They can probably be tuned. The speculative
C quotient limb generation might not be in the optimal spot.
C * Perhaps avoid late-in-loop jumps, e.g., lo0.
C * Improve regalloc wrt dn_param/dn and un_param/un to save some moves.
sub dn_param, un_param C outer loop count mov dn_param, dn C FIXME: Suppress by reg re-alloc push dinv C keep dinv on stack mov un_param, un C FIXME: Suppress by reg re-alloc xor R32(%rbp), R32(%rbp)
lea (dp_param,dn_param,8), dp
mov (up), %rdx
imul dinv, %rdx C first quotient limb
neg dn lea -32(up,dn_param,8), up
test $1, R8(dn_param) jnz L(cx1)
L(cx0): test $2, R8(dn_param) jnz L(b2)
C =============================================================================
L(b0): cmp $-4, dn jnz L(gt4)
L(out0):mov dn, i
.byte0xc4,0x22,0xb3,0xf6,0x04,0xf6 C mulx (dp,dn,8),%r9,%r8
.byte0xc4,0x22,0xa3,0xf6,0x54,0xf6,0x08 C mulx 8(dp,dn,8),%r11,%r10
.byte0xc4,0x22,0x93,0xf6,0x64,0xf6,0x10 C mulx 16(dp,dn,8),%r13,%r12
clc jmp L(lo0)
L(out1):lea1(dn), i
.byte0xc4,0xa2,0xe3,0xf6,0x04,0xf6 C mulx (dp,dn,8),%rbx,%rax
.byte0xc4,0x22,0xb3,0xf6,0x44,0xf6,0x08 C mulx 8(dp,dn,8),%r9,%r8
.byte0xc4,0x22,0xa3,0xf6,0x54,0xf6,0x10 C mulx 16(dp,dn,8),%r11,%r10
clc jmp L(lo1)
mov (%rsp), %rdx C dinv
.byte0xc4,0x22,0xeb,0xf6,0x64,0xf7,0x28 C mulx 40(up,dn,8), %rdx, %r12 add %r9, (up) adc %r11, 8(up) adc %r13, 16(up) adc %rbx, 24(up) adc %rbp, %rax
setc R8(%rbp) add %rax, 32(up) adc $0, R32(%rbp) lea8(up), up dec un jne L(out1) jmp L(ret)
C =============================================================================
L(b2): cmp $-2, dn jnz L(gt2)
mov16(up), %r10 mov24(up), %r9
L(o2): mulx( -16,(dp), %r13, %r12)
mulx( -8,(dp), %rbx, %rax) add %r12, %rbx adc $0, %rax add %r10, %r13 C 0add just to produce carry mov %r9, %r10 C 1 adc %rbx, %r10 C 1 mov %r8, %rdx
mulx( %r10, %rdx, %r12) C next quotient adc %rbp, %rax C 2
setc R8(%rbp) C 3 mov32(up), %r9 C 2 add %rax, %r9 C 2 adc $0, R32(%rbp) C 3 lea8(up), up dec un jne L(o2) mov %r10, 16(up) mov %r9, 24(up) jmp L(ret)
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.