Quelle sbpi1_bdiv_r.asm

Sprache: Masm
dnl  AMD64 mpn_sbpi1_bdiv_r optimised for AMD Zen

dnl  Copyright 2017 Free Software Foundation, Inc.

dnl  This file is part of the GNU MP Library.

dnl

dnl  The GNU MP Library is free software; you can redistribute it and/or modify

dnl  it under the terms of either:

dnl

dnl    * the GNU Lesser General Public License as published by the Free

dnl      Software Foundation; either version 3 of the License, or (at your

dnl      option) any later version.

dnl

dnl  or

dnl

dnl    * the GNU General Public License as published by the Free Software

dnl      Foundation; either version 2 of the License, or (at your option) any

dnl      later version.

dnl

dnl  or both in parallel, as here.

dnl

dnl  The GNU MP Library is distributed in the hope that it will be useful, but

dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY

dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License

dnl  for more details.

dnl

dnl  You should have received copies of the GNU General Public License and the

dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,

dnl  see https://www.gnu.org/licenses/.

include(`../config.m4')

define(`up',       `%rdi')

define(`un_param', `%rsi')

define(`dp_param', `%rdx')

define(`dn_param', `%rcx')

define(`dinv',     `%r8')

define(`i',        `%rcx')

define(`dn',       `%r14')

define(`dp',       `%rsi')

define(`un',       `%r15')

C TODO

C  * The o1...o8  loops for special dn counts were naively hand-optimised by

C    folding the generic loops.  They can probably be tuned.  The speculative

C    quotient limb generation might not be in the optimal spot.

C  * Perhaps avoid late-in-loop jumps, e.g., lo0.

C  * Improve regalloc wrt dn_param/dn and un_param/un to save some moves.

C ABI_SUPPORT(DOS64)

C ABI_SUPPORT(STD64)

ASM_START()

 TEXT

 ALIGN(16)

PROLOGUE(mpn_sbpi1_bdiv_r)

 FUNC_ENTRY(4)

IFDOS(` mov 56(%rsp), dinv ')

 push %r15

 push %r14

 push %r13

 push %r12

 push %rbp

 push %rbx

 sub dn_param, un_param  C outer loop count

 mov dn_param, dn  C FIXME: Suppress by reg re-alloc

 push dinv    C keep dinv on stack

 mov un_param, un  C FIXME: Suppress by reg re-alloc

 xor R32(%rbp), R32(%rbp)

 lea (dp_param,dn_param,8), dp

 mov (up), %rdx

 imul dinv, %rdx   C first quotient limb

 neg dn

 lea -32(up,dn_param,8), up

 test $1, R8(dn_param)

 jnz L(cx1)

L(cx0): test $2, R8(dn_param)

 jnz L(b2)

C =============================================================================

L(b0): cmp $-4, dn

 jnz L(gt4)

L(o4): mulx( -32,(dp), %r9, %r14)

 mulx( -24,(dp), %r11, %r10)

 mulx( -16,(dp), %r13, %r12)

 mulx( -8,(dp), %rbx, %rax)

 add %r14, %r11

 adc %r10, %r13

 adc %r12, %rbx

 adc $0, %rax

 add (up), %r9

 adc 8(up), %r11

 mov %r8, %rdx   C dinv

 mov %r11, 8(up)

 mulx( %r11, %rdx, %r12)  C next quotient

 adc %r13, 16(up)

 adc %rbx, 24(up)

 adc %rbp, %rax

 setc R8(%rbp)

 add %rax, 32(up)

 adc $0, R32(%rbp)

 lea 8(up), up

 dec un

 jne L(o4)

 jmp L(ret)

L(gt4): cmp $-8, dn

 jnz L(out0)

L(o8): mulx( -64,(dp), %r9, %r14)

 mulx( -56,(dp), %rcx, %r10)

 mulx( -48,(dp), %r13, %r12)

 mulx( -40,(dp), %rbx, %rax)

 add %r14, %rcx

 adc %r10, %r13

 adc %r12, %rbx

 adc $0, %rax

 add -32(up), %r9

 mulx( -32,(dp), %r9, %r14)

 adc -24(up), %rcx

 mov %rcx, -24(up)

 mulx( -24,(dp), %r11, %r10)

 adc %r13, -16(up)

 mulx( -16,(dp), %r13, %r12)

 adc %rbx, -8(up)

 adc %rax, %r9

 mulx( -8,(dp), %rbx, %rax)

 adc %r14, %r11

 adc %r10, %r13

 adc %r12, %rbx

 adc $0, %rax

 mov %r8, %rdx   C dinv

 mulx( %rcx, %rdx, %r12)  C next quotient

 add %r9, (up)

 adc %r11, 8(up)

 adc %r13, 16(up)

 adc %rbx, 24(up)

 adc %rbp, %rax

 setc R8(%rbp)

 add %rax, 32(up)

 adc $0, R32(%rbp)

 lea 8(up), up

 dec un

 jne L(o8)

 jmp L(ret)

L(out0):mov dn, i

 .byte 0xc4,0x22,0xb3,0xf6,0x04,0xf6  C mulx (dp,dn,8),%r9,%r8

 .byte 0xc4,0x22,0xa3,0xf6,0x54,0xf6,0x08 C mulx 8(dp,dn,8),%r11,%r10

 .byte 0xc4,0x22,0x93,0xf6,0x64,0xf6,0x10 C mulx 16(dp,dn,8),%r13,%r12

 clc

 jmp L(lo0)

 ALIGN(16)

L(top0):add %r9, (up,i,8)

 .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce  C mulx (dp,i,8), %r9, %r8

 adc %r11, 8(up,i,8)

 .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(dp,i,8), %r11, %r10

 adc %r13, 16(up,i,8)

 .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(dp,i,8), %r13, %r12

 adc %rbx, 24(up,i,8)

 adc %rax, %r9

L(lo0): .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(dp,i,8), %rbx, %rax

 adc %r8, %r11

 adc %r10, %r13

 adc %r12, %rbx

 adc $0, %rax

 add $4, i

 js L(top0)

 mov (%rsp), %rdx   C dinv

 .byte 0xc4,0x22,0xeb,0xf6,0x64,0xf7,0x28 C mulx 40(%rdi,%r14,8),%rdx,%r12

 add %r9, (up)

 adc %r11, 8(up)

 adc %r13, 16(up)

 adc %rbx, 24(up)

 adc %rbp, %rax

 setc R8(%rbp)

 add %rax, 32(up)

 adc $0, R32(%rbp)

 lea 8(up), up

 dec un

 jne L(out0)

 jmp L(ret)

L(cx1): test $2, R8(dn_param)

 jnz L(b3)

C =============================================================================

L(b1): cmp $-1, dn

 jnz L(gt1)

 mov 24(up), %r9

L(o1): mulx( -8,(dp), %rbx, %rdx)

 add %r9, %rbx

 adc %rbp, %rdx

 add 32(up), %rdx

 setc R8(%rbp)

 mov %rdx, %r9

 mulx( %r8, %rdx, %r12)  C next quotient

 lea 8(up), up

 dec un

 jne L(o1)

 mov %r9, 24(up)

 jmp L(ret)

L(gt1): cmp $-5, dn

 jnz L(out1)

L(o5): mulx( -40,(dp), %rbx, %rax)

 mulx( -32,(dp), %r9, %r14)

 mulx( -24,(dp), %r11, %r10)

 mulx( -16,(dp), %r13, %r12)

 add -8(up), %rbx

 adc %rax, %r9

 mulx( -8,(dp), %rbx, %rax)

 adc %r14, %r11

 adc %r10, %r13

 adc %r12, %rbx

 adc $0, %rax

 add (up), %r9

 mov %r9, (up)

 mov %r8, %rdx   C dinv

 mulx( %r9, %rdx, %r12)  C next quotient

 adc %r11, 8(up)

 adc %r13, 16(up)

 adc %rbx, 24(up)

 adc %rbp, %rax

 setc R8(%rbp)

 add %rax, 32(up)

 adc $0, R32(%rbp)

 lea 8(up), up

 dec un

 jne L(o5)

 jmp L(ret)

L(out1):lea 1(dn), i

 .byte 0xc4,0xa2,0xe3,0xf6,0x04,0xf6  C mulx (dp,dn,8),%rbx,%rax

 .byte 0xc4,0x22,0xb3,0xf6,0x44,0xf6,0x08 C mulx 8(dp,dn,8),%r9,%r8

 .byte 0xc4,0x22,0xa3,0xf6,0x54,0xf6,0x10 C mulx 16(dp,dn,8),%r11,%r10

 clc

 jmp L(lo1)

 ALIGN(16)

L(top1):add %r9, (up,i,8)

 .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce  C mulx (dp,i,8), %r9, %r8

 adc %r11, 8(up,i,8)

 .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(dp,i,8), %r11, %r10

 adc %r13, 16(up,i,8)

L(lo1): .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(dp,i,8), %r13, %r12

 adc %rbx, 24(up,i,8)

 adc %rax, %r9

 .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(dp,i,8), %rbx, %rax

 adc %r8, %r11

 adc %r10, %r13

 adc %r12, %rbx

 adc $0, %rax

 add $4, i

 js L(top1)

 mov (%rsp), %rdx   C dinv

 .byte 0xc4,0x22,0xeb,0xf6,0x64,0xf7,0x28 C mulx 40(up,dn,8), %rdx, %r12

 add %r9, (up)

 adc %r11, 8(up)

 adc %r13, 16(up)

 adc %rbx, 24(up)

 adc %rbp, %rax

 setc R8(%rbp)

 add %rax, 32(up)

 adc $0, R32(%rbp)

 lea 8(up), up

 dec un

 jne L(out1)

 jmp L(ret)

C =============================================================================

L(b2): cmp $-2, dn

 jnz L(gt2)

 mov 16(up), %r10

 mov 24(up), %r9

L(o2): mulx( -16,(dp), %r13, %r12)

 mulx( -8,(dp), %rbx, %rax)

 add %r12, %rbx

 adc $0, %rax

 add %r10, %r13  C 0  add just to produce carry

 mov %r9, %r10  C 1

 adc %rbx, %r10  C 1

 mov %r8, %rdx

 mulx( %r10, %rdx, %r12) C next quotient

 adc %rbp, %rax  C 2

 setc R8(%rbp)  C 3

 mov 32(up), %r9  C 2

 add %rax, %r9  C 2

 adc $0, R32(%rbp)  C 3

 lea 8(up), up

 dec un

 jne L(o2)

 mov %r10, 16(up)

 mov %r9, 24(up)

 jmp L(ret)

L(gt2): cmp $-6, dn

 jnz L(out2)

L(o6): mulx( -48,(dp), %r13, %r12)

 mulx( -40,(dp), %rcx, %rax)

 add %r12, %rcx

 adc $0, %rax

 mulx( -32,(dp), %r9, %r14)

 mulx( -24,(dp), %r11, %r10)

 add -16(up), %r13

 mulx( -16,(dp), %r13, %r12)

 adc -8(up), %rcx

 mov %rcx, -8(up)

 adc %rax, %r9

 mulx( -8,(dp), %rbx, %rax)

 adc %r14, %r11

 adc %r10, %r13

 adc %r12, %rbx

 adc $0, %rax

 mov %r8, %rdx   C dinv

 mulx( %rcx, %rdx, %r12)  C next quotient

 add %r9, (up)

 adc %r11, 8(up)

 adc %r13, 16(up)

 adc %rbx, 24(up)

 adc %rbp, %rax

 setc R8(%rbp)

 add %rax, 32(up)

 adc $0, R32(%rbp)

 lea 8(up), up

 dec un

 jne L(o6)

 jmp L(ret)

L(out2):lea 2(dn), i

 .byte 0xc4,0x22,0x93,0xf6,0x24,0xf6  C mulx (dp,dn,8),%r13,%r12

 .byte 0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0x08 C mulx 8(dp,dn,8),%rbx,%rax

 add %r12, %rbx

 adc $0, %rax

 .byte 0xc4,0x22,0xb3,0xf6,0x44,0xf6,0x10 C mulx 16(dp,dn,8),%r9,%r8

 jmp L(lo2)

 ALIGN(16)

L(top2):add %r9, (up,i,8)

 .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce  C mulx (dp,i,8), %r9, %r8

 adc %r11, 8(up,i,8)

L(lo2): .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(dp,i,8), %r11, %r10

 adc %r13, 16(up,i,8)

 .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(dp,i,8), %r13, %r12

 adc %rbx, 24(up,i,8)

 adc %rax, %r9

 .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(dp,i,8), %rbx, %rax

 adc %r8, %r11

 adc %r10, %r13

 adc %r12, %rbx

 adc $0, %rax

 add $4, i

 js L(top2)

 mov (%rsp), %rdx   C dinv

 .byte 0xc4,0x22,0xeb,0xf6,0x64,0xf7,0x28 C mulx 40(up,dn,8), %rdx, %r12

 add %r9, (up)

 adc %r11, 8(up)

 adc %r13, 16(up)

 adc %rbx, 24(up)

 adc %rbp, %rax

 setc R8(%rbp)

 add %rax, 32(up)

 adc $0, R32(%rbp)

 lea 8(up), up

 dec un

 jne L(out2)

 jmp L(ret)

C =============================================================================

L(b3): cmp $-3, dn

 jnz L(gt3)

 mov 8(up), %r14

 mov 16(up), %r9

 mov 24(up), %rcx

L(o3): mulx( -24,(dp), %r11, %r10)

 mulx( -16,(dp), %r13, %r12)

 mulx( -8,(dp), %rbx, %rax)

 add %r10, %r13

 adc %r12, %rbx

 adc $0, %rax

 add %r14, %r11

 mov %r9, %r14

 adc %r13, %r14

 mov %rcx, %r9

 mov %r8, %rdx   C dinv

 mulx( %r14, %rdx, %r12)  C next quotient

 adc %rbx, %r9

 adc %rbp, %rax

 setc R8(%rbp)

 mov 32(up), %rcx

 add %rax, %rcx

 adc $0, R32(%rbp)

 lea 8(up), up

 dec un

 jne L(o3)

 mov %r14, 8(up)

 mov %r9, 16(up)

 mov %rcx, 24(up)

 jmp L(ret)

L(gt3): cmp $-7, dn

 jnz L(out3)

L(o7): mulx( -56,(dp), %r11, %r10)

 mulx( -48,(dp), %rcx, %r12)

 mulx( -40,(dp), %rbx, %rax)

 add %r10, %rcx

 adc %r12, %rbx

 adc $0, %rax

 mulx( -32,(dp), %r9, %r14)

 add -24(up), %r11

 mulx( -24,(dp), %r11, %r10)

 adc -16(up), %rcx

 mov %rcx, -16(up)

 mulx( -16,(dp), %r13, %r12)

 adc %rbx, -8(up)

 adc %rax, %r9

 mulx( -8,(dp), %rbx, %rax)

 adc %r14, %r11

 adc %r10, %r13

 adc %r12, %rbx

 adc $0, %rax

 mov %r8, %rdx   C dinv

 mulx( %rcx, %rdx, %r12)  C next quotient

 add %r9, (up)

 adc %r11, 8(up)

 adc %r13, 16(up)

 adc %rbx, 24(up)

 adc %rbp, %rax

 setc R8(%rbp)

 add %rax, 32(up)

 adc $0, R32(%rbp)

 lea 8(up), up

 dec un

 jne L(o7)

 jmp L(ret)

L(out3):lea 3(dn), i

 .byte 0xc4,0x22,0xa3,0xf6,0x14,0xf6  C mulx (dp,dn,8),%r11,%r10

 .byte 0xc4,0x22,0x93,0xf6,0x64,0xf6,0x08 C mulx 8(dp,dn,8),%r13,%r12

 .byte 0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0x10 C mulx 16(dp,dn,8),%rbx,%rax

 add %r10, %r13

 adc %r12, %rbx

 adc $0, %rax

 jmp L(lo3)

 ALIGN(16)

L(top3):add %r9, (up,i,8)

L(lo3): .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce  C mulx (dp,i,8), %r9, %r8

 adc %r11, 8(up,i,8)

 .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(dp,i,8), %r11, %r10

 adc %r13, 16(up,i,8)

 .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(dp,i,8), %r13, %r12

 adc %rbx, 24(up,i,8)

 adc %rax, %r9

 .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(dp,i,8), %rbx, %rax

 adc %r8, %r11

 adc %r10, %r13

 adc %r12, %rbx

 adc $0, %rax

 add $4, i

 js L(top3)

 mov (%rsp), %rdx   C dinv

 .byte 0xc4,0x22,0xeb,0xf6,0x64,0xf7,0x28 C mulx 40(up,dn,8), %rdx, %r12

 add %r9, (up)

 adc %r11, 8(up)

 adc %r13, 16(up)

 adc %rbx, 24(up)

 adc %rbp, %rax

 setc R8(%rbp)

 add %rax, 32(up)

 adc $0, R32(%rbp)

 lea 8(up), up

 dec un

 jne L(out3)

L(ret): mov %rbp, %rax

 pop %rsi   C dummy dealloc

 pop %rbx

 pop %rbp

 pop %r12

 pop %r13

 pop %r14

 pop %r15

 ret

EPILOGUE()
Messung V0.5 in Prozent
¤ Dauer der Verarbeitung: 0.10 Sekunden (vorverarbeitet am 2026-06-04) ¤

Wurzel
Suchen
Beweissystem der NASA
Beweissystem Isabelle
NIST Cobol Testsuite
Cephes Mathematical Library
Wiener Entwicklungsmethode
Haftungshinweis

Die Informationen auf dieser Webseite wurden nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit, noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:

Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.