/ This Source Code Form is subject to the terms of the Mozilla Public
/ License, v.
2 .
0 . If a copy of the MPL was not distributed with this
/
file , You can obtain one at
http://mozilla.org/MPL/ 2 .
0 /.
/ ------------------------------------------------------------------------
/
/ Implementation of s_mpv_mul_set_vec which exploits
/ the
64 X64->
128 bit unsigned multiply instruction.
/
/ ------------------------------------------------------------------------
/ r = a * digit, r and a are vectors of length len
/ returns the carry digit
/ r and a are
64 bit aligned.
/
/ uint64_t
/ s_mpv_mul_set_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit)
/
.text; .
align 16 ; .globl s_mpv_mul_set_vec64; .type s_mpv_mul_set_vec64, @function; s_mp
v_mul_set_vec64:
xorq %rax, %rax / if (len == 0 ) return (0 )
testq %rdx, %rdx
jz .L17
movq %rdx, %r8 / Use r8 for len; %rdx is used by mul
xorq %r9, %r9 / cy = 0
.L15:
cmpq $8 , %r8 / 8 - len
jb .L16
movq 0 (%rsi), %rax / rax = a[0 ]
movq 8 (%rsi), %r11 / prefetch a[1 ]
mulq %rcx / p = a[0 ] * digit
addq %r9, %rax
adcq $0 , %rdx / p += cy
movq %rax, 0 (%rdi) / r[0 ] = lo (p)
movq %rdx, %r9 / cy = hi (p)
movq %r11, %rax
movq 16 (%rsi), %r11 / prefetch a[2 ]
mulq %rcx / p = a[1 ] * digit
addq %r9, %rax
adcq $0 , %rdx / p += cy
movq %rax, 8 (%rdi) / r[1 ] = lo (p)
movq %rdx, %r9 / cy = hi (p)
movq %r11, %rax
movq 24 (%rsi), %r11 / prefetch a[3 ]
mulq %rcx / p = a[2 ] * digit
addq %r9, %rax
adcq $0 , %rdx / p += cy
movq %rax, 16 (%rdi) / r[2 ] = lo (p)
movq %rdx, %r9 / cy = hi (p)
movq %r11, %rax
movq 32 (%rsi), %r11 / prefetch a[4 ]
mulq %rcx / p = a[3 ] * digit
addq %r9, %rax
adcq $0 , %rdx / p += cy
movq %rax, 24 (%rdi) / r[3 ] = lo (p)
movq %rdx, %r9 / cy = hi (p)
movq %r11, %rax
movq 40 (%rsi), %r11 / prefetch a[5 ]
mulq %rcx / p = a[4 ] * digit
addq %r9, %rax
adcq $0 , %rdx / p += cy
movq %rax, 32 (%rdi) / r[4 ] = lo (p)
movq %rdx, %r9 / cy = hi (p)
movq %r11, %rax
movq 48 (%rsi), %r11 / prefetch a[6 ]
mulq %rcx / p = a[5 ] * digit
addq %r9, %rax
adcq $0 , %rdx / p += cy
movq %rax, 40 (%rdi) / r[5 ] = lo (p)
movq %rdx, %r9 / cy = hi (p)
movq %r11, %rax
movq 56 (%rsi), %r11 / prefetch a[7 ]
mulq %rcx / p = a[6 ] * digit
addq %r9, %rax
adcq $0 , %rdx / p += cy
movq %rax, 48 (%rdi) / r[6 ] = lo (p)
movq %rdx, %r9 / cy = hi (p)
movq %r11, %rax
mulq %rcx / p = a[7 ] * digit
addq %r9, %rax
adcq $0 , %rdx / p += cy
movq %rax, 56 (%rdi) / r[7 ] = lo (p)
movq %rdx, %r9 / cy = hi (p)
addq $64 , %rsi
addq $64 , %rdi
subq $8 , %r8
jz .L17
jmp .L15
.L16:
movq 0 (%rsi), %rax
mulq %rcx / p = a[0 ] * digit
addq %r9, %rax
adcq $0 , %rdx / p += cy
movq %rax, 0 (%rdi) / r[0 ] = lo (p)
movq %rdx, %r9 / cy = hi (p)
decq %r8
jz .L17
movq 8 (%rsi), %rax
mulq %rcx / p = a[1 ] * digit
addq %r9, %rax
adcq $0 , %rdx / p += cy
movq %rax, 8 (%rdi) / r[1 ] = lo (p)
movq %rdx, %r9 / cy = hi (p)
decq %r8
jz .L17
movq 16 (%rsi), %rax
mulq %rcx / p = a[2 ] * digit
addq %r9, %rax
adcq $0 , %rdx / p += cy
movq %rax, 16 (%rdi) / r[2 ] = lo (p)
movq %rdx, %r9 / cy = hi (p)
decq %r8
jz .L17
movq 24 (%rsi), %rax
mulq %rcx / p = a[3 ] * digit
addq %r9, %rax
adcq $0 , %rdx / p += cy
movq %rax, 24 (%rdi) / r[3 ] = lo (p)
movq %rdx, %r9 / cy = hi (p)
decq %r8
jz .L17
movq 32 (%rsi), %rax
mulq %rcx / p = a[4 ] * digit
addq %r9, %rax
adcq $0 , %rdx / p += cy
movq %rax, 32 (%rdi) / r[4 ] = lo (p)
movq %rdx, %r9 / cy = hi (p)
decq %r8
jz .L17
movq 40 (%rsi), %rax
mulq %rcx / p = a[5 ] * digit
addq %r9, %rax
adcq $0 , %rdx / p += cy
movq %rax, 40 (%rdi) / r[5 ] = lo (p)
movq %rdx, %r9 / cy = hi (p)
decq %r8
jz .L17
movq 48 (%rsi), %rax
mulq %rcx / p = a[6 ] * digit
addq %r9, %rax
adcq $0 , %rdx / p += cy
movq %rax, 48 (%rdi) / r[6 ] = lo (p)
movq %rdx, %r9 / cy = hi (p)
decq %r8
jz .L17
.L17:
movq %r9, %rax
ret
.size s_mpv_mul_set_vec64, .-s_mpv_mul_set_vec64
/ ------------------------------------------------------------------------
/
/ Implementation of s_mpv_mul_add_vec which exploits
/ the 64 X64->128 bit unsigned multiply instruction.
/
/ ------------------------------------------------------------------------
/ r += a * digit, r and a are vectors of length len
/ returns the carry digit
/ r and a are 64 bit aligned.
/
/ uint64_t
/ s_mpv_mul_add_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit)
/
.text; .align 16 ; .globl s_mpv_mul_add_vec64; .type s_mpv_mul_add_vec64, @function; s_mpv_mul_add_vec64:
xorq %rax, %rax / if (len == 0 ) return (0 )
testq %rdx, %rdx
jz .L27
movq %rdx, %r8 / Use r8 for len; %rdx is used by mul
xorq %r9, %r9 / cy = 0
.L25:
cmpq $8 , %r8 / 8 - len
jb .L26
movq 0 (%rsi), %rax / rax = a[0 ]
movq 0 (%rdi), %r10 / r10 = r[0 ]
movq 8 (%rsi), %r11 / prefetch a[1 ]
mulq %rcx / p = a[0 ] * digit
addq %r10, %rax
adcq $0 , %rdx / p += r[0 ]
movq 8 (%rdi), %r10 / prefetch r[1 ]
addq %r9, %rax
adcq $0 , %rdx / p += cy
movq %rax, 0 (%rdi) / r[0 ] = lo (p)
movq %rdx, %r9 / cy = hi (p)
movq %r11, %rax
movq 16 (%rsi), %r11 / prefetch a[2 ]
mulq %rcx / p = a[1 ] * digit
addq %r10, %rax
adcq $0 , %rdx / p += r[1 ]
movq 16 (%rdi), %r10 / prefetch r[2 ]
addq %r9, %rax
adcq $0 , %rdx / p += cy
movq %rax, 8 (%rdi) / r[1 ] = lo (p)
movq %rdx, %r9 / cy = hi (p)
movq %r11, %rax
movq 24 (%rsi), %r11 / prefetch a[3 ]
mulq %rcx / p = a[2 ] * digit
addq %r10, %rax
adcq $0 , %rdx / p += r[2 ]
movq 24 (%rdi), %r10 / prefetch r[3 ]
addq %r9, %rax
adcq $0 , %rdx / p += cy
movq %rax, 16 (%rdi) / r[2 ] = lo (p)
movq %rdx, %r9 / cy = hi (p)
movq %r11, %rax
movq 32 (%rsi), %r11 / prefetch a[4 ]
mulq %rcx / p = a[3 ] * digit
addq %r10, %rax
adcq $0 , %rdx / p += r[3 ]
movq 32 (%rdi), %r10 / prefetch r[4 ]
addq %r9, %rax
adcq $0 , %rdx / p += cy
movq %rax, 24 (%rdi) / r[3 ] = lo (p)
movq %rdx, %r9 / cy = hi (p)
movq %r11, %rax
movq 40 (%rsi), %r11 / prefetch a[5 ]
mulq %rcx / p = a[4 ] * digit
addq %r10, %rax
adcq $0 , %rdx / p += r[4 ]
movq 40 (%rdi), %r10 / prefetch r[5 ]
addq %r9, %rax
adcq $0 , %rdx / p += cy
movq %rax, 32 (%rdi) / r[4 ] = lo (p)
movq %rdx, %r9 / cy = hi (p)
movq %r11, %rax
movq 48 (%rsi), %r11 / prefetch a[6 ]
mulq %rcx / p = a[5 ] * digit
addq %r10, %rax
adcq $0 , %rdx / p += r[5 ]
movq 48 (%rdi), %r10 / prefetch r[6 ]
addq %r9, %rax
adcq $0 , %rdx / p += cy
movq %rax, 40 (%rdi) / r[5 ] = lo (p)
movq %rdx, %r9 / cy = hi (p)
movq %r11, %rax
movq 56 (%rsi), %r11 / prefetch a[7 ]
mulq %rcx / p = a[6 ] * digit
addq %r10, %rax
adcq $0 , %rdx / p += r[6 ]
movq 56 (%rdi), %r10 / prefetch r[7 ]
addq %r9, %rax
adcq $0 , %rdx / p += cy
movq %rax, 48 (%rdi) / r[6 ] = lo (p)
movq %rdx, %r9 / cy = hi (p)
movq %r11, %rax
mulq %rcx / p = a[7 ] * digit
addq %r10, %rax
adcq $0 , %rdx / p += r[7 ]
addq %r9, %rax
adcq $0 , %rdx / p += cy
movq %rax, 56 (%rdi) / r[7 ] = lo (p)
movq %rdx, %r9 / cy = hi (p)
addq $64 , %rsi
addq $64 , %rdi
subq $8 , %r8
jz .L27
jmp .L25
.L26:
movq 0 (%rsi), %rax
movq 0 (%rdi), %r10
mulq %rcx / p = a[0 ] * digit
addq %r10, %rax
adcq $0 , %rdx / p += r[0 ]
addq %r9, %rax
adcq $0 , %rdx / p += cy
movq %rax, 0 (%rdi) / r[0 ] = lo (p)
movq %rdx, %r9 / cy = hi (p)
decq %r8
jz .L27
movq 8 (%rsi), %rax
movq 8 (%rdi), %r10
mulq %rcx / p = a[1 ] * digit
addq %r10, %rax
adcq $0 , %rdx / p += r[1 ]
addq %r9, %rax
adcq $0 , %rdx / p += cy
movq %rax, 8 (%rdi) / r[1 ] = lo (p)
movq %rdx, %r9 / cy = hi (p)
decq %r8
jz .L27
movq 16 (%rsi), %rax
movq 16 (%rdi), %r10
mulq %rcx / p = a[2 ] * digit
addq %r10, %rax
adcq $0 , %rdx / p += r[2 ]
addq %r9, %rax
adcq $0 , %rdx / p += cy
movq %rax, 16 (%rdi) / r[2 ] = lo (p)
movq %rdx, %r9 / cy = hi (p)
decq %r8
jz .L27
movq 24 (%rsi), %rax
movq 24 (%rdi), %r10
mulq %rcx / p = a[3 ] * digit
addq %r10, %rax
adcq $0 , %rdx / p += r[3 ]
addq %r9, %rax
adcq $0 , %rdx / p += cy
movq %rax, 24 (%rdi) / r[3 ] = lo (p)
movq %rdx, %r9 / cy = hi (p)
decq %r8
jz .L27
movq 32 (%rsi), %rax
movq 32 (%rdi), %r10
mulq %rcx / p = a[4 ] * digit
addq %r10, %rax
adcq $0 , %rdx / p += r[4 ]
addq %r9, %rax
adcq $0 , %rdx / p += cy
movq %rax, 32 (%rdi) / r[4 ] = lo (p)
movq %rdx, %r9 / cy = hi (p)
decq %r8
jz .L27
movq 40 (%rsi), %rax
movq 40 (%rdi), %r10
mulq %rcx / p = a[5 ] * digit
addq %r10, %rax
adcq $0 , %rdx / p += r[5 ]
addq %r9, %rax
adcq $0 , %rdx / p += cy
movq %rax, 40 (%rdi) / r[5 ] = lo (p)
movq %rdx, %r9 / cy = hi (p)
decq %r8
jz .L27
movq 48 (%rsi), %rax
movq 48 (%rdi), %r10
mulq %rcx / p = a[6 ] * digit
addq %r10, %rax
adcq $0 , %rdx / p += r[6 ]
addq %r9, %rax
adcq $0 , %rdx / p += cy
movq %rax, 48 (%rdi) / r[6 ] = lo (p)
movq %rdx, %r9 / cy = hi (p)
decq %r8
jz .L27
.L27:
movq %r9, %rax
ret
.size s_mpv_mul_add_vec64, .-s_mpv_mul_add_vec64
Messung V0.5 in Prozent C=100 H=100 G=100
¤ Dauer der Verarbeitung: 0.3 Sekunden
¤
*© Formatika GbR, Deutschland