dnl ARM64 Neon mpn_hamdist -- mpn bit hamming distance.
dnl Copyright 2013, 2014 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
dnl
dnl The GNU MP Library is free software; you can redistribute it and/or modify
dnl it under the terms of either:
dnl
dnl * the GNU Lesser General Public License as published by the Free
dnl Software Foundation; either version 3 of the License, or (at your
dnl option) any later version.
dnl
dnl or
dnl
dnl * the GNU General Public License as published by the Free Software
dnl Foundation; either version 2 of the License, or (at your option) any
dnl later version.
dnl
dnl or both in parallel, as here.
dnl
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
dnl for more details.
dnl
dnl You should have received copies of the GNU General Public License and the
dnl GNU Lesser General Public License along with the GNU MP Library. If not,
dnl see https://www.gnu.org/licenses/.
include(`../config.m4')
C cycles/limb
C Cortex-A53 4.5
C Cortex-A57 1.9
C X-Gene 4.36
C TODO
C * Consider greater unrolling.
C * Arrange to align the pointer, if that helps performance. Use the same
C read-and-mask trick we use on PCs, for simplicity and performance. (Sorry
C valgrind!)
C * Explore if explicit align directives, e.g., "[ptr:128]" help.
C * See rth's gmp-devel 2013-02/03 messages about final summation tricks.
changecom(blah)
C INPUT PARAMETERS
define(`ap', x0)
define(`bp', x1)
define(`n', x2)
C We sum into 1616-bit counters in v4,v5, but at the end we sum them andend
C up with 816-bit counters. Therefore, we can sum to 8(2^16-1) bits, or
C (8*2^16-1)/64 = 0x1fff limbs. We use a chunksize close to that, but which
C allows the huge count code to jump deep into the code (at L(chu)).
L(lt8k):
movi v4.16b, #0 C clear summation register
movi v5.16b, #0 C clear summation register
tbz n, #0, L(xx0) sub n, n, #1
ld1 {v0.1d}, [ap], #8 C load 1 limb
ld1 {v16.1d}, [bp], #8 C load 1 limb
eor v0.16b, v0.16b, v16.16b
cnt v6.16b, v0.16b
uadalp v4.8h, v6.16b C could also splat
L(xx0): tbz n, #1, L(x00) sub n, n, #2
ld1 {v0.2d}, [ap], #16 C load 2 limbs
ld1 {v16.2d}, [bp], #16 C load 2 limbs
eor v0.16b, v0.16b, v16.16b
cnt v6.16b, v0.16b
uadalp v4.8h, v6.16b
L(x00): tbz n, #2, L(000)
subs n, n, #4
ld1 {v0.2d,v1.2d}, [ap], #32 C load 4 limbs
ld1 {v16.2d,v17.2d}, [bp], #32 C load 4 limbs
b.ls L(sum)
L(gt4): ld1 {v2.2d,v3.2d}, [ap], #32 C load 4 limbs
ld1 {v18.2d,v19.2d}, [bp], #32 C load 4 limbs
eor v0.16b, v0.16b, v16.16b
eor v1.16b, v1.16b, v17.16b sub n, n, #4
cnt v6.16b, v0.16b
cnt v7.16b, v1.16b
b L(mid)
L(000): subs n, n, #8
b.lo L(e0)
L(chu): ld1 {v2.2d,v3.2d}, [ap], #32 C load 4 limbs
ld1 {v0.2d,v1.2d}, [ap], #32 C load 4 limbs
ld1 {v18.2d,v19.2d}, [bp], #32 C load 4 limbs
ld1 {v16.2d,v17.2d}, [bp], #32 C load 4 limbs
eor v2.16b, v2.16b, v18.16b
eor v3.16b, v3.16b, v19.16b
cnt v6.16b, v2.16b
cnt v7.16b, v3.16b
subs n, n, #8
b.lo L(end)
L(end): uadalp v4.8h, v6.16b
uadalp v5.8h, v7.16b
L(sum): eor v0.16b, v0.16b, v16.16b
eor v1.16b, v1.16b, v17.16b
cnt v6.16b, v0.16b
cnt v7.16b, v1.16b
uadalp v4.8h, v6.16b
uadalp v5.8h, v7.16b add v4.8h, v4.8h, v5.8h
C we have 816-bit counts
L(e0): uaddlp v4.4s, v4.8h C we have 432-bit counts
uaddlp v4.2d, v4.4s C we have 264-bit counts mov x0, v4.d[0] mov x1, v4.d[1] add x0, x0, x1 ret
C Codefor count > maxsize. Splits operand and calls above code.
define(`ap2', x5) C caller-saves reg not used above
define(`bp2', x6) C caller-saves reg not used above
L(gt8k): mov x8, x30 mov x7, n C full count (caller-saves reg not used above) mov x4, #0 C total sum (caller-saves reg not used above) mov x9, #chunksize*8 C caller-saves reg not used above mov x10, #chunksize C caller-saves reg not used above
1: add ap2, ap, x9 C point at subsequent block add bp2, bp, x9 C point at subsequent block mov n, #chunksize-8 C count forthis invocation, adjusted for entry pt
movi v4.16b, #0 C clear chunk summation register
movi v5.16b, #0 C clear chunk summation register
bl L(chu) C jump deep inside code add x4, x4, x0 mov ap, ap2 C put chunk pointer in place for calls mov bp, bp2 C put chunk pointer in place for calls sub x7, x7, x10 cmp x7, x11
b.hi 1b
mov n, x7 C count for final invocation
bl L(lt8k) add x0, x4, x0 mov x30, x8 ret
EPILOGUE()
Messung V0.5 in Prozent
¤ Dauer der Verarbeitung: 0.14 Sekunden
(vorverarbeitet am 2026-06-10)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.