/* SPDX-License-Identifier: GPL-2.0-or-later */
#
# Accelerated poly1305 implementation for ppc64le.
#
# Copyright 2023 - IBM Corp. All rights reserved
#
#===================================================================================
# Written by Danny Tsen <dtsen@us.ibm.com>
#
# Poly1305 - this version mainly using vector/VSX/Scalar
# - 26 bits limbs
# - Handle multiple 64 byte blcok.
#
# Block size 16 bytes
# key = (r, s)
# clamp r &= 0 x0FFFFFFC0FFFFFFC 0 x0FFFFFFC0FFFFFFF
# p = 2 ^130 - 5
# a += m
# a = (r + a) % p
# a += s
#
# Improve performance by breaking down polynominal to the sum of products with
# h4 = m1 * r⁴ + m2 * r³ + m3 * r² + m4 * r
#
# 07 /22 /21 - this revison based on the above sum of products. Setup r^4 , r^3 , r^2 , r and s3, s2, s1, s0
# to 9 vectors for multiplications.
#
# setup r^4 , r^3 , r^2 , r vectors
# vs [r^1 , r^3 , r^2 , r^4 ]
# vs0 = [r0,.....]
# vs1 = [r1,.....]
# vs2 = [r2,.....]
# vs3 = [r3,.....]
# vs4 = [r4,.....]
# vs5 = [r1*5 ,...]
# vs6 = [r2*5 ,...]
# vs7 = [r2*5 ,...]
# vs8 = [r4*5 ,...]
#
# Each word in a vector consists a member of a "r/s" in [a * r/s].
#
# r0, r4*5 , r3*5 , r2*5 , r1*5 ;
# r1, r0, r4*5 , r3*5 , r2*5 ;
# r2, r1, r0, r4*5 , r3*5 ;
# r3, r2, r1, r0, r4*5 ;
# r4, r3, r2, r1, r0 ;
#
#
# poly1305_p10le_4blocks( uint8_t *k, uint32_t mlen, uint8_t *m)
# k = 32 bytes key
# r3 = k (r, s)
# r4 = mlen
# r5 = m
#
#include <asm/ppc_asm.h>
#include <asm/asm-offsets.h>
#include <asm/asm-compat.h>
#include <linux/linkage.h>
.machine "any"
.text
.macro SAVE_GPR GPR OFFSET FRAME
std \GPR,\OFFSET(\FRAME)
.endm
.macro SAVE_VRS VRS OFFSET FRAME
li 16 , \OFFSET
stvx \VRS, 16 , \FRAME
.endm
.macro SAVE_VSX VSX OFFSET FRAME
li 16 , \OFFSET
stxvx \VSX, 16 , \FRAME
.endm
.macro RESTORE_GPR GPR OFFSET FRAME
ld \GPR,\OFFSET(\FRAME)
.endm
.macro RESTORE_VRS VRS OFFSET FRAME
li 16 , \OFFSET
lvx \VRS, 16 , \FRAME
.endm
.macro RESTORE_VSX VSX OFFSET FRAME
li 16 , \OFFSET
lxvx \VSX, 16 , \FRAME
.endm
.macro SAVE_REGS
mflr 0
std 0 , 16 (1 )
stdu 1 ,-752 (1 )
SAVE_GPR 14 , 112 , 1
SAVE_GPR 15 , 120 , 1
SAVE_GPR 16 , 128 , 1
SAVE_GPR 17 , 136 , 1
SAVE_GPR 18 , 144 , 1
SAVE_GPR 19 , 152 , 1
SAVE_GPR 20 , 160 , 1
SAVE_GPR 21 , 168 , 1
SAVE_GPR 22 , 176 , 1
SAVE_GPR 23 , 184 , 1
SAVE_GPR 24 , 192 , 1
SAVE_GPR 25 , 200 , 1
SAVE_GPR 26 , 208 , 1
SAVE_GPR 27 , 216 , 1
SAVE_GPR 28 , 224 , 1
SAVE_GPR 29 , 232 , 1
SAVE_GPR 30 , 240 , 1
SAVE_GPR 31 , 248 , 1
addi 9 , 1 , 256
SAVE_VRS 20 , 0 , 9
SAVE_VRS 21 , 16 , 9
SAVE_VRS 22 , 32 , 9
SAVE_VRS 23 , 48 , 9
SAVE_VRS 24 , 64 , 9
SAVE_VRS 25 , 80 , 9
SAVE_VRS 26 , 96 , 9
SAVE_VRS 27 , 112 , 9
SAVE_VRS 28 , 128 , 9
SAVE_VRS 29 , 144 , 9
SAVE_VRS 30 , 160 , 9
SAVE_VRS 31 , 176 , 9
SAVE_VSX 14 , 192 , 9
SAVE_VSX 15 , 208 , 9
SAVE_VSX 16 , 224 , 9
SAVE_VSX 17 , 240 , 9
SAVE_VSX 18 , 256 , 9
SAVE_VSX 19 , 272 , 9
SAVE_VSX 20 , 288 , 9
SAVE_VSX 21 , 304 , 9
SAVE_VSX 22 , 320 , 9
SAVE_VSX 23 , 336 , 9
SAVE_VSX 24 , 352 , 9
SAVE_VSX 25 , 368 , 9
SAVE_VSX 26 , 384 , 9
SAVE_VSX 27 , 400 , 9
SAVE_VSX 28 , 416 , 9
SAVE_VSX 29 , 432 , 9
SAVE_VSX 30 , 448 , 9
SAVE_VSX 31 , 464 , 9
.endm # SAVE_REGS
.macro RESTORE_REGS
addi 9 , 1 , 256
RESTORE_VRS 20 , 0 , 9
RESTORE_VRS 21 , 16 , 9
RESTORE_VRS 22 , 32 , 9
RESTORE_VRS 23 , 48 , 9
RESTORE_VRS 24 , 64 , 9
RESTORE_VRS 25 , 80 , 9
RESTORE_VRS 26 , 96 , 9
RESTORE_VRS 27 , 112 , 9
RESTORE_VRS 28 , 128 , 9
RESTORE_VRS 29 , 144 , 9
RESTORE_VRS 30 , 160 , 9
RESTORE_VRS 31 , 176 , 9
RESTORE_VSX 14 , 192 , 9
RESTORE_VSX 15 , 208 , 9
RESTORE_VSX 16 , 224 , 9
RESTORE_VSX 17 , 240 , 9
RESTORE_VSX 18 , 256 , 9
RESTORE_VSX 19 , 272 , 9
RESTORE_VSX 20 , 288 , 9
RESTORE_VSX 21 , 304 , 9
RESTORE_VSX 22 , 320 , 9
RESTORE_VSX 23 , 336 , 9
RESTORE_VSX 24 , 352 , 9
RESTORE_VSX 25 , 368 , 9
RESTORE_VSX 26 , 384 , 9
RESTORE_VSX 27 , 400 , 9
RESTORE_VSX 28 , 416 , 9
RESTORE_VSX 29 , 432 , 9
RESTORE_VSX 30 , 448 , 9
RESTORE_VSX 31 , 464 , 9
RESTORE_GPR 14 , 112 , 1
RESTORE_GPR 15 , 120 , 1
RESTORE_GPR 16 , 128 , 1
RESTORE_GPR 17 , 136 , 1
RESTORE_GPR 18 , 144 , 1
RESTORE_GPR 19 , 152 , 1
RESTORE_GPR 20 , 160 , 1
RESTORE_GPR 21 , 168 , 1
RESTORE_GPR 22 , 176 , 1
RESTORE_GPR 23 , 184 , 1
RESTORE_GPR 24 , 192 , 1
RESTORE_GPR 25 , 200 , 1
RESTORE_GPR 26 , 208 , 1
RESTORE_GPR 27 , 216 , 1
RESTORE_GPR 28 , 224 , 1
RESTORE_GPR 29 , 232 , 1
RESTORE_GPR 30 , 240 , 1
RESTORE_GPR 31 , 248 , 1
addi 1 , 1 , 752
ld 0 , 16 (1 )
mtlr 0
.endm # RESTORE_REGS
#
# p[0 ] = a0*r0 + a1*r4*5 + a2*r3*5 + a3*r2*5 + a4*r1*5 ;
# p[1 ] = a0*r1 + a1*r0 + a2*r4*5 + a3*r3*5 + a4*r2*5 ;
# p[2 ] = a0*r2 + a1*r1 + a2*r0 + a3*r4*5 + a4*r3*5 ;
# p[3 ] = a0*r3 + a1*r2 + a2*r1 + a3*r0 + a4*r4*5 ;
# p[4 ] = a0*r4 + a1*r3 + a2*r2 + a3*r1 + a4*r0 ;
#
# [r^2 , r^3 , r^1 , r^4 ]
# [m3, m2, m4, m1]
#
# multiply odd and even words
.macro mul_odd
vmulouw 14 , 4 , 26
vmulouw 10 , 5 , 3
vmulouw 11 , 6 , 2
vmulouw 12 , 7 , 1
vmulouw 13 , 8 , 0
vmulouw 15 , 4 , 27
vaddudm 14 , 14 , 10
vaddudm 14 , 14 , 11
vmulouw 10 , 5 , 26
vmulouw 11 , 6 , 3
vaddudm 14 , 14 , 12
vaddudm 14 , 14 , 13 # x0
vaddudm 15 , 15 , 10
vaddudm 15 , 15 , 11
vmulouw 12 , 7 , 2
vmulouw 13 , 8 , 1
vaddudm 15 , 15 , 12
vaddudm 15 , 15 , 13 # x1
vmulouw 16 , 4 , 28
vmulouw 10 , 5 , 27
vmulouw 11 , 6 , 26
vaddudm 16 , 16 , 10
vaddudm 16 , 16 , 11
vmulouw 12 , 7 , 3
vmulouw 13 , 8 , 2
vaddudm 16 , 16 , 12
vaddudm 16 , 16 , 13 # x2
vmulouw 17 , 4 , 29
vmulouw 10 , 5 , 28
vmulouw 11 , 6 , 27
vaddudm 17 , 17 , 10
vaddudm 17 , 17 , 11
vmulouw 12 , 7 , 26
vmulouw 13 , 8 , 3
vaddudm 17 , 17 , 12
vaddudm 17 , 17 , 13 # x3
vmulouw 18 , 4 , 30
vmulouw 10 , 5 , 29
vmulouw 11 , 6 , 28
vaddudm 18 , 18 , 10
vaddudm 18 , 18 , 11
vmulouw 12 , 7 , 27
vmulouw 13 , 8 , 26
vaddudm 18 , 18 , 12
vaddudm 18 , 18 , 13 # x4
.endm
.macro mul_even
vmuleuw 9 , 4 , 26
vmuleuw 10 , 5 , 3
vmuleuw 11 , 6 , 2
vmuleuw 12 , 7 , 1
vmuleuw 13 , 8 , 0
vaddudm 14 , 14 , 9
vaddudm 14 , 14 , 10
vaddudm 14 , 14 , 11
vaddudm 14 , 14 , 12
vaddudm 14 , 14 , 13 # x0
vmuleuw 9 , 4 , 27
vmuleuw 10 , 5 , 26
vmuleuw 11 , 6 , 3
vmuleuw 12 , 7 , 2
vmuleuw 13 , 8 , 1
vaddudm 15 , 15 , 9
vaddudm 15 , 15 , 10
vaddudm 15 , 15 , 11
vaddudm 15 , 15 , 12
vaddudm 15 , 15 , 13 # x1
vmuleuw 9 , 4 , 28
vmuleuw 10 , 5 , 27
vmuleuw 11 , 6 , 26
vmuleuw 12 , 7 , 3
vmuleuw 13 , 8 , 2
vaddudm 16 , 16 , 9
vaddudm 16 , 16 , 10
vaddudm 16 , 16 , 11
vaddudm 16 , 16 , 12
vaddudm 16 , 16 , 13 # x2
vmuleuw 9 , 4 , 29
vmuleuw 10 , 5 , 28
vmuleuw 11 , 6 , 27
vmuleuw 12 , 7 , 26
vmuleuw 13 , 8 , 3
vaddudm 17 , 17 , 9
vaddudm 17 , 17 , 10
vaddudm 17 , 17 , 11
vaddudm 17 , 17 , 12
vaddudm 17 , 17 , 13 # x3
vmuleuw 9 , 4 , 30
vmuleuw 10 , 5 , 29
vmuleuw 11 , 6 , 28
vmuleuw 12 , 7 , 27
vmuleuw 13 , 8 , 26
vaddudm 18 , 18 , 9
vaddudm 18 , 18 , 10
vaddudm 18 , 18 , 11
vaddudm 18 , 18 , 12
vaddudm 18 , 18 , 13 # x4
.endm
#
# poly1305_setup_r
#
# setup r^4 , r^3 , r^2 , r vectors
# [r, r^3 , r^2 , r^4 ]
# vs0 = [r0,...]
# vs1 = [r1,...]
# vs2 = [r2,...]
# vs3 = [r3,...]
# vs4 = [r4,...]
# vs5 = [r4*5 ,...]
# vs6 = [r3*5 ,...]
# vs7 = [r2*5 ,...]
# vs8 = [r1*5 ,...]
#
# r0, r4*5 , r3*5 , r2*5 , r1*5 ;
# r1, r0, r4*5 , r3*5 , r2*5 ;
# r2, r1, r0, r4*5 , r3*5 ;
# r3, r2, r1, r0, r4*5 ;
# r4, r3, r2, r1, r0 ;
#
.macro poly1305_setup_r
# save r
xxlor 26 , 58 , 58
xxlor 27 , 59 , 59
xxlor 28 , 60 , 60
xxlor 29 , 61 , 61
xxlor 30 , 62 , 62
xxlxor 31 , 31 , 31
# [r, r^3 , r^2 , r^4 ]
# compute r^2
vmr 4 , 26
vmr 5 , 27
vmr 6 , 28
vmr 7 , 29
vmr 8 , 30
bl do_mul # r^2 r^1
xxpermdi 58 , 58 , 36 , 0 x3 # r0
xxpermdi 59 , 59 , 37 , 0 x3 # r1
xxpermdi 60 , 60 , 38 , 0 x3 # r2
xxpermdi 61 , 61 , 39 , 0 x3 # r3
xxpermdi 62 , 62 , 40 , 0 x3 # r4
xxpermdi 36 , 36 , 36 , 0 x3
xxpermdi 37 , 37 , 37 , 0 x3
xxpermdi 38 , 38 , 38 , 0 x3
xxpermdi 39 , 39 , 39 , 0 x3
xxpermdi 40 , 40 , 40 , 0 x3
vspltisb 13 , 2
vsld 9 , 27 , 13
vsld 10 , 28 , 13
vsld 11 , 29 , 13
vsld 12 , 30 , 13
vaddudm 0 , 9 , 27
vaddudm 1 , 10 , 28
vaddudm 2 , 11 , 29
vaddudm 3 , 12 , 30
bl do_mul # r^4 r^3
vmrgow 26 , 26 , 4
vmrgow 27 , 27 , 5
vmrgow 28 , 28 , 6
vmrgow 29 , 29 , 7
vmrgow 30 , 30 , 8
vspltisb 13 , 2
vsld 9 , 27 , 13
vsld 10 , 28 , 13
vsld 11 , 29 , 13
vsld 12 , 30 , 13
vaddudm 0 , 9 , 27
vaddudm 1 , 10 , 28
vaddudm 2 , 11 , 29
vaddudm 3 , 12 , 30
# r^2 r^4
xxlor 0 , 58 , 58
xxlor 1 , 59 , 59
xxlor 2 , 60 , 60
xxlor 3 , 61 , 61
xxlor 4 , 62 , 62
xxlor 5 , 32 , 32
xxlor 6 , 33 , 33
xxlor 7 , 34 , 34
xxlor 8 , 35 , 35
vspltw 9 , 26 , 3
vspltw 10 , 26 , 2
vmrgow 26 , 10 , 9
vspltw 9 , 27 , 3
vspltw 10 , 27 , 2
vmrgow 27 , 10 , 9
vspltw 9 , 28 , 3
vspltw 10 , 28 , 2
vmrgow 28 , 10 , 9
vspltw 9 , 29 , 3
vspltw 10 , 29 , 2
vmrgow 29 , 10 , 9
vspltw 9 , 30 , 3
vspltw 10 , 30 , 2
vmrgow 30 , 10 , 9
vsld 9 , 27 , 13
vsld 10 , 28 , 13
vsld 11 , 29 , 13
vsld 12 , 30 , 13
vaddudm 0 , 9 , 27
vaddudm 1 , 10 , 28
vaddudm 2 , 11 , 29
vaddudm 3 , 12 , 30
.endm
SYM_FUNC_START_LOCAL(do_mul)
mul_odd
# do reduction ( h %= p )
# carry reduction
vspltisb 9 , 2
vsrd 10 , 14 , 31
vsrd 11 , 17 , 31
vand 7 , 17 , 25
vand 4 , 14 , 25
vaddudm 18 , 18 , 11
vsrd 12 , 18 , 31
vaddudm 15 , 15 , 10
vsrd 11 , 15 , 31
vand 8 , 18 , 25
vand 5 , 15 , 25
vaddudm 4 , 4 , 12
vsld 10 , 12 , 9
vaddudm 6 , 16 , 11
vsrd 13 , 6 , 31
vand 6 , 6 , 25
vaddudm 4 , 4 , 10
vsrd 10 , 4 , 31
vaddudm 7 , 7 , 13
vsrd 11 , 7 , 31
vand 7 , 7 , 25
vand 4 , 4 , 25
vaddudm 5 , 5 , 10
vaddudm 8 , 8 , 11
blr
SYM_FUNC_END(do_mul)
#
# init key
#
.macro do_poly1305_init
addis 10 , 2 , rmask@toc@ha
addi 10 , 10 , rmask@toc@l
ld 11 , 0 (10 )
ld 12 , 8 (10 )
li 14 , 16
li 15 , 32
addis 10 , 2 , cnum@toc@ha
addi 10 , 10 , cnum@toc@l
lvx 25 , 0 , 10 # v25 - mask
lvx 31 , 14 , 10 # v31 = 1 a
lvx 19 , 15 , 10 # v19 = 1 << 24
lxv 24 , 48 (10 ) # vs24
lxv 25 , 64 (10 ) # vs25
# initialize
# load key from r3 to vectors
ld 9 , 24 (3 )
ld 10 , 32 (3 )
and. 9 , 9 , 11
and. 10 , 10 , 12
# break 26 bits
extrdi 14 , 9 , 26 , 38
extrdi 15 , 9 , 26 , 12
extrdi 16 , 9 , 12 , 0
mtvsrdd 58 , 0 , 14
insrdi 16 , 10 , 14 , 38
mtvsrdd 59 , 0 , 15
extrdi 17 , 10 , 26 , 24
mtvsrdd 60 , 0 , 16
extrdi 18 , 10 , 24 , 0
mtvsrdd 61 , 0 , 17
mtvsrdd 62 , 0 , 18
# r1 = r1 * 5 , r2 = r2 * 5 , r3 = r3 * 5 , r4 = r4 * 5
li 9 , 5
mtvsrdd 36 , 0 , 9
vmulouw 0 , 27 , 4 # v0 = rr0
vmulouw 1 , 28 , 4 # v1 = rr1
vmulouw 2 , 29 , 4 # v2 = rr2
vmulouw 3 , 30 , 4 # v3 = rr3
.endm
#
# poly1305_p10le_4blocks( uint8_t *k, uint32_t mlen, uint8_t *m)
# k = 32 bytes key
# r3 = k (r, s)
# r4 = mlen
# r5 = m
#
SYM_FUNC_START(poly1305_p10le_4blocks)
.align 5
cmpdi 5 , 64
blt Out_no_poly1305
SAVE_REGS
do_poly1305_init
li 21 , 0 # counter to message
poly1305_setup_r
# load previous H state
# break/convert r6 to 26 bits
ld 9 , 0 (3 )
ld 10 , 8 (3 )
ld 19 , 16 (3 )
sldi 19 , 19 , 24
mtvsrdd 41 , 0 , 19
extrdi 14 , 9 , 26 , 38
extrdi 15 , 9 , 26 , 12
extrdi 16 , 9 , 12 , 0
mtvsrdd 36 , 0 , 14
insrdi 16 , 10 , 14 , 38
mtvsrdd 37 , 0 , 15
extrdi 17 , 10 , 26 , 24
mtvsrdd 38 , 0 , 16
extrdi 18 , 10 , 24 , 0
mtvsrdd 39 , 0 , 17
mtvsrdd 40 , 0 , 18
vor 8 , 8 , 9
# input m1 m2
add 20 , 4 , 21
xxlor 49 , 24 , 24
xxlor 50 , 25 , 25
lxvw4x 43 , 0 , 20
addi 17 , 20 , 16
lxvw4x 44 , 0 , 17
vperm 14 , 11 , 12 , 17
vperm 15 , 11 , 12 , 18
vand 9 , 14 , 25 # a0
vsrd 10 , 14 , 31 # >> 26
vsrd 11 , 10 , 31 # 12 bits left
vand 10 , 10 , 25 # a1
vspltisb 13 , 12
vand 16 , 15 , 25
vsld 12 , 16 , 13
vor 11 , 11 , 12
vand 11 , 11 , 25 # a2
vspltisb 13 , 14
vsrd 12 , 15 , 13 # >> 14
vsrd 13 , 12 , 31 # >> 26 , a4
vand 12 , 12 , 25 # a3
vaddudm 20 , 4 , 9
vaddudm 21 , 5 , 10
vaddudm 22 , 6 , 11
vaddudm 23 , 7 , 12
vaddudm 24 , 8 , 13
# m3 m4
addi 17 , 17 , 16
lxvw4x 43 , 0 , 17
addi 17 , 17 , 16
lxvw4x 44 , 0 , 17
vperm 14 , 11 , 12 , 17
vperm 15 , 11 , 12 , 18
vand 9 , 14 , 25 # a0
vsrd 10 , 14 , 31 # >> 26
vsrd 11 , 10 , 31 # 12 bits left
vand 10 , 10 , 25 # a1
vspltisb 13 , 12
vand 16 , 15 , 25
vsld 12 , 16 , 13
vspltisb 13 , 14
vor 11 , 11 , 12
vand 11 , 11 , 25 # a2
vsrd 12 , 15 , 13 # >> 14
vsrd 13 , 12 , 31 # >> 26 , a4
vand 12 , 12 , 25 # a3
# Smash 4 message blocks into 5 vectors of [m4, m2, m3, m1]
vmrgow 4 , 9 , 20
vmrgow 5 , 10 , 21
vmrgow 6 , 11 , 22
vmrgow 7 , 12 , 23
vmrgow 8 , 13 , 24
vaddudm 8 , 8 , 19
addi 5 , 5 , -64 # len -= 64
addi 21 , 21 , 64 # offset += 64
li 9 , 64
divdu 31 , 5 , 9
cmpdi 31 , 0
ble Skip_block_loop
mtctr 31
# h4 = m1 * r⁴ + m2 * r³ + m3 * r² + m4 * r
# Rewrite the polynominal sum of product as follows,
# h1 = (h0 + m1) * r^2 , h2 = (h0 + m2) * r^2
# h3 = (h1 + m3) * r^2 , h4 = (h2 + m4) * r^2 --> (h0 + m1) r*4 + (h3 + m3) r^2 , (h0 + m2) r^4 + (h0 + m4) r^2
# .... Repeat
# h5 = (h3 + m5) * r^2 , h6 = (h4 + m6) * r^2 -->
# h7 = (h5 + m7) * r^2 , h8 = (h6 + m8) * r^1 --> m5 * r^4 + m6 * r^3 + m7 * r^2 + m8 * r
#
loop_4blocks:
# Multiply odd words and even words
mul_odd
mul_even
# carry reduction
vspltisb 9 , 2
vsrd 10 , 14 , 31
vsrd 11 , 17 , 31
vand 7 , 17 , 25
vand 4 , 14 , 25
vaddudm 18 , 18 , 11
vsrd 12 , 18 , 31
vaddudm 15 , 15 , 10
vsrd 11 , 15 , 31
vand 8 , 18 , 25
vand 5 , 15 , 25
vaddudm 4 , 4 , 12
vsld 10 , 12 , 9
vaddudm 6 , 16 , 11
vsrd 13 , 6 , 31
vand 6 , 6 , 25
vaddudm 4 , 4 , 10
vsrd 10 , 4 , 31
vaddudm 7 , 7 , 13
vsrd 11 , 7 , 31
vand 7 , 7 , 25
vand 4 , 4 , 25
vaddudm 5 , 5 , 10
vaddudm 8 , 8 , 11
# input m1 m2 m3 m4
add 20 , 4 , 21
xxlor 49 , 24 , 24
xxlor 50 , 25 , 25
lxvw4x 43 , 0 , 20
addi 17 , 20 , 16
lxvw4x 44 , 0 , 17
vperm 14 , 11 , 12 , 17
vperm 15 , 11 , 12 , 18
addi 17 , 17 , 16
lxvw4x 43 , 0 , 17
addi 17 , 17 , 16
lxvw4x 44 , 0 , 17
vperm 17 , 11 , 12 , 17
vperm 18 , 11 , 12 , 18
vand 20 , 14 , 25 # a0
vand 9 , 17 , 25 # a0
vsrd 21 , 14 , 31 # >> 26
vsrd 22 , 21 , 31 # 12 bits left
vsrd 10 , 17 , 31 # >> 26
vsrd 11 , 10 , 31 # 12 bits left
vand 21 , 21 , 25 # a1
vand 10 , 10 , 25 # a1
vspltisb 13 , 12
vand 16 , 15 , 25
vsld 23 , 16 , 13
vor 22 , 22 , 23
vand 22 , 22 , 25 # a2
vand 16 , 18 , 25
vsld 12 , 16 , 13
vor 11 , 11 , 12
vand 11 , 11 , 25 # a2
vspltisb 13 , 14
vsrd 23 , 15 , 13 # >> 14
vsrd 24 , 23 , 31 # >> 26 , a4
vand 23 , 23 , 25 # a3
vsrd 12 , 18 , 13 # >> 14
vsrd 13 , 12 , 31 # >> 26 , a4
vand 12 , 12 , 25 # a3
vaddudm 4 , 4 , 20
vaddudm 5 , 5 , 21
vaddudm 6 , 6 , 22
vaddudm 7 , 7 , 23
vaddudm 8 , 8 , 24
# Smash 4 message blocks into 5 vectors of [m4, m2, m3, m1]
vmrgow 4 , 9 , 4
vmrgow 5 , 10 , 5
vmrgow 6 , 11 , 6
vmrgow 7 , 12 , 7
vmrgow 8 , 13 , 8
vaddudm 8 , 8 , 19
addi 5 , 5 , -64 # len -= 64
addi 21 , 21 , 64 # offset += 64
bdnz loop_4blocks
Skip_block_loop:
xxlor 58 , 0 , 0
xxlor 59 , 1 , 1
xxlor 60 , 2 , 2
xxlor 61 , 3 , 3
xxlor 62 , 4 , 4
xxlor 32 , 5 , 5
xxlor 33 , 6 , 6
xxlor 34 , 7 , 7
xxlor 35 , 8 , 8
# Multiply odd words and even words
mul_odd
mul_even
# Sum the products.
xxpermdi 41 , 31 , 46 , 0
xxpermdi 42 , 31 , 47 , 0
vaddudm 4 , 14 , 9
xxpermdi 36 , 31 , 36 , 3
vaddudm 5 , 15 , 10
xxpermdi 37 , 31 , 37 , 3
xxpermdi 43 , 31 , 48 , 0
vaddudm 6 , 16 , 11
xxpermdi 38 , 31 , 38 , 3
xxpermdi 44 , 31 , 49 , 0
vaddudm 7 , 17 , 12
xxpermdi 39 , 31 , 39 , 3
xxpermdi 45 , 31 , 50 , 0
vaddudm 8 , 18 , 13
xxpermdi 40 , 31 , 40 , 3
# carry reduction
vspltisb 9 , 2
vsrd 10 , 4 , 31
vsrd 11 , 7 , 31
vand 7 , 7 , 25
vand 4 , 4 , 25
vaddudm 8 , 8 , 11
vsrd 12 , 8 , 31
vaddudm 5 , 5 , 10
vsrd 11 , 5 , 31
vand 8 , 8 , 25
vand 5 , 5 , 25
vaddudm 4 , 4 , 12
vsld 10 , 12 , 9
vaddudm 6 , 6 , 11
vsrd 13 , 6 , 31
vand 6 , 6 , 25
vaddudm 4 , 4 , 10
vsrd 10 , 4 , 31
vaddudm 7 , 7 , 13
vsrd 11 , 7 , 31
vand 7 , 7 , 25
vand 4 , 4 , 25
vaddudm 5 , 5 , 10
vsrd 10 , 5 , 31
vand 5 , 5 , 25
vaddudm 6 , 6 , 10
vaddudm 8 , 8 , 11
b do_final_update
do_final_update:
# combine 26 bit limbs
# v4, v5, v6, v7 and v8 are 26 bit vectors
vsld 5 , 5 , 31
vor 20 , 4 , 5
vspltisb 11 , 12
vsrd 12 , 6 , 11
vsld 6 , 6 , 31
vsld 6 , 6 , 31
vor 20 , 20 , 6
vspltisb 11 , 14
vsld 7 , 7 , 11
vor 21 , 7 , 12
mfvsrld 16 , 40 # save last 2 bytes
vsld 8 , 8 , 11
vsld 8 , 8 , 31
vor 21 , 21 , 8
mfvsrld 17 , 52
mfvsrld 19 , 53
srdi 16 , 16 , 24
std 17 , 0 (3 )
std 19 , 8 (3 )
stw 16 , 16 (3 )
Out_loop:
li 3 , 0
RESTORE_REGS
blr
Out_no_poly1305:
li 3 , 0
blr
SYM_FUNC_END(poly1305_p10le_4blocks)
#
# =======================================================================
# The following functions implement 64 x 64 bits multiplication poly1305.
#
SYM_FUNC_START_LOCAL(Poly1305_init_64)
# mask 0 x0FFFFFFC0FFFFFFC
# mask 0 x0FFFFFFC0FFFFFFF
addis 10 , 2 , rmask@toc@ha
addi 10 , 10 , rmask@toc@l
ld 11 , 0 (10 )
ld 12 , 8 (10 )
# initialize
# load key from r3
ld 9 , 24 (3 )
ld 10 , 32 (3 )
and. 9 , 9 , 11 # cramp mask r0
and. 10 , 10 , 12 # cramp mask r1
srdi 21 , 10 , 2
add 19 , 21 , 10 # s1: r19 - (r1 >> 2 ) *5
# setup r and s
li 25 , 0
mtvsrdd 32 +0 , 9 , 19 # r0, s1
mtvsrdd 32 +1 , 10 , 9 # r1, r0
mtvsrdd 32 +2 , 19 , 25 # s1
mtvsrdd 32 +3 , 9 , 25 # r0
blr
SYM_FUNC_END(Poly1305_init_64)
# Poly1305_mult
# v6 = (h0, h1), v8 = h2
# v0 = (r0, s1), v1 = (r1, r0), v2 = s1, v3 = r0
#
# Output: v7, v10, v11
#
SYM_FUNC_START_LOCAL(Poly1305_mult)
#
# d0 = h0 * r0 + h1 * s1
vmsumudm 7 , 6 , 0 , 9 # h0 * r0, h1 * s1
# d1 = h0 * r1 + h1 * r0 + h2 * s1
vmsumudm 11 , 6 , 1 , 9 # h0 * r1, h1 * r0
vmsumudm 10 , 8 , 2 , 11 # d1 += h2 * s1
# d2 = r0
vmsumudm 11 , 8 , 3 , 9 # d2 = h2 * r0
blr
SYM_FUNC_END(Poly1305_mult)
#
# carry reduction
# h %=p
#
# Input: v7, v10, v11
# Output: r27, r28, r29
#
SYM_FUNC_START_LOCAL(Carry_reduction)
mfvsrld 27 , 32 +7
mfvsrld 28 , 32 +10
mfvsrld 29 , 32 +11
mfvsrd 20 , 32 +7 # h0.h
mfvsrd 21 , 32 +10 # h1.h
addc 28 , 28 , 20
adde 29 , 29 , 21
srdi 22 , 29 , 0 x2
sldi 23 , 22 , 0 x2
add 23 , 23 , 22 # (h2 & 3 ) * 5
addc 27 , 27 , 23 # h0
addze 28 , 28 # h1
andi. 29 , 29 , 0 x3 # h2
blr
SYM_FUNC_END(Carry_reduction)
#
# poly1305 multiplication
# h *= r, h %= p
# d0 = h0 * r0 + h1 * s1
# d1 = h0 * r1 + h1 * r0 + h2 * s1
# d2 = h0 * r0
#
#
# unsigned int poly1305_test_64s(unisgned char *state, const byte *src, size_t len, highbit)
# - no highbit if final leftover block (highbit = 0 )
#
SYM_FUNC_START(poly1305_64s)
cmpdi 5 , 0
ble Out_no_poly1305_64
mflr 0
std 0 , 16 (1 )
stdu 1 ,-400 (1 )
SAVE_GPR 14 , 112 , 1
SAVE_GPR 15 , 120 , 1
SAVE_GPR 16 , 128 , 1
SAVE_GPR 17 , 136 , 1
SAVE_GPR 18 , 144 , 1
SAVE_GPR 19 , 152 , 1
SAVE_GPR 20 , 160 , 1
SAVE_GPR 21 , 168 , 1
SAVE_GPR 22 , 176 , 1
SAVE_GPR 23 , 184 , 1
SAVE_GPR 24 , 192 , 1
SAVE_GPR 25 , 200 , 1
SAVE_GPR 26 , 208 , 1
SAVE_GPR 27 , 216 , 1
SAVE_GPR 28 , 224 , 1
SAVE_GPR 29 , 232 , 1
SAVE_GPR 30 , 240 , 1
SAVE_GPR 31 , 248 , 1
# Init poly1305
bl Poly1305_init_64
li 25 , 0 # offset to inp and outp
add 11 , 25 , 4
# load h
# h0, h1, h2?
ld 27 , 0 (3 )
ld 28 , 8 (3 )
lwz 29 , 16 (3 )
li 30 , 16
divdu 31 , 5 , 30
mtctr 31
mr 24 , 6 # highbit
Loop_block_64:
vxor 9 , 9 , 9
ld 20 , 0 (11 )
ld 21 , 8 (11 )
addi 11 , 11 , 16
addc 27 , 27 , 20
adde 28 , 28 , 21
adde 29 , 29 , 24
li 22 , 0
mtvsrdd 32 +6 , 27 , 28 # h0, h1
mtvsrdd 32 +8 , 29 , 22 # h2
bl Poly1305_mult
bl Carry_reduction
bdnz Loop_block_64
std 27 , 0 (3 )
std 28 , 8 (3 )
stw 29 , 16 (3 )
li 3 , 0
RESTORE_GPR 14 , 112 , 1
RESTORE_GPR 15 , 120 , 1
RESTORE_GPR 16 , 128 , 1
RESTORE_GPR 17 , 136 , 1
RESTORE_GPR 18 , 144 , 1
RESTORE_GPR 19 , 152 , 1
RESTORE_GPR 20 , 160 , 1
RESTORE_GPR 21 , 168 , 1
RESTORE_GPR 22 , 176 , 1
RESTORE_GPR 23 , 184 , 1
RESTORE_GPR 24 , 192 , 1
RESTORE_GPR 25 , 200 , 1
RESTORE_GPR 26 , 208 , 1
RESTORE_GPR 27 , 216 , 1
RESTORE_GPR 28 , 224 , 1
RESTORE_GPR 29 , 232 , 1
RESTORE_GPR 30 , 240 , 1
RESTORE_GPR 31 , 248 , 1
addi 1 , 1 , 400
ld 0 , 16 (1 )
mtlr 0
blr
Out_no_poly1305_64:
li 3 , 0
blr
SYM_FUNC_END(poly1305_64s)
#
# Input: r3 = h, r4 = s, r5 = mac
# mac = h + s
#
SYM_FUNC_START(poly1305_emit_64)
ld 10 , 0 (3 )
ld 11 , 8 (3 )
ld 12 , 16 (3 )
# compare modulus
# h + 5 + (-p)
mr 6 , 10
mr 7 , 11
mr 8 , 12
addic. 6 , 6 , 5
addze 7 , 7
addze 8 , 8
srdi 9 , 8 , 2 # overflow?
cmpdi 9 , 0
beq Skip_h64
mr 10 , 6
mr 11 , 7
mr 12 , 8
Skip_h64:
ld 6 , 0 (4 )
ld 7 , 8 (4 )
addc 10 , 10 , 6
adde 11 , 11 , 7
addze 12 , 12
std 10 , 0 (5 )
std 11 , 8 (5 )
blr
SYM_FUNC_END(poly1305_emit_64)
SYM_DATA_START_LOCAL(RMASK)
.align 5
rmask:
.byte 0 xff, 0 xff, 0 xff, 0 x0f, 0 xfc, 0 xff, 0 xff, 0 x0f, 0 xfc, 0 xff, 0 xff, 0 x0f, 0 xfc, 0 xff, 0 xff, 0 x0f
cnum:
.long 0 x03ffffff, 0 x00000000, 0 x03ffffff, 0 x00000000
.long 0 x1a, 0 x00, 0 x1a, 0 x00
.long 0 x01000000, 0 x01000000, 0 x01000000, 0 x01000000
.long 0 x00010203, 0 x04050607, 0 x10111213, 0 x14151617
.long 0 x08090a0b, 0 x0c0d0e0f, 0 x18191a1b, 0 x1c1d1e1f
SYM_DATA_END(RMASK)
Messung V0.5 in Prozent C=95 H=94 G=94