/* SPDX-License-Identifier: GPL-2.0-or-later */
#
# Accelerated chacha20 implementation for ppc64le.
#
# Copyright 2023 - IBM Corp. All rights reserved
#
#===================================================================================
# Written by Danny Tsen <dtsen@us.ibm.com>
#
# do rounds, 8 quarter rounds
# 1 . a += b; d ^= a; d <<<= 16 ;
# 2 . c += d; b ^= c; b <<<= 12 ;
# 3 . a += b; d ^= a; d <<<= 8 ;
# 4 . c += d; b ^= c; b <<<= 7
#
# row1 = (row1 + row2), row4 = row1 xor row4, row4 rotate each word by 16
# row3 = (row3 + row4), row2 = row3 xor row2, row2 rotate each word by 12
# row1 = (row1 + row2), row4 = row1 xor row4, row4 rotate each word by 8
# row3 = (row3 + row4), row2 = row3 xor row2, row2 rotate each word by 7
#
# 4 blocks (a b c d)
#
# a0 b0 c0 d0
# a1 b1 c1 d1
# ...
# a4 b4 c4 d4
# ...
# a8 b8 c8 d8
# ...
# a12 b12 c12 d12
# a13 ...
# a14 ...
# a15 b15 c15 d15
#
# Column round (v0, v4, v8, v12, v1, v5, v9, v13, v2, v6, v10, v14, v3, v7, v11, v15)
# Diagnal round (v0, v5, v10, v15, v1, v6, v11, v12, v2, v7, v8, v13, v3, v4, v9, v14)
#
#include <asm/ppc_asm.h>
#include <asm/asm-offsets.h>
#include <asm/asm-compat.h>
#include <linux/linkage.h>
.machine "any"
.text
.macro SAVE_GPR GPR OFFSET FRAME
std \GPR,\OFFSET(\FRAME)
.endm
.macro SAVE_VRS VRS OFFSET FRAME
li 16 , \OFFSET
stvx \VRS, 16 , \FRAME
.endm
.macro SAVE_VSX VSX OFFSET FRAME
li 16 , \OFFSET
stxvx \VSX, 16 , \FRAME
.endm
.macro RESTORE_GPR GPR OFFSET FRAME
ld \GPR,\OFFSET(\FRAME)
.endm
.macro RESTORE_VRS VRS OFFSET FRAME
li 16 , \OFFSET
lvx \VRS, 16 , \FRAME
.endm
.macro RESTORE_VSX VSX OFFSET FRAME
li 16 , \OFFSET
lxvx \VSX, 16 , \FRAME
.endm
.macro SAVE_REGS
mflr 0
std 0 , 16 (1 )
stdu 1 ,-752 (1 )
SAVE_GPR 14 , 112 , 1
SAVE_GPR 15 , 120 , 1
SAVE_GPR 16 , 128 , 1
SAVE_GPR 17 , 136 , 1
SAVE_GPR 18 , 144 , 1
SAVE_GPR 19 , 152 , 1
SAVE_GPR 20 , 160 , 1
SAVE_GPR 21 , 168 , 1
SAVE_GPR 22 , 176 , 1
SAVE_GPR 23 , 184 , 1
SAVE_GPR 24 , 192 , 1
SAVE_GPR 25 , 200 , 1
SAVE_GPR 26 , 208 , 1
SAVE_GPR 27 , 216 , 1
SAVE_GPR 28 , 224 , 1
SAVE_GPR 29 , 232 , 1
SAVE_GPR 30 , 240 , 1
SAVE_GPR 31 , 248 , 1
addi 9 , 1 , 256
SAVE_VRS 20 , 0 , 9
SAVE_VRS 21 , 16 , 9
SAVE_VRS 22 , 32 , 9
SAVE_VRS 23 , 48 , 9
SAVE_VRS 24 , 64 , 9
SAVE_VRS 25 , 80 , 9
SAVE_VRS 26 , 96 , 9
SAVE_VRS 27 , 112 , 9
SAVE_VRS 28 , 128 , 9
SAVE_VRS 29 , 144 , 9
SAVE_VRS 30 , 160 , 9
SAVE_VRS 31 , 176 , 9
SAVE_VSX 14 , 192 , 9
SAVE_VSX 15 , 208 , 9
SAVE_VSX 16 , 224 , 9
SAVE_VSX 17 , 240 , 9
SAVE_VSX 18 , 256 , 9
SAVE_VSX 19 , 272 , 9
SAVE_VSX 20 , 288 , 9
SAVE_VSX 21 , 304 , 9
SAVE_VSX 22 , 320 , 9
SAVE_VSX 23 , 336 , 9
SAVE_VSX 24 , 352 , 9
SAVE_VSX 25 , 368 , 9
SAVE_VSX 26 , 384 , 9
SAVE_VSX 27 , 400 , 9
SAVE_VSX 28 , 416 , 9
SAVE_VSX 29 , 432 , 9
SAVE_VSX 30 , 448 , 9
SAVE_VSX 31 , 464 , 9
.endm # SAVE_REGS
.macro RESTORE_REGS
addi 9 , 1 , 256
RESTORE_VRS 20 , 0 , 9
RESTORE_VRS 21 , 16 , 9
RESTORE_VRS 22 , 32 , 9
RESTORE_VRS 23 , 48 , 9
RESTORE_VRS 24 , 64 , 9
RESTORE_VRS 25 , 80 , 9
RESTORE_VRS 26 , 96 , 9
RESTORE_VRS 27 , 112 , 9
RESTORE_VRS 28 , 128 , 9
RESTORE_VRS 29 , 144 , 9
RESTORE_VRS 30 , 160 , 9
RESTORE_VRS 31 , 176 , 9
RESTORE_VSX 14 , 192 , 9
RESTORE_VSX 15 , 208 , 9
RESTORE_VSX 16 , 224 , 9
RESTORE_VSX 17 , 240 , 9
RESTORE_VSX 18 , 256 , 9
RESTORE_VSX 19 , 272 , 9
RESTORE_VSX 20 , 288 , 9
RESTORE_VSX 21 , 304 , 9
RESTORE_VSX 22 , 320 , 9
RESTORE_VSX 23 , 336 , 9
RESTORE_VSX 24 , 352 , 9
RESTORE_VSX 25 , 368 , 9
RESTORE_VSX 26 , 384 , 9
RESTORE_VSX 27 , 400 , 9
RESTORE_VSX 28 , 416 , 9
RESTORE_VSX 29 , 432 , 9
RESTORE_VSX 30 , 448 , 9
RESTORE_VSX 31 , 464 , 9
RESTORE_GPR 14 , 112 , 1
RESTORE_GPR 15 , 120 , 1
RESTORE_GPR 16 , 128 , 1
RESTORE_GPR 17 , 136 , 1
RESTORE_GPR 18 , 144 , 1
RESTORE_GPR 19 , 152 , 1
RESTORE_GPR 20 , 160 , 1
RESTORE_GPR 21 , 168 , 1
RESTORE_GPR 22 , 176 , 1
RESTORE_GPR 23 , 184 , 1
RESTORE_GPR 24 , 192 , 1
RESTORE_GPR 25 , 200 , 1
RESTORE_GPR 26 , 208 , 1
RESTORE_GPR 27 , 216 , 1
RESTORE_GPR 28 , 224 , 1
RESTORE_GPR 29 , 232 , 1
RESTORE_GPR 30 , 240 , 1
RESTORE_GPR 31 , 248 , 1
addi 1 , 1 , 752
ld 0 , 16 (1 )
mtlr 0
.endm # RESTORE_REGS
.macro QT_loop_8x
# QR(v0, v4, v8, v12, v1, v5, v9, v13, v2, v6, v10, v14, v3, v7, v11, v15)
xxlor 0 , 32 +25 , 32 +25
xxlor 32 +25 , 20 , 20
vadduwm 0 , 0 , 4
vadduwm 1 , 1 , 5
vadduwm 2 , 2 , 6
vadduwm 3 , 3 , 7
vadduwm 16 , 16 , 20
vadduwm 17 , 17 , 21
vadduwm 18 , 18 , 22
vadduwm 19 , 19 , 23
vpermxor 12 , 12 , 0 , 25
vpermxor 13 , 13 , 1 , 25
vpermxor 14 , 14 , 2 , 25
vpermxor 15 , 15 , 3 , 25
vpermxor 28 , 28 , 16 , 25
vpermxor 29 , 29 , 17 , 25
vpermxor 30 , 30 , 18 , 25
vpermxor 31 , 31 , 19 , 25
xxlor 32 +25 , 0 , 0
vadduwm 8 , 8 , 12
vadduwm 9 , 9 , 13
vadduwm 10 , 10 , 14
vadduwm 11 , 11 , 15
vadduwm 24 , 24 , 28
vadduwm 25 , 25 , 29
vadduwm 26 , 26 , 30
vadduwm 27 , 27 , 31
vxor 4 , 4 , 8
vxor 5 , 5 , 9
vxor 6 , 6 , 10
vxor 7 , 7 , 11
vxor 20 , 20 , 24
vxor 21 , 21 , 25
vxor 22 , 22 , 26
vxor 23 , 23 , 27
xxlor 0 , 32 +25 , 32 +25
xxlor 32 +25 , 21 , 21
vrlw 4 , 4 , 25 #
vrlw 5 , 5 , 25
vrlw 6 , 6 , 25
vrlw 7 , 7 , 25
vrlw 20 , 20 , 25 #
vrlw 21 , 21 , 25
vrlw 22 , 22 , 25
vrlw 23 , 23 , 25
xxlor 32 +25 , 0 , 0
vadduwm 0 , 0 , 4
vadduwm 1 , 1 , 5
vadduwm 2 , 2 , 6
vadduwm 3 , 3 , 7
vadduwm 16 , 16 , 20
vadduwm 17 , 17 , 21
vadduwm 18 , 18 , 22
vadduwm 19 , 19 , 23
xxlor 0 , 32 +25 , 32 +25
xxlor 32 +25 , 22 , 22
vpermxor 12 , 12 , 0 , 25
vpermxor 13 , 13 , 1 , 25
vpermxor 14 , 14 , 2 , 25
vpermxor 15 , 15 , 3 , 25
vpermxor 28 , 28 , 16 , 25
vpermxor 29 , 29 , 17 , 25
vpermxor 30 , 30 , 18 , 25
vpermxor 31 , 31 , 19 , 25
xxlor 32 +25 , 0 , 0
vadduwm 8 , 8 , 12
vadduwm 9 , 9 , 13
vadduwm 10 , 10 , 14
vadduwm 11 , 11 , 15
vadduwm 24 , 24 , 28
vadduwm 25 , 25 , 29
vadduwm 26 , 26 , 30
vadduwm 27 , 27 , 31
xxlor 0 , 32 +28 , 32 +28
xxlor 32 +28 , 23 , 23
vxor 4 , 4 , 8
vxor 5 , 5 , 9
vxor 6 , 6 , 10
vxor 7 , 7 , 11
vxor 20 , 20 , 24
vxor 21 , 21 , 25
vxor 22 , 22 , 26
vxor 23 , 23 , 27
vrlw 4 , 4 , 28 #
vrlw 5 , 5 , 28
vrlw 6 , 6 , 28
vrlw 7 , 7 , 28
vrlw 20 , 20 , 28 #
vrlw 21 , 21 , 28
vrlw 22 , 22 , 28
vrlw 23 , 23 , 28
xxlor 32 +28 , 0 , 0
# QR(v0, v5, v10, v15, v1, v6, v11, v12, v2, v7, v8, v13, v3, v4, v9, v14)
xxlor 0 , 32 +25 , 32 +25
xxlor 32 +25 , 20 , 20
vadduwm 0 , 0 , 5
vadduwm 1 , 1 , 6
vadduwm 2 , 2 , 7
vadduwm 3 , 3 , 4
vadduwm 16 , 16 , 21
vadduwm 17 , 17 , 22
vadduwm 18 , 18 , 23
vadduwm 19 , 19 , 20
vpermxor 15 , 15 , 0 , 25
vpermxor 12 , 12 , 1 , 25
vpermxor 13 , 13 , 2 , 25
vpermxor 14 , 14 , 3 , 25
vpermxor 31 , 31 , 16 , 25
vpermxor 28 , 28 , 17 , 25
vpermxor 29 , 29 , 18 , 25
vpermxor 30 , 30 , 19 , 25
xxlor 32 +25 , 0 , 0
vadduwm 10 , 10 , 15
vadduwm 11 , 11 , 12
vadduwm 8 , 8 , 13
vadduwm 9 , 9 , 14
vadduwm 26 , 26 , 31
vadduwm 27 , 27 , 28
vadduwm 24 , 24 , 29
vadduwm 25 , 25 , 30
vxor 5 , 5 , 10
vxor 6 , 6 , 11
vxor 7 , 7 , 8
vxor 4 , 4 , 9
vxor 21 , 21 , 26
vxor 22 , 22 , 27
vxor 23 , 23 , 24
vxor 20 , 20 , 25
xxlor 0 , 32 +25 , 32 +25
xxlor 32 +25 , 21 , 21
vrlw 5 , 5 , 25
vrlw 6 , 6 , 25
vrlw 7 , 7 , 25
vrlw 4 , 4 , 25
vrlw 21 , 21 , 25
vrlw 22 , 22 , 25
vrlw 23 , 23 , 25
vrlw 20 , 20 , 25
xxlor 32 +25 , 0 , 0
vadduwm 0 , 0 , 5
vadduwm 1 , 1 , 6
vadduwm 2 , 2 , 7
vadduwm 3 , 3 , 4
vadduwm 16 , 16 , 21
vadduwm 17 , 17 , 22
vadduwm 18 , 18 , 23
vadduwm 19 , 19 , 20
xxlor 0 , 32 +25 , 32 +25
xxlor 32 +25 , 22 , 22
vpermxor 15 , 15 , 0 , 25
vpermxor 12 , 12 , 1 , 25
vpermxor 13 , 13 , 2 , 25
vpermxor 14 , 14 , 3 , 25
vpermxor 31 , 31 , 16 , 25
vpermxor 28 , 28 , 17 , 25
vpermxor 29 , 29 , 18 , 25
vpermxor 30 , 30 , 19 , 25
xxlor 32 +25 , 0 , 0
vadduwm 10 , 10 , 15
vadduwm 11 , 11 , 12
vadduwm 8 , 8 , 13
vadduwm 9 , 9 , 14
vadduwm 26 , 26 , 31
vadduwm 27 , 27 , 28
vadduwm 24 , 24 , 29
vadduwm 25 , 25 , 30
xxlor 0 , 32 +28 , 32 +28
xxlor 32 +28 , 23 , 23
vxor 5 , 5 , 10
vxor 6 , 6 , 11
vxor 7 , 7 , 8
vxor 4 , 4 , 9
vxor 21 , 21 , 26
vxor 22 , 22 , 27
vxor 23 , 23 , 24
vxor 20 , 20 , 25
vrlw 5 , 5 , 28
vrlw 6 , 6 , 28
vrlw 7 , 7 , 28
vrlw 4 , 4 , 28
vrlw 21 , 21 , 28
vrlw 22 , 22 , 28
vrlw 23 , 23 , 28
vrlw 20 , 20 , 28
xxlor 32 +28 , 0 , 0
.endm
.macro QT_loop_4x
# QR(v0, v4, v8, v12, v1, v5, v9, v13, v2, v6, v10, v14, v3, v7, v11, v15)
vadduwm 0 , 0 , 4
vadduwm 1 , 1 , 5
vadduwm 2 , 2 , 6
vadduwm 3 , 3 , 7
vpermxor 12 , 12 , 0 , 20
vpermxor 13 , 13 , 1 , 20
vpermxor 14 , 14 , 2 , 20
vpermxor 15 , 15 , 3 , 20
vadduwm 8 , 8 , 12
vadduwm 9 , 9 , 13
vadduwm 10 , 10 , 14
vadduwm 11 , 11 , 15
vxor 4 , 4 , 8
vxor 5 , 5 , 9
vxor 6 , 6 , 10
vxor 7 , 7 , 11
vrlw 4 , 4 , 21
vrlw 5 , 5 , 21
vrlw 6 , 6 , 21
vrlw 7 , 7 , 21
vadduwm 0 , 0 , 4
vadduwm 1 , 1 , 5
vadduwm 2 , 2 , 6
vadduwm 3 , 3 , 7
vpermxor 12 , 12 , 0 , 22
vpermxor 13 , 13 , 1 , 22
vpermxor 14 , 14 , 2 , 22
vpermxor 15 , 15 , 3 , 22
vadduwm 8 , 8 , 12
vadduwm 9 , 9 , 13
vadduwm 10 , 10 , 14
vadduwm 11 , 11 , 15
vxor 4 , 4 , 8
vxor 5 , 5 , 9
vxor 6 , 6 , 10
vxor 7 , 7 , 11
vrlw 4 , 4 , 23
vrlw 5 , 5 , 23
vrlw 6 , 6 , 23
vrlw 7 , 7 , 23
# QR(v0, v5, v10, v15, v1, v6, v11, v12, v2, v7, v8, v13, v3, v4, v9, v14)
vadduwm 0 , 0 , 5
vadduwm 1 , 1 , 6
vadduwm 2 , 2 , 7
vadduwm 3 , 3 , 4
vpermxor 15 , 15 , 0 , 20
vpermxor 12 , 12 , 1 , 20
vpermxor 13 , 13 , 2 , 20
vpermxor 14 , 14 , 3 , 20
vadduwm 10 , 10 , 15
vadduwm 11 , 11 , 12
vadduwm 8 , 8 , 13
vadduwm 9 , 9 , 14
vxor 5 , 5 , 10
vxor 6 , 6 , 11
vxor 7 , 7 , 8
vxor 4 , 4 , 9
vrlw 5 , 5 , 21
vrlw 6 , 6 , 21
vrlw 7 , 7 , 21
vrlw 4 , 4 , 21
vadduwm 0 , 0 , 5
vadduwm 1 , 1 , 6
vadduwm 2 , 2 , 7
vadduwm 3 , 3 , 4
vpermxor 15 , 15 , 0 , 22
vpermxor 12 , 12 , 1 , 22
vpermxor 13 , 13 , 2 , 22
vpermxor 14 , 14 , 3 , 22
vadduwm 10 , 10 , 15
vadduwm 11 , 11 , 12
vadduwm 8 , 8 , 13
vadduwm 9 , 9 , 14
vxor 5 , 5 , 10
vxor 6 , 6 , 11
vxor 7 , 7 , 8
vxor 4 , 4 , 9
vrlw 5 , 5 , 23
vrlw 6 , 6 , 23
vrlw 7 , 7 , 23
vrlw 4 , 4 , 23
.endm
# Transpose
.macro TP_4x a0 a1 a2 a3
xxmrghw 10 , 32 +\a0, 32 +\a1 # a0, a1, b0, b1
xxmrghw 11 , 32 +\a2, 32 +\a3 # a2, a3, b2, b3
xxmrglw 12 , 32 +\a0, 32 +\a1 # c0, c1, d0, d1
xxmrglw 13 , 32 +\a2, 32 +\a3 # c2, c3, d2, d3
xxpermdi 32 +\a0, 10 , 11 , 0 # a0, a1, a2, a3
xxpermdi 32 +\a1, 10 , 11 , 3 # b0, b1, b2, b3
xxpermdi 32 +\a2, 12 , 13 , 0 # c0, c1, c2, c3
xxpermdi 32 +\a3, 12 , 13 , 3 # d0, d1, d2, d3
.endm
# key stream = working state + state
.macro Add_state S
vadduwm \S+0 , \S+0 , 16 -\S
vadduwm \S+4 , \S+4 , 17 -\S
vadduwm \S+8 , \S+8 , 18 -\S
vadduwm \S+12 , \S+12 , 19 -\S
vadduwm \S+1 , \S+1 , 16 -\S
vadduwm \S+5 , \S+5 , 17 -\S
vadduwm \S+9 , \S+9 , 18 -\S
vadduwm \S+13 , \S+13 , 19 -\S
vadduwm \S+2 , \S+2 , 16 -\S
vadduwm \S+6 , \S+6 , 17 -\S
vadduwm \S+10 , \S+10 , 18 -\S
vadduwm \S+14 , \S+14 , 19 -\S
vadduwm \S+3 , \S+3 , 16 -\S
vadduwm \S+7 , \S+7 , 17 -\S
vadduwm \S+11 , \S+11 , 18 -\S
vadduwm \S+15 , \S+15 , 19 -\S
.endm
#
# write 256 bytes
#
.macro Write_256 S
add 9 , 14 , 5
add 16 , 14 , 4
lxvw4x 0 , 0 , 9
lxvw4x 1 , 17 , 9
lxvw4x 2 , 18 , 9
lxvw4x 3 , 19 , 9
lxvw4x 4 , 20 , 9
lxvw4x 5 , 21 , 9
lxvw4x 6 , 22 , 9
lxvw4x 7 , 23 , 9
lxvw4x 8 , 24 , 9
lxvw4x 9 , 25 , 9
lxvw4x 10 , 26 , 9
lxvw4x 11 , 27 , 9
lxvw4x 12 , 28 , 9
lxvw4x 13 , 29 , 9
lxvw4x 14 , 30 , 9
lxvw4x 15 , 31 , 9
xxlxor \S+32 , \S+32 , 0
xxlxor \S+36 , \S+36 , 1
xxlxor \S+40 , \S+40 , 2
xxlxor \S+44 , \S+44 , 3
xxlxor \S+33 , \S+33 , 4
xxlxor \S+37 , \S+37 , 5
xxlxor \S+41 , \S+41 , 6
xxlxor \S+45 , \S+45 , 7
xxlxor \S+34 , \S+34 , 8
xxlxor \S+38 , \S+38 , 9
xxlxor \S+42 , \S+42 , 10
xxlxor \S+46 , \S+46 , 11
xxlxor \S+35 , \S+35 , 12
xxlxor \S+39 , \S+39 , 13
xxlxor \S+43 , \S+43 , 14
xxlxor \S+47 , \S+47 , 15
stxvw4x \S+32 , 0 , 16
stxvw4x \S+36 , 17 , 16
stxvw4x \S+40 , 18 , 16
stxvw4x \S+44 , 19 , 16
stxvw4x \S+33 , 20 , 16
stxvw4x \S+37 , 21 , 16
stxvw4x \S+41 , 22 , 16
stxvw4x \S+45 , 23 , 16
stxvw4x \S+34 , 24 , 16
stxvw4x \S+38 , 25 , 16
stxvw4x \S+42 , 26 , 16
stxvw4x \S+46 , 27 , 16
stxvw4x \S+35 , 28 , 16
stxvw4x \S+39 , 29 , 16
stxvw4x \S+43 , 30 , 16
stxvw4x \S+47 , 31 , 16
.endm
#
# void chacha_p10le_8x(const struct chacha_state *state, u8 *dst, const u8 *src,
# unsigned int len, int nrounds);
#
SYM_FUNC_START(chacha_p10le_8x)
.align 5
cmpdi 6 , 0
ble Out_no_chacha
SAVE_REGS
# r17 - r31 mainly for Write_256 macro.
li 17 , 16
li 18 , 32
li 19 , 48
li 20 , 64
li 21 , 80
li 22 , 96
li 23 , 112
li 24 , 128
li 25 , 144
li 26 , 160
li 27 , 176
li 28 , 192
li 29 , 208
li 30 , 224
li 31 , 240
mr 15 , 6 # len
li 14 , 0 # offset to inp and outp
lxvw4x 48 , 0 , 3 # vr16, constants
lxvw4x 49 , 17 , 3 # vr17, key 1
lxvw4x 50 , 18 , 3 # vr18, key 2
lxvw4x 51 , 19 , 3 # vr19, counter, nonce
# create (0 , 1 , 2 , 3 ) counters
vspltisw 0 , 0
vspltisw 1 , 1
vspltisw 2 , 2
vspltisw 3 , 3
vmrghw 4 , 0 , 1
vmrglw 5 , 2 , 3
vsldoi 30 , 4 , 5 , 8 # vr30 counter, 4 (0 , 1 , 2 , 3 )
vspltisw 21 , 12
vspltisw 23 , 7
addis 11 , 2 , permx@toc@ha
addi 11 , 11 , permx@toc@l
lxvw4x 32 +20 , 0 , 11
lxvw4x 32 +22 , 17 , 11
sradi 8 , 7 , 1
mtctr 8
# save constants to vsx
xxlor 16 , 48 , 48
xxlor 17 , 49 , 49
xxlor 18 , 50 , 50
xxlor 19 , 51 , 51
vspltisw 25 , 4
vspltisw 26 , 8
xxlor 25 , 32 +26 , 32 +26
xxlor 24 , 32 +25 , 32 +25
vadduwm 31 , 30 , 25 # counter = (0 , 1 , 2 , 3 ) + (4 , 4 , 4 , 4 )
xxlor 30 , 32 +30 , 32 +30
xxlor 31 , 32 +31 , 32 +31
xxlor 20 , 32 +20 , 32 +20
xxlor 21 , 32 +21 , 32 +21
xxlor 22 , 32 +22 , 32 +22
xxlor 23 , 32 +23 , 32 +23
cmpdi 6 , 512
blt Loop_last
Loop_8x:
xxspltw 32 +0 , 16 , 0
xxspltw 32 +1 , 16 , 1
xxspltw 32 +2 , 16 , 2
xxspltw 32 +3 , 16 , 3
xxspltw 32 +4 , 17 , 0
xxspltw 32 +5 , 17 , 1
xxspltw 32 +6 , 17 , 2
xxspltw 32 +7 , 17 , 3
xxspltw 32 +8 , 18 , 0
xxspltw 32 +9 , 18 , 1
xxspltw 32 +10 , 18 , 2
xxspltw 32 +11 , 18 , 3
xxspltw 32 +12 , 19 , 0
xxspltw 32 +13 , 19 , 1
xxspltw 32 +14 , 19 , 2
xxspltw 32 +15 , 19 , 3
vadduwm 12 , 12 , 30 # increase counter
xxspltw 32 +16 , 16 , 0
xxspltw 32 +17 , 16 , 1
xxspltw 32 +18 , 16 , 2
xxspltw 32 +19 , 16 , 3
xxspltw 32 +20 , 17 , 0
xxspltw 32 +21 , 17 , 1
xxspltw 32 +22 , 17 , 2
xxspltw 32 +23 , 17 , 3
xxspltw 32 +24 , 18 , 0
xxspltw 32 +25 , 18 , 1
xxspltw 32 +26 , 18 , 2
xxspltw 32 +27 , 18 , 3
xxspltw 32 +28 , 19 , 0
xxspltw 32 +29 , 19 , 1
vadduwm 28 , 28 , 31 # increase counter
xxspltw 32 +30 , 19 , 2
xxspltw 32 +31 , 19 , 3
.align 5
quarter_loop_8x:
QT_loop_8x
bdnz quarter_loop_8x
xxlor 0 , 32 +30 , 32 +30
xxlor 32 +30 , 30 , 30
vadduwm 12 , 12 , 30
xxlor 32 +30 , 0 , 0
TP_4x 0 , 1 , 2 , 3
TP_4x 4 , 5 , 6 , 7
TP_4x 8 , 9 , 10 , 11
TP_4x 12 , 13 , 14 , 15
xxlor 0 , 48 , 48
xxlor 1 , 49 , 49
xxlor 2 , 50 , 50
xxlor 3 , 51 , 51
xxlor 48 , 16 , 16
xxlor 49 , 17 , 17
xxlor 50 , 18 , 18
xxlor 51 , 19 , 19
Add_state 0
xxlor 48 , 0 , 0
xxlor 49 , 1 , 1
xxlor 50 , 2 , 2
xxlor 51 , 3 , 3
Write_256 0
addi 14 , 14 , 256 # offset +=256
addi 15 , 15 , -256 # len -=256
xxlor 5 , 32 +31 , 32 +31
xxlor 32 +31 , 31 , 31
vadduwm 28 , 28 , 31
xxlor 32 +31 , 5 , 5
TP_4x 16 +0 , 16 +1 , 16 +2 , 16 +3
TP_4x 16 +4 , 16 +5 , 16 +6 , 16 +7
TP_4x 16 +8 , 16 +9 , 16 +10 , 16 +11
TP_4x 16 +12 , 16 +13 , 16 +14 , 16 +15
xxlor 32 , 16 , 16
xxlor 33 , 17 , 17
xxlor 34 , 18 , 18
xxlor 35 , 19 , 19
Add_state 16
Write_256 16
addi 14 , 14 , 256 # offset +=256
addi 15 , 15 , -256 # len +=256
xxlor 32 +24 , 24 , 24
xxlor 32 +25 , 25 , 25
xxlor 32 +30 , 30 , 30
vadduwm 30 , 30 , 25
vadduwm 31 , 30 , 24
xxlor 30 , 32 +30 , 32 +30
xxlor 31 , 32 +31 , 32 +31
cmpdi 15 , 0
beq Out_loop
cmpdi 15 , 512
blt Loop_last
mtctr 8
b Loop_8x
Loop_last:
lxvw4x 48 , 0 , 3 # vr16, constants
lxvw4x 49 , 17 , 3 # vr17, key 1
lxvw4x 50 , 18 , 3 # vr18, key 2
lxvw4x 51 , 19 , 3 # vr19, counter, nonce
vspltisw 21 , 12
vspltisw 23 , 7
addis 11 , 2 , permx@toc@ha
addi 11 , 11 , permx@toc@l
lxvw4x 32 +20 , 0 , 11
lxvw4x 32 +22 , 17 , 11
sradi 8 , 7 , 1
mtctr 8
Loop_4x:
vspltw 0 , 16 , 0
vspltw 1 , 16 , 1
vspltw 2 , 16 , 2
vspltw 3 , 16 , 3
vspltw 4 , 17 , 0
vspltw 5 , 17 , 1
vspltw 6 , 17 , 2
vspltw 7 , 17 , 3
vspltw 8 , 18 , 0
vspltw 9 , 18 , 1
vspltw 10 , 18 , 2
vspltw 11 , 18 , 3
vspltw 12 , 19 , 0
vadduwm 12 , 12 , 30 # increase counter
vspltw 13 , 19 , 1
vspltw 14 , 19 , 2
vspltw 15 , 19 , 3
.align 5
quarter_loop:
QT_loop_4x
bdnz quarter_loop
vadduwm 12 , 12 , 30
TP_4x 0 , 1 , 2 , 3
TP_4x 4 , 5 , 6 , 7
TP_4x 8 , 9 , 10 , 11
TP_4x 12 , 13 , 14 , 15
Add_state 0
Write_256 0
addi 14 , 14 , 256 # offset += 256
addi 15 , 15 , -256 # len += 256
# Update state counter
vspltisw 25 , 4
vadduwm 30 , 30 , 25
cmpdi 15 , 0
beq Out_loop
cmpdi 15 , 256
blt Out_loop
mtctr 8
b Loop_4x
Out_loop:
RESTORE_REGS
blr
Out_no_chacha:
li 3 , 0
blr
SYM_FUNC_END(chacha_p10le_8x)
SYM_DATA_START_LOCAL(PERMX)
.align 5
permx:
.long 0 x22330011, 0 x66774455, 0 xaabb8899, 0 xeeffccdd
.long 0 x11223300, 0 x55667744, 0 x99aabb88, 0 xddeeffcc
SYM_DATA_END(PERMX)
Messung V0.5 in Prozent C=97 H=96 G=96