// SPDX-License-Identifier: GPL-2 .0 -or-later
/*
* sm3-neon-core.S - SM3 secure hash using NEON instructions
*
* Linux/arm64 port of the libgcrypt SM3 implementation for AArch64
*
* Copyright (C) 2021 Jussi Kivilinna <jussi.kivilinna@iki.fi>
* Copyright (c) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
*/
#include <linux/linkage.h>
#include <linux/cfi_types.h>
#include <asm/assembler.h>
/* Context structure */
#define state_h0 0
#define state_h1 4
#define state_h2 8
#define state_h3 12
#define state_h4 16
#define state_h5 20
#define state_h6 24
#define state_h7 28
/* Stack structure */
#define STACK_W_SIZE (32 * 2 * 3 )
#define STACK_W (0 )
#define STACK_SIZE (STACK_W + STACK_W_SIZE)
/* Register macros */
#define RSTATE x0
#define RDATA x1
#define RNBLKS x2
#define RKPTR x28
#define RFRAME x29
#define ra w3
#define rb w4
#define rc w5
#define rd w6
#define re w7
#define rf w8
#define rg w9
#define rh w10
#define t0 w11
#define t1 w12
#define t2 w13
#define t3 w14
#define t4 w15
#define t5 w16
#define t6 w17
#define k_even w19
#define k_odd w20
#define addr0 x21
#define addr1 x22
#define s0 w23
#define s1 w24
#define s2 w25
#define s3 w26
#define W0 v0
#define W1 v1
#define W2 v2
#define W3 v3
#define W4 v4
#define W5 v5
#define XTMP0 v6
#define XTMP1 v7
#define XTMP2 v16
#define XTMP3 v17
#define XTMP4 v18
#define XTMP5 v19
#define XTMP6 v20
/* Helper macros. */
#define _(...) /*_*/
#define clear_vec(x) \
movi x.8 h, #0 ;
#define rolw(o, a, n) \
ror o, a, #(32 - n);
/* Round function macros. */
#define GG1_1(x, y, z, o, t) \
eor o, x, y;
#define GG1_2(x, y, z, o, t) \
eor o, o, z;
#define GG1_3(x, y, z, o, t)
#define FF1_1(x, y, z, o, t) GG1_1(x, y, z, o, t)
#define FF1_2(x, y, z, o, t)
#define FF1_3(x, y, z, o, t) GG1_2(x, y, z, o, t)
#define GG2_1(x, y, z, o, t) \
bic o, z, x;
#define GG2_2(x, y, z, o, t) \
and t, y, x;
#define GG2_3(x, y, z, o, t) \
eor o, o, t;
#define FF2_1(x, y, z, o, t) \
eor o, x, y;
#define FF2_2(x, y, z, o, t) \
and t, x, y; \
and o, o, z;
#define FF2_3(x, y, z, o, t) \
eor o, o, t;
#define R(i, a, b, c, d, e, f, g, h, k, K_LOAD, round, widx, wtype, IOP, iop_param) \
K_LOAD(round); \
ldr t5, [sp, #(wtype##_W1_ADDR(round, widx))]; \
rolw(t0, a, 12 ); /* rol(a, 12) => t0 */ \
IOP(1 , iop_param); \
FF##i##_1 (a, b, c, t1, t2); \
ldr t6, [sp, #(wtype##_W1W2_ADDR(round, widx))]; \
add k, k, e; \
IOP(2 , iop_param); \
GG##i##_1 (e, f, g, t3, t4); \
FF##i##_2 (a, b, c, t1, t2); \
IOP(3 , iop_param); \
add k, k, t0; \
add h, h, t5; \
add d, d, t6; /* w1w2 + d => d */ \
IOP(4 , iop_param); \
rolw(k, k, 7 ); /* rol (t0 + e + t), 7) => k */ \
GG##i##_2 (e, f, g, t3, t4); \
add h, h, k; /* h + w1 + k => h */ \
IOP(5 , iop_param); \
FF##i##_3 (a, b, c, t1, t2); \
eor t0, t0, k; /* k ^ t0 => t0 */ \
GG##i##_3 (e, f, g, t3, t4); \
add d, d, t1; /* FF(a,b,c) + d => d */ \
IOP(6 , iop_param); \
add t3, t3, h; /* GG(e,f,g) + h => t3 */ \
rolw(b, b, 9 ); /* rol(b, 9) => b */ \
eor h, t3, t3, ror #(32 -9 ); \
IOP(7 , iop_param); \
add d, d, t0; /* t0 + d => d */ \
rolw(f, f, 19 ); /* rol(f, 19) => f */ \
IOP(8 , iop_param); \
eor h, h, t3, ror #(32 -17 ); /* P0(t3) => h */
#define R1(a, b, c, d, e, f, g, h, k, K_LOAD, round, widx, wtype, IOP, iop_param) \
R(1 , ##a, ##b, ##c, ##d, ##e, ##f, ##g, ##h, ##k, K_LOAD, round, widx, wtype, IOP, iop_param)
#define R2(a, b, c, d, e, f, g, h, k, K_LOAD, round, widx, wtype, IOP, iop_param) \
R(2 , ##a, ##b, ##c, ##d, ##e, ##f, ##g, ##h, ##k, K_LOAD, round, widx, wtype, IOP, iop_param)
#define KL(round) \
ldp k_even, k_odd, [RKPTR, #(4 *(round))];
/* Input expansion macros. */
/* Byte-swapped input address. */
#define IW_W_ADDR(round, widx, offs) \
(STACK_W + ((round) / 4 ) * 64 + (offs) + ((widx) * 4 ))
/* Expanded input address. */
#define XW_W_ADDR(round, widx, offs) \
(STACK_W + ((((round) / 3 ) - 4 ) % 2 ) * 64 + (offs) + ((widx) * 4 ))
/* Rounds 1-12, byte-swapped input block addresses. */
#define IW_W1_ADDR(round, widx) IW_W_ADDR(round, widx, 32 )
#define IW_W1W2_ADDR(round, widx) IW_W_ADDR(round, widx, 48 )
/* Rounds 1-12, expanded input block addresses. */
#define XW_W1_ADDR(round, widx) XW_W_ADDR(round, widx, 0 )
#define XW_W1W2_ADDR(round, widx) XW_W_ADDR(round, widx, 16 )
/* Input block loading.
* Interleaving within round function needed for in-order CPUs. */
#define LOAD_W_VEC_1_1() \
add addr0, sp, #IW_W1_ADDR(0 , 0 );
#define LOAD_W_VEC_1_2() \
add addr1, sp, #IW_W1_ADDR(4 , 0 );
#define LOAD_W_VEC_1_3() \
ld1 {W0.16 b}, [RDATA], #16 ;
#define LOAD_W_VEC_1_4() \
ld1 {W1.16 b}, [RDATA], #16 ;
#define LOAD_W_VEC_1_5() \
ld1 {W2.16 b}, [RDATA], #16 ;
#define LOAD_W_VEC_1_6() \
ld1 {W3.16 b}, [RDATA], #16 ;
#define LOAD_W_VEC_1_7() \
rev32 XTMP0.16 b, W0.16 b;
#define LOAD_W_VEC_1_8() \
rev32 XTMP1.16 b, W1.16 b;
#define LOAD_W_VEC_2_1() \
rev32 XTMP2.16 b, W2.16 b;
#define LOAD_W_VEC_2_2() \
rev32 XTMP3.16 b, W3.16 b;
#define LOAD_W_VEC_2_3() \
eor XTMP4.16 b, XTMP1.16 b, XTMP0.16 b;
#define LOAD_W_VEC_2_4() \
eor XTMP5.16 b, XTMP2.16 b, XTMP1.16 b;
#define LOAD_W_VEC_2_5() \
st1 {XTMP0.16 b}, [addr0], #16 ;
#define LOAD_W_VEC_2_6() \
st1 {XTMP4.16 b}, [addr0]; \
add addr0, sp, #IW_W1_ADDR(8 , 0 );
#define LOAD_W_VEC_2_7() \
eor XTMP6.16 b, XTMP3.16 b, XTMP2.16 b;
#define LOAD_W_VEC_2_8() \
ext W0.16 b, XTMP0.16 b, XTMP0.16 b, #8 ; /* W0: xx, w0, xx, xx */
#define LOAD_W_VEC_3_1() \
mov W2.16 b, XTMP1.16 b; /* W2: xx, w6, w5, w4 */
#define LOAD_W_VEC_3_2() \
st1 {XTMP1.16 b}, [addr1], #16 ;
#define LOAD_W_VEC_3_3() \
st1 {XTMP5.16 b}, [addr1]; \
ext W1.16 b, XTMP0.16 b, XTMP0.16 b, #4 ; /* W1: xx, w3, w2, w1 */
#define LOAD_W_VEC_3_4() \
ext W3.16 b, XTMP1.16 b, XTMP2.16 b, #12 ; /* W3: xx, w9, w8, w7 */
#define LOAD_W_VEC_3_5() \
ext W4.16 b, XTMP2.16 b, XTMP3.16 b, #8 ; /* W4: xx, w12, w11, w10 */
#define LOAD_W_VEC_3_6() \
st1 {XTMP2.16 b}, [addr0], #16 ;
#define LOAD_W_VEC_3_7() \
st1 {XTMP6.16 b}, [addr0];
#define LOAD_W_VEC_3_8() \
ext W5.16 b, XTMP3.16 b, XTMP3.16 b, #4 ; /* W5: xx, w15, w14, w13 */
#define LOAD_W_VEC_1(iop_num, ...) \
LOAD_W_VEC_1_##iop_num()
#define LOAD_W_VEC_2(iop_num, ...) \
LOAD_W_VEC_2_##iop_num()
#define LOAD_W_VEC_3(iop_num, ...) \
LOAD_W_VEC_3_##iop_num()
/* Message scheduling. Note: 3 words per vector register.
* Interleaving within round function needed for in-order CPUs. */
#define SCHED_W_1_1(round, w0, w1, w2, w3, w4, w5) \
/* Load (w[i - 16]) => XTMP0 */ \
/* Load (w[i - 13]) => XTMP5 */ \
ext XTMP0.16 b, w0.16 b, w0.16 b, #12 ; /* XTMP0: w0, xx, xx, xx */
#define SCHED_W_1_2(round, w0, w1, w2, w3, w4, w5) \
ext XTMP5.16 b, w1.16 b, w1.16 b, #12 ;
#define SCHED_W_1_3(round, w0, w1, w2, w3, w4, w5) \
ext XTMP0.16 b, XTMP0.16 b, w1.16 b, #12 ; /* XTMP0: xx, w2, w1, w0 */
#define SCHED_W_1_4(round, w0, w1, w2, w3, w4, w5) \
ext XTMP5.16 b, XTMP5.16 b, w2.16 b, #12 ;
#define SCHED_W_1_5(round, w0, w1, w2, w3, w4, w5) \
/* w[i - 9] == w3 */ \
/* W3 ^ XTMP0 => XTMP0 */ \
eor XTMP0.16 b, XTMP0.16 b, w3.16 b;
#define SCHED_W_1_6(round, w0, w1, w2, w3, w4, w5) \
/* w[i - 3] == w5 */ \
/* rol(XMM5, 15) ^ XTMP0 => XTMP0 */ \
/* rol(XTMP5, 7) => XTMP1 */ \
add addr0, sp, #XW_W1_ADDR((round), 0 ); \
shl XTMP2.4 s, w5.4 s, #15 ;
#define SCHED_W_1_7(round, w0, w1, w2, w3, w4, w5) \
shl XTMP1.4 s, XTMP5.4 s, #7 ;
#define SCHED_W_1_8(round, w0, w1, w2, w3, w4, w5) \
sri XTMP2.4 s, w5.4 s, #(32 -15 );
#define SCHED_W_2_1(round, w0, w1, w2, w3, w4, w5) \
sri XTMP1.4 s, XTMP5.4 s, #(32 -7 );
#define SCHED_W_2_2(round, w0, w1, w2, w3, w4, w5) \
eor XTMP0.16 b, XTMP0.16 b, XTMP2.16 b;
#define SCHED_W_2_3(round, w0, w1, w2, w3, w4, w5) \
/* w[i - 6] == W4 */ \
/* W4 ^ XTMP1 => XTMP1 */ \
eor XTMP1.16 b, XTMP1.16 b, w4.16 b;
#define SCHED_W_2_4(round, w0, w1, w2, w3, w4, w5) \
/* P1(XTMP0) ^ XTMP1 => W0 */ \
shl XTMP3.4 s, XTMP0.4 s, #15 ;
#define SCHED_W_2_5(round, w0, w1, w2, w3, w4, w5) \
shl XTMP4.4 s, XTMP0.4 s, #23 ;
#define SCHED_W_2_6(round, w0, w1, w2, w3, w4, w5) \
eor w0.16 b, XTMP1.16 b, XTMP0.16 b;
#define SCHED_W_2_7(round, w0, w1, w2, w3, w4, w5) \
sri XTMP3.4 s, XTMP0.4 s, #(32 -15 );
#define SCHED_W_2_8(round, w0, w1, w2, w3, w4, w5) \
sri XTMP4.4 s, XTMP0.4 s, #(32 -23 );
#define SCHED_W_3_1(round, w0, w1, w2, w3, w4, w5) \
eor w0.16 b, w0.16 b, XTMP3.16 b;
#define SCHED_W_3_2(round, w0, w1, w2, w3, w4, w5) \
/* Load (w[i - 3]) => XTMP2 */ \
ext XTMP2.16 b, w4.16 b, w4.16 b, #12 ;
#define SCHED_W_3_3(round, w0, w1, w2, w3, w4, w5) \
eor w0.16 b, w0.16 b, XTMP4.16 b;
#define SCHED_W_3_4(round, w0, w1, w2, w3, w4, w5) \
ext XTMP2.16 b, XTMP2.16 b, w5.16 b, #12 ;
#define SCHED_W_3_5(round, w0, w1, w2, w3, w4, w5) \
/* W1 ^ W2 => XTMP3 */ \
eor XTMP3.16 b, XTMP2.16 b, w0.16 b;
#define SCHED_W_3_6(round, w0, w1, w2, w3, w4, w5)
#define SCHED_W_3_7(round, w0, w1, w2, w3, w4, w5) \
st1 {XTMP2.16 b-XTMP3.16 b}, [addr0];
#define SCHED_W_3_8(round, w0, w1, w2, w3, w4, w5)
#define SCHED_W_W0W1W2W3W4W5_1(iop_num, round) \
SCHED_W_1_##iop_num(round, W0, W1, W2, W3, W4, W5)
#define SCHED_W_W0W1W2W3W4W5_2(iop_num, round) \
SCHED_W_2_##iop_num(round, W0, W1, W2, W3, W4, W5)
#define SCHED_W_W0W1W2W3W4W5_3(iop_num, round) \
SCHED_W_3_##iop_num(round, W0, W1, W2, W3, W4, W5)
#define SCHED_W_W1W2W3W4W5W0_1(iop_num, round) \
SCHED_W_1_##iop_num(round, W1, W2, W3, W4, W5, W0)
#define SCHED_W_W1W2W3W4W5W0_2(iop_num, round) \
SCHED_W_2_##iop_num(round, W1, W2, W3, W4, W5, W0)
#define SCHED_W_W1W2W3W4W5W0_3(iop_num, round) \
SCHED_W_3_##iop_num(round, W1, W2, W3, W4, W5, W0)
#define SCHED_W_W2W3W4W5W0W1_1(iop_num, round) \
SCHED_W_1_##iop_num(round, W2, W3, W4, W5, W0, W1)
#define SCHED_W_W2W3W4W5W0W1_2(iop_num, round) \
SCHED_W_2_##iop_num(round, W2, W3, W4, W5, W0, W1)
#define SCHED_W_W2W3W4W5W0W1_3(iop_num, round) \
SCHED_W_3_##iop_num(round, W2, W3, W4, W5, W0, W1)
#define SCHED_W_W3W4W5W0W1W2_1(iop_num, round) \
SCHED_W_1_##iop_num(round, W3, W4, W5, W0, W1, W2)
#define SCHED_W_W3W4W5W0W1W2_2(iop_num, round) \
SCHED_W_2_##iop_num(round, W3, W4, W5, W0, W1, W2)
#define SCHED_W_W3W4W5W0W1W2_3(iop_num, round) \
SCHED_W_3_##iop_num(round, W3, W4, W5, W0, W1, W2)
#define SCHED_W_W4W5W0W1W2W3_1(iop_num, round) \
SCHED_W_1_##iop_num(round, W4, W5, W0, W1, W2, W3)
#define SCHED_W_W4W5W0W1W2W3_2(iop_num, round) \
SCHED_W_2_##iop_num(round, W4, W5, W0, W1, W2, W3)
#define SCHED_W_W4W5W0W1W2W3_3(iop_num, round) \
SCHED_W_3_##iop_num(round, W4, W5, W0, W1, W2, W3)
#define SCHED_W_W5W0W1W2W3W4_1(iop_num, round) \
SCHED_W_1_##iop_num(round, W5, W0, W1, W2, W3, W4)
#define SCHED_W_W5W0W1W2W3W4_2(iop_num, round) \
SCHED_W_2_##iop_num(round, W5, W0, W1, W2, W3, W4)
#define SCHED_W_W5W0W1W2W3W4_3(iop_num, round) \
SCHED_W_3_##iop_num(round, W5, W0, W1, W2, W3, W4)
/*
* Transform blocks*64 bytes (blocks*16 32-bit words) at 'src'.
*
* void sm3_neon_transform(struct sm3_state *sst, u8 const *src,
* int blocks)
*/
.text
.align 3
SYM_TYPED_FUNC_START(sm3_neon_transform)
ldp ra, rb, [RSTATE, #0 ]
ldp rc, rd, [RSTATE, #8 ]
ldp re, rf, [RSTATE, #16 ]
ldp rg, rh, [RSTATE, #24 ]
stp x28, x29, [sp, #-16 ]!
stp x19, x20, [sp, #-16 ]!
stp x21, x22, [sp, #-16 ]!
stp x23, x24, [sp, #-16 ]!
stp x25, x26, [sp, #-16 ]!
mov RFRAME, sp
sub addr0, sp, #STACK_SIZE
adr_l RKPTR, .LKtable
and sp, addr0, #(~63 )
/* Preload first block. */
LOAD_W_VEC_1(1 , 0 )
LOAD_W_VEC_1(2 , 0 )
LOAD_W_VEC_1(3 , 0 )
LOAD_W_VEC_1(4 , 0 )
LOAD_W_VEC_1(5 , 0 )
LOAD_W_VEC_1(6 , 0 )
LOAD_W_VEC_1(7 , 0 )
LOAD_W_VEC_1(8 , 0 )
LOAD_W_VEC_2(1 , 0 )
LOAD_W_VEC_2(2 , 0 )
LOAD_W_VEC_2(3 , 0 )
LOAD_W_VEC_2(4 , 0 )
LOAD_W_VEC_2(5 , 0 )
LOAD_W_VEC_2(6 , 0 )
LOAD_W_VEC_2(7 , 0 )
LOAD_W_VEC_2(8 , 0 )
LOAD_W_VEC_3(1 , 0 )
LOAD_W_VEC_3(2 , 0 )
LOAD_W_VEC_3(3 , 0 )
LOAD_W_VEC_3(4 , 0 )
LOAD_W_VEC_3(5 , 0 )
LOAD_W_VEC_3(6 , 0 )
LOAD_W_VEC_3(7 , 0 )
LOAD_W_VEC_3(8 , 0 )
.balign 16
.Loop:
/* Transform 0-3 */
R1(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 0 , 0 , IW, _, 0 )
R1(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 1 , 1 , IW, _, 0 )
R1(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 2 , 2 , IW, _, 0 )
R1(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 3 , 3 , IW, _, 0 )
/* Transform 4-7 + Precalc 12-14 */
R1(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 4 , 0 , IW, _, 0 )
R1(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 5 , 1 , IW, _, 0 )
R1(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 6 , 2 , IW, SCHED_W_W0W1W2W3W4W5_1, 12 )
R1(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 7 , 3 , IW, SCHED_W_W0W1W2W3W4W5_2, 12 )
/* Transform 8-11 + Precalc 12-17 */
R1(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 8 , 0 , IW, SCHED_W_W0W1W2W3W4W5_3, 12 )
R1(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 9 , 1 , IW, SCHED_W_W1W2W3W4W5W0_1, 15 )
R1(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 10 , 2 , IW, SCHED_W_W1W2W3W4W5W0_2, 15 )
R1(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 11 , 3 , IW, SCHED_W_W1W2W3W4W5W0_3, 15 )
/* Transform 12-14 + Precalc 18-20 */
R1(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 12 , 0 , XW, SCHED_W_W2W3W4W5W0W1_1, 18 )
R1(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 13 , 1 , XW, SCHED_W_W2W3W4W5W0W1_2, 18 )
R1(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 14 , 2 , XW, SCHED_W_W2W3W4W5W0W1_3, 18 )
/* Transform 15-17 + Precalc 21-23 */
R1(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 15 , 0 , XW, SCHED_W_W3W4W5W0W1W2_1, 21 )
R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 16 , 1 , XW, SCHED_W_W3W4W5W0W1W2_2, 21 )
R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 17 , 2 , XW, SCHED_W_W3W4W5W0W1W2_3, 21 )
/* Transform 18-20 + Precalc 24-26 */
R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 18 , 0 , XW, SCHED_W_W4W5W0W1W2W3_1, 24 )
R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 19 , 1 , XW, SCHED_W_W4W5W0W1W2W3_2, 24 )
R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 20 , 2 , XW, SCHED_W_W4W5W0W1W2W3_3, 24 )
/* Transform 21-23 + Precalc 27-29 */
R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 21 , 0 , XW, SCHED_W_W5W0W1W2W3W4_1, 27 )
R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 22 , 1 , XW, SCHED_W_W5W0W1W2W3W4_2, 27 )
R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 23 , 2 , XW, SCHED_W_W5W0W1W2W3W4_3, 27 )
/* Transform 24-26 + Precalc 30-32 */
R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 24 , 0 , XW, SCHED_W_W0W1W2W3W4W5_1, 30 )
R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 25 , 1 , XW, SCHED_W_W0W1W2W3W4W5_2, 30 )
R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 26 , 2 , XW, SCHED_W_W0W1W2W3W4W5_3, 30 )
/* Transform 27-29 + Precalc 33-35 */
R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 27 , 0 , XW, SCHED_W_W1W2W3W4W5W0_1, 33 )
R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 28 , 1 , XW, SCHED_W_W1W2W3W4W5W0_2, 33 )
R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 29 , 2 , XW, SCHED_W_W1W2W3W4W5W0_3, 33 )
/* Transform 30-32 + Precalc 36-38 */
R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 30 , 0 , XW, SCHED_W_W2W3W4W5W0W1_1, 36 )
R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 31 , 1 , XW, SCHED_W_W2W3W4W5W0W1_2, 36 )
R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 32 , 2 , XW, SCHED_W_W2W3W4W5W0W1_3, 36 )
/* Transform 33-35 + Precalc 39-41 */
R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 33 , 0 , XW, SCHED_W_W3W4W5W0W1W2_1, 39 )
R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 34 , 1 , XW, SCHED_W_W3W4W5W0W1W2_2, 39 )
R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 35 , 2 , XW, SCHED_W_W3W4W5W0W1W2_3, 39 )
/* Transform 36-38 + Precalc 42-44 */
R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 36 , 0 , XW, SCHED_W_W4W5W0W1W2W3_1, 42 )
R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 37 , 1 , XW, SCHED_W_W4W5W0W1W2W3_2, 42 )
R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 38 , 2 , XW, SCHED_W_W4W5W0W1W2W3_3, 42 )
/* Transform 39-41 + Precalc 45-47 */
R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 39 , 0 , XW, SCHED_W_W5W0W1W2W3W4_1, 45 )
R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 40 , 1 , XW, SCHED_W_W5W0W1W2W3W4_2, 45 )
R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 41 , 2 , XW, SCHED_W_W5W0W1W2W3W4_3, 45 )
/* Transform 42-44 + Precalc 48-50 */
R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 42 , 0 , XW, SCHED_W_W0W1W2W3W4W5_1, 48 )
R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 43 , 1 , XW, SCHED_W_W0W1W2W3W4W5_2, 48 )
R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 44 , 2 , XW, SCHED_W_W0W1W2W3W4W5_3, 48 )
/* Transform 45-47 + Precalc 51-53 */
R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 45 , 0 , XW, SCHED_W_W1W2W3W4W5W0_1, 51 )
R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 46 , 1 , XW, SCHED_W_W1W2W3W4W5W0_2, 51 )
R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 47 , 2 , XW, SCHED_W_W1W2W3W4W5W0_3, 51 )
/* Transform 48-50 + Precalc 54-56 */
R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 48 , 0 , XW, SCHED_W_W2W3W4W5W0W1_1, 54 )
R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 49 , 1 , XW, SCHED_W_W2W3W4W5W0W1_2, 54 )
R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 50 , 2 , XW, SCHED_W_W2W3W4W5W0W1_3, 54 )
/* Transform 51-53 + Precalc 57-59 */
R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 51 , 0 , XW, SCHED_W_W3W4W5W0W1W2_1, 57 )
R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 52 , 1 , XW, SCHED_W_W3W4W5W0W1W2_2, 57 )
R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 53 , 2 , XW, SCHED_W_W3W4W5W0W1W2_3, 57 )
/* Transform 54-56 + Precalc 60-62 */
R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 54 , 0 , XW, SCHED_W_W4W5W0W1W2W3_1, 60 )
R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 55 , 1 , XW, SCHED_W_W4W5W0W1W2W3_2, 60 )
R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 56 , 2 , XW, SCHED_W_W4W5W0W1W2W3_3, 60 )
/* Transform 57-59 + Precalc 63 */
R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 57 , 0 , XW, SCHED_W_W5W0W1W2W3W4_1, 63 )
R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 58 , 1 , XW, SCHED_W_W5W0W1W2W3W4_2, 63 )
R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 59 , 2 , XW, SCHED_W_W5W0W1W2W3W4_3, 63 )
/* Transform 60 */
R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 60 , 0 , XW, _, _)
subs RNBLKS, RNBLKS, #1
b.eq .Lend
/* Transform 61-63 + Preload next block */
R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 61 , 1 , XW, LOAD_W_VEC_1, _)
ldp s0, s1, [RSTATE, #0 ]
R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 62 , 2 , XW, LOAD_W_VEC_2, _)
ldp s2, s3, [RSTATE, #8 ]
R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 63 , 0 , XW, LOAD_W_VEC_3, _)
/* Update the chaining variables. */
eor ra, ra, s0
eor rb, rb, s1
ldp s0, s1, [RSTATE, #16 ]
eor rc, rc, s2
ldp k_even, k_odd, [RSTATE, #24 ]
eor rd, rd, s3
eor re, re, s0
stp ra, rb, [RSTATE, #0 ]
eor rf, rf, s1
stp rc, rd, [RSTATE, #8 ]
eor rg, rg, k_even
stp re, rf, [RSTATE, #16 ]
eor rh, rh, k_odd
stp rg, rh, [RSTATE, #24 ]
b .Loop
.Lend:
/* Transform 61-63 */
R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 61 , 1 , XW, _, _)
ldp s0, s1, [RSTATE, #0 ]
R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 62 , 2 , XW, _, _)
ldp s2, s3, [RSTATE, #8 ]
R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 63 , 0 , XW, _, _)
/* Update the chaining variables. */
eor ra, ra, s0
clear_vec(W0)
eor rb, rb, s1
clear_vec(W1)
ldp s0, s1, [RSTATE, #16 ]
clear_vec(W2)
eor rc, rc, s2
clear_vec(W3)
ldp k_even, k_odd, [RSTATE, #24 ]
clear_vec(W4)
eor rd, rd, s3
clear_vec(W5)
eor re, re, s0
clear_vec(XTMP0)
stp ra, rb, [RSTATE, #0 ]
clear_vec(XTMP1)
eor rf, rf, s1
clear_vec(XTMP2)
stp rc, rd, [RSTATE, #8 ]
clear_vec(XTMP3)
eor rg, rg, k_even
clear_vec(XTMP4)
stp re, rf, [RSTATE, #16 ]
clear_vec(XTMP5)
eor rh, rh, k_odd
clear_vec(XTMP6)
stp rg, rh, [RSTATE, #24 ]
/* Clear message expansion area */
add addr0, sp, #STACK_W
st1 {W0.16 b-W3.16 b}, [addr0], #64
st1 {W0.16 b-W3.16 b}, [addr0], #64
st1 {W0.16 b-W3.16 b}, [addr0]
mov sp, RFRAME
ldp x25, x26, [sp], #16
ldp x23, x24, [sp], #16
ldp x21, x22, [sp], #16
ldp x19, x20, [sp], #16
ldp x28, x29, [sp], #16
ret
SYM_FUNC_END(sm3_neon_transform)
.section ".rodata" , "a"
.align 4
.LKtable:
.long 0 x79cc4519, 0 xf3988a32, 0 xe7311465, 0 xce6228cb
.long 0 x9cc45197, 0 x3988a32f, 0 x7311465e, 0 xe6228cbc
.long 0 xcc451979, 0 x988a32f3, 0 x311465e7, 0 x6228cbce
.long 0 xc451979c, 0 x88a32f39, 0 x11465e73, 0 x228cbce6
.long 0 x9d8a7a87, 0 x3b14f50f, 0 x7629ea1e, 0 xec53d43c
.long 0 xd8a7a879, 0 xb14f50f3, 0 x629ea1e7, 0 xc53d43ce
.long 0 x8a7a879d, 0 x14f50f3b, 0 x29ea1e76, 0 x53d43cec
.long 0 xa7a879d8, 0 x4f50f3b1, 0 x9ea1e762, 0 x3d43cec5
.long 0 x7a879d8a, 0 xf50f3b14, 0 xea1e7629, 0 xd43cec53
.long 0 xa879d8a7, 0 x50f3b14f, 0 xa1e7629e, 0 x43cec53d
.long 0 x879d8a7a, 0 x0f3b14f5, 0 x1e7629ea, 0 x3cec53d4
.long 0 x79d8a7a8, 0 xf3b14f50, 0 xe7629ea1, 0 xcec53d43
.long 0 x9d8a7a87, 0 x3b14f50f, 0 x7629ea1e, 0 xec53d43c
.long 0 xd8a7a879, 0 xb14f50f3, 0 x629ea1e7, 0 xc53d43ce
.long 0 x8a7a879d, 0 x14f50f3b, 0 x29ea1e76, 0 x53d43cec
.long 0 xa7a879d8, 0 x4f50f3b1, 0 x9ea1e762, 0 x3d43cec5
Messung V0.5 in Prozent C=98 H=98 G=97