/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* SM4 Cipher Algorithm for ARMv8 NEON
* as specified in
* https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html
*
* Copyright (C) 2022, Alibaba Group.
* Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
*/
#include <linux/linkage.h>
#include <asm/assembler.h>
/* Register macros */
#define RTMP0 v8
#define RTMP1 v9
#define RTMP2 v10
#define RTMP3 v11
#define RTMP4 v12
#define RTMP5 v13
#define RTMP6 v14
#define RTMP7 v15
#define RX0 v12
#define RX1 v13
#define RKEY v14
#define RIV v15
/* Helper macros. */
#define SM4_PREPARE() \
adr_l x5, crypto_sm4_sbox; \
ld1 {v16.
16 b-v19.
16 b}, [x5], #
64 ; \
ld1 {v20.
16 b-v23.
16 b}, [x5], #
64 ; \
ld1 {v24.
16 b-v27.
16 b}, [x5], #
64 ; \
ld1 {v28.
16 b-v31.
16 b}, [x5];
#define transpose_4x4(s0, s1, s2, s3) \
zip1 RTMP0.
4 s, s0.
4 s, s1.
4 s; \
zip1 RTMP1.
4 s, s2.
4 s, s3.
4 s; \
zip2 RTMP2.
4 s, s0.
4 s, s1.
4 s; \
zip2 RTMP3.
4 s, s2.
4 s, s3.
4 s; \
zip1 s0.
2 d, RTMP0.
2 d, RTMP1.
2 d; \
zip2 s1.
2 d, RTMP0.
2 d, RTMP1.
2 d; \
zip1 s2.
2 d, RTMP2.
2 d, RTMP3.
2 d; \
zip2 s3.
2 d, RTMP2.
2 d, RTMP3.
2 d;
#define transpose_4x4_2x(s0, s1, s2, s3, s4, s5, s6, s7) \
zip1 RTMP0.
4 s, s0.
4 s, s1.
4 s; \
zip1 RTMP1.
4 s, s2.
4 s, s3.
4 s; \
zip2 RTMP2.
4 s, s0.
4 s, s1.
4 s; \
zip2 RTMP3.
4 s, s2.
4 s, s3.
4 s; \
zip1 RTMP4.
4 s, s4.
4 s, s5.
4 s; \
zip1 RTMP5.
4 s, s6.
4 s, s7.
4 s; \
zip2 RTMP6.
4 s, s4.
4 s, s5.
4 s; \
zip2 RTMP7.
4 s, s6.
4 s, s7.
4 s; \
zip1 s0.
2 d, RTMP0.
2 d, RTMP1.
2 d; \
zip2 s1.
2 d, RTMP0.
2 d, RTMP1.
2 d; \
zip1 s2.
2 d, RTMP2.
2 d, RTMP3.
2 d; \
zip2 s3.
2 d, RTMP2.
2 d, RTMP3.
2 d; \
zip1 s4.
2 d, RTMP4.
2 d, RTMP5.
2 d; \
zip2 s5.
2 d, RTMP4.
2 d, RTMP5.
2 d; \
zip1 s6.
2 d, RTMP6.
2 d, RTMP7.
2 d; \
zip2 s7.
2 d, RTMP6.
2 d, RTMP7.
2 d;
#define rotate_clockwise_4x4(s0, s1, s2, s3) \
zip1 RTMP0.
4 s, s1.
4 s, s0.
4 s; \
zip2 RTMP1.
4 s, s1.
4 s, s0.
4 s; \
zip1 RTMP2.
4 s, s3.
4 s, s2.
4 s; \
zip2 RTMP3.
4 s, s3.
4 s, s2.
4 s; \
zip1 s0.
2 d, RTMP2.
2 d, RTMP0.
2 d; \
zip2 s1.
2 d, RTMP2.
2 d, RTMP0.
2 d; \
zip1 s2.
2 d, RTMP3.
2 d, RTMP1.
2 d; \
zip2 s3.
2 d, RTMP3.
2 d, RTMP1.
2 d;
#define rotate_clockwise_4x4_2x(s0, s1, s2, s3, s4, s5, s6, s7) \
zip1 RTMP0.
4 s, s1.
4 s, s0.
4 s; \
zip1 RTMP2.
4 s, s3.
4 s, s2.
4 s; \
zip2 RTMP1.
4 s, s1.
4 s, s0.
4 s; \
zip2 RTMP3.
4 s, s3.
4 s, s2.
4 s; \
zip1 RTMP4.
4 s, s5.
4 s, s4.
4 s; \
zip1 RTMP6.
4 s, s7.
4 s, s6.
4 s; \
zip2 RTMP5.
4 s, s5.
4 s, s4.
4 s; \
zip2 RTMP7.
4 s, s7.
4 s, s6.
4 s; \
zip1 s0.
2 d, RTMP2.
2 d, RTMP0.
2 d; \
zip2 s1.
2 d, RTMP2.
2 d, RTMP0.
2 d; \
zip1 s2.
2 d, RTMP3.
2 d, RTMP1.
2 d; \
zip2 s3.
2 d, RTMP3.
2 d, RTMP1.
2 d; \
zip1 s4.
2 d, RTMP6.
2 d, RTMP4.
2 d; \
zip2 s5.
2 d, RTMP6.
2 d, RTMP4.
2 d; \
zip1 s6.
2 d, RTMP7.
2 d, RTMP5.
2 d; \
zip2 s7.
2 d, RTMP7.
2 d, RTMP5.
2 d;
#define ROUND4(round, s0, s1, s2, s3) \
dup RX0.
4 s, RKEY.s[round]; \
/* rk ^ s1 ^ s2 ^ s3 */ \
eor RTMP1.
16 b, s2.
16 b, s3.
16 b; \
eor RX0.
16 b, RX0.
16 b, s1.
16 b; \
eor RX0.
16 b, RX0.
16 b, RTMP1.
16 b; \
\
/* sbox, non-linear part */ \
movi RTMP3.
16 b, #
64 ;
/* sizeof(sbox) / 4 */ \
tbl RTMP0.
16 b, {v16.
16 b-v19.
16 b}, RX0.
16 b; \
sub RX0.
16 b, RX0.
16 b, RTMP3.
16 b; \
tbx RTMP0.
16 b, {v20.
16 b-v23.
16 b}, RX0.
16 b; \
sub RX0.
16 b, RX0.
16 b, RTMP3.
16 b; \
tbx RTMP0.
16 b, {v24.
16 b-v27.
16 b}, RX0.
16 b; \
sub RX0.
16 b, RX0.
16 b, RTMP3.
16 b; \
tbx RTMP0.
16 b, {v28.
16 b-v31.
16 b}, RX0.
16 b; \
\
/* linear part */ \
shl RTMP1.
4 s, RTMP0.
4 s, #
8 ; \
shl RTMP2.
4 s, RTMP0.
4 s, #
16 ; \
shl RTMP3.
4 s, RTMP0.
4 s, #
24 ; \
sri RTMP1.
4 s, RTMP0.
4 s, #(
32 -
8 ); \
sri RTMP2.
4 s, RTMP0.
4 s, #(
32 -
16 ); \
sri RTMP3.
4 s, RTMP0.
4 s, #(
32 -
24 ); \
/* RTMP1 = x ^ rol32(x, 8) ^ rol32(x, 16) */ \
eor RTMP1.
16 b, RTMP1.
16 b, RTMP0.
16 b; \
eor RTMP1.
16 b, RTMP1.
16 b, RTMP2.
16 b; \
/* RTMP3 = x ^ rol32(x, 24) ^ rol32(RTMP1, 2) */ \
eor RTMP3.
16 b, RTMP3.
16 b, RTMP0.
16 b; \
shl RTMP2.
4 s, RTMP1.
4 s,
2 ; \
sri RTMP2.
4 s, RTMP1.
4 s, #(
32 -
2 ); \
eor RTMP3.
16 b, RTMP3.
16 b, RTMP2.
16 b; \
/* s0 ^= RTMP3 */ \
eor s0.
16 b, s0.
16 b, RTMP3.
16 b;
#define SM4_CRYPT_BLK4_BE(b0, b1, b2, b3) \
mov x6,
8 ; \
4 : \
ld1 {RKEY.
4 s}, [x0], #
16 ; \
subs x6, x6, #
1 ; \
\
ROUND4(
0 , b0, b1, b2, b3); \
ROUND4(
1 , b1, b2, b3, b0); \
ROUND4(
2 , b2, b3, b0, b1); \
ROUND4(
3 , b3, b0, b1, b2); \
\
bne
4 b; \
\
rev32 b0.
16 b, b0.
16 b; \
rev32 b1.
16 b, b1.
16 b; \
rev32 b2.
16 b, b2.
16 b; \
rev32 b3.
16 b, b3.
16 b; \
\
rotate_clockwise_4x4(b0, b1, b2, b3); \
\
/* repoint to rkey */ \
sub x0, x0, #
128 ;
#define SM4_CRYPT_BLK4(b0, b1, b2, b3) \
rev32 b0.
16 b, b0.
16 b; \
rev32 b1.
16 b, b1.
16 b; \
rev32 b2.
16 b, b2.
16 b; \
rev32 b3.
16 b, b3.
16 b; \
SM4_CRYPT_BLK4_BE(b0, b1, b2, b3);
#define ROUND8(round, s0, s1, s2, s3, t0, t1, t2, t3) \
/* rk ^ s1 ^ s2 ^ s3 */ \
dup RX0.
4 s, RKEY.s[round]; \
eor RTMP0.
16 b, s2.
16 b, s3.
16 b; \
mov RX1.
16 b, RX0.
16 b; \
eor RTMP1.
16 b, t2.
16 b, t3.
16 b; \
eor RX0.
16 b, RX0.
16 b, s1.
16 b; \
eor RX1.
16 b, RX1.
16 b, t1.
16 b; \
eor RX0.
16 b, RX0.
16 b, RTMP0.
16 b; \
eor RX1.
16 b, RX1.
16 b, RTMP1.
16 b; \
\
/* sbox, non-linear part */ \
movi RTMP3.
16 b, #
64 ;
/* sizeof(sbox) / 4 */ \
tbl RTMP0.
16 b, {v16.
16 b-v19.
16 b}, RX0.
16 b; \
tbl RTMP1.
16 b, {v16.
16 b-v19.
16 b}, RX1.
16 b; \
sub RX0.
16 b, RX0.
16 b, RTMP3.
16 b; \
sub RX1.
16 b, RX1.
16 b, RTMP3.
16 b; \
tbx RTMP0.
16 b, {v20.
16 b-v23.
16 b}, RX0.
16 b; \
tbx RTMP1.
16 b, {v20.
16 b-v23.
16 b}, RX1.
16 b; \
sub RX0.
16 b, RX0.
16 b, RTMP3.
16 b; \
sub RX1.
16 b, RX1.
16 b, RTMP3.
16 b; \
tbx RTMP0.
16 b, {v24.
16 b-v27.
16 b}, RX0.
16 b; \
tbx RTMP1.
16 b, {v24.
16 b-v27.
16 b}, RX1.
16 b; \
sub RX0.
16 b, RX0.
16 b, RTMP3.
16 b; \
sub RX1.
16 b, RX1.
16 b, RTMP3.
16 b; \
tbx RTMP0.
16 b, {v28.
16 b-v31.
16 b}, RX0.
16 b; \
tbx RTMP1.
16 b, {v28.
16 b-v31.
16 b}, RX1.
16 b; \
\
/* linear part */ \
shl RX0.
4 s, RTMP0.
4 s, #
8 ; \
shl RX1.
4 s, RTMP1.
4 s, #
8 ; \
shl RTMP2.
4 s, RTMP0.
4 s, #
16 ; \
shl RTMP3.
4 s, RTMP1.
4 s, #
16 ; \
sri RX0.
4 s, RTMP0.
4 s, #(
32 -
8 ); \
sri RX1.
4 s, RTMP1.
4 s, #(
32 -
8 ); \
sri RTMP2.
4 s, RTMP0.
4 s, #(
32 -
16 ); \
sri RTMP3.
4 s, RTMP1.
4 s, #(
32 -
16 ); \
/* RX = x ^ rol32(x, 8) ^ rol32(x, 16) */ \
eor RX0.
16 b, RX0.
16 b, RTMP0.
16 b; \
eor RX1.
16 b, RX1.
16 b, RTMP1.
16 b; \
eor RX0.
16 b, RX0.
16 b, RTMP2.
16 b; \
eor RX1.
16 b, RX1.
16 b, RTMP3.
16 b; \
/* RTMP0/1 ^= x ^ rol32(x, 24) ^ rol32(RX, 2) */ \
shl RTMP2.
4 s, RTMP0.
4 s, #
24 ; \
shl RTMP3.
4 s, RTMP1.
4 s, #
24 ; \
sri RTMP2.
4 s, RTMP0.
4 s, #(
32 -
24 ); \
sri RTMP3.
4 s, RTMP1.
4 s, #(
32 -
24 ); \
eor RTMP0.
16 b, RTMP0.
16 b, RTMP2.
16 b; \
eor RTMP1.
16 b, RTMP1.
16 b, RTMP3.
16 b; \
shl RTMP2.
4 s, RX0.
4 s, #
2 ; \
shl RTMP3.
4 s, RX1.
4 s, #
2 ; \
sri RTMP2.
4 s, RX0.
4 s, #(
32 -
2 ); \
sri RTMP3.
4 s, RX1.
4 s, #(
32 -
2 ); \
eor RTMP0.
16 b, RTMP0.
16 b, RTMP2.
16 b; \
eor RTMP1.
16 b, RTMP1.
16 b, RTMP3.
16 b; \
/* s0/t0 ^= RTMP0/1 */ \
eor s0.
16 b, s0.
16 b, RTMP0.
16 b; \
eor t0.
16 b, t0.
16 b, RTMP1.
16 b;
#define SM4_CRYPT_BLK8_norotate(b0, b1, b2, b3, b4, b5, b6, b7) \
rev32 b0.
16 b, b0.
16 b; \
rev32 b1.
16 b, b1.
16 b; \
rev32 b2.
16 b, b2.
16 b; \
rev32 b3.
16 b, b3.
16 b; \
rev32 b4.
16 b, b4.
16 b; \
rev32 b5.
16 b, b5.
16 b; \
rev32 b6.
16 b, b6.
16 b; \
rev32 b7.
16 b, b7.
16 b; \
\
mov x6,
8 ; \
8 : \
ld1 {RKEY.
4 s}, [x0], #
16 ; \
subs x6, x6, #
1 ; \
\
ROUND8(
0 , b0, b1, b2, b3, b4, b5, b6, b7); \
ROUND8(
1 , b1, b2, b3, b0, b5, b6, b7, b4); \
ROUND8(
2 , b2, b3, b0, b1, b6, b7, b4, b5); \
ROUND8(
3 , b3, b0, b1, b2, b7, b4, b5, b6); \
\
bne
8 b; \
\
rev32 b0.
16 b, b0.
16 b; \
rev32 b1.
16 b, b1.
16 b; \
rev32 b2.
16 b, b2.
16 b; \
rev32 b3.
16 b, b3.
16 b; \
rev32 b4.
16 b, b4.
16 b; \
rev32 b5.
16 b, b5.
16 b; \
rev32 b6.
16 b, b6.
16 b; \
rev32 b7.
16 b, b7.
16 b; \
\
/* repoint to rkey */ \
sub x0, x0, #
128 ;
#define SM4_CRYPT_BLK8(b0, b1, b2, b3, b4, b5, b6, b7) \
SM4_CRYPT_BLK8_norotate(b0, b1, b2, b3, b4, b5, b6, b7); \
rotate_clockwise_4x4_2x(b0, b1, b2, b3, b4, b5, b6, b7); \
.
align 3
SYM_FUNC_START(sm4_neon_crypt)
/* input:
* x0: round key array, CTX
* x1: dst
* x2: src
* w3: nblocks
*/
SM4_PREPARE()
.Lcrypt_loop_8x:
sub w3, w3, #
8
tbnz w3, #
31 , .Lcrypt_4x
ld4 {v0.
4 s-v3.
4 s}, [x2], #
64
ld4 {v4.
4 s-v7.
4 s}, [x2], #
64
SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)
st1 {v0.
16 b-v3.
16 b}, [x1], #
64
st1 {v4.
16 b-v7.
16 b}, [x1], #
64
cbz w3, .Lcrypt_end
b .Lcrypt_loop_8x
.Lcrypt_4x:
add w3, w3, #
8
cmp w3, #
4
blt .Lcrypt_tail
sub w3, w3, #
4
ld4 {v0.
4 s-v3.
4 s}, [x2], #
64
SM4_CRYPT_BLK4(v0, v1, v2, v3)
st1 {v0.
16 b-v3.
16 b}, [x1], #
64
cbz w3, .Lcrypt_end
.Lcrypt_tail:
cmp w3, #
2
ld1 {v0.
16 b}, [x2], #
16
blt .Lcrypt_tail_load_done
ld1 {v1.
16 b}, [x2], #
16
beq .Lcrypt_tail_load_done
ld1 {v2.
16 b}, [x2], #
16
.Lcrypt_tail_load_done:
transpose_4x4(v0, v1, v2, v3)
SM4_CRYPT_BLK4(v0, v1, v2, v3)
cmp w3, #
2
st1 {v0.
16 b}, [x1], #
16
blt .Lcrypt_end
st1 {v1.
16 b}, [x1], #
16
beq .Lcrypt_end
st1 {v2.
16 b}, [x1], #
16
.Lcrypt_end:
ret
SYM_FUNC_END(sm4_neon_crypt)
.
align 3
SYM_FUNC_START(sm4_neon_cbc_dec)
/* input:
* x0: round key array, CTX
* x1: dst
* x2: src
* x3: iv (big endian, 128 bit)
* w4: nblocks
*/
SM4_PREPARE()
ld1 {RIV.
16 b}, [x3]
.Lcbc_dec_loop_8x:
sub w4, w4, #
8
tbnz w4, #
31 , .Lcbc_dec_4x
ld4 {v0.
4 s-v3.
4 s}, [x2], #
64
ld4 {v4.
4 s-v7.
4 s}, [x2]
SM4_CRYPT_BLK8_norotate(v0, v1, v2, v3, v4, v5, v6, v7)
/* Avoid overwriting the RIV register */
rotate_clockwise_4x4(v0, v1, v2, v3)
rotate_clockwise_4x4(v4, v5, v6, v7)
sub x2, x2, #
64
eor v0.
16 b, v0.
16 b, RIV.
16 b
ld1 {RTMP0.
16 b-RTMP3.
16 b}, [x2], #
64
ld1 {RTMP4.
16 b-RTMP7.
16 b}, [x2], #
64
eor v1.
16 b, v1.
16 b, RTMP0.
16 b
eor v2.
16 b, v2.
16 b, RTMP1.
16 b
eor v3.
16 b, v3.
16 b, RTMP2.
16 b
eor v4.
16 b, v4.
16 b, RTMP3.
16 b
eor v5.
16 b, v5.
16 b, RTMP4.
16 b
eor v6.
16 b, v6.
16 b, RTMP5.
16 b
eor v7.
16 b, v7.
16 b, RTMP6.
16 b
mov RIV.
16 b, RTMP7.
16 b
st1 {v0.
16 b-v3.
16 b}, [x1], #
64
st1 {v4.
16 b-v7.
16 b}, [x1], #
64
cbz w4, .Lcbc_dec_end
b .Lcbc_dec_loop_8x
.Lcbc_dec_4x:
add w4, w4, #
8
cmp w4, #
4
blt .Lcbc_dec_tail
sub w4, w4, #
4
ld1 {v0.
16 b-v3.
16 b}, [x2], #
64
rev32 v4.
16 b, v0.
16 b
rev32 v5.
16 b, v1.
16 b
rev32 v6.
16 b, v2.
16 b
rev32 v7.
16 b, v3.
16 b
transpose_4x4(v4, v5, v6, v7)
SM4_CRYPT_BLK4_BE(v4, v5, v6, v7)
eor v4.
16 b, v4.
16 b, RIV.
16 b
eor v5.
16 b, v5.
16 b, v0.
16 b
eor v6.
16 b, v6.
16 b, v1.
16 b
eor v7.
16 b, v7.
16 b, v2.
16 b
mov RIV.
16 b, v3.
16 b
st1 {v4.
16 b-v7.
16 b}, [x1], #
64
cbz w4, .Lcbc_dec_end
.Lcbc_dec_tail:
cmp w4, #
2
ld1 {v0.
16 b}, [x2], #
16
blt .Lcbc_dec_tail_load_done
ld1 {v1.
16 b}, [x2], #
16
beq .Lcbc_dec_tail_load_done
ld1 {v2.
16 b}, [x2], #
16
.Lcbc_dec_tail_load_done:
rev32 v4.
16 b, v0.
16 b
rev32 v5.
16 b, v1.
16 b
rev32 v6.
16 b, v2.
16 b
transpose_4x4(v4, v5, v6, v7)
SM4_CRYPT_BLK4_BE(v4, v5, v6, v7)
cmp w4, #
2
eor v4.
16 b, v4.
16 b, RIV.
16 b
mov RIV.
16 b, v0.
16 b
st1 {v4.
16 b}, [x1], #
16
blt .Lcbc_dec_end
eor v5.
16 b, v5.
16 b, v0.
16 b
mov RIV.
16 b, v1.
16 b
st1 {v5.
16 b}, [x1], #
16
beq .Lcbc_dec_end
eor v6.
16 b, v6.
16 b, v1.
16 b
mov RIV.
16 b, v2.
16 b
st1 {v6.
16 b}, [x1], #
16
.Lcbc_dec_end:
/* store new IV */
st1 {RIV.
16 b}, [x3]
ret
SYM_FUNC_END(sm4_neon_cbc_dec)
.
align 3
SYM_FUNC_START(sm4_neon_ctr_crypt)
/* input:
* x0: round key array, CTX
* x1: dst
* x2: src
* x3: ctr (big endian, 128 bit)
* w4: nblocks
*/
SM4_PREPARE()
ldp x7, x8, [x3]
rev x7, x7
rev x8, x8
.Lctr_crypt_loop_8x:
sub w4, w4, #
8
tbnz w4, #
31 , .Lctr_crypt_4x
#define inc_le128(vctr) \
mov vctr.d[
1 ], x8; \
mov vctr.d[
0 ], x7; \
adds x8, x8, #
1 ; \
rev64 vctr.
16 b, vctr.
16 b; \
adc x7, x7, xzr;
/* construct CTRs */
inc_le128(v0)
/* +0 */
inc_le128(v1)
/* +1 */
inc_le128(v2)
/* +2 */
inc_le128(v3)
/* +3 */
inc_le128(v4)
/* +4 */
inc_le128(v5)
/* +5 */
inc_le128(v6)
/* +6 */
inc_le128(v7)
/* +7 */
transpose_4x4_2x(v0, v1, v2, v3, v4, v5, v6, v7)
SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)
ld1 {RTMP0.
16 b-RTMP3.
16 b}, [x2], #
64
ld1 {RTMP4.
16 b-RTMP7.
16 b}, [x2], #
64
eor v0.
16 b, v0.
16 b, RTMP0.
16 b
eor v1.
16 b, v1.
16 b, RTMP1.
16 b
eor v2.
16 b, v2.
16 b, RTMP2.
16 b
eor v3.
16 b, v3.
16 b, RTMP3.
16 b
eor v4.
16 b, v4.
16 b, RTMP4.
16 b
eor v5.
16 b, v5.
16 b, RTMP5.
16 b
eor v6.
16 b, v6.
16 b, RTMP6.
16 b
eor v7.
16 b, v7.
16 b, RTMP7.
16 b
st1 {v0.
16 b-v3.
16 b}, [x1], #
64
st1 {v4.
16 b-v7.
16 b}, [x1], #
64
cbz w4, .Lctr_crypt_end
b .Lctr_crypt_loop_8x
.Lctr_crypt_4x:
add w4, w4, #
8
cmp w4, #
4
blt .Lctr_crypt_tail
sub w4, w4, #
4
/* construct CTRs */
inc_le128(v0)
/* +0 */
inc_le128(v1)
/* +1 */
inc_le128(v2)
/* +2 */
inc_le128(v3)
/* +3 */
ld1 {v4.
16 b-v7.
16 b}, [x2], #
64
transpose_4x4(v0, v1, v2, v3)
SM4_CRYPT_BLK4(v0, v1, v2, v3)
eor v0.
16 b, v0.
16 b, v4.
16 b
eor v1.
16 b, v1.
16 b, v5.
16 b
eor v2.
16 b, v2.
16 b, v6.
16 b
eor v3.
16 b, v3.
16 b, v7.
16 b
st1 {v0.
16 b-v3.
16 b}, [x1], #
64
cbz w4, .Lctr_crypt_end
.Lctr_crypt_tail:
/* inc_le128 will change the sign bit */
ld1 {v4.
16 b}, [x2], #
16
inc_le128(v0)
cmp w4, #
2
blt .Lctr_crypt_tail_load_done
ld1 {v5.
16 b}, [x2], #
16
inc_le128(v1)
cmp w4, #
2
beq .Lctr_crypt_tail_load_done
ld1 {v6.
16 b}, [x2], #
16
inc_le128(v2)
.Lctr_crypt_tail_load_done:
transpose_4x4(v0, v1, v2, v3)
SM4_CRYPT_BLK4(v0, v1, v2, v3)
cmp w4, #
2
eor v0.
16 b, v0.
16 b, v4.
16 b
st1 {v0.
16 b}, [x1], #
16
blt .Lctr_crypt_end
eor v1.
16 b, v1.
16 b, v5.
16 b
st1 {v1.
16 b}, [x1], #
16
beq .Lctr_crypt_end
eor v2.
16 b, v2.
16 b, v6.
16 b
st1 {v2.
16 b}, [x1], #
16
.Lctr_crypt_end:
/* store new CTR */
rev x7, x7
rev x8, x8
stp x7, x8, [x3]
ret
SYM_FUNC_END(sm4_neon_ctr_crypt)
Messung V0.5 in Prozent C=99 H=100 G=99
¤ Dauer der Verarbeitung: 0.2 Sekunden
(vorverarbeitet am 2026-06-07)
¤
*© Formatika GbR, Deutschland