/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* SM4 Cipher Algorithm for ARMv8 with Crypto Extensions
* as specified in
* https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html
*
* Copyright (C) 2022, Alibaba Group.
* Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
*/
#include <linux/linkage.h>
#include <asm/assembler.h>
#include
"sm4-ce-asm.h"
.arch armv8-a+crypto
.irp b,
0 ,
1 ,
2 ,
3 ,
4 ,
5 ,
6 ,
7 ,
8 ,
9 ,
10 ,
11 ,
12 ,
13 ,
14 ,
15 , \
20 ,
24 ,
25 ,
26 ,
27 ,
28 ,
29 ,
30 ,
31
.
set .Lv\b\().
4 s, \b
.endr
.macro sm4e, vd, vn
.inst
0 xcec08400 | (.L\vn <<
5 ) | .L\vd
.endm
.macro sm4ekey, vd, vn, vm
.inst
0 xce60c800 | (.L\vm <<
16 ) | (.L\vn <<
5 ) | .L\vd
.endm
/* Register macros */
#define RTMP0 v16
#define RTMP1 v17
#define RTMP2 v18
#define RTMP3 v19
#define RIV v20
#define RMAC v20
#define RMASK v21
.
align 3
SYM_FUNC_START(sm4_ce_expand_key)
/* input:
* x0: 128-bit key
* x1: rkey_enc
* x2: rkey_dec
* x3: fk array
* x4: ck array
*/
ld1 {v0.
16 b}, [x0];
rev32 v0.
16 b, v0.
16 b;
ld1 {v1.
16 b}, [x3];
/* load ck */
ld1 {v24.
16 b-v27.
16 b}, [x4], #
64 ;
ld1 {v28.
16 b-v31.
16 b}, [x4];
/* input ^ fk */
eor v0.
16 b, v0.
16 b, v1.
16 b;
sm4ekey v0.
4 s, v0.
4 s, v24.
4 s;
sm4ekey v1.
4 s, v0.
4 s, v25.
4 s;
sm4ekey v2.
4 s, v1.
4 s, v26.
4 s;
sm4ekey v3.
4 s, v2.
4 s, v27.
4 s;
sm4ekey v4.
4 s, v3.
4 s, v28.
4 s;
sm4ekey v5.
4 s, v4.
4 s, v29.
4 s;
sm4ekey v6.
4 s, v5.
4 s, v30.
4 s;
sm4ekey v7.
4 s, v6.
4 s, v31.
4 s;
adr_l x5, .Lbswap128_mask
ld1 {v24.
16 b}, [x5]
st1 {v0.
16 b-v3.
16 b}, [x1], #
64 ;
st1 {v4.
16 b-v7.
16 b}, [x1];
tbl v16.
16 b, {v7.
16 b}, v24.
16 b
tbl v17.
16 b, {v6.
16 b}, v24.
16 b
tbl v18.
16 b, {v5.
16 b}, v24.
16 b
tbl v19.
16 b, {v4.
16 b}, v24.
16 b
tbl v20.
16 b, {v3.
16 b}, v24.
16 b
tbl v21.
16 b, {v2.
16 b}, v24.
16 b
tbl v22.
16 b, {v1.
16 b}, v24.
16 b
tbl v23.
16 b, {v0.
16 b}, v24.
16 b
st1 {v16.
16 b-v19.
16 b}, [x2], #
64
st1 {v20.
16 b-v23.
16 b}, [x2]
ret;
SYM_FUNC_END(sm4_ce_expand_key)
.
align 3
SYM_FUNC_START(sm4_ce_crypt_block)
/* input:
* x0: round key array, CTX
* x1: dst
* x2: src
*/
SM4_PREPARE(x0)
ld1 {v0.
16 b}, [x2];
SM4_CRYPT_BLK(v0);
st1 {v0.
16 b}, [x1];
ret;
SYM_FUNC_END(sm4_ce_crypt_block)
.
align 3
SYM_FUNC_START(sm4_ce_crypt)
/* input:
* x0: round key array, CTX
* x1: dst
* x2: src
* w3: nblocks
*/
SM4_PREPARE(x0)
.Lcrypt_loop_blk:
sub w3, w3, #
8 ;
tbnz w3, #
31 , .Lcrypt_tail8;
ld1 {v0.
16 b-v3.
16 b}, [x2], #
64 ;
ld1 {v4.
16 b-v7.
16 b}, [x2], #
64 ;
SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7);
st1 {v0.
16 b-v3.
16 b}, [x1], #
64 ;
st1 {v4.
16 b-v7.
16 b}, [x1], #
64 ;
cbz w3, .Lcrypt_end;
b .Lcrypt_loop_blk;
.Lcrypt_tail8:
add w3, w3, #
8 ;
cmp w3, #
4 ;
blt .Lcrypt_tail4;
sub w3, w3, #
4 ;
ld1 {v0.
16 b-v3.
16 b}, [x2], #
64 ;
SM4_CRYPT_BLK4(v0, v1, v2, v3);
st1 {v0.
16 b-v3.
16 b}, [x1], #
64 ;
cbz w3, .Lcrypt_end;
.Lcrypt_tail4:
sub w3, w3, #
1 ;
ld1 {v0.
16 b}, [x2], #
16 ;
SM4_CRYPT_BLK(v0);
st1 {v0.
16 b}, [x1], #
16 ;
cbnz w3, .Lcrypt_tail4;
.Lcrypt_end:
ret;
SYM_FUNC_END(sm4_ce_crypt)
.
align 3
SYM_FUNC_START(sm4_ce_cbc_enc)
/* input:
* x0: round key array, CTX
* x1: dst
* x2: src
* x3: iv (big endian, 128 bit)
* w4: nblocks
*/
SM4_PREPARE(x0)
ld1 {RIV.
16 b}, [x3]
.Lcbc_enc_loop_4x:
cmp w4, #
4
blt .Lcbc_enc_loop_1x
sub w4, w4, #
4
ld1 {v0.
16 b-v3.
16 b}, [x2], #
64
eor v0.
16 b, v0.
16 b, RIV.
16 b
SM4_CRYPT_BLK(v0)
eor v1.
16 b, v1.
16 b, v0.
16 b
SM4_CRYPT_BLK(v1)
eor v2.
16 b, v2.
16 b, v1.
16 b
SM4_CRYPT_BLK(v2)
eor v3.
16 b, v3.
16 b, v2.
16 b
SM4_CRYPT_BLK(v3)
st1 {v0.
16 b-v3.
16 b}, [x1], #
64
mov RIV.
16 b, v3.
16 b
cbz w4, .Lcbc_enc_end
b .Lcbc_enc_loop_4x
.Lcbc_enc_loop_1x:
sub w4, w4, #
1
ld1 {v0.
16 b}, [x2], #
16
eor RIV.
16 b, RIV.
16 b, v0.
16 b
SM4_CRYPT_BLK(RIV)
st1 {RIV.
16 b}, [x1], #
16
cbnz w4, .Lcbc_enc_loop_1x
.Lcbc_enc_end:
/* store new IV */
st1 {RIV.
16 b}, [x3]
ret
SYM_FUNC_END(sm4_ce_cbc_enc)
.
align 3
SYM_FUNC_START(sm4_ce_cbc_dec)
/* input:
* x0: round key array, CTX
* x1: dst
* x2: src
* x3: iv (big endian, 128 bit)
* w4: nblocks
*/
SM4_PREPARE(x0)
ld1 {RIV.
16 b}, [x3]
.Lcbc_dec_loop_8x:
sub w4, w4, #
8
tbnz w4, #
31 , .Lcbc_dec_4x
ld1 {v0.
16 b-v3.
16 b}, [x2], #
64
ld1 {v4.
16 b-v7.
16 b}, [x2], #
64
rev32 v8.
16 b, v0.
16 b
rev32 v9.
16 b, v1.
16 b
rev32 v10.
16 b, v2.
16 b
rev32 v11.
16 b, v3.
16 b
rev32 v12.
16 b, v4.
16 b
rev32 v13.
16 b, v5.
16 b
rev32 v14.
16 b, v6.
16 b
rev32 v15.
16 b, v7.
16 b
SM4_CRYPT_BLK8_BE(v8, v9, v10, v11, v12, v13, v14, v15)
eor v8.
16 b, v8.
16 b, RIV.
16 b
eor v9.
16 b, v9.
16 b, v0.
16 b
eor v10.
16 b, v10.
16 b, v1.
16 b
eor v11.
16 b, v11.
16 b, v2.
16 b
eor v12.
16 b, v12.
16 b, v3.
16 b
eor v13.
16 b, v13.
16 b, v4.
16 b
eor v14.
16 b, v14.
16 b, v5.
16 b
eor v15.
16 b, v15.
16 b, v6.
16 b
st1 {v8.
16 b-v11.
16 b}, [x1], #
64
st1 {v12.
16 b-v15.
16 b}, [x1], #
64
mov RIV.
16 b, v7.
16 b
cbz w4, .Lcbc_dec_end
b .Lcbc_dec_loop_8x
.Lcbc_dec_4x:
add w4, w4, #
8
cmp w4, #
4
blt .Lcbc_dec_loop_1x
sub w4, w4, #
4
ld1 {v0.
16 b-v3.
16 b}, [x2], #
64
rev32 v8.
16 b, v0.
16 b
rev32 v9.
16 b, v1.
16 b
rev32 v10.
16 b, v2.
16 b
rev32 v11.
16 b, v3.
16 b
SM4_CRYPT_BLK4_BE(v8, v9, v10, v11)
eor v8.
16 b, v8.
16 b, RIV.
16 b
eor v9.
16 b, v9.
16 b, v0.
16 b
eor v10.
16 b, v10.
16 b, v1.
16 b
eor v11.
16 b, v11.
16 b, v2.
16 b
st1 {v8.
16 b-v11.
16 b}, [x1], #
64
mov RIV.
16 b, v3.
16 b
cbz w4, .Lcbc_dec_end
.Lcbc_dec_loop_1x:
sub w4, w4, #
1
ld1 {v0.
16 b}, [x2], #
16
rev32 v8.
16 b, v0.
16 b
SM4_CRYPT_BLK_BE(v8)
eor v8.
16 b, v8.
16 b, RIV.
16 b
st1 {v8.
16 b}, [x1], #
16
mov RIV.
16 b, v0.
16 b
cbnz w4, .Lcbc_dec_loop_1x
.Lcbc_dec_end:
/* store new IV */
st1 {RIV.
16 b}, [x3]
ret
SYM_FUNC_END(sm4_ce_cbc_dec)
.
align 3
SYM_FUNC_START(sm4_ce_cbc_cts_enc)
/* input:
* x0: round key array, CTX
* x1: dst
* x2: src
* x3: iv (big endian, 128 bit)
* w4: nbytes
*/
SM4_PREPARE(x0)
sub w5, w4, #
16
uxtw x5, w5
ld1 {RIV.
16 b}, [x3]
ld1 {v0.
16 b}, [x2]
eor RIV.
16 b, RIV.
16 b, v0.
16 b
SM4_CRYPT_BLK(RIV)
/* load permute table */
adr_l x6, .Lcts_permute_table
add x7, x6, #
32
add x6, x6, x5
sub x7, x7, x5
ld1 {v3.
16 b}, [x6]
ld1 {v4.
16 b}, [x7]
/* overlapping loads */
add x2, x2, x5
ld1 {v1.
16 b}, [x2]
/* create Cn from En-1 */
tbl v0.
16 b, {RIV.
16 b}, v3.
16 b
/* padding Pn with zeros */
tbl v1.
16 b, {v1.
16 b}, v4.
16 b
eor v1.
16 b, v1.
16 b, RIV.
16 b
SM4_CRYPT_BLK(v1)
/* overlapping stores */
add x5, x1, x5
st1 {v0.
16 b}, [x5]
st1 {v1.
16 b}, [x1]
ret
SYM_FUNC_END(sm4_ce_cbc_cts_enc)
.
align 3
SYM_FUNC_START(sm4_ce_cbc_cts_dec)
/* input:
* x0: round key array, CTX
* x1: dst
* x2: src
* x3: iv (big endian, 128 bit)
* w4: nbytes
*/
SM4_PREPARE(x0)
sub w5, w4, #
16
uxtw x5, w5
ld1 {RIV.
16 b}, [x3]
/* load permute table */
adr_l x6, .Lcts_permute_table
add x7, x6, #
32
add x6, x6, x5
sub x7, x7, x5
ld1 {v3.
16 b}, [x6]
ld1 {v4.
16 b}, [x7]
/* overlapping loads */
ld1 {v0.
16 b}, [x2], x5
ld1 {v1.
16 b}, [x2]
SM4_CRYPT_BLK(v0)
/* select the first Ln bytes of Xn to create Pn */
tbl v2.
16 b, {v0.
16 b}, v3.
16 b
eor v2.
16 b, v2.
16 b, v1.
16 b
/* overwrite the first Ln bytes with Cn to create En-1 */
tbx v0.
16 b, {v1.
16 b}, v4.
16 b
SM4_CRYPT_BLK(v0)
eor v0.
16 b, v0.
16 b, RIV.
16 b
/* overlapping stores */
add x5, x1, x5
st1 {v2.
16 b}, [x5]
st1 {v0.
16 b}, [x1]
ret
SYM_FUNC_END(sm4_ce_cbc_cts_dec)
.
align 3
SYM_FUNC_START(sm4_ce_ctr_enc)
/* input:
* x0: round key array, CTX
* x1: dst
* x2: src
* x3: ctr (big endian, 128 bit)
* w4: nblocks
*/
SM4_PREPARE(x0)
ldp x7, x8, [x3]
rev x7, x7
rev x8, x8
.Lctr_loop_8x:
sub w4, w4, #
8
tbnz w4, #
31 , .Lctr_4x
#define inc_le128(vctr) \
mov vctr.d[
1 ], x8; \
mov vctr.d[
0 ], x7; \
adds x8, x8, #
1 ; \
rev64 vctr.
16 b, vctr.
16 b; \
adc x7, x7, xzr;
/* construct CTRs */
inc_le128(v0)
/* +0 */
inc_le128(v1)
/* +1 */
inc_le128(v2)
/* +2 */
inc_le128(v3)
/* +3 */
inc_le128(v4)
/* +4 */
inc_le128(v5)
/* +5 */
inc_le128(v6)
/* +6 */
inc_le128(v7)
/* +7 */
ld1 {v8.
16 b-v11.
16 b}, [x2], #
64
ld1 {v12.
16 b-v15.
16 b}, [x2], #
64
SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)
eor v0.
16 b, v0.
16 b, v8.
16 b
eor v1.
16 b, v1.
16 b, v9.
16 b
eor v2.
16 b, v2.
16 b, v10.
16 b
eor v3.
16 b, v3.
16 b, v11.
16 b
eor v4.
16 b, v4.
16 b, v12.
16 b
eor v5.
16 b, v5.
16 b, v13.
16 b
eor v6.
16 b, v6.
16 b, v14.
16 b
eor v7.
16 b, v7.
16 b, v15.
16 b
st1 {v0.
16 b-v3.
16 b}, [x1], #
64
st1 {v4.
16 b-v7.
16 b}, [x1], #
64
cbz w4, .Lctr_end
b .Lctr_loop_8x
.Lctr_4x:
add w4, w4, #
8
cmp w4, #
4
blt .Lctr_loop_1x
sub w4, w4, #
4
/* construct CTRs */
inc_le128(v0)
/* +0 */
inc_le128(v1)
/* +1 */
inc_le128(v2)
/* +2 */
inc_le128(v3)
/* +3 */
ld1 {v8.
16 b-v11.
16 b}, [x2], #
64
SM4_CRYPT_BLK4(v0, v1, v2, v3)
eor v0.
16 b, v0.
16 b, v8.
16 b
eor v1.
16 b, v1.
16 b, v9.
16 b
eor v2.
16 b, v2.
16 b, v10.
16 b
eor v3.
16 b, v3.
16 b, v11.
16 b
st1 {v0.
16 b-v3.
16 b}, [x1], #
64
cbz w4, .Lctr_end
.Lctr_loop_1x:
sub w4, w4, #
1
/* construct CTRs */
inc_le128(v0)
ld1 {v8.
16 b}, [x2], #
16
SM4_CRYPT_BLK(v0)
eor v0.
16 b, v0.
16 b, v8.
16 b
st1 {v0.
16 b}, [x1], #
16
cbnz w4, .Lctr_loop_1x
.Lctr_end:
/* store new CTR */
rev x7, x7
rev x8, x8
stp x7, x8, [x3]
ret
SYM_FUNC_END(sm4_ce_ctr_enc)
#define tweak_next(vt, vin, RTMP) \
sshr RTMP.
2 d, vin.
2 d, #
63 ; \
and RTMP.
16 b, RTMP.
16 b, RMASK.
16 b; \
add vt.
2 d, vin.
2 d, vin.
2 d; \
ext RTMP.
16 b, RTMP.
16 b, RTMP.
16 b, #
8 ; \
eor vt.
16 b, vt.
16 b, RTMP.
16 b;
.
align 3
SYM_FUNC_START(sm4_ce_xts_enc)
/* input:
* x0: round key array, CTX
* x1: dst
* x2: src
* x3: tweak (big endian, 128 bit)
* w4: nbytes
* x5: round key array for IV
*/
ld1 {v8.
16 b}, [x3]
cbz x5, .Lxts_enc_nofirst
SM4_PREPARE(x5)
/* Generate first tweak */
SM4_CRYPT_BLK(v8)
.Lxts_enc_nofirst:
SM4_PREPARE(x0)
ands w5, w4, #
15
lsr w4, w4, #
4
sub w6, w4, #
1
csel w4, w4, w6, eq
uxtw x5, w5
movi RMASK.
2 s, #
0 x1
movi RTMP0.
2 s, #
0 x87
uzp1 RMASK.
4 s, RMASK.
4 s, RTMP0.
4 s
cbz w4, .Lxts_enc_cts
.Lxts_enc_loop_8x:
sub w4, w4, #
8
tbnz w4, #
31 , .Lxts_enc_4x
tweak_next( v9, v8, RTMP0)
tweak_next(v10, v9, RTMP1)
tweak_next(v11, v10, RTMP2)
tweak_next(v12, v11, RTMP3)
tweak_next(v13, v12, RTMP0)
tweak_next(v14, v13, RTMP1)
tweak_next(v15, v14, RTMP2)
ld1 {v0.
16 b-v3.
16 b}, [x2], #
64
ld1 {v4.
16 b-v7.
16 b}, [x2], #
64
eor v0.
16 b, v0.
16 b, v8.
16 b
eor v1.
16 b, v1.
16 b, v9.
16 b
eor v2.
16 b, v2.
16 b, v10.
16 b
eor v3.
16 b, v3.
16 b, v11.
16 b
eor v4.
16 b, v4.
16 b, v12.
16 b
eor v5.
16 b, v5.
16 b, v13.
16 b
eor v6.
16 b, v6.
16 b, v14.
16 b
eor v7.
16 b, v7.
16 b, v15.
16 b
SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)
eor v0.
16 b, v0.
16 b, v8.
16 b
eor v1.
16 b, v1.
16 b, v9.
16 b
eor v2.
16 b, v2.
16 b, v10.
16 b
eor v3.
16 b, v3.
16 b, v11.
16 b
eor v4.
16 b, v4.
16 b, v12.
16 b
eor v5.
16 b, v5.
16 b, v13.
16 b
eor v6.
16 b, v6.
16 b, v14.
16 b
eor v7.
16 b, v7.
16 b, v15.
16 b
st1 {v0.
16 b-v3.
16 b}, [x1], #
64
st1 {v4.
16 b-v7.
16 b}, [x1], #
64
tweak_next(v8, v15, RTMP3)
cbz w4, .Lxts_enc_cts
b .Lxts_enc_loop_8x
.Lxts_enc_4x:
add w4, w4, #
8
cmp w4, #
4
blt .Lxts_enc_loop_1x
sub w4, w4, #
4
tweak_next( v9, v8, RTMP0)
tweak_next(v10, v9, RTMP1)
tweak_next(v11, v10, RTMP2)
ld1 {v0.
16 b-v3.
16 b}, [x2], #
64
eor v0.
16 b, v0.
16 b, v8.
16 b
eor v1.
16 b, v1.
16 b, v9.
16 b
eor v2.
16 b, v2.
16 b, v10.
16 b
eor v3.
16 b, v3.
16 b, v11.
16 b
SM4_CRYPT_BLK4(v0, v1, v2, v3)
eor v0.
16 b, v0.
16 b, v8.
16 b
eor v1.
16 b, v1.
16 b, v9.
16 b
eor v2.
16 b, v2.
16 b, v10.
16 b
eor v3.
16 b, v3.
16 b, v11.
16 b
st1 {v0.
16 b-v3.
16 b}, [x1], #
64
tweak_next(v8, v11, RTMP3)
cbz w4, .Lxts_enc_cts
.Lxts_enc_loop_1x:
sub w4, w4, #
1
ld1 {v0.
16 b}, [x2], #
16
eor v0.
16 b, v0.
16 b, v8.
16 b
SM4_CRYPT_BLK(v0)
eor v0.
16 b, v0.
16 b, v8.
16 b
st1 {v0.
16 b}, [x1], #
16
tweak_next(v8, v8, RTMP0)
cbnz w4, .Lxts_enc_loop_1x
.Lxts_enc_cts:
cbz x5, .Lxts_enc_end
/* cipher text stealing */
tweak_next(v9, v8, RTMP0)
ld1 {v0.
16 b}, [x2]
eor v0.
16 b, v0.
16 b, v8.
16 b
SM4_CRYPT_BLK(v0)
eor v0.
16 b, v0.
16 b, v8.
16 b
/* load permute table */
adr_l x6, .Lcts_permute_table
add x7, x6, #
32
add x6, x6, x5
sub x7, x7, x5
ld1 {v3.
16 b}, [x6]
ld1 {v4.
16 b}, [x7]
/* overlapping loads */
add x2, x2, x5
ld1 {v1.
16 b}, [x2]
/* create Cn from En-1 */
tbl v2.
16 b, {v0.
16 b}, v3.
16 b
/* padding Pn with En-1 at the end */
tbx v0.
16 b, {v1.
16 b}, v4.
16 b
eor v0.
16 b, v0.
16 b, v9.
16 b
SM4_CRYPT_BLK(v0)
eor v0.
16 b, v0.
16 b, v9.
16 b
/* overlapping stores */
add x5, x1, x5
st1 {v2.
16 b}, [x5]
st1 {v0.
16 b}, [x1]
b .Lxts_enc_ret
.Lxts_enc_end:
/* store new tweak */
st1 {v8.
16 b}, [x3]
.Lxts_enc_ret:
ret
SYM_FUNC_END(sm4_ce_xts_enc)
.
align 3
SYM_FUNC_START(sm4_ce_xts_dec)
/* input:
* x0: round key array, CTX
* x1: dst
* x2: src
* x3: tweak (big endian, 128 bit)
* w4: nbytes
* x5: round key array for IV
*/
ld1 {v8.
16 b}, [x3]
cbz x5, .Lxts_dec_nofirst
SM4_PREPARE(x5)
/* Generate first tweak */
SM4_CRYPT_BLK(v8)
.Lxts_dec_nofirst:
SM4_PREPARE(x0)
ands w5, w4, #
15
lsr w4, w4, #
4
sub w6, w4, #
1
csel w4, w4, w6, eq
uxtw x5, w5
movi RMASK.
2 s, #
0 x1
movi RTMP0.
2 s, #
0 x87
uzp1 RMASK.
4 s, RMASK.
4 s, RTMP0.
4 s
cbz w4, .Lxts_dec_cts
.Lxts_dec_loop_8x:
sub w4, w4, #
8
tbnz w4, #
31 , .Lxts_dec_4x
tweak_next( v9, v8, RTMP0)
tweak_next(v10, v9, RTMP1)
tweak_next(v11, v10, RTMP2)
tweak_next(v12, v11, RTMP3)
tweak_next(v13, v12, RTMP0)
tweak_next(v14, v13, RTMP1)
tweak_next(v15, v14, RTMP2)
ld1 {v0.
16 b-v3.
16 b}, [x2], #
64
ld1 {v4.
16 b-v7.
16 b}, [x2], #
64
eor v0.
16 b, v0.
16 b, v8.
16 b
eor v1.
16 b, v1.
16 b, v9.
16 b
eor v2.
16 b, v2.
16 b, v10.
16 b
eor v3.
16 b, v3.
16 b, v11.
16 b
eor v4.
16 b, v4.
16 b, v12.
16 b
eor v5.
16 b, v5.
16 b, v13.
16 b
eor v6.
16 b, v6.
16 b, v14.
16 b
eor v7.
16 b, v7.
16 b, v15.
16 b
SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)
eor v0.
16 b, v0.
16 b, v8.
16 b
eor v1.
16 b, v1.
16 b, v9.
16 b
eor v2.
16 b, v2.
16 b, v10.
16 b
eor v3.
16 b, v3.
16 b, v11.
16 b
eor v4.
16 b, v4.
16 b, v12.
16 b
eor v5.
16 b, v5.
16 b, v13.
16 b
eor v6.
16 b, v6.
16 b, v14.
16 b
eor v7.
16 b, v7.
16 b, v15.
16 b
st1 {v0.
16 b-v3.
16 b}, [x1], #
64
st1 {v4.
16 b-v7.
16 b}, [x1], #
64
tweak_next(v8, v15, RTMP3)
cbz w4, .Lxts_dec_cts
b .Lxts_dec_loop_8x
.Lxts_dec_4x:
add w4, w4, #
8
cmp w4, #
4
blt .Lxts_dec_loop_1x
sub w4, w4, #
4
tweak_next( v9, v8, RTMP0)
tweak_next(v10, v9, RTMP1)
tweak_next(v11, v10, RTMP2)
ld1 {v0.
16 b-v3.
16 b}, [x2], #
64
eor v0.
16 b, v0.
16 b, v8.
16 b
eor v1.
16 b, v1.
16 b, v9.
16 b
eor v2.
16 b, v2.
16 b, v10.
16 b
eor v3.
16 b, v3.
16 b, v11.
16 b
SM4_CRYPT_BLK4(v0, v1, v2, v3)
eor v0.
16 b, v0.
16 b, v8.
16 b
eor v1.
16 b, v1.
16 b, v9.
16 b
eor v2.
16 b, v2.
16 b, v10.
16 b
eor v3.
16 b, v3.
16 b, v11.
16 b
st1 {v0.
16 b-v3.
16 b}, [x1], #
64
tweak_next(v8, v11, RTMP3)
cbz w4, .Lxts_dec_cts
.Lxts_dec_loop_1x:
sub w4, w4, #
1
ld1 {v0.
16 b}, [x2], #
16
eor v0.
16 b, v0.
16 b, v8.
16 b
SM4_CRYPT_BLK(v0)
eor v0.
16 b, v0.
16 b, v8.
16 b
st1 {v0.
16 b}, [x1], #
16
tweak_next(v8, v8, RTMP0)
cbnz w4, .Lxts_dec_loop_1x
.Lxts_dec_cts:
cbz x5, .Lxts_dec_end
/* cipher text stealing */
tweak_next(v9, v8, RTMP0)
ld1 {v0.
16 b}, [x2]
eor v0.
16 b, v0.
16 b, v9.
16 b
SM4_CRYPT_BLK(v0)
eor v0.
16 b, v0.
16 b, v9.
16 b
/* load permute table */
adr_l x6, .Lcts_permute_table
add x7, x6, #
32
add x6, x6, x5
sub x7, x7, x5
ld1 {v3.
16 b}, [x6]
ld1 {v4.
16 b}, [x7]
/* overlapping loads */
add x2, x2, x5
ld1 {v1.
16 b}, [x2]
/* create Cn from En-1 */
tbl v2.
16 b, {v0.
16 b}, v3.
16 b
/* padding Pn with En-1 at the end */
tbx v0.
16 b, {v1.
16 b}, v4.
16 b
eor v0.
16 b, v0.
16 b, v8.
16 b
SM4_CRYPT_BLK(v0)
eor v0.
16 b, v0.
16 b, v8.
16 b
/* overlapping stores */
add x5, x1, x5
st1 {v2.
16 b}, [x5]
st1 {v0.
16 b}, [x1]
b .Lxts_dec_ret
.Lxts_dec_end:
/* store new tweak */
st1 {v8.
16 b}, [x3]
.Lxts_dec_ret:
ret
SYM_FUNC_END(sm4_ce_xts_dec)
.
align 3
SYM_FUNC_START(sm4_ce_mac_update)
/* input:
* x0: round key array, CTX
* x1: digest
* x2: src
* w3: nblocks
* w4: enc_before
* w5: enc_after
*/
SM4_PREPARE(x0)
ld1 {RMAC.
16 b}, [x1]
cbz w4, .Lmac_update
SM4_CRYPT_BLK(RMAC)
.Lmac_update:
cbz w3, .Lmac_ret
sub w6, w3, #
1
cmp w5, wzr
csel w3, w3, w6, ne
cbz w3, .Lmac_end
.Lmac_loop_4x:
cmp w3, #
4
blt .Lmac_loop_1x
sub w3, w3, #
4
ld1 {v0.
16 b-v3.
16 b}, [x2], #
64
eor RMAC.
16 b, RMAC.
16 b, v0.
16 b
SM4_CRYPT_BLK(RMAC)
eor RMAC.
16 b, RMAC.
16 b, v1.
16 b
SM4_CRYPT_BLK(RMAC)
eor RMAC.
16 b, RMAC.
16 b, v2.
16 b
SM4_CRYPT_BLK(RMAC)
eor RMAC.
16 b, RMAC.
16 b, v3.
16 b
SM4_CRYPT_BLK(RMAC)
cbz w3, .Lmac_end
b .Lmac_loop_4x
.Lmac_loop_1x:
sub w3, w3, #
1
ld1 {v0.
16 b}, [x2], #
16
eor RMAC.
16 b, RMAC.
16 b, v0.
16 b
SM4_CRYPT_BLK(RMAC)
cbnz w3, .Lmac_loop_1x
.Lmac_end:
cbnz w5, .Lmac_ret
ld1 {v0.
16 b}, [x2], #
16
eor RMAC.
16 b, RMAC.
16 b, v0.
16 b
.Lmac_ret:
st1 {RMAC.
16 b}, [x1]
ret
SYM_FUNC_END(sm4_ce_mac_update)
.
section ".rodata" ,
"a"
.
align 4
.Lbswap128_mask:
.byte
0 x0c,
0 x0d,
0 x0e,
0 x0f,
0 x08,
0 x09,
0 x0a,
0 x0b
.byte
0 x04,
0 x05,
0 x06,
0 x07,
0 x00,
0 x01,
0 x02,
0 x03
.Lcts_permute_table:
.byte
0 xff,
0 xff,
0 xff,
0 xff,
0 xff,
0 xff,
0 xff,
0 xff
.byte
0 xff,
0 xff,
0 xff,
0 xff,
0 xff,
0 xff,
0 xff,
0 xff
.byte
0 x0,
0 x1,
0 x2,
0 x3,
0 x4,
0 x5,
0 x6,
0 x7
.byte
0 x8,
0 x9,
0 xa,
0 xb,
0 xc,
0 xd,
0 xe,
0 xf
.byte
0 xff,
0 xff,
0 xff,
0 xff,
0 xff,
0 xff,
0 xff,
0 xff
.byte
0 xff,
0 xff,
0 xff,
0 xff,
0 xff,
0 xff,
0 xff,
0 xff
Messung V0.5 in Prozent C=94 H=100 G=96
¤ Dauer der Verarbeitung: 0.9 Sekunden
(vorverarbeitet am 2026-06-05)
¤
*© Formatika GbR, Deutschland