/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* SM4-GCM AEAD Algorithm using ARMv8 Crypto Extensions
* as specified in rfc8998
* https://datatracker.ietf.org/doc/html/rfc8998
*
* Copyright (C) 2016 Jussi Kivilinna <jussi.kivilinna@iki.fi>
* Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
*/
#include <linux/linkage.h>
#include <linux/cfi_types.h>
#include <asm/assembler.h>
#include
"sm4-ce-asm.h"
.arch armv8-a+crypto
.irp b,
0 ,
1 ,
2 ,
3 ,
24 ,
25 ,
26 ,
27 ,
28 ,
29 ,
30 ,
31
.
set .Lv\b\().
4 s, \b
.endr
.macro sm4e, vd, vn
.inst
0 xcec08400 | (.L\vn <<
5 ) | .L\vd
.endm
/* Register macros */
/* Used for both encryption and decryption */
#define RHASH v21
#define RRCONST v22
#define RZERO v23
/* Helper macros. */
/*
* input: m0, m1
* output: r0:r1 (low 128-bits in r0, high in r1)
*/
#define PMUL_128x128(r0, r1, m0, m1, T0, T1) \
ext T0.
16 b, m1.
16 b, m1.
16 b, #
8 ; \
pmull r0.
1 q, m0.
1 d, m1.
1 d; \
pmull T1.
1 q, m0.
1 d, T0.
1 d; \
pmull2 T0.
1 q, m0.
2 d, T0.
2 d; \
pmull2 r1.
1 q, m0.
2 d, m1.
2 d; \
eor T0.
16 b, T0.
16 b, T1.
16 b; \
ext T1.
16 b, RZERO.
16 b, T0.
16 b, #
8 ; \
ext T0.
16 b, T0.
16 b, RZERO.
16 b, #
8 ; \
eor r0.
16 b, r0.
16 b, T1.
16 b; \
eor r1.
16 b, r1.
16 b, T0.
16 b;
#define PMUL_128x128_4x(r0, r1, m0, m1, T0, T1, \
r2, r3, m2, m3, T2, T3, \
r4, r5, m4, m5, T4, T5, \
r6, r7, m6, m7, T6, T7) \
ext T0.
16 b, m1.
16 b, m1.
16 b, #
8 ; \
ext T2.
16 b, m3.
16 b, m3.
16 b, #
8 ; \
ext T4.
16 b, m5.
16 b, m5.
16 b, #
8 ; \
ext T6.
16 b, m7.
16 b, m7.
16 b, #
8 ; \
pmull r0.
1 q, m0.
1 d, m1.
1 d; \
pmull r2.
1 q, m2.
1 d, m3.
1 d; \
pmull r4.
1 q, m4.
1 d, m5.
1 d; \
pmull r6.
1 q, m6.
1 d, m7.
1 d; \
pmull T1.
1 q, m0.
1 d, T0.
1 d; \
pmull T3.
1 q, m2.
1 d, T2.
1 d; \
pmull T5.
1 q, m4.
1 d, T4.
1 d; \
pmull T7.
1 q, m6.
1 d, T6.
1 d; \
pmull2 T0.
1 q, m0.
2 d, T0.
2 d; \
pmull2 T2.
1 q, m2.
2 d, T2.
2 d; \
pmull2 T4.
1 q, m4.
2 d, T4.
2 d; \
pmull2 T6.
1 q, m6.
2 d, T6.
2 d; \
pmull2 r1.
1 q, m0.
2 d, m1.
2 d; \
pmull2 r3.
1 q, m2.
2 d, m3.
2 d; \
pmull2 r5.
1 q, m4.
2 d, m5.
2 d; \
pmull2 r7.
1 q, m6.
2 d, m7.
2 d; \
eor T0.
16 b, T0.
16 b, T1.
16 b; \
eor T2.
16 b, T2.
16 b, T3.
16 b; \
eor T4.
16 b, T4.
16 b, T5.
16 b; \
eor T6.
16 b, T6.
16 b, T7.
16 b; \
ext T1.
16 b, RZERO.
16 b, T0.
16 b, #
8 ; \
ext T3.
16 b, RZERO.
16 b, T2.
16 b, #
8 ; \
ext T5.
16 b, RZERO.
16 b, T4.
16 b, #
8 ; \
ext T7.
16 b, RZERO.
16 b, T6.
16 b, #
8 ; \
ext T0.
16 b, T0.
16 b, RZERO.
16 b, #
8 ; \
ext T2.
16 b, T2.
16 b, RZERO.
16 b, #
8 ; \
ext T4.
16 b, T4.
16 b, RZERO.
16 b, #
8 ; \
ext T6.
16 b, T6.
16 b, RZERO.
16 b, #
8 ; \
eor r0.
16 b, r0.
16 b, T1.
16 b; \
eor r2.
16 b, r2.
16 b, T3.
16 b; \
eor r4.
16 b, r4.
16 b, T5.
16 b; \
eor r6.
16 b, r6.
16 b, T7.
16 b; \
eor r1.
16 b, r1.
16 b, T0.
16 b; \
eor r3.
16 b, r3.
16 b, T2.
16 b; \
eor r5.
16 b, r5.
16 b, T4.
16 b; \
eor r7.
16 b, r7.
16 b, T6.
16 b;
/*
* input: r0:r1 (low 128-bits in r0, high in r1)
* output: a
*/
#define REDUCTION(a, r0, r1, rconst, T0, T1) \
pmull2 T0.
1 q, r1.
2 d, rconst.
2 d; \
ext T1.
16 b, T0.
16 b, RZERO.
16 b, #
8 ; \
ext T0.
16 b, RZERO.
16 b, T0.
16 b, #
8 ; \
eor r1.
16 b, r1.
16 b, T1.
16 b; \
eor r0.
16 b, r0.
16 b, T0.
16 b; \
pmull T0.
1 q, r1.
1 d, rconst.
1 d; \
eor a.
16 b, r0.
16 b, T0.
16 b;
#define SM4_CRYPT_PMUL_128x128_BLK(b0, r0, r1, m0, m1, T0, T1) \
rev32 b0.
16 b, b0.
16 b; \
ext T0.
16 b, m1.
16 b, m1.
16 b, #
8 ; \
sm4e b0.
4 s, v24.
4 s; \
pmull r0.
1 q, m0.
1 d, m1.
1 d; \
sm4e b0.
4 s, v25.
4 s; \
pmull T1.
1 q, m0.
1 d, T0.
1 d; \
sm4e b0.
4 s, v26.
4 s; \
pmull2 T0.
1 q, m0.
2 d, T0.
2 d; \
sm4e b0.
4 s, v27.
4 s; \
pmull2 r1.
1 q, m0.
2 d, m1.
2 d; \
sm4e b0.
4 s, v28.
4 s; \
eor T0.
16 b, T0.
16 b, T1.
16 b; \
sm4e b0.
4 s, v29.
4 s; \
ext T1.
16 b, RZERO.
16 b, T0.
16 b, #
8 ; \
sm4e b0.
4 s, v30.
4 s; \
ext T0.
16 b, T0.
16 b, RZERO.
16 b, #
8 ; \
sm4e b0.
4 s, v31.
4 s; \
eor r0.
16 b, r0.
16 b, T1.
16 b; \
rev64 b0.
4 s, b0.
4 s; \
eor r1.
16 b, r1.
16 b, T0.
16 b; \
ext b0.
16 b, b0.
16 b, b0.
16 b, #
8 ; \
rev32 b0.
16 b, b0.
16 b;
#define SM4_CRYPT_PMUL_128x128_BLK3(b0, b1, b2, \
r0, r1, m0, m1, T0, T1, \
r2, r3, m2, m3, T2, T3, \
r4, r5, m4, m5, T4, T5) \
rev32 b0.
16 b, b0.
16 b; \
rev32 b1.
16 b, b1.
16 b; \
rev32 b2.
16 b, b2.
16 b; \
ext T0.
16 b, m1.
16 b, m1.
16 b, #
8 ; \
ext T2.
16 b, m3.
16 b, m3.
16 b, #
8 ; \
ext T4.
16 b, m5.
16 b, m5.
16 b, #
8 ; \
sm4e b0.
4 s, v24.
4 s; \
sm4e b1.
4 s, v24.
4 s; \
sm4e b2.
4 s, v24.
4 s; \
pmull r0.
1 q, m0.
1 d, m1.
1 d; \
pmull r2.
1 q, m2.
1 d, m3.
1 d; \
pmull r4.
1 q, m4.
1 d, m5.
1 d; \
sm4e b0.
4 s, v25.
4 s; \
sm4e b1.
4 s, v25.
4 s; \
sm4e b2.
4 s, v25.
4 s; \
pmull T1.
1 q, m0.
1 d, T0.
1 d; \
pmull T3.
1 q, m2.
1 d, T2.
1 d; \
pmull T5.
1 q, m4.
1 d, T4.
1 d; \
sm4e b0.
4 s, v26.
4 s; \
sm4e b1.
4 s, v26.
4 s; \
sm4e b2.
4 s, v26.
4 s; \
pmull2 T0.
1 q, m0.
2 d, T0.
2 d; \
pmull2 T2.
1 q, m2.
2 d, T2.
2 d; \
pmull2 T4.
1 q, m4.
2 d, T4.
2 d; \
sm4e b0.
4 s, v27.
4 s; \
sm4e b1.
4 s, v27.
4 s; \
sm4e b2.
4 s, v27.
4 s; \
pmull2 r1.
1 q, m0.
2 d, m1.
2 d; \
pmull2 r3.
1 q, m2.
2 d, m3.
2 d; \
pmull2 r5.
1 q, m4.
2 d, m5.
2 d; \
sm4e b0.
4 s, v28.
4 s; \
sm4e b1.
4 s, v28.
4 s; \
sm4e b2.
4 s, v28.
4 s; \
eor T0.
16 b, T0.
16 b, T1.
16 b; \
eor T2.
16 b, T2.
16 b, T3.
16 b; \
eor T4.
16 b, T4.
16 b, T5.
16 b; \
sm4e b0.
4 s, v29.
4 s; \
sm4e b1.
4 s, v29.
4 s; \
sm4e b2.
4 s, v29.
4 s; \
ext T1.
16 b, RZERO.
16 b, T0.
16 b, #
8 ; \
ext T3.
16 b, RZERO.
16 b, T2.
16 b, #
8 ; \
ext T5.
16 b, RZERO.
16 b, T4.
16 b, #
8 ; \
sm4e b0.
4 s, v30.
4 s; \
sm4e b1.
4 s, v30.
4 s; \
sm4e b2.
4 s, v30.
4 s; \
ext T0.
16 b, T0.
16 b, RZERO.
16 b, #
8 ; \
ext T2.
16 b, T2.
16 b, RZERO.
16 b, #
8 ; \
ext T4.
16 b, T4.
16 b, RZERO.
16 b, #
8 ; \
sm4e b0.
4 s, v31.
4 s; \
sm4e b1.
4 s, v31.
4 s; \
sm4e b2.
4 s, v31.
4 s; \
eor r0.
16 b, r0.
16 b, T1.
16 b; \
eor r2.
16 b, r2.
16 b, T3.
16 b; \
eor r4.
16 b, r4.
16 b, T5.
16 b; \
rev64 b0.
4 s, b0.
4 s; \
rev64 b1.
4 s, b1.
4 s; \
rev64 b2.
4 s, b2.
4 s; \
eor r1.
16 b, r1.
16 b, T0.
16 b; \
eor r3.
16 b, r3.
16 b, T2.
16 b; \
eor r5.
16 b, r5.
16 b, T4.
16 b; \
ext b0.
16 b, b0.
16 b, b0.
16 b, #
8 ; \
ext b1.
16 b, b1.
16 b, b1.
16 b, #
8 ; \
ext b2.
16 b, b2.
16 b, b2.
16 b, #
8 ; \
eor r0.
16 b, r0.
16 b, r2.
16 b; \
eor r1.
16 b, r1.
16 b, r3.
16 b; \
rev32 b0.
16 b, b0.
16 b; \
rev32 b1.
16 b, b1.
16 b; \
rev32 b2.
16 b, b2.
16 b; \
eor r0.
16 b, r0.
16 b, r4.
16 b; \
eor r1.
16 b, r1.
16 b, r5.
16 b;
#define inc32_le128(vctr) \
mov vctr.d[
1 ], x9; \
add w6, w9, #
1 ; \
mov vctr.d[
0 ], x8; \
bfi x9, x6, #
0 , #
32 ; \
rev64 vctr.
16 b, vctr.
16 b;
#define GTAG_HASH_LENGTHS(vctr0, vlen) \
ld1 {vlen.
16 b}, [x7]; \
/* construct CTR0 */ \
/* the lower 32-bits of initial IV is always be32(1) */ \
mov x6, #
0 x1; \
bfi x9, x6, #
0 , #
32 ; \
mov vctr0.d[
0 ], x8; \
mov vctr0.d[
1 ], x9; \
rbit vlen.
16 b, vlen.
16 b; \
rev64 vctr0.
16 b, vctr0.
16 b; \
/* authtag = GCTR(CTR0, GHASH) */ \
eor RHASH.
16 b, RHASH.
16 b, vlen.
16 b; \
SM4_CRYPT_PMUL_128x128_BLK(vctr0, RR0, RR1, RHASH, RH1, \
RTMP0, RTMP1); \
REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3); \
rbit RHASH.
16 b, RHASH.
16 b; \
eor RHASH.
16 b, RHASH.
16 b, vctr0.
16 b;
/* Register macros for encrypt and ghash */
/* can be the same as input v0-v3 */
#define RR1 v0
#define RR3 v1
#define RR5 v2
#define RR7 v3
#define RR0 v4
#define RR2 v5
#define RR4 v6
#define RR6 v7
#define RTMP0 v8
#define RTMP1 v9
#define RTMP2 v10
#define RTMP3 v11
#define RTMP4 v12
#define RTMP5 v13
#define RTMP6 v14
#define RTMP7 v15
#define RH1 v16
#define RH2 v17
#define RH3 v18
#define RH4 v19
.
align 3
SYM_FUNC_START(sm4_ce_pmull_ghash_setup)
/* input:
* x0: round key array, CTX
* x1: ghash table
*/
SM4_PREPARE(x0)
adr_l x2, .Lghash_rconst
ld1r {RRCONST.
2 d}, [x2]
eor RZERO.
16 b, RZERO.
16 b, RZERO.
16 b
/* H = E(K, 0^128) */
rev32 v0.
16 b, RZERO.
16 b
SM4_CRYPT_BLK_BE(v0)
/* H ^ 1 */
rbit RH1.
16 b, v0.
16 b
/* H ^ 2 */
PMUL_128x128(RR0, RR1, RH1, RH1, RTMP0, RTMP1)
REDUCTION(RH2, RR0, RR1, RRCONST, RTMP2, RTMP3)
/* H ^ 3 */
PMUL_128x128(RR0, RR1, RH2, RH1, RTMP0, RTMP1)
REDUCTION(RH3, RR0, RR1, RRCONST, RTMP2, RTMP3)
/* H ^ 4 */
PMUL_128x128(RR0, RR1, RH2, RH2, RTMP0, RTMP1)
REDUCTION(RH4, RR0, RR1, RRCONST, RTMP2, RTMP3)
st1 {RH1.
16 b-RH4.
16 b}, [x1]
ret
SYM_FUNC_END(sm4_ce_pmull_ghash_setup)
.
align 3
SYM_FUNC_START(pmull_ghash_update)
/* input:
* x0: ghash table
* x1: ghash result
* x2: src
* w3: nblocks
*/
ld1 {RH1.
16 b-RH4.
16 b}, [x0]
ld1 {RHASH.
16 b}, [x1]
rbit RHASH.
16 b, RHASH.
16 b
adr_l x4, .Lghash_rconst
ld1r {RRCONST.
2 d}, [x4]
eor RZERO.
16 b, RZERO.
16 b, RZERO.
16 b
.Lghash_loop_4x:
cmp w3, #
4
blt .Lghash_loop_1x
sub w3, w3, #
4
ld1 {v0.
16 b-v3.
16 b}, [x2], #
64
rbit v0.
16 b, v0.
16 b
rbit v1.
16 b, v1.
16 b
rbit v2.
16 b, v2.
16 b
rbit v3.
16 b, v3.
16 b
/*
* (in0 ^ HASH) * H^4 => rr0:rr1
* (in1) * H^3 => rr2:rr3
* (in2) * H^2 => rr4:rr5
* (in3) * H^1 => rr6:rr7
*/
eor RHASH.
16 b, RHASH.
16 b, v0.
16 b
PMUL_128x128_4x(RR0, RR1, RHASH, RH4, RTMP0, RTMP1,
RR2, RR3, v1, RH3, RTMP2, RTMP3,
RR4, RR5, v2, RH2, RTMP4, RTMP5,
RR6, RR7, v3, RH1, RTMP6, RTMP7)
eor RR0.
16 b, RR0.
16 b, RR2.
16 b
eor RR1.
16 b, RR1.
16 b, RR3.
16 b
eor RR0.
16 b, RR0.
16 b, RR4.
16 b
eor RR1.
16 b, RR1.
16 b, RR5.
16 b
eor RR0.
16 b, RR0.
16 b, RR6.
16 b
eor RR1.
16 b, RR1.
16 b, RR7.
16 b
REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP0, RTMP1)
cbz w3, .Lghash_end
b .Lghash_loop_4x
.Lghash_loop_1x:
sub w3, w3, #
1
ld1 {v0.
16 b}, [x2], #
16
rbit v0.
16 b, v0.
16 b
eor RHASH.
16 b, RHASH.
16 b, v0.
16 b
PMUL_128x128(RR0, RR1, RHASH, RH1, RTMP0, RTMP1)
REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3)
cbnz w3, .Lghash_loop_1x
.Lghash_end:
rbit RHASH.
16 b, RHASH.
16 b
st1 {RHASH.
2 d}, [x1]
ret
SYM_FUNC_END(pmull_ghash_update)
.
align 3
SYM_TYPED_FUNC_START(sm4_ce_pmull_gcm_enc)
/* input:
* x0: round key array, CTX
* x1: dst
* x2: src
* x3: ctr (big endian, 128 bit)
* w4: nbytes
* x5: ghash result
* x6: ghash table
* x7: lengths (only for last block)
*/
SM4_PREPARE(x0)
ldp x8, x9, [x3]
rev x8, x8
rev x9, x9
ld1 {RH1.
16 b-RH4.
16 b}, [x6]
ld1 {RHASH.
16 b}, [x5]
rbit RHASH.
16 b, RHASH.
16 b
adr_l x6, .Lghash_rconst
ld1r {RRCONST.
2 d}, [x6]
eor RZERO.
16 b, RZERO.
16 b, RZERO.
16 b
cbz w4, .Lgcm_enc_hash_len
.Lgcm_enc_loop_4x:
cmp w4, #(
4 *
16 )
blt .Lgcm_enc_loop_1x
sub w4, w4, #(
4 *
16 )
/* construct CTRs */
inc32_le128(v0)
/* +0 */
inc32_le128(v1)
/* +1 */
inc32_le128(v2)
/* +2 */
inc32_le128(v3)
/* +3 */
ld1 {RTMP0.
16 b-RTMP3.
16 b}, [x2], #
64
SM4_CRYPT_BLK4(v0, v1, v2, v3)
eor v0.
16 b, v0.
16 b, RTMP0.
16 b
eor v1.
16 b, v1.
16 b, RTMP1.
16 b
eor v2.
16 b, v2.
16 b, RTMP2.
16 b
eor v3.
16 b, v3.
16 b, RTMP3.
16 b
st1 {v0.
16 b-v3.
16 b}, [x1], #
64
/* ghash update */
rbit v0.
16 b, v0.
16 b
rbit v1.
16 b, v1.
16 b
rbit v2.
16 b, v2.
16 b
rbit v3.
16 b, v3.
16 b
/*
* (in0 ^ HASH) * H^4 => rr0:rr1
* (in1) * H^3 => rr2:rr3
* (in2) * H^2 => rr4:rr5
* (in3) * H^1 => rr6:rr7
*/
eor RHASH.
16 b, RHASH.
16 b, v0.
16 b
PMUL_128x128_4x(RR0, RR1, RHASH, RH4, RTMP0, RTMP1,
RR2, RR3, v1, RH3, RTMP2, RTMP3,
RR4, RR5, v2, RH2, RTMP4, RTMP5,
RR6, RR7, v3, RH1, RTMP6, RTMP7)
eor RR0.
16 b, RR0.
16 b, RR2.
16 b
eor RR1.
16 b, RR1.
16 b, RR3.
16 b
eor RR0.
16 b, RR0.
16 b, RR4.
16 b
eor RR1.
16 b, RR1.
16 b, RR5.
16 b
eor RR0.
16 b, RR0.
16 b, RR6.
16 b
eor RR1.
16 b, RR1.
16 b, RR7.
16 b
REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP0, RTMP1)
cbz w4, .Lgcm_enc_hash_len
b .Lgcm_enc_loop_4x
.Lgcm_enc_loop_1x:
cmp w4, #
16
blt .Lgcm_enc_tail
sub w4, w4, #
16
/* construct CTRs */
inc32_le128(v0)
ld1 {RTMP0.
16 b}, [x2], #
16
SM4_CRYPT_BLK(v0)
eor v0.
16 b, v0.
16 b, RTMP0.
16 b
st1 {v0.
16 b}, [x1], #
16
/* ghash update */
rbit v0.
16 b, v0.
16 b
eor RHASH.
16 b, RHASH.
16 b, v0.
16 b
PMUL_128x128(RR0, RR1, RHASH, RH1, RTMP0, RTMP1)
REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3)
cbz w4, .Lgcm_enc_hash_len
b .Lgcm_enc_loop_1x
.Lgcm_enc_tail:
/* construct CTRs */
inc32_le128(v0)
SM4_CRYPT_BLK(v0)
/* load permute table */
adr_l x0, .Lcts_permute_table
add x0, x0, #
32
sub x0, x0, w4, uxtw
ld1 {v3.
16 b}, [x0]
.Lgcm_enc_tail_loop:
/* do encrypt */
ldrb w0, [x2], #
1 /* get 1 byte from input */
umov w6, v0.b[
0 ]
/* get top crypted byte */
eor w6, w6, w0
/* w6 = CTR ^ input */
strb w6, [x1], #
1 /* store out byte */
/* shift right out one byte */
ext v0.
16 b, v0.
16 b, v0.
16 b, #
1
/* the last ciphertext is placed in high bytes */
ins v0.b[
15 ], w6
subs w4, w4, #
1
bne .Lgcm_enc_tail_loop
/* padding last block with zeros */
tbl v0.
16 b, {v0.
16 b}, v3.
16 b
/* ghash update */
rbit v0.
16 b, v0.
16 b
eor RHASH.
16 b, RHASH.
16 b, v0.
16 b
PMUL_128x128(RR0, RR1, RHASH, RH1, RTMP0, RTMP1)
REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3)
.Lgcm_enc_hash_len:
cbz x7, .Lgcm_enc_end
GTAG_HASH_LENGTHS(v1, v3)
b .Lgcm_enc_ret
.Lgcm_enc_end:
/* store new CTR */
rev x8, x8
rev x9, x9
stp x8, x9, [x3]
rbit RHASH.
16 b, RHASH.
16 b
.Lgcm_enc_ret:
/* store new MAC */
st1 {RHASH.
2 d}, [x5]
ret
SYM_FUNC_END(sm4_ce_pmull_gcm_enc)
#undef RR1
#undef RR3
#undef RR5
#undef RR7
#undef RR0
#undef RR2
#undef RR4
#undef RR6
#undef RTMP0
#undef RTMP1
#undef RTMP2
#undef RTMP3
#undef RTMP4
#undef RTMP5
#undef RTMP6
#undef RTMP7
#undef RH1
#undef RH2
#undef RH3
#undef RH4
/* Register macros for decrypt */
/* v0-v2 for building CTRs, v3-v5 for saving inputs */
#define RR1 v6
#define RR3 v7
#define RR5 v8
#define RR0 v9
#define RR2 v10
#define RR4 v11
#define RTMP0 v12
#define RTMP1 v13
#define RTMP2 v14
#define RTMP3 v15
#define RTMP4 v16
#define RTMP5 v17
#define RH1 v18
#define RH2 v19
#define RH3 v20
.
align 3
SYM_TYPED_FUNC_START(sm4_ce_pmull_gcm_dec)
/* input:
* x0: round key array, CTX
* x1: dst
* x2: src
* x3: ctr (big endian, 128 bit)
* w4: nbytes
* x5: ghash result
* x6: ghash table
* x7: lengths (only for last block)
*/
SM4_PREPARE(x0)
ldp x8, x9, [x3]
rev x8, x8
rev x9, x9
ld1 {RH1.
16 b-RH3.
16 b}, [x6]
ld1 {RHASH.
16 b}, [x5]
rbit RHASH.
16 b, RHASH.
16 b
adr_l x6, .Lghash_rconst
ld1r {RRCONST.
2 d}, [x6]
eor RZERO.
16 b, RZERO.
16 b, RZERO.
16 b
cbz w4, .Lgcm_dec_hash_len
.Lgcm_dec_loop_3x:
cmp w4, #(
3 *
16 )
blt .Lgcm_dec_loop_1x
sub w4, w4, #(
3 *
16 )
ld1 {v3.
16 b-v5.
16 b}, [x2], #(
3 *
16 )
/* construct CTRs */
inc32_le128(v0)
/* +0 */
rbit v6.
16 b, v3.
16 b
inc32_le128(v1)
/* +1 */
rbit v7.
16 b, v4.
16 b
inc32_le128(v2)
/* +2 */
rbit v8.
16 b, v5.
16 b
eor RHASH.
16 b, RHASH.
16 b, v6.
16 b
/* decrypt & ghash update */
SM4_CRYPT_PMUL_128x128_BLK3(v0, v1, v2,
RR0, RR1, RHASH, RH3, RTMP0, RTMP1,
RR2, RR3, v7, RH2, RTMP2, RTMP3,
RR4, RR5, v8, RH1, RTMP4, RTMP5)
eor v0.
16 b, v0.
16 b, v3.
16 b
eor v1.
16 b, v1.
16 b, v4.
16 b
eor v2.
16 b, v2.
16 b, v5.
16 b
REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP0, RTMP1)
st1 {v0.
16 b-v2.
16 b}, [x1], #(
3 *
16 )
cbz w4, .Lgcm_dec_hash_len
b .Lgcm_dec_loop_3x
.Lgcm_dec_loop_1x:
cmp w4, #
16
blt .Lgcm_dec_tail
sub w4, w4, #
16
ld1 {v3.
16 b}, [x2], #
16
/* construct CTRs */
inc32_le128(v0)
rbit v6.
16 b, v3.
16 b
eor RHASH.
16 b, RHASH.
16 b, v6.
16 b
SM4_CRYPT_PMUL_128x128_BLK(v0, RR0, RR1, RHASH, RH1, RTMP0, RTMP1)
eor v0.
16 b, v0.
16 b, v3.
16 b
REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3)
st1 {v0.
16 b}, [x1], #
16
cbz w4, .Lgcm_dec_hash_len
b .Lgcm_dec_loop_1x
.Lgcm_dec_tail:
/* construct CTRs */
inc32_le128(v0)
SM4_CRYPT_BLK(v0)
/* load permute table */
adr_l x0, .Lcts_permute_table
add x0, x0, #
32
sub x0, x0, w4, uxtw
ld1 {v3.
16 b}, [x0]
.Lgcm_dec_tail_loop:
/* do decrypt */
ldrb w0, [x2], #
1 /* get 1 byte from input */
umov w6, v0.b[
0 ]
/* get top crypted byte */
eor w6, w6, w0
/* w6 = CTR ^ input */
strb w6, [x1], #
1 /* store out byte */
/* shift right out one byte */
ext v0.
16 b, v0.
16 b, v0.
16 b, #
1
/* the last ciphertext is placed in high bytes */
ins v0.b[
15 ], w0
subs w4, w4, #
1
bne .Lgcm_dec_tail_loop
/* padding last block with zeros */
tbl v0.
16 b, {v0.
16 b}, v3.
16 b
/* ghash update */
rbit v0.
16 b, v0.
16 b
eor RHASH.
16 b, RHASH.
16 b, v0.
16 b
PMUL_128x128(RR0, RR1, RHASH, RH1, RTMP0, RTMP1)
REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3)
.Lgcm_dec_hash_len:
cbz x7, .Lgcm_dec_end
GTAG_HASH_LENGTHS(v1, v3)
b .Lgcm_dec_ret
.Lgcm_dec_end:
/* store new CTR */
rev x8, x8
rev x9, x9
stp x8, x9, [x3]
rbit RHASH.
16 b, RHASH.
16 b
.Lgcm_dec_ret:
/* store new MAC */
st1 {RHASH.
2 d}, [x5]
ret
SYM_FUNC_END(sm4_ce_pmull_gcm_dec)
.
section ".rodata" ,
"a"
.
align 4
.Lcts_permute_table:
.byte
0 xff,
0 xff,
0 xff,
0 xff,
0 xff,
0 xff,
0 xff,
0 xff
.byte
0 xff,
0 xff,
0 xff,
0 xff,
0 xff,
0 xff,
0 xff,
0 xff
.byte
0 x0,
0 x1,
0 x2,
0 x3,
0 x4,
0 x5,
0 x6,
0 x7
.byte
0 x8,
0 x9,
0 xa,
0 xb,
0 xc,
0 xd,
0 xe,
0 xf
.byte
0 xff,
0 xff,
0 xff,
0 xff,
0 xff,
0 xff,
0 xff,
0 xff
.byte
0 xff,
0 xff,
0 xff,
0 xff,
0 xff,
0 xff,
0 xff,
0 xff
.Lghash_rconst:
.quad
0 x87
Messung V0.5 in Prozent C=94 H=96 G=94
¤ Dauer der Verarbeitung: 0.12 Sekunden
(vorverarbeitet am 2026-06-07)
¤
*© Formatika GbR, Deutschland