/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* Fast AES implementation for SPE instruction set (PPC)
*
* This code makes use of the SPE SIMD instruction set as defined in
* http://cache.freescale.com/files/32bit/doc/ref_manual/SPEPIM.pdf
* Implementation is based on optimization guide notes from
* http://cache.freescale.com/files/32bit/doc/app_note/AN2665.pdf
*
* Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
*/
#include <asm/ppc_asm.h>
#include
"aes-spe-regs.h"
#define EAD(in, bpos) \
rlwimi rT0,in,
28 -((bpos+
3 )%
4 )*
8 ,
20 ,
27 ;
#define DAD(in, bpos) \
rlwimi rT1,in,
24 -((bpos+
3 )%
4 )*
8 ,
24 ,
31 ;
#define LWH(out, off) \
evlwwsplat out,off(rT0);
/* load word high */
#define LWL(out, off) \
lwz out,off(rT0);
/* load word low */
#define LBZ(out, tab, off) \
lbz out,off(tab);
/* load byte */
#define LAH(out, in, bpos, off) \
EAD(in, bpos)
/* calc addr + load word high */ \
LWH(out, off)
#define LAL(out, in, bpos, off) \
EAD(in, bpos)
/* calc addr + load word low */ \
LWL(out, off)
#define LAE(out, in, bpos) \
EAD(in, bpos)
/* calc addr + load enc byte */ \
LBZ(out, rT0,
8 )
#define LBE(out) \
LBZ(out, rT0,
8 )
/* load enc byte */
#define LAD(out, in, bpos) \
DAD(in, bpos)
/* calc addr + load dec byte */ \
LBZ(out, rT1,
0 )
#define LBD(out) \
LBZ(out, rT1,
0 )
/*
* ppc_encrypt_block: The central encryption function for a single 16 bytes
* block. It does no stack handling or register saving to support fast calls
* via bl/blr. It expects that caller has pre-xored input data with first
* 4 words of encryption key into rD0-rD3. Pointer/counter registers must
* have also been set up before (rT0, rKP, CTR). Output is stored in rD0-rD3
* and rW0-rW3 and caller must execute a final xor on the output registers.
* All working registers rD0-rD3 & rW0-rW7 are overwritten during processing.
*
*/
_
GLOBAL (ppc_encrypt_block)
LAH(rW4, rD1,
2 ,
4 )
LAH(rW6, rD0,
3 ,
0 )
LAH(rW3, rD0,
1 ,
8 )
ppc_encrypt_block_loop:
LAH(rW0, rD3,
0 ,
12 )
LAL(rW0, rD0,
0 ,
12 )
LAH(rW1, rD1,
0 ,
12 )
LAH(rW2, rD2,
1 ,
8 )
LAL(rW2, rD3,
1 ,
8 )
LAL(rW3, rD1,
1 ,
8 )
LAL(rW4, rD2,
2 ,
4 )
LAL(rW6, rD1,
3 ,
0 )
LAH(rW5, rD3,
2 ,
4 )
LAL(rW5, rD0,
2 ,
4 )
LAH(rW7, rD2,
3 ,
0 )
evldw rD1,
16 (rKP)
EAD(rD3,
3 )
evxor rW2,rW2,rW4
LWL(rW7,
0 )
evxor rW2,rW2,rW6
EAD(rD2,
0 )
evxor rD1,rD1,rW2
LWL(rW1,
12 )
evxor rD1,rD1,rW0
evldw rD3,
24 (rKP)
evmergehi rD0,rD0,rD1
EAD(rD1,
2 )
evxor rW3,rW3,rW5
LWH(rW4,
4 )
evxor rW3,rW3,rW7
EAD(rD0,
3 )
evxor rD3,rD3,rW3
LWH(rW6,
0 )
evxor rD3,rD3,rW1
EAD(rD0,
1 )
evmergehi rD2,rD2,rD3
LWH(rW3,
8 )
LAH(rW0, rD3,
0 ,
12 )
LAL(rW0, rD0,
0 ,
12 )
LAH(rW1, rD1,
0 ,
12 )
LAH(rW2, rD2,
1 ,
8 )
LAL(rW2, rD3,
1 ,
8 )
LAL(rW3, rD1,
1 ,
8 )
LAL(rW4, rD2,
2 ,
4 )
LAL(rW6, rD1,
3 ,
0 )
LAH(rW5, rD3,
2 ,
4 )
LAL(rW5, rD0,
2 ,
4 )
LAH(rW7, rD2,
3 ,
0 )
evldw rD1,
32 (rKP)
EAD(rD3,
3 )
evxor rW2,rW2,rW4
LWL(rW7,
0 )
evxor rW2,rW2,rW6
EAD(rD2,
0 )
evxor rD1,rD1,rW2
LWL(rW1,
12 )
evxor rD1,rD1,rW0
evldw rD3,
40 (rKP)
evmergehi rD0,rD0,rD1
EAD(rD1,
2 )
evxor rW3,rW3,rW5
LWH(rW4,
4 )
evxor rW3,rW3,rW7
EAD(rD0,
3 )
evxor rD3,rD3,rW3
LWH(rW6,
0 )
evxor rD3,rD3,rW1
EAD(rD0,
1 )
evmergehi rD2,rD2,rD3
LWH(rW3,
8 )
addi rKP,rKP,
32
bdnz ppc_encrypt_block_loop
LAH(rW0, rD3,
0 ,
12 )
LAL(rW0, rD0,
0 ,
12 )
LAH(rW1, rD1,
0 ,
12 )
LAH(rW2, rD2,
1 ,
8 )
LAL(rW2, rD3,
1 ,
8 )
LAL(rW3, rD1,
1 ,
8 )
LAL(rW4, rD2,
2 ,
4 )
LAH(rW5, rD3,
2 ,
4 )
LAL(rW6, rD1,
3 ,
0 )
LAL(rW5, rD0,
2 ,
4 )
LAH(rW7, rD2,
3 ,
0 )
evldw rD1,
16 (rKP)
EAD(rD3,
3 )
evxor rW2,rW2,rW4
LWL(rW7,
0 )
evxor rW2,rW2,rW6
EAD(rD2,
0 )
evxor rD1,rD1,rW2
LWL(rW1,
12 )
evxor rD1,rD1,rW0
evldw rD3,
24 (rKP)
evmergehi rD0,rD0,rD1
EAD(rD1,
0 )
evxor rW3,rW3,rW5
LBE(rW2)
evxor rW3,rW3,rW7
EAD(rD0,
1 )
evxor rD3,rD3,rW3
LBE(rW6)
evxor rD3,rD3,rW1
EAD(rD0,
0 )
evmergehi rD2,rD2,rD3
LBE(rW1)
LAE(rW0, rD3,
0 )
LAE(rW1, rD0,
0 )
LAE(rW4, rD2,
1 )
LAE(rW5, rD3,
1 )
LAE(rW3, rD2,
0 )
LAE(rW7, rD1,
1 )
rlwimi rW0,rW4,
8 ,
16 ,
23
rlwimi rW1,rW5,
8 ,
16 ,
23
LAE(rW4, rD1,
2 )
LAE(rW5, rD2,
2 )
rlwimi rW2,rW6,
8 ,
16 ,
23
rlwimi rW3,rW7,
8 ,
16 ,
23
LAE(rW6, rD3,
2 )
LAE(rW7, rD0,
2 )
rlwimi rW0,rW4,
16 ,
8 ,
15
rlwimi rW1,rW5,
16 ,
8 ,
15
LAE(rW4, rD0,
3 )
LAE(rW5, rD1,
3 )
rlwimi rW2,rW6,
16 ,
8 ,
15
lwz rD0,
32 (rKP)
rlwimi rW3,rW7,
16 ,
8 ,
15
lwz rD1,
36 (rKP)
LAE(rW6, rD2,
3 )
LAE(rW7, rD3,
3 )
rlwimi rW0,rW4,
24 ,
0 ,
7
lwz rD2,
40 (rKP)
rlwimi rW1,rW5,
24 ,
0 ,
7
lwz rD3,
44 (rKP)
rlwimi rW2,rW6,
24 ,
0 ,
7
rlwimi rW3,rW7,
24 ,
0 ,
7
blr
/*
* ppc_decrypt_block: The central decryption function for a single 16 bytes
* block. It does no stack handling or register saving to support fast calls
* via bl/blr. It expects that caller has pre-xored input data with first
* 4 words of encryption key into rD0-rD3. Pointer/counter registers must
* have also been set up before (rT0, rKP, CTR). Output is stored in rD0-rD3
* and rW0-rW3 and caller must execute a final xor on the output registers.
* All working registers rD0-rD3 & rW0-rW7 are overwritten during processing.
*
*/
_
GLOBAL (ppc_decrypt_block)
LAH(rW0, rD1,
0 ,
12 )
LAH(rW6, rD0,
3 ,
0 )
LAH(rW3, rD0,
1 ,
8 )
ppc_decrypt_block_loop:
LAH(rW1, rD3,
0 ,
12 )
LAL(rW0, rD2,
0 ,
12 )
LAH(rW2, rD2,
1 ,
8 )
LAL(rW2, rD3,
1 ,
8 )
LAH(rW4, rD3,
2 ,
4 )
LAL(rW4, rD0,
2 ,
4 )
LAL(rW6, rD1,
3 ,
0 )
LAH(rW5, rD1,
2 ,
4 )
LAH(rW7, rD2,
3 ,
0 )
LAL(rW7, rD3,
3 ,
0 )
LAL(rW3, rD1,
1 ,
8 )
evldw rD1,
16 (rKP)
EAD(rD0,
0 )
evxor rW4,rW4,rW6
LWL(rW1,
12 )
evxor rW0,rW0,rW4
EAD(rD2,
2 )
evxor rW0,rW0,rW2
LWL(rW5,
4 )
evxor rD1,rD1,rW0
evldw rD3,
24 (rKP)
evmergehi rD0,rD0,rD1
EAD(rD1,
0 )
evxor rW3,rW3,rW7
LWH(rW0,
12 )
evxor rW3,rW3,rW1
EAD(rD0,
3 )
evxor rD3,rD3,rW3
LWH(rW6,
0 )
evxor rD3,rD3,rW5
EAD(rD0,
1 )
evmergehi rD2,rD2,rD3
LWH(rW3,
8 )
LAH(rW1, rD3,
0 ,
12 )
LAL(rW0, rD2,
0 ,
12 )
LAH(rW2, rD2,
1 ,
8 )
LAL(rW2, rD3,
1 ,
8 )
LAH(rW4, rD3,
2 ,
4 )
LAL(rW4, rD0,
2 ,
4 )
LAL(rW6, rD1,
3 ,
0 )
LAH(rW5, rD1,
2 ,
4 )
LAH(rW7, rD2,
3 ,
0 )
LAL(rW7, rD3,
3 ,
0 )
LAL(rW3, rD1,
1 ,
8 )
evldw rD1,
32 (rKP)
EAD(rD0,
0 )
evxor rW4,rW4,rW6
LWL(rW1,
12 )
evxor rW0,rW0,rW4
EAD(rD2,
2 )
evxor rW0,rW0,rW2
LWL(rW5,
4 )
evxor rD1,rD1,rW0
evldw rD3,
40 (rKP)
evmergehi rD0,rD0,rD1
EAD(rD1,
0 )
evxor rW3,rW3,rW7
LWH(rW0,
12 )
evxor rW3,rW3,rW1
EAD(rD0,
3 )
evxor rD3,rD3,rW3
LWH(rW6,
0 )
evxor rD3,rD3,rW5
EAD(rD0,
1 )
evmergehi rD2,rD2,rD3
LWH(rW3,
8 )
addi rKP,rKP,
32
bdnz ppc_decrypt_block_loop
LAH(rW1, rD3,
0 ,
12 )
LAL(rW0, rD2,
0 ,
12 )
LAH(rW2, rD2,
1 ,
8 )
LAL(rW2, rD3,
1 ,
8 )
LAH(rW4, rD3,
2 ,
4 )
LAL(rW4, rD0,
2 ,
4 )
LAL(rW6, rD1,
3 ,
0 )
LAH(rW5, rD1,
2 ,
4 )
LAH(rW7, rD2,
3 ,
0 )
LAL(rW7, rD3,
3 ,
0 )
LAL(rW3, rD1,
1 ,
8 )
evldw rD1,
16 (rKP)
EAD(rD0,
0 )
evxor rW4,rW4,rW6
LWL(rW1,
12 )
evxor rW0,rW0,rW4
EAD(rD2,
2 )
evxor rW0,rW0,rW2
LWL(rW5,
4 )
evxor rD1,rD1,rW0
evldw rD3,
24 (rKP)
evmergehi rD0,rD0,rD1
DAD(rD1,
0 )
evxor rW3,rW3,rW7
LBD(rW0)
evxor rW3,rW3,rW1
DAD(rD0,
1 )
evxor rD3,rD3,rW3
LBD(rW6)
evxor rD3,rD3,rW5
DAD(rD0,
0 )
evmergehi rD2,rD2,rD3
LBD(rW3)
LAD(rW2, rD3,
0 )
LAD(rW1, rD2,
0 )
LAD(rW4, rD2,
1 )
LAD(rW5, rD3,
1 )
LAD(rW7, rD1,
1 )
rlwimi rW0,rW4,
8 ,
16 ,
23
rlwimi rW1,rW5,
8 ,
16 ,
23
LAD(rW4, rD3,
2 )
LAD(rW5, rD0,
2 )
rlwimi rW2,rW6,
8 ,
16 ,
23
rlwimi rW3,rW7,
8 ,
16 ,
23
LAD(rW6, rD1,
2 )
LAD(rW7, rD2,
2 )
rlwimi rW0,rW4,
16 ,
8 ,
15
rlwimi rW1,rW5,
16 ,
8 ,
15
LAD(rW4, rD0,
3 )
LAD(rW5, rD1,
3 )
rlwimi rW2,rW6,
16 ,
8 ,
15
lwz rD0,
32 (rKP)
rlwimi rW3,rW7,
16 ,
8 ,
15
lwz rD1,
36 (rKP)
LAD(rW6, rD2,
3 )
LAD(rW7, rD3,
3 )
rlwimi rW0,rW4,
24 ,
0 ,
7
lwz rD2,
40 (rKP)
rlwimi rW1,rW5,
24 ,
0 ,
7
lwz rD3,
44 (rKP)
rlwimi rW2,rW6,
24 ,
0 ,
7
rlwimi rW3,rW7,
24 ,
0 ,
7
blr
Messung V0.5 in Prozent C=96 H=90 G=93
¤ Dauer der Verarbeitung: 0.11 Sekunden
(vorverarbeitet am 2026-06-07)
¤
*© Formatika GbR, Deutschland