Quellcode-Bibliothek chacha-core.S
Sprache: unbekannt
/* SPDX-License-Identifier: GPL-2.0 OR MIT */
/*
* Copyright (C) 2016-2018 René van Dorst <opensource@vdorst.com>. All Rights Reserved.
* Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
*/
#define MASK_U32 0 x3c
#define CHACHA20_BLOCK_SIZE 64
#define STACK_SIZE 32
#define X0 $t0
#define X1 $t1
#define X2 $t2
#define X3 $t3
#define X4 $t4
#define X5 $t5
#define X6 $t6
#define X7 $t7
#define X8 $t8
#define X9 $t9
#define X10 $v1
#define X11 $s6
#define X12 $s5
#define X13 $s4
#define X14 $s3
#define X15 $s2
/* Use regs which are overwritten on exit for Tx so we don't leak clear data. */
#define T0 $s1
#define T1 $s0
#define T(n) T ## n
#define X(n) X ## n
/* Input arguments */
#define STATE $a0
#define OUT $a1
#define IN $a2
#define BYTES $a3
/* Output argument */
/* NONCE[0] is kept in a register and not in memory.
* We don't want to touch original value in memory.
* Must be incremented every loop iteration.
*/
#define NONCE_0 $v0
/* SAVED_X and SAVED_CA are set in the jump table.
* Use regs which are overwritten on exit else we don't leak clear data.
* They are used to handling the last bytes which are not multiple of 4.
*/
#define SAVED_X X15
#define SAVED_CA $s7
#define IS_UNALIGNED $s7
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
#define MSB 0
#define LSB 3
#define CPU_TO_LE32(n) \
wsbh n, n; \
rotr n, 16 ;
#else
#define MSB 3
#define LSB 0
#define CPU_TO_LE32(n)
#endif
#define FOR_EACH_WORD(x) \
x( 0 ); \
x( 1 ); \
x( 2 ); \
x( 3 ); \
x( 4 ); \
x( 5 ); \
x( 6 ); \
x( 7 ); \
x( 8 ); \
x( 9 ); \
x(10 ); \
x(11 ); \
x(12 ); \
x(13 ); \
x(14 ); \
x(15 );
#define FOR_EACH_WORD_REV(x) \
x(15 ); \
x(14 ); \
x(13 ); \
x(12 ); \
x(11 ); \
x(10 ); \
x( 9 ); \
x( 8 ); \
x( 7 ); \
x( 6 ); \
x( 5 ); \
x( 4 ); \
x( 3 ); \
x( 2 ); \
x( 1 ); \
x( 0 );
#define PLUS_ONE_0 1
#define PLUS_ONE_1 2
#define PLUS_ONE_2 3
#define PLUS_ONE_3 4
#define PLUS_ONE_4 5
#define PLUS_ONE_5 6
#define PLUS_ONE_6 7
#define PLUS_ONE_7 8
#define PLUS_ONE_8 9
#define PLUS_ONE_9 10
#define PLUS_ONE_10 11
#define PLUS_ONE_11 12
#define PLUS_ONE_12 13
#define PLUS_ONE_13 14
#define PLUS_ONE_14 15
#define PLUS_ONE_15 16
#define PLUS_ONE(x) PLUS_ONE_ ## x
#define _CONCAT3(a,b,c) a ## b ## c
#define CONCAT3(a,b,c) _CONCAT3(a,b,c)
#define STORE_UNALIGNED(x) \
CONCAT3(.Lchacha_mips_xor_unaligned_, PLUS_ONE(x), _b: ;) \
.if (x != 12); \
lw T0, (x*4 )(STATE); \
.endif; \
lwl T1, (x*4 )+MSB ## (IN); \
lwr T1, (x*4 )+LSB ## (IN); \
.if (x == 12 ); \
addu X ## x, NONCE_0; \
.else; \
addu X ## x, T0; \
.endif; \
CPU_TO_LE32(X ## x); \
xor X ## x, T1; \
swl X ## x, (x*4 )+MSB ## (OUT); \
swr X ## x, (x*4 )+LSB ## (OUT);
#define STORE_ALIGNED(x) \
CONCAT3(.Lchacha_mips_xor_aligned_, PLUS_ONE(x), _b: ;) \
.if (x != 12); \
lw T0, (x*4 )(STATE); \
.endif; \
lw T1, (x*4 ) ## (IN); \
.if (x == 12 ); \
addu X ## x, NONCE_0; \
.else; \
addu X ## x, T0; \
.endif; \
CPU_TO_LE32(X ## x); \
xor X ## x, T1; \
sw X ## x, (x*4 ) ## (OUT);
/* Jump table macro.
* Used for setup and handling the last bytes, which are not multiple of 4.
* X15 is free to store Xn
* Every jumptable entry must be equal in size.
*/
#define JMPTBL_ALIGNED(x) \
.Lchacha_mips_jmptbl_aligned_ ## x: ; \
.set noreorder; \
b .Lchacha_mips_xor_aligned_ ## x ## _b; \
.if (x == 12 ); \
addu SAVED_X, X ## x, NONCE_0; \
.else; \
addu SAVED_X, X ## x, SAVED_CA; \
.endif; \
.set reorder
#define JMPTBL_UNALIGNED(x) \
.Lchacha_mips_jmptbl_unaligned_ ## x: ; \
.set noreorder; \
b .Lchacha_mips_xor_unaligned_ ## x ## _b; \
.if (x == 12 ); \
addu SAVED_X, X ## x, NONCE_0; \
.else; \
addu SAVED_X, X ## x, SAVED_CA; \
.endif; \
.set reorder
#define AXR(A, B, C, D, K, L, M, N, V, W, Y, Z, S) \
addu X(A), X(K); \
addu X(B), X(L); \
addu X(C), X(M); \
addu X(D), X(N); \
xor X(V), X(A); \
xor X(W), X(B); \
xor X(Y), X(C); \
xor X(Z), X(D); \
rotr X(V), 32 - S; \
rotr X(W), 32 - S; \
rotr X(Y), 32 - S; \
rotr X(Z), 32 - S;
.text
.set reorder
.set noat
.globl chacha_crypt_arch
.ent chacha_crypt_arch
chacha_crypt_arch:
.frame $sp, STACK_SIZE, $ra
/* Load number of rounds */
lw $at, 16 ($sp)
addiu $sp, -STACK_SIZE
/* Return bytes = 0. */
beqz BYTES, .Lchacha_mips_end
lw NONCE_0, 48 (STATE)
/* Save s0-s7 */
sw $s0, 0 ($sp)
sw $s1, 4 ($sp)
sw $s2, 8 ($sp)
sw $s3, 12 ($sp)
sw $s4, 16 ($sp)
sw $s5, 20 ($sp)
sw $s6, 24 ($sp)
sw $s7, 28 ($sp)
/* Test IN or OUT is unaligned.
* IS_UNALIGNED = ( IN | OUT ) & 0x00000003
*/
or IS_UNALIGNED, IN, OUT
andi IS_UNALIGNED, 0 x3
b .Lchacha_rounds_start
.align 4
.Loop_chacha_rounds:
addiu IN, CHACHA20_BLOCK_SIZE
addiu OUT, CHACHA20_BLOCK_SIZE
addiu NONCE_0, 1
.Lchacha_rounds_start:
lw X0, 0 (STATE)
lw X1, 4 (STATE)
lw X2, 8 (STATE)
lw X3, 12 (STATE)
lw X4, 16 (STATE)
lw X5, 20 (STATE)
lw X6, 24 (STATE)
lw X7, 28 (STATE)
lw X8, 32 (STATE)
lw X9, 36 (STATE)
lw X10, 40 (STATE)
lw X11, 44 (STATE)
move X12, NONCE_0
lw X13, 52 (STATE)
lw X14, 56 (STATE)
lw X15, 60 (STATE)
.Loop_chacha_xor_rounds:
addiu $at, -2
AXR( 0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 , 12 ,13 ,14 ,15 , 16 );
AXR( 8 , 9 ,10 ,11 , 12 ,13 ,14 ,15 , 4 , 5 , 6 , 7 , 12 );
AXR( 0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 , 12 ,13 ,14 ,15 , 8 );
AXR( 8 , 9 ,10 ,11 , 12 ,13 ,14 ,15 , 4 , 5 , 6 , 7 , 7 );
AXR( 0 , 1 , 2 , 3 , 5 , 6 , 7 , 4 , 15 ,12 ,13 ,14 , 16 );
AXR(10 ,11 , 8 , 9 , 15 ,12 ,13 ,14 , 5 , 6 , 7 , 4 , 12 );
AXR( 0 , 1 , 2 , 3 , 5 , 6 , 7 , 4 , 15 ,12 ,13 ,14 , 8 );
AXR(10 ,11 , 8 , 9 , 15 ,12 ,13 ,14 , 5 , 6 , 7 , 4 , 7 );
bnez $at, .Loop_chacha_xor_rounds
addiu BYTES, -(CHACHA20_BLOCK_SIZE)
/* Is data src/dst unaligned? Jump */
bnez IS_UNALIGNED, .Loop_chacha_unaligned
/* Set number rounds here to fill delayslot. */
lw $at, (STACK_SIZE+16 )($sp)
/* BYTES < 0, it has no full block. */
bltz BYTES, .Lchacha_mips_no_full_block_aligned
FOR_EACH_WORD_REV(STORE_ALIGNED)
/* BYTES > 0? Loop again. */
bgtz BYTES, .Loop_chacha_rounds
/* Place this here to fill delay slot */
addiu NONCE_0, 1
/* BYTES < 0? Handle last bytes */
bltz BYTES, .Lchacha_mips_xor_bytes
.Lchacha_mips_xor_done:
/* Restore used registers */
lw $s0, 0 ($sp)
lw $s1, 4 ($sp)
lw $s2, 8 ($sp)
lw $s3, 12 ($sp)
lw $s4, 16 ($sp)
lw $s5, 20 ($sp)
lw $s6, 24 ($sp)
lw $s7, 28 ($sp)
/* Write NONCE_0 back to right location in state */
sw NONCE_0, 48 (STATE)
.Lchacha_mips_end:
addiu $sp, STACK_SIZE
jr $ra
.Lchacha_mips_no_full_block_aligned:
/* Restore the offset on BYTES */
addiu BYTES, CHACHA20_BLOCK_SIZE
/* Get number of full WORDS */
andi $at, BYTES, MASK_U32
/* Load upper half of jump table addr */
lui T0, %hi (.Lchacha_mips_jmptbl_aligned_0)
/* Calculate lower half jump table offset */
ins T0, $at, 1 , 6
/* Add offset to STATE */
addu T1, STATE, $at
/* Add lower half jump table addr */
addiu T0, %lo (.Lchacha_mips_jmptbl_aligned_0)
/* Read value from STATE */
lw SAVED_CA, 0 (T1)
/* Store remaining bytecounter as negative value */
subu BYTES, $at, BYTES
jr T0
/* Jump table */
FOR_EACH_WORD(JMPTBL_ALIGNED)
.Loop_chacha_unaligned:
/* Set number rounds here to fill delayslot. */
lw $at, (STACK_SIZE+16 )($sp)
/* BYTES > 0, it has no full block. */
bltz BYTES, .Lchacha_mips_no_full_block_unaligned
FOR_EACH_WORD_REV(STORE_UNALIGNED)
/* BYTES > 0? Loop again. */
bgtz BYTES, .Loop_chacha_rounds
/* Write NONCE_0 back to right location in state */
sw NONCE_0, 48 (STATE)
.set noreorder
/* Fall through to byte handling */
bgez BYTES, .Lchacha_mips_xor_done
.Lchacha_mips_xor_unaligned_0_b:
.Lchacha_mips_xor_aligned_0_b:
/* Place this here to fill delay slot */
addiu NONCE_0, 1
.set reorder
.Lchacha_mips_xor_bytes:
addu IN, $at
addu OUT, $at
/* First byte */
lbu T1, 0 (IN)
addiu $at, BYTES, 1
xor T1, SAVED_X
sb T1, 0 (OUT)
beqz $at, .Lchacha_mips_xor_done
/* Second byte */
lbu T1, 1 (IN)
addiu $at, BYTES, 2
rotr SAVED_X, 8
xor T1, SAVED_X
sb T1, 1 (OUT)
beqz $at, .Lchacha_mips_xor_done
/* Third byte */
lbu T1, 2 (IN)
rotr SAVED_X, 8
xor T1, SAVED_X
sb T1, 2 (OUT)
b .Lchacha_mips_xor_done
.Lchacha_mips_no_full_block_unaligned:
/* Restore the offset on BYTES */
addiu BYTES, CHACHA20_BLOCK_SIZE
/* Get number of full WORDS */
andi $at, BYTES, MASK_U32
/* Load upper half of jump table addr */
lui T0, %hi (.Lchacha_mips_jmptbl_unaligned_0)
/* Calculate lower half jump table offset */
ins T0, $at, 1 , 6
/* Add offset to STATE */
addu T1, STATE, $at
/* Add lower half jump table addr */
addiu T0, %lo (.Lchacha_mips_jmptbl_unaligned_0)
/* Read value from STATE */
lw SAVED_CA, 0 (T1)
/* Store remaining bytecounter as negative value */
subu BYTES, $at, BYTES
jr T0
/* Jump table */
FOR_EACH_WORD(JMPTBL_UNALIGNED)
.end chacha_crypt_arch
.set at
/* Input arguments
* STATE $a0
* OUT $a1
* NROUND $a2
*/
#undef X12
#undef X13
#undef X14
#undef X15
#define X12 $a3
#define X13 $at
#define X14 $v0
#define X15 STATE
.set noat
.globl hchacha_block_arch
.ent hchacha_block_arch
hchacha_block_arch:
.frame $sp, STACK_SIZE, $ra
addiu $sp, -STACK_SIZE
/* Save X11(s6) */
sw X11, 0 ($sp)
lw X0, 0 (STATE)
lw X1, 4 (STATE)
lw X2, 8 (STATE)
lw X3, 12 (STATE)
lw X4, 16 (STATE)
lw X5, 20 (STATE)
lw X6, 24 (STATE)
lw X7, 28 (STATE)
lw X8, 32 (STATE)
lw X9, 36 (STATE)
lw X10, 40 (STATE)
lw X11, 44 (STATE)
lw X12, 48 (STATE)
lw X13, 52 (STATE)
lw X14, 56 (STATE)
lw X15, 60 (STATE)
.Loop_hchacha_xor_rounds:
addiu $a2, -2
AXR( 0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 , 12 ,13 ,14 ,15 , 16 );
AXR( 8 , 9 ,10 ,11 , 12 ,13 ,14 ,15 , 4 , 5 , 6 , 7 , 12 );
AXR( 0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 , 12 ,13 ,14 ,15 , 8 );
AXR( 8 , 9 ,10 ,11 , 12 ,13 ,14 ,15 , 4 , 5 , 6 , 7 , 7 );
AXR( 0 , 1 , 2 , 3 , 5 , 6 , 7 , 4 , 15 ,12 ,13 ,14 , 16 );
AXR(10 ,11 , 8 , 9 , 15 ,12 ,13 ,14 , 5 , 6 , 7 , 4 , 12 );
AXR( 0 , 1 , 2 , 3 , 5 , 6 , 7 , 4 , 15 ,12 ,13 ,14 , 8 );
AXR(10 ,11 , 8 , 9 , 15 ,12 ,13 ,14 , 5 , 6 , 7 , 4 , 7 );
bnez $a2, .Loop_hchacha_xor_rounds
/* Restore used register */
lw X11, 0 ($sp)
sw X0, 0 (OUT)
sw X1, 4 (OUT)
sw X2, 8 (OUT)
sw X3, 12 (OUT)
sw X12, 16 (OUT)
sw X13, 20 (OUT)
sw X14, 24 (OUT)
sw X15, 28 (OUT)
addiu $sp, STACK_SIZE
jr $ra
.end hchacha_block_arch
.set at
Messung V0.5 in Prozent C=95 H=93 G=93
[0.20QuellennavigatorsProjekt 2026-06-07]
2026-06-09