Quelle blake2s-core.S

Sprache: Sparc

/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* BLAKE2s digest algorithm, ARM scalar implementation
*
* Copyright 2020 Google LLC
*
* Author: Eric Biggers <ebiggers@google.com>
*/

#include <linux/linkage.h>
#include <asm/assembler.h>

// Registers used to hold message words temporarily.  There aren't
// enough ARM registers to hold the whole message block, so we have to
// load the words on-demand.
M_0  .req r12
M_1  .req r14

// The BLAKE2s initialization vector
.Lblake2s_IV:
.word 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A
.word 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19

.macro __ldrd  a, b, src, offset
#if __LINUX_ARM_ARCH__ >= 6
ldrd  \a, \b, [\src, #\offset]
#else
ldr  \a, [\src, #\offset]
ldr  \b, [\src, #\offset + 4]
#endif
.endm

.macro __strd  a, b, dst, offset
#if __LINUX_ARM_ARCH__ >= 6
strd  \a, \b, [\dst, #\offset]
#else
str  \a, [\dst, #\offset]
str  \b, [\dst, #\offset + 4]
#endif
.endm

.macro _le32_bswap a, tmp
#ifdef __ARMEB__
rev_l  \a, \tmp
#endif
.endm

.macro _le32_bswap_8x a, b, c, d, e, f, g, h,  tmp
_le32_bswap \a, \tmp
_le32_bswap \b, \tmp
_le32_bswap \c, \tmp
_le32_bswap \d, \tmp
_le32_bswap \e, \tmp
_le32_bswap \f, \tmp
_le32_bswap \g, \tmp
_le32_bswap \h, \tmp
.endm

// Execute a quarter-round of BLAKE2s by mixing two columns or two diagonals.
// (a0, b0, c0, d0) and (a1, b1, c1, d1) give the registers containing the two
// columns/diagonals.  s0-s1 are the word offsets to the message words the first
// column/diagonal needs, and likewise s2-s3 for the second column/diagonal.
// M_0 and M_1 are free to use, and the message block can be found at sp + 32.
//
// Note that to save instructions, the rotations don't happen when the
// pseudocode says they should, but rather they are delayed until the values are
// used.  See the comment above _blake2s_round().
.macro _blake2s_quarterround  a0, b0, c0, d0,  a1, b1, c1, d1,  s0, s1, s2, s3

ldr  M_0, [sp, #32 + 4 * \s0]
ldr  M_1, [sp, #32 + 4 * \s2]

// a += b + m[blake2s_sigma[r][2*i + 0]];
add  \a0, \a0, \b0, ror #brot
add  \a1, \a1, \b1, ror #brot
add  \a0, \a0, M_0
add  \a1, \a1, M_1

// d = ror32(d ^ a, 16);
eor  \d0, \a0, \d0, ror #drot
eor  \d1, \a1, \d1, ror #drot

// c += d;
add  \c0, \c0, \d0, ror #16
add  \c1, \c1, \d1, ror #16

// b = ror32(b ^ c, 12);
eor  \b0, \c0, \b0, ror #brot
eor  \b1, \c1, \b1, ror #brot

ldr  M_0, [sp, #32 + 4 * \s1]
ldr  M_1, [sp, #32 + 4 * \s3]

// a += b + m[blake2s_sigma[r][2*i + 1]];
add  \a0, \a0, \b0, ror #12
add  \a1, \a1, \b1, ror #12
add  \a0, \a0, M_0
add  \a1, \a1, M_1

// d = ror32(d ^ a, 8);
eor  \d0, \a0, \d0, ror#16
eor  \d1, \a1, \d1, ror#16

// c += d;
add  \c0, \c0, \d0, ror#8
add  \c1, \c1, \d1, ror#8

// b = ror32(b ^ c, 7);
eor  \b0, \c0, \b0, ror#12
eor  \b1, \c1, \b1, ror#12
.endm

// Execute one round of BLAKE2s by updating the state matrix v[0..15].  v[0..9]
// are in r0..r9.  The stack pointer points to 8 bytes of scratch space for
// spilling v[8..9], then to v[9..15], then to the message block.  r10-r12 and
// r14 are free to use.  The macro arguments s0-s15 give the order in which the
// message words are used in this round.
//
// All rotates are performed using the implicit rotate operand accepted by the
// 'add' and 'eor' instructions.  This is faster than using explicit rotate
// instructions.  To make this work, we allow the values in the second and last
// rows of the BLAKE2s state matrix (rows 'b' and 'd') to temporarily have the
// wrong rotation amount.  The rotation amount is then fixed up just in time
// when the values are used.  'brot' is the number of bits the values in row 'b'
// need to be rotated right to arrive at the correct values, and 'drot'
// similarly for row 'd'.  (brot, drot) start out as (0, 0) but we make it such
// that they end up as (7, 8) after every round.
.macro _blake2s_round s0, s1, s2, s3, s4, s5, s6, s7, \
   s8, s9, s10, s11, s12, s13, s14, s15

// Mix first two columns:
// (v[0], v[4], v[8], v[12]) and (v[1], v[5], v[9], v[13]).
__ldrd  r10, r11, sp, 16 // load v[12] and v[13]
_blake2s_quarterround r0, r4, r8, r10,  r1, r5, r9, r11, \
    \s0, \s1, \s2, \s3
__strd  r8, r9, sp, 0
__strd  r10, r11, sp, 16

// Mix second two columns:
// (v[2], v[6], v[10], v[14]) and (v[3], v[7], v[11], v[15]).
__ldrd  r8, r9, sp, 8  // load v[10] and v[11]
__ldrd  r10, r11, sp, 24 // load v[14] and v[15]
_blake2s_quarterround r2, r6, r8, r10,  r3, r7, r9, r11, \
    \s4, \s5, \s6, \s7
str  r10, [sp, #24]  // store v[14]
// v[10], v[11], and v[15] are used below, so no need to store them yet.

.set brot, 7
.set drot, 8

// Mix first two diagonals:
// (v[0], v[5], v[10], v[15]) and (v[1], v[6], v[11], v[12]).
ldr  r10, [sp, #16]  // load v[12]
_blake2s_quarterround r0, r5, r8, r11,  r1, r6, r9, r10, \
    \s8, \s9, \s10, \s11
__strd  r8, r9, sp, 8
str  r11, [sp, #28]
str  r10, [sp, #16]

// Mix second two diagonals:
// (v[2], v[7], v[8], v[13]) and (v[3], v[4], v[9], v[14]).
__ldrd  r8, r9, sp, 0  // load v[8] and v[9]
__ldrd  r10, r11, sp, 20 // load v[13] and v[14]
_blake2s_quarterround r2, r7, r8, r10,  r3, r4, r9, r11, \
    \s12, \s13, \s14, \s15
__strd  r10, r11, sp, 20
.endm

//
// void blake2s_compress(struct blake2s_state *state,
//    const u8 *block, size_t nblocks, u32 inc);
//
// Only the first three fields of struct blake2s_state are used:
// u32 h[8]; (inout)
// u32 t[2]; (inout)
// u32 f[2]; (in)
//
.align  5
ENTRY(blake2s_compress)
push  {r0-r2,r4-r11,lr} // keep this an even number

.Lnext_block:
// r0 is 'state'
// r1 is 'block'
// r3 is 'inc'

// Load and increment the counter t[0..1].
__ldrd  r10, r11, r0, 32
adds  r10, r10, r3
adc  r11, r11, #0
__strd  r10, r11, r0, 32

// _blake2s_round is very short on registers, so copy the message block
// to the stack to save a register during the rounds.  This also has the
// advantage that misalignment only needs to be dealt with in one place.
sub  sp, sp, #64
mov  r12, sp
tst  r1, #3
bne  .Lcopy_block_misaligned
ldmia  r1!, {r2-r9}
_le32_bswap_8x r2, r3, r4, r5, r6, r7, r8, r9,  r14
stmia  r12!, {r2-r9}
ldmia  r1!, {r2-r9}
_le32_bswap_8x r2, r3, r4, r5, r6, r7, r8, r9,  r14
stmia  r12, {r2-r9}
.Lcopy_block_done:
str  r1, [sp, #68]  // Update message pointer

// Calculate v[8..15].  Push v[9..15] onto the stack, and leave space
// for spilling v[8..9].  Leave v[8..9] in r8-r9.
mov  r14, r0   // r14 = state
adr  r12, .Lblake2s_IV
ldmia  r12!, {r8-r9} // load IV[0..1]
__ldrd  r0, r1, r14, 40  // load f[0..1]
ldm  r12, {r2-r7}  // load IV[3..7]
eor  r4, r4, r10  // v[12] = IV[4] ^ t[0]
eor  r5, r5, r11  // v[13] = IV[5] ^ t[1]
eor  r6, r6, r0  // v[14] = IV[6] ^ f[0]
eor  r7, r7, r1  // v[15] = IV[7] ^ f[1]
push  {r2-r7}   // push v[9..15]
sub  sp, sp, #8  // leave space for v[8..9]

// Load h[0..7] == v[0..7].
ldm  r14, {r0-r7}

// Execute the rounds.  Each round is provided the order in which it
// needs to use the message words.
.set brot, 0
.set drot, 0
_blake2s_round 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
_blake2s_round 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3
_blake2s_round 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4
_blake2s_round 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8
_blake2s_round 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13
_blake2s_round 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9
_blake2s_round 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11
_blake2s_round 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10
_blake2s_round 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5
_blake2s_round 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0

// Fold the final state matrix into the hash chaining value:
//
// for (i = 0; i < 8; i++)
//  h[i] ^= v[i] ^ v[i + 8];
//
ldr  r14, [sp, #96]  // r14 = &h[0]
add  sp, sp, #8  // v[8..9] are already loaded.
pop  {r10-r11}  // load v[10..11]
eor  r0, r0, r8
eor  r1, r1, r9
eor  r2, r2, r10
eor  r3, r3, r11
ldm  r14, {r8-r11}  // load h[0..3]
eor  r0, r0, r8
eor  r1, r1, r9
eor  r2, r2, r10
eor  r3, r3, r11
stmia  r14!, {r0-r3} // store new h[0..3]
ldm  r14, {r0-r3}  // load old h[4..7]
pop  {r8-r11}  // load v[12..15]
eor  r0, r0, r4, ror #brot
eor  r1, r1, r5, ror #brot
eor  r2, r2, r6, ror #brot
eor  r3, r3, r7, ror #brot
eor  r0, r0, r8, ror #drot
eor  r1, r1, r9, ror #drot
eor  r2, r2, r10, ror #drot
eor  r3, r3, r11, ror #drot
   add  sp, sp, #64  // skip copy of message block
stm  r14, {r0-r3}  // store new h[4..7]

// Advance to the next block, if there is one.  Note that if there are
// multiple blocks, then 'inc' (the counter increment amount) must be
// 64.  So we can simply set it to 64 without re-loading it.
ldm  sp, {r0, r1, r2} // load (state, block, nblocks)
mov  r3, #64   // set 'inc'
subs  r2, r2, #1  // nblocks--
str  r2, [sp, #8]
bne  .Lnext_block  // nblocks != 0?

pop  {r0-r2,r4-r11,pc}

// The next message block (pointed to by r1) isn't 4-byte aligned, so it
// can't be loaded using ldmia.  Copy it to the stack buffer (pointed to
// by r12) using an alternative method.  r2-r9 are free to use.
.Lcopy_block_misaligned:
mov  r2, #64
1:
#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
ldr  r3, [r1], #4
_le32_bswap r3, r4
#else
ldrb  r3, [r1, #0]
ldrb  r4, [r1, #1]
ldrb  r5, [r1, #2]
ldrb  r6, [r1, #3]
add  r1, r1, #4
orr  r3, r3, r4, lsl #8
orr  r3, r3, r5, lsl #16
orr  r3, r3, r6, lsl #24
#endif
subs  r2, r2, #4
str  r3, [r12], #4
bne  1b
b  .Lcopy_block_done
ENDPROC(blake2s_compress)

Messung V0.5 in Prozent

¤ Dauer der Verarbeitung: 0.16 Sekunden (vorverarbeitet am 2026-06-08) ¤

Wurzel

Suchen

PVS Prover

Isabelle Prover

NIST Cobol Testsuite

Cephes Mathematical Library

Vienna Development Method

Haftungshinweis

Die Informationen auf dieser Webseite wurden nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit, noch Qualität der bereit gestellten Informationen zugesichert.

Bemerkung:

Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.