#define __ARM_ARCH__ __LINUX_ARM_ARCH__
@ SPDX-License-Identifier: GPL-
2 .
0
@ This code is taken from the OpenSSL project but the author (Andy Polyakov)
@ has relicensed it under the GPLv2. Therefore this program is free software;
@ you can redistribute it and/or modify it under the terms of the GNU General
@ Public License version
2 as published by the Free Software Foundation.
@
@ The original headers, including the original license headers, are
@ included below for completeness.
@ ====================================================================
@ Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
@ project. The module is, however, dual licensed under OpenSSL and
@ CRYPTOGAMS licenses depending on where you obtain it. For further
@ details see
https://www.openssl.org/~appro/cryptogams/.
@ ====================================================================
@ sha1_block procedure for ARMv4.
@
@ January
2007 .
@
Size /performance trade-off
@ ====================================================================
@ impl
size in bytes comp cycles[*] measured performance
@ ====================================================================
@ thumb
304 3212 4420
@ armv4-small
392 /+
29 %
1958 /+
64 %
2250 /+
96 %
@ armv4-compact
740 /+
89 %
1552 /+
26 %
1840 /+
22 %
@ armv4-large
1420 /+
92 %
1307 /+
19 %
1370 /+
34 %[***]
@ full unroll ~
5100 /+
260 % ~
1260 /+
4 % ~
1300 /+
5 %
@ ====================================================================
@ thumb = same as
'small' but in Thumb instructions[**] and
@ with recurring code in two private functions;
@ small = detached Xload/update, loops are folded;
@ compact = detached Xload/update,
5 x unroll;
@ large = interleaved Xload/update,
5 x unroll;
@ full unroll = interleaved Xload/update, full unroll, estimated[
!];
@
@ [*] Manually counted instructions in
"grand" loop body. Measured
@ performance is affected by prologue and epilogue overhead,
@ i-cache availability, branch penalties, etc.
@ [**] While each Thumb instruction is twice smaller, they are not as
@ diverse as ARM ones: e.g., there are only two arithmetic
@ instructions with
3 arguments, no [fixed] rotate, addressing
@ modes are limited. As result it takes more instructions to do
@ the same job in Thumb, therefore the code is never twice as
@ small and always slower.
@ [***] which is also ~
35 % better than compiler generated code. Dual-
@ issue Cortex A8 core was measured to process input block in
@ ~
990 cycles.
@ August
2010 .
@
@ Rescheduling for dual-issue pipeline resulted in
13 % improvement on
@ Cortex A8 core and in absolute terms ~
870 cycles per input block
@ [or
13 .
6 cycles per byte].
@ February
2011 .
@
@ Profiler-assisted and platform-specific optimization resulted in
10 %
@ improvement on Cortex A8 core and
12 .
2 cycles per byte.
#include <linux/linkage.h>
.text
.
align 2
ENTRY(sha1_block_data_order)
stmdb sp
!,{r4-r12,lr}
add r2,r1,r2,lsl#
6 @ r2 to point at the end of r1
ldmia r0,{r3,r4,r5,r6,r7}
.Lloop:
ldr r8,.LK_00_19
mov r14,sp
sub sp,sp,#
15 *
4
mov r5,r5,ror#
30
mov r6,r6,ror#
30
mov r7,r7,ror#
30 @ [
6 ]
.L_00_15:
#if __ARM_ARCH__<
7
ldrb r10,[r1,#
2 ]
ldrb r9,[r1,#
3 ]
ldrb r11,[r1,#
1 ]
add r7,r8,r7,ror#
2 @ E+=K_00_19
ldrb r12,[r1],#
4
orr r9,r9,r10,lsl#
8
eor r10,r5,r6 @ F_xx_xx
orr r9,r9,r11,lsl#
16
add r7,r7,r3,ror#
27 @ E+=ROR(A,
27 )
orr r9,r9,r12,lsl#
24
#else
ldr r9,[r1],#
4 @ handles unaligned
add r7,r8,r7,ror#
2 @ E+=K_00_19
eor r10,r5,r6 @ F_xx_xx
add r7,r7,r3,ror#
27 @ E+=ROR(A,
27 )
#ifdef __ARMEL__
rev r9,r9 @ byte swap
#endif
#endif
and r10,r4,r10,ror#
2
add r7,r7,r9 @ E+=X[i]
eor r10,r10,r6,ror#
2 @ F_00_19(B,C,D)
str r9,[r14,#-
4 ]
!
add r7,r7,r10 @ E+=F_00_19(B,C,D)
#if __ARM_ARCH__<
7
ldrb r10,[r1,#
2 ]
ldrb r9,[r1,#
3 ]
ldrb r11,[r1,#
1 ]
add r6,r8,r6,ror#
2 @ E+=K_00_19
ldrb r12,[r1],#
4
orr r9,r9,r10,lsl#
8
eor r10,r4,r5 @ F_xx_xx
orr r9,r9,r11,lsl#
16
add r6,r6,r7,ror#
27 @ E+=ROR(A,
27 )
orr r9,r9,r12,lsl#
24
#else
ldr r9,[r1],#
4 @ handles unaligned
add r6,r8,r6,ror#
2 @ E+=K_00_19
eor r10,r4,r5 @ F_xx_xx
add r6,r6,r7,ror#
27 @ E+=ROR(A,
27 )
#ifdef __ARMEL__
rev r9,r9 @ byte swap
#endif
#endif
and r10,r3,r10,ror#
2
add r6,r6,r9 @ E+=X[i]
eor r10,r10,r5,ror#
2 @ F_00_19(B,C,D)
str r9,[r14,#-
4 ]
!
add r6,r6,r10 @ E+=F_00_19(B,C,D)
#if __ARM_ARCH__<
7
ldrb r10,[r1,#
2 ]
ldrb r9,[r1,#
3 ]
ldrb r11,[r1,#
1 ]
add r5,r8,r5,ror#
2 @ E+=K_00_19
ldrb r12,[r1],#
4
orr r9,r9,r10,lsl#
8
eor r10,r3,r4 @ F_xx_xx
orr r9,r9,r11,lsl#
16
add r5,r5,r6,ror#
27 @ E+=ROR(A,
27 )
orr r9,r9,r12,lsl#
24
#else
ldr r9,[r1],#
4 @ handles unaligned
add r5,r8,r5,ror#
2 @ E+=K_00_19
eor r10,r3,r4 @ F_xx_xx
add r5,r5,r6,ror#
27 @ E+=ROR(A,
27 )
#ifdef __ARMEL__
rev r9,r9 @ byte swap
#endif
#endif
and r10,r7,r10,ror#
2
add r5,r5,r9 @ E+=X[i]
eor r10,r10,r4,ror#
2 @ F_00_19(B,C,D)
str r9,[r14,#-
4 ]
!
add r5,r5,r10 @ E+=F_00_19(B,C,D)
#if __ARM_ARCH__<
7
ldrb r10,[r1,#
2 ]
ldrb r9,[r1,#
3 ]
ldrb r11,[r1,#
1 ]
add r4,r8,r4,ror#
2 @ E+=K_00_19
ldrb r12,[r1],#
4
orr r9,r9,r10,lsl#
8
eor r10,r7,r3 @ F_xx_xx
orr r9,r9,r11,lsl#
16
add r4,r4,r5,ror#
27 @ E+=ROR(A,
27 )
orr r9,r9,r12,lsl#
24
#else
ldr r9,[r1],#
4 @ handles unaligned
add r4,r8,r4,ror#
2 @ E+=K_00_19
eor r10,r7,r3 @ F_xx_xx
add r4,r4,r5,ror#
27 @ E+=ROR(A,
27 )
#ifdef __ARMEL__
rev r9,r9 @ byte swap
#endif
#endif
and r10,r6,r10,ror#
2
add r4,r4,r9 @ E+=X[i]
eor r10,r10,r3,ror#
2 @ F_00_19(B,C,D)
str r9,[r14,#-
4 ]
!
add r4,r4,r10 @ E+=F_00_19(B,C,D)
#if __ARM_ARCH__<
7
ldrb r10,[r1,#
2 ]
ldrb r9,[r1,#
3 ]
ldrb r11,[r1,#
1 ]
add r3,r8,r3,ror#
2 @ E+=K_00_19
ldrb r12,[r1],#
4
orr r9,r9,r10,lsl#
8
eor r10,r6,r7 @ F_xx_xx
orr r9,r9,r11,lsl#
16
add r3,r3,r4,ror#
27 @ E+=ROR(A,
27 )
orr r9,r9,r12,lsl#
24
#else
ldr r9,[r1],#
4 @ handles unaligned
add r3,r8,r3,ror#
2 @ E+=K_00_19
eor r10,r6,r7 @ F_xx_xx
add r3,r3,r4,ror#
27 @ E+=ROR(A,
27 )
#ifdef __ARMEL__
rev r9,r9 @ byte swap
#endif
#endif
and r10,r5,r10,ror#
2
add r3,r3,r9 @ E+=X[i]
eor r10,r10,r7,ror#
2 @ F_00_19(B,C,D)
str r9,[r14,#-
4 ]
!
add r3,r3,r10 @ E+=F_00_19(B,C,D)
cmp r14,sp
bne .L_00_15 @ [((
11 +
4 )*
5 +
2 )*
3 ]
sub sp,sp,#
25 *
4
#if __ARM_ARCH__<
7
ldrb r10,[r1,#
2 ]
ldrb r9,[r1,#
3 ]
ldrb r11,[r1,#
1 ]
add r7,r8,r7,ror#
2 @ E+=K_00_19
ldrb r12,[r1],#
4
orr r9,r9,r10,lsl#
8
eor r10,r5,r6 @ F_xx_xx
orr r9,r9,r11,lsl#
16
add r7,r7,r3,ror#
27 @ E+=ROR(A,
27 )
orr r9,r9,r12,lsl#
24
#else
ldr r9,[r1],#
4 @ handles unaligned
add r7,r8,r7,ror#
2 @ E+=K_00_19
eor r10,r5,r6 @ F_xx_xx
add r7,r7,r3,ror#
27 @ E+=ROR(A,
27 )
#ifdef __ARMEL__
rev r9,r9 @ byte swap
#endif
#endif
and r10,r4,r10,ror#
2
add r7,r7,r9 @ E+=X[i]
eor r10,r10,r6,ror#
2 @ F_00_19(B,C,D)
str r9,[r14,#-
4 ]
!
add r7,r7,r10 @ E+=F_00_19(B,C,D)
ldr r9,[r14,#
15 *
4 ]
ldr r10,[r14,#
13 *
4 ]
ldr r11,[r14,#
7 *
4 ]
add r6,r8,r6,ror#
2 @ E+=K_xx_xx
ldr r12,[r14,#
2 *
4 ]
eor r9,r9,r10
eor r11,r11,r12 @
1 cycle stall
eor r10,r4,r5 @ F_xx_xx
mov r9,r9,ror#
31
add r6,r6,r7,ror#
27 @ E+=ROR(A,
27 )
eor r9,r9,r11,ror#
31
str r9,[r14,#-
4 ]
!
and r10,r3,r10,ror#
2 @ F_xx_xx
@ F_xx_xx
add r6,r6,r9 @ E+=X[i]
eor r10,r10,r5,ror#
2 @ F_00_19(B,C,D)
add r6,r6,r10 @ E+=F_00_19(B,C,D)
ldr r9,[r14,#
15 *
4 ]
ldr r10,[r14,#
13 *
4 ]
ldr r11,[r14,#
7 *
4 ]
add r5,r8,r5,ror#
2 @ E+=K_xx_xx
ldr r12,[r14,#
2 *
4 ]
eor r9,r9,r10
eor r11,r11,r12 @
1 cycle stall
eor r10,r3,r4 @ F_xx_xx
mov r9,r9,ror#
31
add r5,r5,r6,ror#
27 @ E+=ROR(A,
27 )
eor r9,r9,r11,ror#
31
str r9,[r14,#-
4 ]
!
and r10,r7,r10,ror#
2 @ F_xx_xx
@ F_xx_xx
add r5,r5,r9 @ E+=X[i]
eor r10,r10,r4,ror#
2 @ F_00_19(B,C,D)
add r5,r5,r10 @ E+=F_00_19(B,C,D)
ldr r9,[r14,#
15 *
4 ]
ldr r10,[r14,#
13 *
4 ]
ldr r11,[r14,#
7 *
4 ]
add r4,r8,r4,ror#
2 @ E+=K_xx_xx
ldr r12,[r14,#
2 *
4 ]
eor r9,r9,r10
eor r11,r11,r12 @
1 cycle stall
eor r10,r7,r3 @ F_xx_xx
mov r9,r9,ror#
31
add r4,r4,r5,ror#
27 @ E+=ROR(A,
27 )
eor r9,r9,r11,ror#
31
str r9,[r14,#-
4 ]
!
and r10,r6,r10,ror#
2 @ F_xx_xx
@ F_xx_xx
add r4,r4,r9 @ E+=X[i]
eor r10,r10,r3,ror#
2 @ F_00_19(B,C,D)
add r4,r4,r10 @ E+=F_00_19(B,C,D)
ldr r9,[r14,#
15 *
4 ]
ldr r10,[r14,#
13 *
4 ]
ldr r11,[r14,#
7 *
4 ]
add r3,r8,r3,ror#
2 @ E+=K_xx_xx
ldr r12,[r14,#
2 *
4 ]
eor r9,r9,r10
eor r11,r11,r12 @
1 cycle stall
eor r10,r6,r7 @ F_xx_xx
mov r9,r9,ror#
31
add r3,r3,r4,ror#
27 @ E+=ROR(A,
27 )
eor r9,r9,r11,ror#
31
str r9,[r14,#-
4 ]
!
and r10,r5,r10,ror#
2 @ F_xx_xx
@ F_xx_xx
add r3,r3,r9 @ E+=X[i]
eor r10,r10,r7,ror#
2 @ F_00_19(B,C,D)
add r3,r3,r10 @ E+=F_00_19(B,C,D)
ldr r8,.LK_20_39 @ [+
15 +
16 *
4 ]
cmn sp,#
0 @ [+
3 ], clear carry to denote
20 _
39
.L_20_39_or_60_79:
ldr r9,[r14,#
15 *
4 ]
ldr r10,[r14,#
13 *
4 ]
ldr r11,[r14,#
7 *
4 ]
add r7,r8,r7,ror#
2 @ E+=K_xx_xx
ldr r12,[r14,#
2 *
4 ]
eor r9,r9,r10
eor r11,r11,r12 @
1 cycle stall
eor r10,r5,r6 @ F_xx_xx
mov r9,r9,ror#
31
add r7,r7,r3,ror#
27 @ E+=ROR(A,
27 )
eor r9,r9,r11,ror#
31
str r9,[r14,#-
4 ]
!
eor r10,r4,r10,ror#
2 @ F_xx_xx
@ F_xx_xx
add r7,r7,r9 @ E+=X[i]
add r7,r7,r10 @ E+=F_20_39(B,C,D)
ldr r9,[r14,#
15 *
4 ]
ldr r10,[r14,#
13 *
4 ]
ldr r11,[r14,#
7 *
4 ]
add r6,r8,r6,ror#
2 @ E+=K_xx_xx
ldr r12,[r14,#
2 *
4 ]
eor r9,r9,r10
eor r11,r11,r12 @
1 cycle stall
eor r10,r4,r5 @ F_xx_xx
mov r9,r9,ror#
31
add r6,r6,r7,ror#
27 @ E+=ROR(A,
27 )
eor r9,r9,r11,ror#
31
str r9,[r14,#-
4 ]
!
eor r10,r3,r10,ror#
2 @ F_xx_xx
@ F_xx_xx
add r6,r6,r9 @ E+=X[i]
add r6,r6,r10 @ E+=F_20_39(B,C,D)
ldr r9,[r14,#
15 *
4 ]
ldr r10,[r14,#
13 *
4 ]
ldr r11,[r14,#
7 *
4 ]
add r5,r8,r5,ror#
2 @ E+=K_xx_xx
ldr r12,[r14,#
2 *
4 ]
eor r9,r9,r10
eor r11,r11,r12 @
1 cycle stall
eor r10,r3,r4 @ F_xx_xx
mov r9,r9,ror#
31
add r5,r5,r6,ror#
27 @ E+=ROR(A,
27 )
eor r9,r9,r11,ror#
31
str r9,[r14,#-
4 ]
!
eor r10,r7,r10,ror#
2 @ F_xx_xx
@ F_xx_xx
add r5,r5,r9 @ E+=X[i]
add r5,r5,r10 @ E+=F_20_39(B,C,D)
ldr r9,[r14,#
15 *
4 ]
ldr r10,[r14,#
13 *
4 ]
ldr r11,[r14,#
7 *
4 ]
add r4,r8,r4,ror#
2 @ E+=K_xx_xx
ldr r12,[r14,#
2 *
4 ]
eor r9,r9,r10
eor r11,r11,r12 @
1 cycle stall
eor r10,r7,r3 @ F_xx_xx
mov r9,r9,ror#
31
add r4,r4,r5,ror#
27 @ E+=ROR(A,
27 )
eor r9,r9,r11,ror#
31
str r9,[r14,#-
4 ]
!
eor r10,r6,r10,ror#
2 @ F_xx_xx
@ F_xx_xx
add r4,r4,r9 @ E+=X[i]
add r4,r4,r10 @ E+=F_20_39(B,C,D)
ldr r9,[r14,#
15 *
4 ]
ldr r10,[r14,#
13 *
4 ]
ldr r11,[r14,#
7 *
4 ]
add r3,r8,r3,ror#
2 @ E+=K_xx_xx
ldr r12,[r14,#
2 *
4 ]
eor r9,r9,r10
eor r11,r11,r12 @
1 cycle stall
eor r10,r6,r7 @ F_xx_xx
mov r9,r9,ror#
31
add r3,r3,r4,ror#
27 @ E+=ROR(A,
27 )
eor r9,r9,r11,ror#
31
str r9,[r14,#-
4 ]
!
eor r10,r5,r10,ror#
2 @ F_xx_xx
@ F_xx_xx
add r3,r3,r9 @ E+=X[i]
add r3,r3,r10 @ E+=F_20_39(B,C,D)
ARM( teq r14,sp ) @ preserve carry
THUMB( mov r11,sp )
THUMB( teq r14,r11 ) @ preserve carry
bne .L_20_39_or_60_79 @ [+((
12 +
3 )*
5 +
2 )*
4 ]
bcs .L_done @ [+((
12 +
3 )*
5 +
2 )*
4 ], spare
300 bytes
ldr r8,.LK_40_59
sub sp,sp,#
20 *
4 @ [+
2 ]
.L_40_59:
ldr r9,[r14,#
15 *
4 ]
ldr r10,[r14,#
13 *
4 ]
ldr r11,[r14,#
7 *
4 ]
add r7,r8,r7,ror#
2 @ E+=K_xx_xx
ldr r12,[r14,#
2 *
4 ]
eor r9,r9,r10
eor r11,r11,r12 @
1 cycle stall
eor r10,r5,r6 @ F_xx_xx
mov r9,r9,ror#
31
add r7,r7,r3,ror#
27 @ E+=ROR(A,
27 )
eor r9,r9,r11,ror#
31
str r9,[r14,#-
4 ]
!
and r10,r4,r10,ror#
2 @ F_xx_xx
and r11,r5,r6 @ F_xx_xx
add r7,r7,r9 @ E+=X[i]
add r7,r7,r10 @ E+=F_40_59(B,C,D)
add r7,r7,r11,ror#
2
ldr r9,[r14,#
15 *
4 ]
ldr r10,[r14,#
13 *
4 ]
ldr r11,[r14,#
7 *
4 ]
add r6,r8,r6,ror#
2 @ E+=K_xx_xx
ldr r12,[r14,#
2 *
4 ]
eor r9,r9,r10
eor r11,r11,r12 @
1 cycle stall
eor r10,r4,r5 @ F_xx_xx
mov r9,r9,ror#
31
add r6,r6,r7,ror#
27 @ E+=ROR(A,
27 )
eor r9,r9,r11,ror#
31
str r9,[r14,#-
4 ]
!
and r10,r3,r10,ror#
2 @ F_xx_xx
and r11,r4,r5 @ F_xx_xx
add r6,r6,r9 @ E+=X[i]
add r6,r6,r10 @ E+=F_40_59(B,C,D)
add r6,r6,r11,ror#
2
ldr r9,[r14,#
15 *
4 ]
ldr r10,[r14,#
13 *
4 ]
ldr r11,[r14,#
7 *
4 ]
add r5,r8,r5,ror#
2 @ E+=K_xx_xx
ldr r12,[r14,#
2 *
4 ]
eor r9,r9,r10
eor r11,r11,r12 @
1 cycle stall
eor r10,r3,r4 @ F_xx_xx
mov r9,r9,ror#
31
add r5,r5,r6,ror#
27 @ E+=ROR(A,
27 )
eor r9,r9,r11,ror#
31
str r9,[r14,#-
4 ]
!
and r10,r7,r10,ror#
2 @ F_xx_xx
and r11,r3,r4 @ F_xx_xx
add r5,r5,r9 @ E+=X[i]
add r5,r5,r10 @ E+=F_40_59(B,C,D)
add r5,r5,r11,ror#
2
ldr r9,[r14,#
15 *
4 ]
ldr r10,[r14,#
13 *
4 ]
ldr r11,[r14,#
7 *
4 ]
add r4,r8,r4,ror#
2 @ E+=K_xx_xx
ldr r12,[r14,#
2 *
4 ]
eor r9,r9,r10
eor r11,r11,r12 @
1 cycle stall
eor r10,r7,r3 @ F_xx_xx
mov r9,r9,ror#
31
add r4,r4,r5,ror#
27 @ E+=ROR(A,
27 )
eor r9,r9,r11,ror#
31
str r9,[r14,#-
4 ]
!
and r10,r6,r10,ror#
2 @ F_xx_xx
and r11,r7,r3 @ F_xx_xx
add r4,r4,r9 @ E+=X[i]
add r4,r4,r10 @ E+=F_40_59(B,C,D)
add r4,r4,r11,ror#
2
ldr r9,[r14,#
15 *
4 ]
ldr r10,[r14,#
13 *
4 ]
ldr r11,[r14,#
7 *
4 ]
add r3,r8,r3,ror#
2 @ E+=K_xx_xx
ldr r12,[r14,#
2 *
4 ]
eor r9,r9,r10
eor r11,r11,r12 @
1 cycle stall
eor r10,r6,r7 @ F_xx_xx
mov r9,r9,ror#
31
add r3,r3,r4,ror#
27 @ E+=ROR(A,
27 )
eor r9,r9,r11,ror#
31
str r9,[r14,#-
4 ]
!
and r10,r5,r10,ror#
2 @ F_xx_xx
and r11,r6,r7 @ F_xx_xx
add r3,r3,r9 @ E+=X[i]
add r3,r3,r10 @ E+=F_40_59(B,C,D)
add r3,r3,r11,ror#
2
cmp r14,sp
bne .L_40_59 @ [+((
12 +
5 )*
5 +
2 )*
4 ]
ldr r8,.LK_60_79
sub sp,sp,#
20 *
4
cmp sp,#
0 @
set carry to denote
60 _
79
b .L_20_39_or_60_79 @ [+
4 ], spare
300 bytes
.L_done:
add sp,sp,#
80 *
4 @
"deallocate" stack frame
ldmia r0,{r8,r9,r10,r11,r12}
add r3,r8,r3
add r4,r9,r4
add r5,r10,r5,ror#
2
add r6,r11,r6,ror#
2
add r7,r12,r7,ror#
2
stmia r0,{r3,r4,r5,r6,r7}
teq r1,r2
bne .Lloop @ [+
18 ], total
1307
ldmia sp
!,{r4-r12,pc}
.
align 2
.LK_00_19: .
word 0 x5a827999
.LK_20_39: .
word 0 x6ed9eba1
.LK_40_59: .
word 0 x8f1bbcdc
.LK_60_79: .
word 0 xca62c1d6
ENDPROC(sha1_block_data_order)
.asciz
"SHA1 block transform for ARMv4, CRYPTOGAMS by <appro@openssl.org>"
.
align 2
Messung V0.5 in Prozent C=80 H=94 G=87
¤ Dauer der Verarbeitung: 0.9 Sekunden
(vorverarbeitet am 2026-06-08)
¤
*© Formatika GbR, Deutschland