Quelle blowfish-x86_64-asm_64.S

Sprache: Sparc

/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* Blowfish Cipher Algorithm (x86_64)
*
* Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
*/

#include <linux/linkage.h>

.file "blowfish-x86_64-asm.S"
.text

/* structure of crypto context */
#define p 0
#define s0 ((16 + 2) * 4)
#define s1 ((16 + 2 + (1 * 256)) * 4)
#define s2 ((16 + 2 + (2 * 256)) * 4)
#define s3 ((16 + 2 + (3 * 256)) * 4)

/* register macros */
#define CTX %r12
#define RIO %rsi

#define RX0 %rax
#define RX1 %rbx
#define RX2 %rcx
#define RX3 %rdx

#define RX0d %eax
#define RX1d %ebx
#define RX2d %ecx
#define RX3d %edx

#define RX0bl %al
#define RX1bl %bl
#define RX2bl %cl
#define RX3bl %dl

#define RX0bh %ah
#define RX1bh %bh
#define RX2bh %ch
#define RX3bh %dh

#define RT0 %rdi
#define RT1 %rsi
#define RT2 %r8
#define RT3 %r9

#define RT0d %edi
#define RT1d %esi
#define RT2d %r8d
#define RT3d %r9d

#define RKEY %r10

/***********************************************************************
* 1-way blowfish
***********************************************************************/
#define F() \
rorq $16,  RX0; \
movzbl RX0bh,  RT0d; \
movzbl RX0bl,  RT1d; \
rolq $16,  RX0; \
movl s0(CTX,RT0,4), RT0d; \
addl s1(CTX,RT1,4), RT0d; \
movzbl RX0bh,  RT1d; \
movzbl RX0bl,  RT2d; \
rolq $32,  RX0; \
xorl s2(CTX,RT1,4), RT0d; \
addl s3(CTX,RT2,4), RT0d; \
xorq RT0,  RX0;

#define add_roundkey_enc(n) \
xorq p+4*(n)(CTX),  RX0;

#define round_enc(n) \
add_roundkey_enc(n); \
\
F(); \
F();

#define add_roundkey_dec(n) \
movq p+4*(n-1)(CTX), RT0; \
rorq $32,  RT0; \
xorq RT0,  RX0;

#define round_dec(n) \
add_roundkey_dec(n); \
\
F(); \
F(); \

#define read_block() \
movq (RIO),   RX0; \
rorq $32,   RX0; \
bswapq    RX0;

#define write_block() \
bswapq    RX0; \
movq RX0,   (RIO);

SYM_FUNC_START(blowfish_enc_blk)
/* input:
* %rdi: ctx
* %rsi: dst
* %rdx: src
*/
movq %r12, %r11;

movq %rdi, CTX;
movq %rsi, %r10;
movq %rdx, RIO;

read_block();

round_enc(0);
round_enc(2);
round_enc(4);
round_enc(6);
round_enc(8);
round_enc(10);
round_enc(12);
round_enc(14);
add_roundkey_enc(16);

movq %r11, %r12;
movq %r10, RIO;

write_block();
RET;
SYM_FUNC_END(blowfish_enc_blk)

SYM_FUNC_START(blowfish_dec_blk)
/* input:
* %rdi: ctx
* %rsi: dst
* %rdx: src
*/
movq %r12, %r11;

movq %rdi, CTX;
movq %rsi, %r10;
movq %rdx, RIO;

read_block();

round_dec(17);
round_dec(15);
round_dec(13);
round_dec(11);
round_dec(9);
round_dec(7);
round_dec(5);
round_dec(3);
add_roundkey_dec(1);

movq %r10, RIO;
write_block();

movq %r11, %r12;

RET;
SYM_FUNC_END(blowfish_dec_blk)

/**********************************************************************
  4-way blowfish, four blocks parallel
**********************************************************************/

/* F() for 4-way. Slower when used alone/1-way, but faster when used
* parallel/4-way (tested on AMD Phenom II & Intel Xeon E7330).
*/
#define F4(x) \
movzbl x ## bh,  RT1d; \
movzbl x ## bl,  RT3d; \
rorq $16,  x; \
movzbl x ## bh,  RT0d; \
movzbl x ## bl,  RT2d; \
rorq $16,  x; \
movl s0(CTX,RT0,4), RT0d; \
addl s1(CTX,RT2,4), RT0d; \
xorl s2(CTX,RT1,4), RT0d; \
addl s3(CTX,RT3,4), RT0d; \
xorq RT0,  x;

#define add_preloaded_roundkey4() \
xorq RKEY,  RX0; \
xorq RKEY,  RX1; \
xorq RKEY,  RX2; \
xorq RKEY,  RX3;

#define preload_roundkey_enc(n) \
movq p+4*(n)(CTX), RKEY;

#define add_roundkey_enc4(n) \
add_preloaded_roundkey4(); \
preload_roundkey_enc(n + 2);

#define round_enc4(n) \
add_roundkey_enc4(n); \
\
F4(RX0); \
F4(RX1); \
F4(RX2); \
F4(RX3); \
\
F4(RX0); \
F4(RX1); \
F4(RX2); \
F4(RX3);

#define preload_roundkey_dec(n) \
movq p+4*((n)-1)(CTX), RKEY; \
rorq $32,  RKEY;

#define add_roundkey_dec4(n) \
add_preloaded_roundkey4(); \
preload_roundkey_dec(n - 2);

#define round_dec4(n) \
add_roundkey_dec4(n); \
\
F4(RX0); \
F4(RX1); \
F4(RX2); \
F4(RX3); \
\
F4(RX0); \
F4(RX1); \
F4(RX2); \
F4(RX3);

#define read_block4() \
movq (RIO),  RX0; \
rorq $32,  RX0; \
bswapq    RX0; \
\
movq 8(RIO),  RX1; \
rorq $32,  RX1; \
bswapq    RX1; \
\
movq 16(RIO),  RX2; \
rorq $32,  RX2; \
bswapq    RX2; \
\
movq 24(RIO),  RX3; \
rorq $32,  RX3; \
bswapq    RX3;

#define write_block4() \
bswapq    RX0; \
movq RX0,  (RIO); \
\
bswapq    RX1; \
movq RX1,  8(RIO); \
\
bswapq    RX2; \
movq RX2,  16(RIO); \
\
bswapq    RX3; \
movq RX3,  24(RIO);

#define xor_block4() \
movq (RIO),  RT0; \
bswapq   RT0; \
xorq RT0,  RX1; \
\
movq 8(RIO),  RT2; \
bswapq   RT2; \
xorq RT2,  RX2; \
\
movq 16(RIO),  RT3; \
bswapq   RT3; \
xorq RT3,  RX3;

SYM_FUNC_START(blowfish_enc_blk_4way)
/* input:
* %rdi: ctx
* %rsi: dst
* %rdx: src
*/
pushq %r12;
pushq %rbx;

movq %rdi, CTX
movq %rsi, %r11;
movq %rdx, RIO;

preload_roundkey_enc(0);

read_block4();

round_enc4(0);
round_enc4(2);
round_enc4(4);
round_enc4(6);
round_enc4(8);
round_enc4(10);
round_enc4(12);
round_enc4(14);
add_preloaded_roundkey4();

movq %r11, RIO;
write_block4();

popq %rbx;
popq %r12;
RET;
SYM_FUNC_END(blowfish_enc_blk_4way)

SYM_FUNC_START(__blowfish_dec_blk_4way)
/* input:
* %rdi: ctx
* %rsi: dst
* %rdx: src
* %rcx: cbc (bool)
*/
pushq %r12;
pushq %rbx;
pushq %rcx;
pushq %rdx;

movq %rdi, CTX;
movq %rsi, %r11;
movq %rdx, RIO;

preload_roundkey_dec(17);
read_block4();

round_dec4(17);
round_dec4(15);
round_dec4(13);
round_dec4(11);
round_dec4(9);
round_dec4(7);
round_dec4(5);
round_dec4(3);
add_preloaded_roundkey4();

popq RIO;
popq %r12;
testq %r12, %r12;
jz .L_no_cbc_xor;

xor_block4();

.L_no_cbc_xor:
movq %r11, RIO;
write_block4();

popq %rbx;
popq %r12;

RET;
SYM_FUNC_END(__blowfish_dec_blk_4way)

Messung V0.5 in Prozent

¤ Dauer der Verarbeitung: 0.11 Sekunden (vorverarbeitet am 2026-06-07) ¤

Wurzel

Suchen

Beweissystem der NASA

Beweissystem Isabelle

NIST Cobol Testsuite

Cephes Mathematical Library

Wiener Entwicklungsmethode

Haftungshinweis

Die Informationen auf dieser Webseite wurden nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit, noch Qualität der bereit gestellten Informationen zugesichert.

Bemerkung:

Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.