// This function will only be called if AVX2 or AVX are supported // AVX512 uses a different function. if (VM_Version::supports_avx2()) {
vector_len = Assembler::AVX_256bit;
outlen = 256;
} elseif (VM_Version::supports_avx()) {
vector_len = Assembler::AVX_128bit;
outlen = 128;
}
__ enter();
// Load the initial state in columnar orientation and then copy // that starting state to the working register set. // Also load the address of the add mask for later use in handling // multi-block counter increments.
__ lea(rotAddr, ExternalAddress(chacha20_lrot_consts()));
__ lea(rax, ExternalAddress(chacha20_ctradd_avx())); if (vector_len == Assembler::AVX_128bit) {
__ movdqu(aState, Address(state, 0)); // Bytes 0 - 15 -> a1Vec
__ movdqu(bState, Address(state, 16)); // Bytes 16 - 31 -> b1Vec
__ movdqu(cState, Address(state, 32)); // Bytes 32 - 47 -> c1Vec
__ movdqu(dState, Address(state, 48)); // Bytes 48 - 63 -> d1Vec
__ movdqu(a2Vec, aState);
__ movdqu(b2Vec, bState);
__ movdqu(c2Vec, cState);
__ vpaddd(d2State, dState, Address(rax, 16), vector_len);
__ movdqu(d2Vec, d2State);
__ movdqu(lrot8, Address(rotAddr, 0)); // Load 8-bit lrot const
__ movdqu(lrot16, Address(rotAddr, 32)); // Load 16-bit lrot const
} else { // We will broadcast each 128-bit segment of the state array into // the high and low halves of ymm state registers. Then apply the add // mask to the dState register. These will then be copied into the // a/b/c/d1Vec working registers.
__ vbroadcastf128(aState, Address(state, 0), vector_len);
__ vbroadcastf128(bState, Address(state, 16), vector_len);
__ vbroadcastf128(cState, Address(state, 32), vector_len);
__ vbroadcastf128(dState, Address(state, 48), vector_len);
__ vpaddd(dState, dState, Address(rax, 0), vector_len);
__ vpaddd(d2State, dState, Address(rax, 32), vector_len);
__ movl(loopCounter, 10); // Set 10 2-round iterations
__ BIND(L_twoRounds);
// The first quarter round macro call covers the first 4 QR operations: // Qround(state, 0, 4, 8,12) // Qround(state, 1, 5, 9,13) // Qround(state, 2, 6,10,14) // Qround(state, 3, 7,11,15)
cc20_quarter_round_avx(a1Vec, b1Vec, c1Vec, d1Vec, scratch,
lrot8, lrot16, vector_len);
cc20_quarter_round_avx(a2Vec, b2Vec, c2Vec, d2Vec, scratch,
lrot8, lrot16, vector_len);
// Shuffle the b1Vec/c1Vec/d1Vec to reorganize the state vectors // to diagonals. The a1Vec does not need to change orientation.
cc20_shift_lane_org(b1Vec, c1Vec, d1Vec, vector_len, true);
cc20_shift_lane_org(b2Vec, c2Vec, d2Vec, vector_len, true);
// The second set of operations on the vectors covers the second 4 quarter // round operations, now acting on the diagonals: // Qround(state, 0, 5,10,15) // Qround(state, 1, 6,11,12) // Qround(state, 2, 7, 8,13) // Qround(state, 3, 4, 9,14)
cc20_quarter_round_avx(a1Vec, b1Vec, c1Vec, d1Vec, scratch,
lrot8, lrot16, vector_len);
cc20_quarter_round_avx(a2Vec, b2Vec, c2Vec, d2Vec, scratch,
lrot8, lrot16, vector_len);
// Before we start the next iteration, we need to perform shuffles // on the b/c/d vectors to move them back to columnar organizations // from their current diagonal orientation.
cc20_shift_lane_org(b1Vec, c1Vec, d1Vec, vector_len, false);
cc20_shift_lane_org(b2Vec, c2Vec, d2Vec, vector_len, false);
// Add the original start state back into the current state.
__ vpaddd(a1Vec, a1Vec, aState, vector_len);
__ vpaddd(b1Vec, b1Vec, bState, vector_len);
__ vpaddd(c1Vec, c1Vec, cState, vector_len);
__ vpaddd(d1Vec, d1Vec, dState, vector_len);
// This function will always write 128 or 256 bytes into the // key stream buffer, depending on the length of the SIMD // registers. That length should be returned through %rax.
__ mov64(rax, outlen);
// Load the initial state in columnar orientation. // We will broadcast each 128-bit segment of the state array into // all four double-quadword slots on ZMM State registers. They will // be copied into the working ZMM registers and then added back in // at the very end of the block function. The add mask should be // applied to the dState register so it does not need to be fetched // when adding the start state back into the final working state.
__ lea(rax, ExternalAddress(chacha20_ctradd_avx512()));
__ evbroadcasti32x4(aState, Address(state, 0), Assembler::AVX_512bit);
__ evbroadcasti32x4(bState, Address(state, 16), Assembler::AVX_512bit);
__ evbroadcasti32x4(cState, Address(state, 32), Assembler::AVX_512bit);
__ evbroadcasti32x4(dState, Address(state, 48), Assembler::AVX_512bit);
__ vpaddd(dState, dState, Address(rax, 0), Assembler::AVX_512bit);
__ evmovdqul(scratch, Address(rax, 64), Assembler::AVX_512bit);
__ vpaddd(d2State, dState, scratch, Assembler::AVX_512bit);
__ vpaddd(d3State, d2State, scratch, Assembler::AVX_512bit);
__ vpaddd(d4State, d3State, scratch, Assembler::AVX_512bit);
__ movl(loopCounter, 10); // Set 10 2-round iterations
__ BIND(L_twoRounds);
// The first set of operations on the vectors covers the first 4 quarter // round operations: // Qround(state, 0, 4, 8,12) // Qround(state, 1, 5, 9,13) // Qround(state, 2, 6,10,14) // Qround(state, 3, 7,11,15)
cc20_quarter_round_avx(a1Vec, b1Vec, c1Vec, d1Vec, scratch,
xnoreg, xnoreg, Assembler::AVX_512bit);
cc20_quarter_round_avx(a2Vec, b2Vec, c2Vec, d2Vec, scratch,
xnoreg, xnoreg, Assembler::AVX_512bit);
cc20_quarter_round_avx(a3Vec, b3Vec, c3Vec, d3Vec, scratch,
xnoreg, xnoreg, Assembler::AVX_512bit);
cc20_quarter_round_avx(a4Vec, b4Vec, c4Vec, d4Vec, scratch,
xnoreg, xnoreg, Assembler::AVX_512bit);
// Shuffle the b1Vec/c1Vec/d1Vec to reorganize the state vectors // to diagonals. The a1Vec does not need to change orientation.
cc20_shift_lane_org(b1Vec, c1Vec, d1Vec, Assembler::AVX_512bit, true);
cc20_shift_lane_org(b2Vec, c2Vec, d2Vec, Assembler::AVX_512bit, true);
cc20_shift_lane_org(b3Vec, c3Vec, d3Vec, Assembler::AVX_512bit, true);
cc20_shift_lane_org(b4Vec, c4Vec, d4Vec, Assembler::AVX_512bit, true);
// The second set of operations on the vectors covers the second 4 quarter // round operations, now acting on the diagonals: // Qround(state, 0, 5,10,15) // Qround(state, 1, 6,11,12) // Qround(state, 2, 7, 8,13) // Qround(state, 3, 4, 9,14)
cc20_quarter_round_avx(a1Vec, b1Vec, c1Vec, d1Vec, scratch,
xnoreg, xnoreg, Assembler::AVX_512bit);
cc20_quarter_round_avx(a2Vec, b2Vec, c2Vec, d2Vec, scratch,
xnoreg, xnoreg, Assembler::AVX_512bit);
cc20_quarter_round_avx(a3Vec, b3Vec, c3Vec, d3Vec, scratch,
xnoreg, xnoreg, Assembler::AVX_512bit);
cc20_quarter_round_avx(a4Vec, b4Vec, c4Vec, d4Vec, scratch,
xnoreg, xnoreg, Assembler::AVX_512bit);
// Before we start the next iteration, we need to perform shuffles // on the b/c/d vectors to move them back to columnar organizations // from their current diagonal orientation.
cc20_shift_lane_org(b1Vec, c1Vec, d1Vec, Assembler::AVX_512bit, false);
cc20_shift_lane_org(b2Vec, c2Vec, d2Vec, Assembler::AVX_512bit, false);
cc20_shift_lane_org(b3Vec, c3Vec, d3Vec, Assembler::AVX_512bit, false);
cc20_shift_lane_org(b4Vec, c4Vec, d4Vec, Assembler::AVX_512bit, false);
// Add the initial state now held on the a/b/c/dState registers to the // final working register values. We will also add in the counter add // mask onto zmm3 after adding in the start state.
__ vpaddd(a1Vec, a1Vec, aState, Assembler::AVX_512bit);
__ vpaddd(b1Vec, b1Vec, bState, Assembler::AVX_512bit);
__ vpaddd(c1Vec, c1Vec, cState, Assembler::AVX_512bit);
__ vpaddd(d1Vec, d1Vec, dState, Assembler::AVX_512bit);
// Write the ZMM state registers out to the key stream buffer // Each ZMM is divided into 4 128-bit segments. Each segment // is written to memory at 64-byte displacements from one // another. The result is that all 4 blocks will be in their // proper order when serialized.
cc20_keystream_collate_avx512(a1Vec, b1Vec, c1Vec, d1Vec, result, 0);
cc20_keystream_collate_avx512(a2Vec, b2Vec, c2Vec, d2Vec, result, 256);
cc20_keystream_collate_avx512(a3Vec, b3Vec, c3Vec, d3Vec, result, 512);
cc20_keystream_collate_avx512(a4Vec, b4Vec, c4Vec, d4Vec, result, 768);
// This function will always write 1024 bytes into the key stream buffer // and that length should be returned through %rax.
__ mov64(rax, 1024);
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.