/* * Copyright (c) 2019, 2021, Intel Corporation. All rights reserved. * * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 only, as * published by the Free Software Foundation. * * This code is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * version 2 for more details (a copy is included in the LICENSE file that * accompanied this code). * * You should have received a copy of the GNU General Public License version * 2 along with this work; if not, write to the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. * * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA * or visit www.oracle.com if you need additional information or have any * questions. *
*/
// registers holding the four results in the parallelized loop const XMMRegister xmm_result0 = xmm5; const XMMRegister xmm_result1 = xmm6; const XMMRegister xmm_result2 = xmm7; const XMMRegister xmm_result3 = xmm8; const XMMRegister xmm_result4 = xmm9; const XMMRegister xmm_result5 = xmm10;
const XMMRegister xmm_from0 = xmm11; const XMMRegister xmm_from1 = xmm12; const XMMRegister xmm_from2 = xmm13; const XMMRegister xmm_from3 = xmm14; //the last one is xmm14. we have to preserve it on WIN64. const XMMRegister xmm_from4 = xmm3; //reuse xmm3~4. Because xmm_key_tmp0~1 are useless when loading input text const XMMRegister xmm_from5 = xmm4;
// k == 0 : generate code for key_128 // k == 1 : generate code for key_192 // k == 2 : generate code for key_256 for (int k = 0; k < 3; ++k) { //multi blocks starts here
__ align(OptoLoopAlignment);
__ BIND(L_multiBlock_loopTop[k]);
__ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least PARALLEL_FACTOR blocks left
__ jcc(Assembler::less, L_singleBlockLoopTop[k]);
load_key(xmm_key_tmp0, key, 0x00, xmm_key_shuf_mask);
//load, then increase counters
CTR_DoSix(movdqa, xmm_curr_counter);
inc_counter(rbx, xmm_result1, 0x01, L__incCounter[k][0]);
inc_counter(rbx, xmm_result2, 0x02, L__incCounter[k][1]);
inc_counter(rbx, xmm_result3, 0x03, L__incCounter[k][2]);
inc_counter(rbx, xmm_result4, 0x04, L__incCounter[k][3]);
inc_counter(rbx, xmm_result5, 0x05, L__incCounter[k][4]);
inc_counter(rbx, xmm_curr_counter, 0x06, L__incCounter[k][5]);
CTR_DoSix(pshufb, xmm_counter_shuf_mask); // after increased, shuffled counters back for PXOR
CTR_DoSix(pxor, xmm_key_tmp0); //PXOR with Round 0 key
//load two ROUND_KEYs at a time for (int i = 1; i < rounds[k]; ) {
load_key(xmm_key_tmp1, key, (0x10 * i), xmm_key_shuf_mask);
load_key(xmm_key_tmp0, key, (0x10 * (i+1)), xmm_key_shuf_mask);
CTR_DoSix(aesenc, xmm_key_tmp1);
i++; if (i != rounds[k]) {
CTR_DoSix(aesenc, xmm_key_tmp0);
} else {
CTR_DoSix(aesenclast, xmm_key_tmp0);
}
i++;
}
__ movdqu(Address(saved_encCounter_start, 0), xmm_result0); // 2. Perform pxor of the encrypted counter and plaintext Bytes.
__ pxor(xmm_result0, xmm_from0); // Also the encrypted counter is saved for next invocation.
constRegister from = c_rarg0; // source array address constRegister to = c_rarg1; // destination array address constRegister key = c_rarg2; // key array address constRegister rvec = c_rarg3; // r byte array initialized from initvector array address // and left with the results of the last encryption block #ifndef _WIN64 constRegister len_reg = c_rarg4; // src len (must be multiple of blocksize 16) #else const Address len_mem(rbp, 6 * wordSize); // length is on stack on Win64 constRegister len_reg = r11; // pick the volatile windows register #endif
#ifdef _WIN64 // on win64, fill len_reg from stack position
__ movl(len_reg, len_mem); #else
__ push(len_reg); // Save #endif
__ push(rbx);
__ vzeroupper();
// Calculate number of rounds from key size: 44 for 10-rounds, 52 for 12-rounds, 60 for 14-rounds constRegister rounds = rbx;
__ movl(rounds, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
const XMMRegister IV = xmm0; // Load IV and broadcast value to 512-bits
__ evbroadcasti64x2(IV, Address(rvec, 0), Assembler::AVX_512bit);
// for decryption java expanded key ordering is rotated one position from what we want // so we start from 0x10 here and hit 0x00 last // we don't know if the key is aligned, hence not using load-execute form
load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask);
load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask);
load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask);
load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask);
// for decryption the aesdeclast operation is always on key+0x00
__ aesdeclast(xmm_result, xmm_temp3);
__ movdqu(Address(to, 0), xmm_result); // store the result
__ xorptr(rax, rax); // return 0
__ leave(); // required for proper stackwalking of RuntimeStub frame
__ ret(0);
__ enter(); // required for proper stackwalking of RuntimeStub frame
#ifdef _WIN64 // on win64, fill len_reg from stack position
__ movl(len_reg, len_mem); #else
__ push(len_reg); // Save #endif
const XMMRegister xmm_key_shuf_mask = xmm_temp; // used temporarily to swap key bytes up front
__ movdqu(xmm_key_shuf_mask, ExternalAddress(key_shuffle_mask_addr()), r10 /*rscratch*/); // load up xmm regs xmm2 thru xmm12 with key 0x00 - 0xa0 for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x00; rnum <= XMM_REG_NUM_KEY_FIRST+10; rnum++) {
load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
offset += 0x10;
}
__ movdqu(xmm_result, Address(rvec, 0x00)); // initialize xmm_result with r vec
// now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
__ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
__ cmpl(rax, 44);
__ jcc(Assembler::notEqual, L_key_192_256);
// 128 bit code follows here
__ movptr(pos, 0);
__ align(OptoLoopAlignment);
__ BIND(L_loopTop_128);
__ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input
__ pxor (xmm_result, xmm_temp); // xor with the current r vector
__ pxor (xmm_result, xmm_key0); // do the aes rounds for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_FIRST + 9; rnum++) {
__ aesenc(xmm_result, as_XMMRegister(rnum));
}
__ aesenclast(xmm_result, xmm_key10);
__ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output // no need to store r to memory until we exit
__ addptr(pos, AESBlockSize);
__ subptr(len_reg, AESBlockSize);
__ jcc(Assembler::notEqual, L_loopTop_128);
__ BIND(L_exit);
__ movdqu(Address(rvec, 0), xmm_result); // final value of r stored in rvec of CipherBlockChaining object
__ BIND(L_key_192_256); // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256)
load_key(xmm_key11, key, 0xb0, xmm_key_shuf_mask);
load_key(xmm_key12, key, 0xc0, xmm_key_shuf_mask);
__ cmpl(rax, 52);
__ jcc(Assembler::notEqual, L_key_256);
// 192-bit code follows here (could be changed to use more xmm registers)
__ movptr(pos, 0);
__ align(OptoLoopAlignment);
__ BIND(L_loopTop_192);
__ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input
__ pxor (xmm_result, xmm_temp); // xor with the current r vector
__ pxor (xmm_result, xmm_key0); // do the aes rounds for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_FIRST + 11; rnum++) {
__ aesenc(xmm_result, as_XMMRegister(rnum));
}
__ aesenclast(xmm_result, xmm_key12);
__ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output // no need to store r to memory until we exit
__ addptr(pos, AESBlockSize);
__ subptr(len_reg, AESBlockSize);
__ jcc(Assembler::notEqual, L_loopTop_192);
__ jmp(L_exit);
__ BIND(L_key_256); // 256-bit code follows here (could be changed to use more xmm registers)
load_key(xmm_key13, key, 0xd0, xmm_key_shuf_mask);
__ movptr(pos, 0);
__ align(OptoLoopAlignment);
__ BIND(L_loopTop_256);
__ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input
__ pxor (xmm_result, xmm_temp); // xor with the current r vector
__ pxor (xmm_result, xmm_key0); // do the aes rounds for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_FIRST + 13; rnum++) {
__ aesenc(xmm_result, as_XMMRegister(rnum));
}
load_key(xmm_temp, key, 0xe0, r10 /*rscratch*/);
__ aesenclast(xmm_result, xmm_temp);
__ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output // no need to store r to memory until we exit
__ addptr(pos, AESBlockSize);
__ subptr(len_reg, AESBlockSize);
__ jcc(Assembler::notEqual, L_loopTop_256);
__ jmp(L_exit);
return start;
}
// This is a version of CBC/AES Decrypt which does 4 blocks in a loop at a time // to hide instruction latency // // Arguments: // // Inputs: // c_rarg0 - source byte array address // c_rarg1 - destination byte array address // c_rarg2 - K (key) in little endian int array // c_rarg3 - r vector byte array address // c_rarg4 - input length // // Output: // rax - input length //
address StubGenerator::generate_cipherBlockChaining_decryptAESCrypt_Parallel() {
assert(UseAES, "need AES instructions and misaligned SSE support");
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
address start = __ pc();
constRegister from = c_rarg0; // source array address constRegister to = c_rarg1; // destination array address constRegister key = c_rarg2; // key array address constRegister rvec = c_rarg3; // r byte array initialized from initvector array address // and left with the results of the last encryption block #ifndef _WIN64 constRegister len_reg = c_rarg4; // src len (must be multiple of blocksize 16) #else const Address len_mem(rbp, 6 * wordSize); // length is on stack on Win64 constRegister len_reg = r11; // pick the volatile windows register #endif constRegister pos = rax;
__ enter(); // required for proper stackwalking of RuntimeStub frame
#ifdef _WIN64 // on win64, fill len_reg from stack position
__ movl(len_reg, len_mem); #else
__ push(len_reg); // Save #endif
__ push(rbx); // the java expanded key ordering is rotated one position from what we want // so we start from 0x10 here and hit 0x00 last const XMMRegister xmm_key_shuf_mask = xmm1; // used temporarily to swap key bytes up front
__ movdqu(xmm_key_shuf_mask, ExternalAddress(key_shuffle_mask_addr()), rbx /*rscratch*/); // load up xmm regs 5 thru 15 with key 0x10 - 0xa0 - 0x00 for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x10; rnum < XMM_REG_NUM_KEY_LAST; rnum++) {
load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
offset += 0x10;
}
load_key(xmm_key_last, key, 0x00, xmm_key_shuf_mask);
// registers holding the four results in the parallelized loop const XMMRegister xmm_result0 = xmm0; const XMMRegister xmm_result1 = xmm2; const XMMRegister xmm_result2 = xmm3; const XMMRegister xmm_result3 = xmm4;
__ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00)); // initialize with initial rvec
__ xorptr(pos, pos);
// now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
__ movl(rbx, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
__ cmpl(rbx, 52);
__ jcc(Assembler::equal, L_multiBlock_loopTopHead[1]);
__ cmpl(rbx, 60);
__ jcc(Assembler::equal, L_multiBlock_loopTopHead[2]);
// registers used in the non-parallelized loops // xmm register assignments for the loops below const XMMRegister xmm_result = xmm0; const XMMRegister xmm_prev_block_cipher_save = xmm2; const XMMRegister xmm_key11 = xmm3; const XMMRegister xmm_key12 = xmm4; const XMMRegister key_tmp = xmm4;
__ BIND(L_singleBlock_loopTopHead[k]); if (k == 1) {
__ addptr(rsp, 6 * wordSize);
} elseif (k == 2) {
__ addptr(rsp, 10 * wordSize);
}
__ cmpptr(len_reg, 0); // any blocks left??
__ jcc(Assembler::equal, L_exit);
__ BIND(L_singleBlock_loopTopHead2[k]); if (k == 1) {
load_key(xmm_key11, key, 0xb0, rbx /*rscratch*/); // 0xb0; 192-bit key goes up to 0xc0
load_key(xmm_key12, key, 0xc0, rbx /*rscratch*/); // 0xc0; 192-bit key goes up to 0xc0
} if (k == 2) {
load_key(xmm_key11, key, 0xb0, rbx /*rscratch*/); // 0xb0; 256-bit key goes up to 0xe0
}
__ align(OptoLoopAlignment);
__ BIND(L_singleBlock_loopTop[k]);
__ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input
__ movdqa(xmm_prev_block_cipher_save, xmm_result); // save for next r vector
__ pxor(xmm_result, xmm_key_first); // do the aes dec rounds for (int rnum = 1; rnum <= 9 ; rnum++) {
__ aesdec(xmm_result, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
} if (k == 1) {
__ aesdec(xmm_result, xmm_key11);
__ aesdec(xmm_result, xmm_key12);
} if (k == 2) {
__ aesdec(xmm_result, xmm_key11);
load_key(key_tmp, key, 0xc0, rbx /*rscratch*/);
__ aesdec(xmm_result, key_tmp);
load_key(key_tmp, key, 0xd0, rbx /*rscratch*/);
__ aesdec(xmm_result, key_tmp);
load_key(key_tmp, key, 0xe0, rbx /*rscratch*/);
__ aesdec(xmm_result, key_tmp);
}
__ aesdeclast(xmm_result, xmm_key_last); // xmm15 always came from key+0
__ pxor(xmm_result, xmm_prev_block_cipher); // xor with the current r vector
__ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output // no need to store r to memory until we exit
__ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save); // set up next r vector with cipher input from this block
__ addptr(pos, AESBlockSize);
__ subptr(len_reg, AESBlockSize);
__ jcc(Assembler::notEqual, L_singleBlock_loopTop[k]); if (k != 2) {
__ jmp(L_exit);
}
} //for 128/192/256
__ BIND(L_exit);
__ movdqu(Address(rvec, 0), xmm_prev_block_cipher); // final value of r stored in rvec of CipherBlockChaining object
__ pop(rbx); #ifdef _WIN64
__ movl(rax, len_mem); #else
__ pop(rax); // return length #endif
__ leave(); // required for proper stackwalking of RuntimeStub frame
__ ret(0);
// Utility routine for loading a 128-bit key word in little endian format void StubGenerator::load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask) {
__ movdqu(xmmdst, Address(key, offset));
__ pshufb(xmmdst, xmm_shuf_mask);
}
// For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge // context for the registers used, where all instructions below are using 128-bit mode // On EVEX without VL and BW, these instructions will all be AVX. if (VM_Version::supports_avx512vlbw()) {
__ movl(rax, 0xffff);
__ kmovql(k1, rax);
}
__ push(len); // Save
__ push(rbx);
__ vzeroupper();
__ xorptr(pos, pos);
// Calculate number of rounds based on key length(128, 192, 256):44 for 10-rounds, 52 for 12-rounds, 60 for 14-rounds
__ movl(rounds, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
// Load Key shuf mask const XMMRegister xmm_key_shuf_mask = xmm31; // used temporarily to swap key bytes up front
__ movdqu(xmm_key_shuf_mask, ExternalAddress(key_shuffle_mask_addr()), rbx /*rscratch*/);
// For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge // context for the registers used, where all instructions below are using 128-bit mode // On EVEX without VL and BW, these instructions will all be AVX. if (VM_Version::supports_avx512vlbw()) {
__ movl(rax, 0xffff);
__ kmovql(k1, rax);
}
__ push(len); // Save
__ push(rbx);
__ vzeroupper();
__ xorptr(pos, pos); // Calculate number of rounds i.e. based on key length(128, 192, 256):44 for 10-rounds, 52 for 12-rounds, 60 for 14-rounds
__ movl(rounds, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
// Load Key shuf mask const XMMRegister xmm_key_shuf_mask = xmm31; // used temporarily to swap key bytes up front
__ movdqu(xmm_key_shuf_mask, ExternalAddress(key_shuffle_mask_addr()), rbx /*rscratch*/);
// Load and shuffle round keys. The java expanded key ordering is rotated one position in decryption. // So the first round key is loaded from 1*16 here and last round key is loaded from 0*16
ev_load_key(xmm9, key, 1 * 16, xmm_key_shuf_mask);
ev_load_key(xmm10, key, 2 * 16, xmm_key_shuf_mask);
ev_load_key(xmm11, key, 3 * 16, xmm_key_shuf_mask);
ev_load_key(xmm12, key, 4 * 16, xmm_key_shuf_mask);
ev_load_key(xmm13, key, 5 * 16, xmm_key_shuf_mask);
ev_load_key(xmm14, key, 6 * 16, xmm_key_shuf_mask);
ev_load_key(xmm15, key, 7 * 16, xmm_key_shuf_mask);
ev_load_key(xmm16, key, 8 * 16, xmm_key_shuf_mask);
ev_load_key(xmm17, key, 9 * 16, xmm_key_shuf_mask);
ev_load_key(xmm18, key, 10 * 16, xmm_key_shuf_mask);
ev_load_key(xmm27, key, 0 * 16, xmm_key_shuf_mask);
__ cmpl(rounds, 52);
__ jcc(Assembler::greaterEqual, KEY_192);
__ jmp(Loop_start);
__ bind(KEY_256);
ev_load_key(xmm21, key, 13 * 16, xmm_key_shuf_mask);
ev_load_key(xmm22, key, 14 * 16, xmm_key_shuf_mask);
__ bind(Loop_start);
__ movq(rbx, len); // Convert input length to number of blocks
__ shrq(len, 4);
__ shlq(rbx, 60);
__ jcc(Assembler::equal, NO_PARTS);
__ addq(len, 1); // Check if number of blocks is greater than/ equal to 32 // If true, blocks then 512 bytes are processed at a time (code marked by label LOOP) // If not, 16 bytes are processed (code marked by label REMAINDER)
__ bind(NO_PARTS);
__ movq(rbx, len);
__ shrq(len, 5);
__ jcc(Assembler::equal, REMAINDER);
__ movl(r13, len); // Compute number of blocks that will be processed as 512 bytes at a time // Subtract this from the total number of blocks, which will then be processed by REMAINDER loop.
__ shlq(r13, 5);
__ subq(rbx, r13);
// Calculate number of rounds i.e. 10, 12, 14, based on key length(128, 192, 256).
__ movl(rounds, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
__ vpxor(xmm0, xmm0, xmm0, Assembler::AVX_128bit); // Move initial counter value in xmm0
__ movdqu(xmm0, Address(counter, 0)); // broadcast counter value to zmm8
__ evshufi64x2(xmm8, xmm0, xmm0, 0, Assembler::AVX_512bit);
//shuffle counter using lbswap_mask
__ vpshufb(xmm8, xmm8, xmm16, Assembler::AVX_512bit);
// pre-increment and propagate counter values to zmm9-zmm15 registers. // Linc0 increments the zmm8 by 1 (initial value being 0), Linc4 increments the counters zmm9-zmm15 by 4 // The counter is incremented after each block i.e. 16 bytes is processed; // each zmm register has 4 counter values as its MSB // the counters are incremented in parallel
__ vpaddd(xmm8, xmm8, ExternalAddress(counter_mask_linc0_addr()), Assembler::AVX_512bit, r15 /*rscratch*/);
__ vpaddd(xmm9, xmm8, ExternalAddress(counter_mask_linc4_addr()), Assembler::AVX_512bit, r15 /*rscratch*/);
__ vpaddd(xmm10, xmm9, ExternalAddress(counter_mask_linc4_addr()), Assembler::AVX_512bit, r15 /*rscratch*/);
__ vpaddd(xmm11, xmm10, ExternalAddress(counter_mask_linc4_addr()), Assembler::AVX_512bit, r15 /*rscratch*/);
__ vpaddd(xmm12, xmm11, ExternalAddress(counter_mask_linc4_addr()), Assembler::AVX_512bit, r15 /*rscratch*/);
__ vpaddd(xmm13, xmm12, ExternalAddress(counter_mask_linc4_addr()), Assembler::AVX_512bit, r15 /*rscratch*/);
__ vpaddd(xmm14, xmm13, ExternalAddress(counter_mask_linc4_addr()), Assembler::AVX_512bit, r15 /*rscratch*/);
__ vpaddd(xmm15, xmm14, ExternalAddress(counter_mask_linc4_addr()), Assembler::AVX_512bit, r15 /*rscratch*/);
// load linc32 mask in zmm register.linc32 increments counter by 32
__ evmovdquq(xmm19, ExternalAddress(counter_mask_linc32_addr()), Assembler::AVX_512bit, r15 /*rscratch*/);
// xmm31 contains the key shuffle mask.
__ movdqu(xmm31, ExternalAddress(key_shuffle_mask_addr()), r15 /*rscratch*/); // Load key function loads 128 bit key and shuffles it. Then we broadcast the shuffled key to convert it into a 512 bit value. // For broadcasting the values to ZMM, vshufi64 is used instead of evbroadcasti64x2 as the source in this case is ZMM register // that holds shuffled key value.
ev_load_key(xmm20, key, 0, xmm31);
ev_load_key(xmm21, key, 1 * 16, xmm31);
ev_load_key(xmm22, key, 2 * 16, xmm31);
ev_load_key(xmm23, key, 3 * 16, xmm31);
ev_load_key(xmm24, key, 4 * 16, xmm31);
ev_load_key(xmm25, key, 5 * 16, xmm31);
ev_load_key(xmm26, key, 6 * 16, xmm31);
ev_load_key(xmm27, key, 7 * 16, xmm31);
ev_load_key(xmm28, key, 8 * 16, xmm31);
ev_load_key(xmm29, key, 9 * 16, xmm31);
ev_load_key(xmm30, key, 10 * 16, xmm31);
// Process 32 blocks or 512 bytes of data
__ bind(LOOP);
__ cmpl(len_reg, 512);
__ jcc(Assembler::less, REMAINDER);
__ subq(len_reg, 512); //Shuffle counter and Exor it with roundkey1. Result is stored in zmm0-7
__ vpshufb(xmm0, xmm8, xmm16, Assembler::AVX_512bit);
__ evpxorq(xmm0, xmm0, xmm20, Assembler::AVX_512bit);
__ vpshufb(xmm1, xmm9, xmm16, Assembler::AVX_512bit);
__ evpxorq(xmm1, xmm1, xmm20, Assembler::AVX_512bit);
__ vpshufb(xmm2, xmm10, xmm16, Assembler::AVX_512bit);
__ evpxorq(xmm2, xmm2, xmm20, Assembler::AVX_512bit);
__ vpshufb(xmm3, xmm11, xmm16, Assembler::AVX_512bit);
__ evpxorq(xmm3, xmm3, xmm20, Assembler::AVX_512bit);
__ vpshufb(xmm4, xmm12, xmm16, Assembler::AVX_512bit);
__ evpxorq(xmm4, xmm4, xmm20, Assembler::AVX_512bit);
__ vpshufb(xmm5, xmm13, xmm16, Assembler::AVX_512bit);
__ evpxorq(xmm5, xmm5, xmm20, Assembler::AVX_512bit);
__ vpshufb(xmm6, xmm14, xmm16, Assembler::AVX_512bit);
__ evpxorq(xmm6, xmm6, xmm20, Assembler::AVX_512bit);
__ vpshufb(xmm7, xmm15, xmm16, Assembler::AVX_512bit);
__ evpxorq(xmm7, xmm7, xmm20, Assembler::AVX_512bit); // Perform AES encode operations and put results in zmm0-zmm7. // This is followed by incrementing counter values in zmm8-zmm15. // Since we will be processing 32 blocks at a time, the counter is incremented by 32.
roundEnc(xmm21, 7);
__ vpaddq(xmm8, xmm8, xmm19, Assembler::AVX_512bit);
roundEnc(xmm22, 7);
__ vpaddq(xmm9, xmm9, xmm19, Assembler::AVX_512bit);
roundEnc(xmm23, 7);
__ vpaddq(xmm10, xmm10, xmm19, Assembler::AVX_512bit);
roundEnc(xmm24, 7);
__ vpaddq(xmm11, xmm11, xmm19, Assembler::AVX_512bit);
roundEnc(xmm25, 7);
__ vpaddq(xmm12, xmm12, xmm19, Assembler::AVX_512bit);
roundEnc(xmm26, 7);
__ vpaddq(xmm13, xmm13, xmm19, Assembler::AVX_512bit);
roundEnc(xmm27, 7);
__ vpaddq(xmm14, xmm14, xmm19, Assembler::AVX_512bit);
roundEnc(xmm28, 7);
__ vpaddq(xmm15, xmm15, xmm19, Assembler::AVX_512bit);
roundEnc(xmm29, 7);
// Encode 256, 128, 64 or 16 bytes at a time if length is less than 512 bytes
__ bind(REMAINDER);
__ cmpl(len_reg, 0);
__ jcc(Assembler::equal, END);
__ cmpl(len_reg, 256);
__ jcc(Assembler::aboveEqual, REMAINDER_16);
__ cmpl(len_reg, 128);
__ jcc(Assembler::aboveEqual, REMAINDER_8);
__ cmpl(len_reg, 64);
__ jcc(Assembler::aboveEqual, REMAINDER_4); // At this point, we will process 16 bytes of data at a time. // So load xmm19 with counter increment value as 1
__ evmovdquq(xmm19, ExternalAddress(counter_mask_linc1_addr()), Assembler::AVX_128bit, r15 /*rscratch*/);
__ jmp(REMAINDER_LOOP);
// Each ZMM register can be used to encode 64 bytes of data, so we have 4 ZMM registers to encode 256 bytes of data
__ bind(REMAINDER_16);
__ subq(len_reg, 256); // As we process 16 blocks at a time, load mask for incrementing the counter value by 16
__ evmovdquq(xmm19, ExternalAddress(counter_mask_linc16_addr()), Assembler::AVX_512bit, r15 /*rscratch*/); // shuffle counter and XOR counter with roundkey1
__ vpshufb(xmm0, xmm8, xmm16, Assembler::AVX_512bit);
__ evpxorq(xmm0, xmm0, xmm20, Assembler::AVX_512bit);
__ vpshufb(xmm1, xmm9, xmm16, Assembler::AVX_512bit);
__ evpxorq(xmm1, xmm1, xmm20, Assembler::AVX_512bit);
__ vpshufb(xmm2, xmm10, xmm16, Assembler::AVX_512bit);
__ evpxorq(xmm2, xmm2, xmm20, Assembler::AVX_512bit);
__ vpshufb(xmm3, xmm11, xmm16, Assembler::AVX_512bit);
__ evpxorq(xmm3, xmm3, xmm20, Assembler::AVX_512bit); // Increment counter values by 16
__ vpaddq(xmm8, xmm8, xmm19, Assembler::AVX_512bit);
__ vpaddq(xmm9, xmm9, xmm19, Assembler::AVX_512bit); // AES encode rounds
roundEnc(xmm21, 3);
roundEnc(xmm22, 3);
roundEnc(xmm23, 3);
roundEnc(xmm24, 3);
roundEnc(xmm25, 3);
roundEnc(xmm26, 3);
roundEnc(xmm27, 3);
roundEnc(xmm28, 3);
roundEnc(xmm29, 3);
__ cmpl(len_reg, 64);
__ jcc(Assembler::aboveEqual, REMAINDER_4); //load mask for incrementing the counter value by 1
__ evmovdquq(xmm19, ExternalAddress(counter_mask_linc1_addr()), Assembler::AVX_128bit, r15 /*rscratch*/);
__ jmp(REMAINDER_LOOP);
// Each ZMM register can be used to encode 64 bytes of data, so we have 2 ZMM registers to encode 128 bytes of data
__ bind(REMAINDER_8);
__ subq(len_reg, 128); // As we process 8 blocks at a time, load mask for incrementing the counter value by 8
__ evmovdquq(xmm19, ExternalAddress(counter_mask_linc8_addr()), Assembler::AVX_512bit, r15 /*rscratch*/); // shuffle counters and xor with roundkey1
__ vpshufb(xmm0, xmm8, xmm16, Assembler::AVX_512bit);
__ evpxorq(xmm0, xmm0, xmm20, Assembler::AVX_512bit);
__ vpshufb(xmm1, xmm9, xmm16, Assembler::AVX_512bit);
__ evpxorq(xmm1, xmm1, xmm20, Assembler::AVX_512bit); // increment counter by 8
__ vpaddq(xmm8, xmm8, xmm19, Assembler::AVX_512bit); // AES encode
roundEnc(xmm21, 1);
roundEnc(xmm22, 1);
roundEnc(xmm23, 1);
roundEnc(xmm24, 1);
roundEnc(xmm25, 1);
roundEnc(xmm26, 1);
roundEnc(xmm27, 1);
roundEnc(xmm28, 1);
roundEnc(xmm29, 1);
__ bind(END_REMAINDER_LOOP); // If the length register is less than the blockSize i.e. 16 // then we store only those bytes of the CT to the destination // corresponding to the length register value // extracting the exact number of bytes is handled by EXTRACT_TAILBYTES
__ cmpl(len_reg, 16);
__ jcc(Assembler::less, EXTRACT_TAILBYTES);
__ subl(len_reg, 16); // After AES encode rounds, the encrypted block cipher lies in xmm0. // If the length register is equal to 16 bytes, store CT in dest after XOR operation.
__ evpxorq(xmm0, xmm0, Address(src_addr, pos, Address::times_1, 0), Assembler::AVX_128bit);
__ evmovdquq(Address(dest_addr, pos, Address::times_1, 0), xmm0, Assembler::AVX_128bit);
__ addl(pos, 16);
__ jmp(REMAINDER_LOOP);
__ bind(EXTRACT_TAILBYTES); // Save encrypted counter value in xmm0 for next invocation, before XOR operation
__ movdqu(Address(saved_encCounter_start, 0), xmm0); // XOR encryted block cipher in xmm0 with PT to produce CT
__ evpxorq(xmm0, xmm0, Address(src_addr, pos, Address::times_1, 0), Assembler::AVX_128bit); // extract up to 15 bytes of CT from xmm0 as specified by length register
__ testptr(len_reg, 8);
__ jcc(Assembler::zero, EXTRACT_TAIL_4BYTES);
__ pextrq(Address(dest_addr, pos), xmm0, 0);
__ psrldq(xmm0, 8);
__ addl(pos, 8);
__ bind(EXTRACT_TAIL_4BYTES);
__ testptr(len_reg, 4);
__ jcc(Assembler::zero, EXTRACT_TAIL_2BYTES);
__ pextrd(Address(dest_addr, pos), xmm0, 0);
__ psrldq(xmm0, 4);
__ addq(pos, 4);
__ bind(EXTRACT_TAIL_2BYTES);
__ testptr(len_reg, 2);
__ jcc(Assembler::zero, EXTRACT_TAIL_1BYTE);
__ pextrw(Address(dest_addr, pos), xmm0, 0);
__ psrldq(xmm0, 2);
__ addl(pos, 2);
__ bind(EXTRACT_TAIL_1BYTE);
__ testptr(len_reg, 1);
__ jcc(Assembler::zero, END);
__ pextrb(Address(dest_addr, pos), xmm0, 0);
__ addl(pos, 1);
__ bind(END); // If there are no tail bytes, store counter value and exit
__ cmpl(len_reg, 0);
__ jcc(Assembler::equal, STORE_CTR);
__ movl(Address(used_addr, 0), len_reg);
__ movl(pos, 0); // Total length processed // Min data size processed = 768 bytes
__ cmpl(len, 768);
__ jcc(Assembler::less, ENC_DEC_DONE);
// Generate 48 constants for htbl
__ call(GENERATE_HTBL_48_BLKS, relocInfo::none); int index = 0; // Index for choosing subkeyHtbl entry
__ movl(ghash_pos, 0); // Pointer for ghash read and store operations
// Move initial counter value and STATE value into variables
__ movdqu(CTR_BLOCKx, Address(counter, 0));
__ movdqu(AAD_HASHx, Address(state, 0)); // Load lswap mask for ghash
__ movdqu(xmm24, ExternalAddress(ghash_long_swap_mask_addr()), rbx /*rscratch*/); // Shuffle input state using lswap mask
__ vpshufb(AAD_HASHx, AAD_HASHx, xmm24, Assembler::AVX_128bit);
// Compute #rounds for AES based on the length of the key array
__ movl(rounds, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
// Begin 32 blocks of AES processing
__ bind(AES_32_BLOCKS); // Save incremented counter before overwriting it with AES data
__ evmovdquq(CTR_BLOCKx, ZTMP8, Assembler::AVX_512bit);
__ bind(GHASH_AES_PARALLEL); // Ghash16_encrypt16_parallel takes place in the order with three reduction values: // 1) First time -> cipher xor input ghash // 2) No reduction -> accumulate multiplication values // 3) Final reduction post 48 blocks -> new ghash value is computed for the next round // Reduction value = first time
ghash16_encrypt16_parallel(key, avx512_subkeyHtbl, CTR_BLOCKx, AAD_HASHx, in, out, ct, pos, true, xmm24, true, rounds, ghash_pos, false, index, COUNTER_INC_MASK);
__ addl(pos, 256);
__ addl(ghash_pos, 256);
index += 4;
// At this point we have processed 768 bytes of AES and 256 bytes of GHASH. // If the remaining length is less than 768, process remaining 512 bytes of ghash in GHASH_LAST_32 code
__ subl(len, 768);
__ cmpl(len, 768);
__ jcc(Assembler::less, GHASH_LAST_32);
// AES 16 blocks and GHASH 16 blocks in parallel // For multiples of 48 blocks we will do ghash16_encrypt16 interleaved multiple times // Reduction value = no reduction means that the carryless multiplication values are accumulated for further calculations // Each call uses 4 subkeyHtbl values, so increment the index by 4.
__ bind(GHASH_16_AES_16); // Reduction value = no reduction
ghash16_encrypt16_parallel(key, avx512_subkeyHtbl, CTR_BLOCKx, AAD_HASHx, in, out, ct, pos, false, xmm24, false, rounds, ghash_pos, false, index, COUNTER_INC_MASK);
__ addl(pos, 256);
__ addl(ghash_pos, 256);
index += 4; // Reduction value = final reduction means that the accumulated values have to be reduced as we have completed 48 blocks of ghash
ghash16_encrypt16_parallel(key, avx512_subkeyHtbl, CTR_BLOCKx, AAD_HASHx, in, out, ct, pos, false, xmm24, false, rounds, ghash_pos, true, index, COUNTER_INC_MASK);
__ addl(pos, 256);
__ addl(ghash_pos, 256); // Calculated ghash value needs to be __ moved to AAD_HASHX so that we can restart the ghash16-aes16 pipeline
__ movdqu(AAD_HASHx, ZTMP5);
index = 0; // Reset subkeyHtbl index
// Restart the pipeline // Reduction value = first time
ghash16_encrypt16_parallel(key, avx512_subkeyHtbl, CTR_BLOCKx, AAD_HASHx, in, out, ct, pos, true, xmm24, true, rounds, ghash_pos, false, index, COUNTER_INC_MASK);
__ addl(pos, 256);
__ addl(ghash_pos, 256);
index += 4;
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.