/* * Copyright (c) 2019, 2021, Intel Corporation. All rights reserved. * * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 only, as * published by the Free Software Foundation. * * This code is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * version 2 for more details (a copy is included in the LICENSE file that * accompanied this code). * * You should have received a copy of the GNU General Public License version * 2 along with this work; if not, write to the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. * * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA * or visit www.oracle.com if you need additional information or have any * questions. *
*/
// registers holding the four results in the parallelized loop const XMMRegister xmm_result0 = xmm5; const XMMRegister xmm_result1 = xmm6; const XMMRegister xmm_result2 = xmm7; const XMMRegister xmm_result3 = xmm8; const XMMRegister xmm_result4 = xmm9; const XMMRegister xmm_result5 = xmm10;
const XMMRegister xmm_from0 = xmm11; const XMMRegister xmm_from1 = xmm12; const XMMRegister xmm_from2 = xmm13; const XMMRegister xmm_from3 = xmm14; //the last one is xmm14. we have to preserve it on WIN64. const XMMRegister xmm_from4 = xmm3; //reuse xmm3~4. Because xmm_key_tmp0~1 are useless when loading input text const XMMRegister xmm_from5 = xmm4;
// k == 0 : generate code for key_128 // k == 1 : generate code for key_192 // k == 2 : generate code for key_256 for (int k = 0; k < 3; ++k) { //multi blocks starts here
__ align(OptoLoopAlignment);
__ BIND(L_multiBlock_loopTop[k]);
__ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least PARALLEL_FACTOR blocks left
__ jcc(Assembler::less, L_singleBlockLoopTop[k]);
load_key(xmm_key_tmp0, key, 0x00, xmm_key_shuf_mask);
//load, then increase counters
CTR_DoSix(movdqa, xmm_curr_counter);
inc_counter(rbx, xmm_result1, 0x01, L__incCounter[k][0]);
inc_counter(rbx, xmm_result2, 0x02, L__incCounter[k][1]);
inc_counter(rbx, xmm_result3, 0x03, L__incCounter[k][2]);
inc_counter(rbx, xmm_result4, 0x04, L__incCounter[k][3]);
inc_counter(rbx, xmm_result5, 0x05, L__incCounter[k][4]);
inc_counter(rbx, xmm_curr_counter, 0x06, L__incCounter[k][5]);
CTR_DoSix(pshufb, xmm_counter_shuf_mask); // after increased, shuffled counters back for PXOR
CTR_DoSix(pxor, xmm_key_tmp0); //PXOR with Round 0 key
//load two ROUND_KEYs at a time for (int i = 1; i < rounds[k]; ) {
load_key(xmm_key_tmp1, key, (0x10 * i), xmm_key_shuf_mask);
load_key(xmm_key_tmp0, key, (0x10 * (i+1)), xmm_key_shuf_mask);
CTR_DoSix(aesenc, xmm_key_tmp1);
i++; if (i != rounds[k]) {
CTR_DoSix(aesenc, xmm_key_tmp0);
} else {
CTR_DoSix(aesenclast, xmm_key_tmp0);
}
i++;
}
__ movdqu(Address(saved_encCounter_start, 0), xmm_result0); // 2. Perform pxor of the encrypted counter and plaintext Bytes.
__ pxor(xmm_result0, xmm_from0); // Also the encrypted counter is saved for next invocation.
constRegister from = c_rarg0; // source array address constRegister to = c_rarg1; // destination array address constRegister key = c_rarg2; // key array address constRegister rvec = c_rarg3; // r byte array initialized from initvector array address // and left with the results of the last encryption block #ifndef _WIN64 constRegister len_reg = c_rarg4; // src len (must be multiple of blocksize 16) #else const Address len_mem(rbp, 6 * wordSize); // length is on stack on Win64 constRegister len_reg = r11; // pick the volatile windows register #endif
#ifdef _WIN64 // on win64, fill len_reg from stack position
__ movl(len_reg, len_mem); #else
__ push(len_reg); // Save #endif
__ push(rbx);
__ vzeroupper();
// Calculate number of rounds from key size: 44 for 10-rounds, 52 for 12-rounds, 60 for 14-rounds constRegister rounds = rbx;
__ movl(rounds, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
const XMMRegister IV = xmm0; // Load IV and broadcast value to 512-bits
__ evbroadcasti64x2(IV, Address(rvec, 0), Assembler::AVX_512bit);
// for decryption java expanded key ordering is rotated one position from what we want // so we start from 0x10 here and hit 0x00 last // we don't know if the key is aligned, hence not using load-execute form
load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask);
load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask);
load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask);
load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask);
// for decryption the aesdeclast operation is always on key+0x00
__ aesdeclast(xmm_result, xmm_temp3);
__ movdqu(Address(to, 0), xmm_result); // store the result
__ xorptr(rax, rax); // return 0
__ leave(); // required for proper stackwalking of RuntimeStub frame
__ ret(0);
__ enter(); // required for proper stackwalking of RuntimeStub frame
#ifdef _WIN64 // on win64, fill len_reg from stack position
__ movl(len_reg, len_mem); #else
__ push(len_reg); // Save #endif
const XMMRegister xmm_key_shuf_mask = xmm_temp; // used temporarily to swap key bytes up front
__ movdqu(xmm_key_shuf_mask, ExternalAddress(key_shuffle_mask_addr()), r10 /*rscratch*/); // load up xmm regs xmm2 thru xmm12 with key 0x00 - 0xa0 for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x00; rnum <= XMM_REG_NUM_KEY_FIRST+10; rnum++) {
load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
offset += 0x10;
}
__ movdqu(xmm_result, Address(rvec, 0x00)); // initialize xmm_result with r vec
// now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
__ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
__ cmpl(rax, 44);
__ jcc(Assembler::notEqual, L_key_192_256);
// 128 bit code follows here
__ movptr(pos, 0);
__ align(OptoLoopAlignment);
__ BIND(L_loopTop_128);
__ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input
__ pxor (xmm_result, xmm_temp); // xor with the current r vector
__ pxor (xmm_result, xmm_key0); // do the aes rounds for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_FIRST + 9; rnum++) {
__ aesenc(xmm_result, as_XMMRegister(rnum));
}
__ aesenclast(xmm_result, xmm_key10);
__ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output // no need to store r to memory until we exit
__ addptr(pos, AESBlockSize);
__ subptr(len_reg, AESBlockSize);
__ jcc(Assembler::notEqual, L_loopTop_128);
__ BIND(L_exit);
__ movdqu(Address(rvec, 0), xmm_result); // final value of r stored in rvec of CipherBlockChaining object
__ BIND(L_key_192_256); // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256)
load_key(xmm_key11, key, 0xb0, xmm_key_shuf_mask);
load_key(xmm_key12, key, 0xc0, xmm_key_shuf_mask);
__ cmpl(rax, 52);
__ jcc(Assembler::notEqual, L_key_256);
// 192-bit code follows here (could be changed to use more xmm registers)
__ movptr(pos, 0);
__ align(OptoLoopAlignment);
__ BIND(L_loopTop_192);
__ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input
__ pxor (xmm_result, xmm_temp); // xor with the current r vector
__ pxor (xmm_result, xmm_key0); // do the aes rounds for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_FIRST + 11; rnum++) {
__ aesenc(xmm_result, as_XMMRegister(rnum));
}
__ aesenclast(xmm_result, xmm_key12);
__ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output // no need to store r to memory until we exit
__ addptr(pos, AESBlockSize);
__ subptr(len_reg, AESBlockSize);
__ jcc(Assembler::notEqual, L_loopTop_192);
__ jmp(L_exit);
__ BIND(L_key_256); // 256-bit code follows here (could be changed to use more xmm registers)
load_key(xmm_key13, key, 0xd0, xmm_key_shuf_mask);
__ movptr(pos, 0);
__ align(OptoLoopAlignment);
__ BIND(L_loopTop_256);
__ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input
__ pxor (xmm_result, xmm_temp); // xor with the current r vector
__ pxor (xmm_result, xmm_key0); // do the aes rounds for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_FIRST + 13; rnum++) {
__ aesenc(xmm_result, as_XMMRegister(rnum));
}
load_key(xmm_temp, key, 0xe0, r10 /*rscratch*/);
__ aesenclast(xmm_result, xmm_temp);
__ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output // no need to store r to memory until we exit
__ addptr(pos, AESBlockSize);
__ subptr(len_reg, AESBlockSize);
__ jcc(Assembler::notEqual, L_loopTop_256);
__ jmp(L_exit);
return start;
}
// This is a version of CBC/AES Decrypt which does 4 blocks in a loop at a time // to hide instruction latency // // Arguments: // // Inputs: // c_rarg0 - source byte array address // c_rarg1 - destination byte array address // c_rarg2 - K (key) in little endian int array // c_rarg3 - r vector byte array address // c_rarg4 - input length // // Output: // rax - input length //
address StubGenerator::generate_cipherBlockChaining_decryptAESCrypt_Parallel() {
assert(UseAES, "need AES instructions and misaligned SSE support");
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
address start = __ pc();
constRegister from = c_rarg0; // source array address constRegister to = c_rarg1; // destination array address constRegister key = c_rarg2; // key array address constRegister rvec = c_rarg3; // r byte array initialized from initvector array address // and left with the results of the last encryption block #ifndef _WIN64 constRegister len_reg = c_rarg4; // src len (must be multiple of blocksize 16) #else const Address len_mem(rbp, 6 * wordSize); // length is on stack on Win64 constRegister len_reg = r11; // pick the volatile windows register #endif constRegister pos = rax;
__ enter(); // required for proper stackwalking of RuntimeStub frame
#ifdef _WIN64 // on win64, fill len_reg from stack position
__ movl(len_reg, len_mem); #else
__ push(len_reg); // Save #endif
__ push(rbx); // the java expanded key ordering is rotated one position from what we want // so we start from 0x10 here and hit 0x00 last const XMMRegister xmm_key_shuf_mask = xmm1; // used temporarily to swap key bytes up front
__ movdqu(xmm_key_shuf_mask, ExternalAddress(key_shuffle_mask_addr()), rbx /*rscratch*/); // load up xmm regs 5 thru 15 with key 0x10 - 0xa0 - 0x00 for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x10; rnum < XMM_REG_NUM_KEY_LAST; rnum++) {
load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
offset += 0x10;
}
load_key(xmm_key_last, key, 0x00, xmm_key_shuf_mask);
// registers holding the four results in the parallelized loop const XMMRegister xmm_result0 = xmm0; const XMMRegister xmm_result1 = xmm2; const XMMRegister xmm_result2 = xmm3; const XMMRegister xmm_result3 = xmm4;
__ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00)); // initialize with initial rvec
__ xorptr(pos, pos);
// now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
__ movl(rbx, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
__ cmpl(rbx, 52);
__ jcc(Assembler::equal, L_multiBlock_loopTopHead[1]);
__ cmpl(rbx, 60);
__ jcc(Assembler::equal, L_multiBlock_loopTopHead[2]);
// registers used in the non-parallelized loops // xmm register assignments for the loops below const XMMRegister xmm_result = xmm0; const XMMRegister xmm_prev_block_cipher_save = xmm2; const XMMRegister xmm_key11 = xmm3; const XMMRegister xmm_key12 = xmm4; const XMMRegister key_tmp = xmm4;
__ BIND(L_singleBlock_loopTopHead[k]); if (k == 1) {
__ addptr(rsp, 6 * wordSize);
} elseif (k == 2) {
__ addptr(rsp, 10 * wordSize);
}
__ cmpptr(len_reg, 0); // any blocks left??
__ jcc(Assembler::equal, L_exit);
__ BIND(L_singleBlock_loopTopHead2[k]); if (k == 1) {
load_key(xmm_key11, key, 0xb0, rbx /*rscratch*/); // 0xb0; 192-bit key goes up to 0xc0
load_key(xmm_key12, key, 0xc0, rbx /*rscratch*/); // 0xc0; 192-bit key goes up to 0xc0
} if (k == 2) {
load_key(xmm_key11, key, 0xb0, rbx /*rscratch*/); // 0xb0; 256-bit key goes up to 0xe0
}
__ align(OptoLoopAlignment);
__ BIND(L_singleBlock_loopTop[k]);
__ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input
__ movdqa(xmm_prev_block_cipher_save, xmm_result); // save for next r vector
__ pxor(xmm_result, xmm_key_first); // do the aes dec rounds for (int rnum = 1; rnum <= 9 ; rnum++) {
__ aesdec(xmm_result, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
} if (k == 1) {
__ aesdec(xmm_result, xmm_key11);
__ aesdec(xmm_result, xmm_key12);
} if (k == 2) {
__ aesdec(xmm_result, xmm_key11);
load_key(key_tmp, key, 0xc0, rbx /*rscratch*/);
__ aesdec(xmm_result, key_tmp);
load_key(key_tmp, key, 0xd0, rbx /*rscratch*/);
__ aesdec(xmm_result, key_tmp);
load_key(key_tmp, key, 0xe0, rbx /*rscratch*/);
__ aesdec(xmm_result, key_tmp);
}
__ aesdeclast(xmm_result, xmm_key_last); // xmm15 always came from key+0
__ pxor(xmm_result, xmm_prev_block_cipher); // xor with the current r vector
__ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output // no need to store r to memory until we exit
__ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save); // set up next r vector with cipher input from this block
__ addptr(pos, AESBlockSize);
__ subptr(len_reg, AESBlockSize);
__ jcc(Assembler::notEqual, L_singleBlock_loopTop[k]); if (k != 2) {
__ jmp(L_exit);
}
} //for 128/192/256
__ BIND(L_exit);
__ movdqu(Address(rvec, 0), xmm_prev_block_cipher); // final value of r stored in rvec of CipherBlockChaining object
__ pop(rbx); #ifdef _WIN64
__ movl(rax, len_mem); #else
__ pop(rax); // return length #endif
__ leave(); // required for proper stackwalking of RuntimeStub frame
__ ret(0);
// Utility routine for loading a 128-bit key word in little endian format void StubGenerator::load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask) {
__ movdqu(xmmdst, Address(key, offset));
__ pshufb(xmmdst, xmm_shuf_mask);
}
// For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge // context for the registers used, where all instructions below are using 128-bit mode // On EVEX without VL and BW, these instructions will all be AVX. if (VM_Version::supports_avx512vlbw()) {
__ movl(rax, 0xffff);
__ kmovql(k1, rax);
}
__ push(len); // Save
__ push(rbx);
__ vzeroupper();
__ xorptr(pos, pos);
// Calculate number of rounds based on key length(128, 192, 256):44 for 10-rounds, 52 for 12-rounds, 60 for 14-rounds
__ movl(rounds, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
// Load Key shuf mask const XMMRegister xmm_key_shuf_mask = xmm31; // used temporarily to swap key bytes up front
__ movdqu(xmm_key_shuf_mask, ExternalAddress(key_shuffle_mask_addr()), rbx /*rscratch*/);
__ bind(Loop_start);
__ movq(rbx, len); // Divide length by 16 to convert it to number of blocks
__ shrq(len, 4);
__ shlq(rbx, 60);
__ jcc(Assembler::equal, NO_PARTS);
__ addq(len, 1); // Check if number of blocks is greater than or equal to 32 // If true, 512 bytes are processed at a time (code marked by label LOOP) // If not, 16 bytes are processed (code marked by REMAINDER label)
__ bind(NO_PARTS);
__ movq(rbx, len);
__ shrq(len, 5);
__ jcc(Assembler::equal, REMAINDER);
__ movl(r13, len); // Compute number of blocks that will be processed 512 bytes at a time // Subtract this from the total number of blocks which will then be processed by REMAINDER loop
__ shlq(r13, 5);
__ subq(rbx, r13); //Begin processing 512 bytes
__ bind(LOOP);
--> --------------------
--> maximum size reached
--> --------------------
¤ Dauer der Verarbeitung: 0.21 Sekunden
(vorverarbeitet)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung ist noch experimentell.