/* * Copyright (c) 2019, 2021, Intel Corporation. All rights reserved. * * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 only, as * published by the Free Software Foundation. * * This code is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * version 2 for more details (a copy is included in the LICENSE file that * accompanied this code). * * You should have received a copy of the GNU General Public License version * 2 along with this work; if not, write to the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. * * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA * or visit www.oracle.com if you need additional information or have any * questions. *
*/
__ movdqu(xmm_temp5, xmm_temp4); // move the contents of xmm4 to xmm5
__ psrldq(xmm_temp4, 8); // shift by xmm4 64 bits to the right
__ pslldq(xmm_temp5, 8); // shift by xmm5 64 bits to the left
__ pxor(xmm_temp3, xmm_temp5);
__ pxor(xmm_temp6, xmm_temp4); // Register pair <xmm6:xmm3> holds the result // of the carry-less multiplication of // xmm0 by xmm1.
// We shift the result of the multiplication by one bit position // to the left to cope for the fact that the bits are reversed.
__ movdqu(xmm_temp7, xmm_temp3);
__ movdqu(xmm_temp8, xmm_temp6);
__ pslld(xmm_temp3, 1);
__ pslld(xmm_temp6, 1);
__ psrld(xmm_temp7, 31);
__ psrld(xmm_temp8, 31);
__ movdqu(xmm_temp9, xmm_temp7);
__ pslldq(xmm_temp8, 4);
__ pslldq(xmm_temp7, 4);
__ psrldq(xmm_temp9, 12);
__ por(xmm_temp3, xmm_temp7);
__ por(xmm_temp6, xmm_temp8);
__ por(xmm_temp6, xmm_temp9);
// // First phase of the reduction // // Move xmm3 into xmm7, xmm8, xmm9 in order to perform the shifts // independently.
__ movdqu(xmm_temp7, xmm_temp3);
__ movdqu(xmm_temp8, xmm_temp3);
__ movdqu(xmm_temp9, xmm_temp3);
__ pslld(xmm_temp7, 31); // packed right shift shifting << 31
__ pslld(xmm_temp8, 30); // packed right shift shifting << 30
__ pslld(xmm_temp9, 25); // packed right shift shifting << 25
__ pxor(xmm_temp7, xmm_temp8); // xor the shifted versions
__ pxor(xmm_temp7, xmm_temp9);
__ movdqu(xmm_temp8, xmm_temp7);
__ pslldq(xmm_temp7, 12);
__ psrldq(xmm_temp8, 4);
__ pxor(xmm_temp3, xmm_temp7); // first phase of the reduction complete
// // Second phase of the reduction // // Make 3 copies of xmm3 in xmm2, xmm4, xmm5 for doing these // shift operations.
__ movdqu(xmm_temp2, xmm_temp3);
__ movdqu(xmm_temp4, xmm_temp3);
__ movdqu(xmm_temp5, xmm_temp3);
__ psrld(xmm_temp2, 1); // packed left shifting >> 1
__ psrld(xmm_temp4, 2); // packed left shifting >> 2
__ psrld(xmm_temp5, 7); // packed left shifting >> 7
__ pxor(xmm_temp2, xmm_temp4); // xor the shifted versions
__ pxor(xmm_temp2, xmm_temp5);
__ pxor(xmm_temp2, xmm_temp8);
__ pxor(xmm_temp3, xmm_temp2);
__ pxor(xmm_temp6, xmm_temp3); // the result is in xmm6
__ bind(L_exit);
__ pshufb(xmm_temp6, xmm_temp10); // Byte swap 16-byte result
__ movdqu(Address(state, 0), xmm_temp6); // store the result
__ pop(rbx);
__ leave();
__ ret(0);
return start;
}
// Ghash single and multi block operations using AVX instructions
address StubGenerator::generate_avx_ghash_processBlocks() {
__ align(CodeEntryAlignment);
// Check if Hashtable (1*16) has been already generated // For anything less than 8 blocks, we generate only the first power of H.
__ movdqu(tmp2, Address(htbl, 1 * 16));
__ ptest(tmp2, tmp2);
__ jcc(Assembler::notZero, BEGIN_PROCESS);
__ call(GENERATE_HTBL_1_BLK, relocInfo::none);
__ cmpl(blocks, 8);
__ jcc(Assembler::below, ONE_BLK_INIT); // If we have 8 blocks or more data, then generate remaining powers of H
__ movdqu(tmp2, Address(htbl, 8 * 16));
__ ptest(tmp2, tmp2);
__ jcc(Assembler::notZero, PROCESS_8_BLOCKS);
__ call(GENERATE_HTBL_8_BLKS, relocInfo::none);
//Do 8 multiplies followed by a reduction processing 8 blocks of data at a time //Each block = 16 bytes.
__ bind(PROCESS_8_BLOCKS);
__ subl(blocks, 8);
__ movdqu(bswap_mask, ExternalAddress(ghash_byte_swap_mask_addr()), rbx /*rscratch*/);
__ movdqu(data, Address(input_data, 16 * 7));
__ vpshufb(data, data, bswap_mask, Assembler::AVX_128bit); //Loading 1*16 as calculated powers of H required starts at that location.
__ movdqu(xmm15, Address(htbl, 1 * 16)); //Perform carryless multiplication of (H*2, data block #7)
__ vpclmulhqlqdq(tmp2, data, xmm15);//a0 * b1
__ vpclmulldq(tmp0, data, xmm15);//a0 * b0
__ vpclmulhdq(tmp1, data, xmm15);//a1 * b1
__ vpclmullqhqdq(tmp3, data, xmm15);//a1* b0
__ vpxor(tmp2, tmp2, tmp3, Assembler::AVX_128bit);// (a0 * b1) + (a1 * b0)
// we have the 2 128-bit partially accumulated multiplication results in tmp0:tmp1 // with higher 128-bit in tmp1 and lower 128-bit in corresponding tmp0 // Follows the reduction technique mentioned in // Shift-XOR reduction described in Gueron-Kounavis May 2010
__ bind(BLOCK8_REDUCTION); // First Phase of the reduction
__ vpslld(xmm8, tmp0, 31, Assembler::AVX_128bit); // packed right shifting << 31
__ vpslld(xmm9, tmp0, 30, Assembler::AVX_128bit); // packed right shifting << 30
__ vpslld(xmm10, tmp0, 25, Assembler::AVX_128bit); // packed right shifting << 25 // xor the shifted versions
__ vpxor(xmm8, xmm8, xmm10, Assembler::AVX_128bit);
__ vpxor(xmm8, xmm8, xmm9, Assembler::AVX_128bit);
// Since this is one block operation we will only use H * 2 i.e. the first power of H
__ bind(ONE_BLK_INIT);
__ movdqu(tmp0, Address(htbl, 1 * 16));
__ movdqu(bswap_mask, ExternalAddress(ghash_byte_swap_mask_addr()), rbx /*rscratch*/);
//Do one (128 bit x 128 bit) carry-less multiplication at a time followed by a reduction.
__ bind(PROCESS_1_BLOCK);
__ cmpl(blocks, 0);
__ jcc(Assembler::equal, SAVE_STATE);
__ subl(blocks, 1);
__ movdqu(data, Address(input_data, 0));
__ vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
__ vpxor(state, state, data, Assembler::AVX_128bit); // gfmul(H*2, state)
__ call(GFMUL, relocInfo::none);
__ addptr(input_data, 16);
__ jmp(PROCESS_1_BLOCK);
__ bind(EXIT_GHASH); // zero out xmm registers used for Htbl storage
__ vpxor(xmm0, xmm0, xmm0, Assembler::AVX_128bit);
__ vpxor(xmm1, xmm1, xmm1, Assembler::AVX_128bit);
__ vpxor(xmm3, xmm3, xmm3, Assembler::AVX_128bit);
__ vpxor(xmm15, xmm15, xmm15, Assembler::AVX_128bit);
}
// Multiply two 128 bit numbers resulting in a 256 bit value // Result of the multiplication followed by reduction stored in state void StubGenerator::gfmul(XMMRegister tmp0, XMMRegister state) { const XMMRegister tmp1 = xmm4; const XMMRegister tmp2 = xmm5; const XMMRegister tmp3 = xmm6; const XMMRegister tmp4 = xmm7;
// This method takes the subkey after expansion as input and generates 1 * 16 power of subkey H. // The power of H is used in reduction process for one block ghash void StubGenerator::generateHtbl_one_block(Register htbl, Register rscratch) { const XMMRegister t = xmm13;
// load the original subkey hash
__ movdqu(t, Address(htbl, 0)); // shuffle using long swap mask
__ movdqu(xmm10, ExternalAddress(ghash_long_swap_mask_addr()), rscratch);
__ vpshufb(t, t, xmm10, Assembler::AVX_128bit);
//Adding p(x)<<1 to xmm5 which holds the reduction polynomial
__ vpxor(t, t, xmm5, Assembler::AVX_128bit);
__ movdqu(Address(htbl, 1 * 16), t); // H * 2
__ ret(0);
}
// This method takes the subkey after expansion as input and generates the remaining powers of subkey H. // The power of H is used in reduction process for eight block ghash void StubGenerator::generateHtbl_eight_blocks(Register htbl) { const XMMRegister t = xmm13; const XMMRegister tmp0 = xmm1;
Label GFMUL;
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung ist noch experimentell.