/* * Copyright (c) 2003, 2022, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 only, as * published by the Free Software Foundation. * * This code is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * version 2 for more details (a copy is included in the LICENSE file that * accompanied this code). * * You should have received a copy of the GNU General Public License version * 2 along with this work; if not, write to the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. * * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA * or visit www.oracle.com if you need additional information or have any * questions. *
*/
// Return point for a Java call if there's an exception thrown in // Java code. The exception is caught and transformed into a // pending exception stored in JavaThread that can be tested from // within the VM. // // Note: Usually the parameters are removed by the callee. In case // of an exception crossing an activation frame boundary, that is // not the case if the callee is compiled code => need to setup the // rsp. // // rax: exception oop
// complete return to VM
assert(StubRoutines::_call_stub_return_address != NULL, "_call_stub_return_address must have been generated before");
__ jump(RuntimeAddress(StubRoutines::_call_stub_return_address));
return start;
}
// Continuation point for runtime calls returning with a pending // exception. The pending exception check happened in the runtime // or native call stub. The pending exception in Thread is // converted into a Java-level exception. // // Contract with Java-level exception handlers: // rax: exception // rdx: throwing pc // // NOTE: At entry of this stub, exception-pc must be on stack !!
// Upon entry, the sp points to the return address returning into // Java (interpreted or compiled) code; i.e., the return address // becomes the throwing pc. // // Arguments pushed before the runtime call are still on the stack // but the exception handler will reset the stack pointer -> // ignore them. A potential result in registers can be ignored as // well.
#ifdef ASSERT // make sure this code is only executed if there is a pending exception
{
Label L;
__ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
__ jcc(Assembler::notEqual, L);
__ stop("StubRoutines::forward exception: no pending exception (1)");
__ bind(L);
} #endif
// Support for intptr_t get_previous_sp() // // This routine is used to find the previous stack pointer for the // caller.
address StubGenerator::generate_get_previous_sp() {
StubCodeMark mark(this, "StubRoutines", "get_previous_sp");
address start = __ pc();
__ movptr(rax, rsp);
__ addptr(rax, 8); // return address is at the top of the stack.
__ ret(0);
return start;
}
//---------------------------------------------------------------------------------------------------- // Support for void verify_mxcsr() // // This routine is used with -Xcheck:jni to verify that native // JNI code does not return to Java code without restoring the // MXCSR register to our expected state.
// Before the call to MacroAssembler::debug(), see below.
return_addr = 16 * wordSize,
error_msg = 17 * wordSize
};
// get object
__ movptr(rax, Address(rsp, oop_to_verify));
// make sure object is 'reasonable'
__ testptr(rax, rax);
__ jcc(Assembler::zero, exit); // if obj is NULL it is OK
#if INCLUDE_ZGC if (UseZGC) { // Check if metadata bits indicate a bad oop
__ testptr(rax, Address(r15_thread, ZThreadLocalData::address_bad_mask_offset()));
__ jcc(Assembler::notZero, error);
} #endif
// Check if the oop is in the right area of memory
__ movptr(c_rarg2, rax);
__ movptr(c_rarg3, (intptr_t) Universe::verify_oop_mask());
__ andptr(c_rarg2, c_rarg3);
__ movptr(c_rarg3, (intptr_t) Universe::verify_oop_bits());
__ cmpptr(c_rarg2, c_rarg3);
__ jcc(Assembler::notZero, error);
// make sure klass is 'reasonable', which is not zero.
__ load_klass(rax, rax, rscratch1); // get klass
__ testptr(rax, rax);
__ jcc(Assembler::zero, error); // if klass is NULL it is broken
// return if everything seems ok
__ bind(exit);
__ movptr(rax, Address(rsp, saved_rax)); // get saved rax back
__ movptr(rscratch1, Address(rsp, saved_r10)); // get saved r10 back
__ pop(c_rarg3); // restore c_rarg3
__ pop(c_rarg2); // restore c_rarg2
__ pop(r12); // restore r12
__ popf(); // restore flags
__ ret(4 * wordSize); // pop caller saved stuff
// handle errors
__ bind(error);
__ movptr(rax, Address(rsp, saved_rax)); // get saved rax back
__ movptr(rscratch1, Address(rsp, saved_r10)); // get saved r10 back
__ pop(c_rarg3); // get saved c_rarg3 back
__ pop(c_rarg2); // get saved c_rarg2 back
__ pop(r12); // get saved r12 back
__ popf(); // get saved flags off stack -- // will be ignored
__ pusha(); // push registers // (rip is already // already pushed) // debug(char* msg, int64_t pc, int64_t regs[]) // We've popped the registers we'd saved (c_rarg3, c_rarg2 and flags), and // pushed all the registers, so now the stack looks like: // [tos + 0] 16 saved registers // [tos + 16] return address // * [tos + 17] error message (char*) // * [tos + 18] object to verify (oop) // * [tos + 19] saved rax - saved by caller and bashed // * [tos + 20] saved r10 (rscratch1) - saved by caller // * = popped on exit
__ movptr(c_rarg0, Address(rsp, error_msg)); // pass address of error message
__ movptr(c_rarg1, Address(rsp, return_addr)); // pass return address
__ movq(c_rarg2, rsp); // pass address of regs on stack
__ mov(r12, rsp); // remember rsp
__ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
__ andptr(rsp, -16); // align stack as required by ABI
BLOCK_COMMENT("call MacroAssembler::debug");
__ call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug64)));
__ hlt();
return start;
}
// Shuffle first three arg regs on Windows into Linux/Solaris locations. // // Outputs: // rdi - rcx // rsi - rdx // rdx - r8 // rcx - r9 // // Registers r9 and r10 are used to save rdi and rsi on Windows, which latter // are non-volatile. r9 and r10 should not be used by the caller. // void StubGenerator::setup_arg_regs(int nargs) { constRegister saved_rdi = r9; constRegister saved_rsi = r10;
assert(nargs == 3 || nargs == 4, "else fix"); #ifdef _WIN64
assert(c_rarg0 == rcx && c_rarg1 == rdx && c_rarg2 == r8 && c_rarg3 == r9, "unexpected argument registers"); if (nargs >= 4)
__ mov(rax, r9); // r9 is also saved_rdi
__ movptr(saved_rdi, rdi);
__ movptr(saved_rsi, rsi);
__ mov(rdi, rcx); // c_rarg0
__ mov(rsi, rdx); // c_rarg1
__ mov(rdx, r8); // c_rarg2 if (nargs >= 4)
__ mov(rcx, rax); // c_rarg3 (via rax) #else
assert(c_rarg0 == rdi && c_rarg1 == rsi && c_rarg2 == rdx && c_rarg3 == rcx, "unexpected argument registers"); #endif
DEBUG_ONLY(_regs_in_thread = false;)
}
// This is used in places where r10 is a scratch register, and can // be adapted if r9 is needed also. void StubGenerator::setup_arg_regs_using_thread() { constRegister saved_r15 = r9; #ifdef _WIN64
__ mov(saved_r15, r15); // r15 is callee saved and needs to be restored
__ get_thread(r15_thread);
assert(c_rarg0 == rcx && c_rarg1 == rdx && c_rarg2 == r8 && c_rarg3 == r9, "unexpected argument registers");
__ movptr(Address(r15_thread, in_bytes(JavaThread::windows_saved_rdi_offset())), rdi);
__ movptr(Address(r15_thread, in_bytes(JavaThread::windows_saved_rsi_offset())), rsi);
void StubGenerator::restore_arg_regs_using_thread() {
assert(_regs_in_thread, "wrong call to restore_arg_regs"); constRegister saved_r15 = r9; #ifdef _WIN64
__ get_thread(r15_thread);
__ movptr(rsi, Address(r15_thread, in_bytes(JavaThread::windows_saved_rsi_offset())));
__ movptr(rdi, Address(r15_thread, in_bytes(JavaThread::windows_saved_rdi_offset())));
__ mov(r15, saved_r15); // r15 is callee saved and needs to be restored #endif
}
void StubGenerator::setup_argument_regs(BasicType type) { if (type == T_BYTE || type == T_SHORT) {
setup_arg_regs(); // from => rdi, to => rsi, count => rdx // r9 and r10 may be used to save non-volatile registers
} else {
setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx // r9 is used to save r15_thread
}
}
void StubGenerator::restore_argument_regs(BasicType type) { if (type == T_BYTE || type == T_SHORT) {
restore_arg_regs();
} else {
restore_arg_regs_using_thread();
}
}
// ofs and limit are use for multi-block byte array. // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
address StubGenerator::generate_sha1_implCompress(bool multi_block, constchar *name) {
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", name);
address start = __ pc();
//Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
address StubGenerator::generate_pshuffle_byte_flip_mask_sha512() {
__ align32();
StubCodeMark mark(this, "StubRoutines", "pshuffle_byte_flip_mask_sha512");
address start = __ pc();
// Code for 512-bit VBMI encoding. Encodes 48 input bytes into 64 // output bytes. We read 64 input bytes and ignore the last 16, so be // sure not to read past the end of the input buffer. if (VM_Version::supports_avx512_vbmi()) {
__ cmpl(length, 64); // Do not overrun input buffer.
__ jcc(Assembler::below, L_not512);
__ shll(isURL, 6); // index into decode table based on isURL
__ lea(encode_table, ExternalAddress(StubRoutines::x86::base64_encoding_table_addr()));
__ addptr(encode_table, isURL);
__ shrl(isURL, 6); // restore isURL
// Put the input bytes into the proper lanes for writing, then // encode them.
__ evpmultishiftqb(xmm0, xmm1, xmm0, Assembler::AVX_512bit);
__ vpermb(xmm0, xmm0, xmm2, Assembler::AVX_512bit);
// Write to destination
__ evmovdquq(Address(dest, dp), xmm0, Assembler::AVX_512bit);
__ BIND(L_not512); if (VM_Version::supports_avx2()
&& VM_Version::supports_avx512vlbw()) { /* ** This AVX2 encoder is based off the paper at: ** https://dl.acm.org/doi/10.1145/3132709 ** ** We use AVX2 SIMD instructions to encode 24 bytes into 32 ** output bytes. **
*/ // Lengths under 32 bytes are done with scalar routine
__ cmpl(length, 31);
__ jcc(Assembler::belowEqual, L_process3);
// Set up supporting constant table data
__ vmovdqu(xmm9, ExternalAddress(StubRoutines::x86::base64_avx2_shuffle_addr()), rax); // 6-bit mask for 2nd and 4th (and multiples) 6-bit values
__ movl(rax, 0x0fc0fc00);
__ vmovdqu(xmm1, ExternalAddress(StubRoutines::x86::base64_avx2_input_mask_addr()), rax);
__ evpbroadcastd(xmm8, rax, Assembler::AVX_256bit);
// Multiplication constant for "shifting" right by 6 and 10 // bits
__ movl(rax, 0x04000040);
// For the first load, we mask off reading of the first 4 // bytes into the register. This is so we can get 4 3-byte // chunks into each lane of the register, avoiding having to // handle end conditions. We then shuffle these bytes into a // specific order so that manipulation is easier. // // The initial read loads the XMM register like this: // // Lower 128-bit lane: // +----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+ // | XX | XX | XX | XX | A0 | A1 | A2 | B0 | B1 | B2 | C0 | C1 // | C2 | D0 | D1 | D2 | // +----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+ // // Upper 128-bit lane: // +----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+ // | E0 | E1 | E2 | F0 | F1 | F2 | G0 | G1 | G2 | H0 | H1 | H2 // | XX | XX | XX | XX | // +----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+ // // Where A0 is the first input byte, B0 is the fourth, etc. // The alphabetical significance denotes the 3 bytes to be // consumed and encoded into 4 bytes. // // We then shuffle the register so each 32-bit word contains // the sequence: // A1 A0 A2 A1, B1, B0, B2, B1, etc. // Each of these byte sequences are then manipulated into 4 // 6-bit values ready for encoding. // // If we focus on one set of 3-byte chunks, changing the // nomenclature such that A0 => a, A1 => b, and A2 => c, we // shuffle such that each 24-bit chunk contains: // // b7 b6 b5 b4 b3 b2 b1 b0 | a7 a6 a5 a4 a3 a2 a1 a0 | c7 c6 // c5 c4 c3 c2 c1 c0 | b7 b6 b5 b4 b3 b2 b1 b0 // Explain this step. // b3 b2 b1 b0 c5 c4 c3 c2 | c1 c0 d5 d4 d3 d2 d1 d0 | a5 a4 // a3 a2 a1 a0 b5 b4 | b3 b2 b1 b0 c5 c4 c3 c2 // // W first and off all but bits 4-9 and 16-21 (c5..c0 and // a5..a0) and shift them using a vector multiplication // operation (vpmulhuw) which effectively shifts c right by 6 // bits and a right by 10 bits. We similarly mask bits 10-15 // (d5..d0) and 22-27 (b5..b0) and shift them left by 8 and 4 // bits respectively. This is done using vpmullw. We end up // with 4 6-bit values, thus splitting the 3 input bytes, // ready for encoding: // 0 0 d5..d0 0 0 c5..c0 0 0 b5..b0 0 0 a5..a0 // // For translation, we recognize that there are 5 distinct // ranges of legal Base64 characters as below: // // +-------------+-------------+------------+ // | 6-bit value | ASCII range | offset | // +-------------+-------------+------------+ // | 0..25 | A..Z | 65 | // | 26..51 | a..z | 71 | // | 52..61 | 0..9 | -4 | // | 62 | + or - | -19 or -17 | // | 63 | / or _ | -16 or 32 | // +-------------+-------------+------------+ // // We note that vpshufb does a parallel lookup in a // destination register using the lower 4 bits of bytes from a // source register. If we use a saturated subtraction and // subtract 51 from each 6-bit value, bytes from [0,51] // saturate to 0, and [52,63] map to a range of [1,12]. We // distinguish the [0,25] and [26,51] ranges by assigning a // value of 13 for all 6-bit values less than 26. We end up // with: // // +-------------+-------------+------------+ // | 6-bit value | Reduced | offset | // +-------------+-------------+------------+ // | 0..25 | 13 | 65 | // | 26..51 | 0 | 71 | // | 52..61 | 0..9 | -4 | // | 62 | 11 | -19 or -17 | // | 63 | 12 | -16 or 32 | // +-------------+-------------+------------+ // // We then use a final vpshufb to add the appropriate offset, // translating the bytes. // // Load input bytes - only 28 bytes. Mask the first load to // not load into the full register.
__ vpmaskmovd(xmm1, xmm1, Address(source, start_offset, Address::times_1, -4), Assembler::AVX_256bit);
// Move 3-byte chunks of input (12 bytes) into 16 bytes, // ordering by: // 1, 0, 2, 1; 4, 3, 5, 4; etc. This groups 6-bit chunks // for easy masking
__ vpshufb(xmm1, xmm1, xmm9, Assembler::AVX_256bit);
__ addl(start_offset, 24);
// Load masking register for first and third (and multiples) // 6-bit values.
__ movl(rax, 0x003f03f0);
__ evpbroadcastd(xmm6, rax, Assembler::AVX_256bit); // Multiplication constant for "shifting" left by 4 and 8 bits
__ movl(rax, 0x01000010);
__ evpbroadcastd(xmm5, rax, Assembler::AVX_256bit);
// Shift output bytes 0 and 2 into proper lanes
__ vpmulhuw(xmm2, xmm0, xmm7, Assembler::AVX_256bit);
// Mask and shift output bytes 1 and 3 into proper lanes and // combine
__ vpand(xmm0, xmm6, xmm1, Assembler::AVX_256bit);
__ vpmullw(xmm0, xmm5, xmm0, Assembler::AVX_256bit);
__ vpor(xmm0, xmm0, xmm2, Assembler::AVX_256bit);
// Find out which are 0..25. This indicates which input // values fall in the range of 'A'-'Z', which require an // additional offset (see comments above)
__ vpcmpgtb(xmm2, xmm0, xmm3, Assembler::AVX_256bit);
__ vpsubusb(xmm1, xmm0, xmm4, Assembler::AVX_256bit);
__ vpsubb(xmm1, xmm1, xmm2, Assembler::AVX_256bit);
// Shuffle the offsets based on the range calculation done // above. This allows us to add the correct offset to the // 6-bit value corresponding to the range documented above.
__ vpshufb(xmm1, xmm2, xmm1, Assembler::AVX_256bit);
__ vpaddb(xmm0, xmm1, xmm0, Assembler::AVX_256bit);
// Store the encoded bytes
__ vmovdqu(Address(dest, dp), xmm0);
__ addl(dp, 32);
// Get next 32 bytes
__ vmovdqu(xmm1, Address(source, start_offset, Address::times_1, -4));
__ subl(length, 24);
__ addl(start_offset, 24);
// This logic is identical to the above, with only constant // register loads removed. Shuffle the input, mask off 6-bit // chunks, shift them into place, then add the offset to // encode.
__ vpshufb(xmm1, xmm1, xmm9, Assembler::AVX_256bit);
__ orl(rax, r13); // At this point, rax contains | byte1 | byte2 | byte0 | byte1 // r13 has byte2 << 16 - need low-order 6 bits to translate. // This translated byte is the fourth output byte.
__ shrl(r13, 16);
__ andl(r13, 0x3f);
// The high-order 6 bits of r15 (byte0) is translated. // The translated byte is the first output byte.
__ shrl(r15, 10);
// Extract high-order 4 bits of byte1 and low-order 2 bits of byte0. // This translated byte is the second output byte.
__ shrl(rax, 4);
__ movl(r10, rax);
__ andl(rax, 0x3f);
--> --------------------
--> maximum size reached
--> --------------------
¤ Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.0.24Bemerkung:
(vorverarbeitet)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung ist noch experimentell.