/* * Copyright (c) 1997, 2022, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2022 SAP SE. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 only, as * published by the Free Software Foundation. * * This code is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * version 2 for more details (a copy is included in the LICENSE file that * accompanied this code). * * You should have received a copy of the GNU General Public License version * 2 along with this work; if not, write to the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. * * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA * or visit www.oracle.com if you need additional information or have any * questions. *
*/
// Declaration and definition of StubGenerator (no .hpp file). // For a more detailed description of the stub routine structure // see the comment in stubRoutines.hpp.
// Calculate top_of_arguments_addr which will be R17_tos (not prepushed) later. // FIXME: why not simply use SP+frame::top_ijava_frame_size?
__ addi(r_top_of_arguments_addr,
R1_SP, frame::top_ijava_frame_abi_size);
__ add(r_top_of_arguments_addr,
r_top_of_arguments_addr, r_frame_alignment_in_bytes);
// any arguments to copy?
__ cmpdi(CCR0, r_arg_argument_count, 0);
__ beq(CCR0, arguments_copied);
// prepare loop and copy arguments in reverse order
{ // init CTR with arg_argument_count
__ mtctr(r_arg_argument_count);
// let r_argumentcopy_addr point to last outgoing Java arguments P
__ mr(r_argumentcopy_addr, r_top_of_arguments_addr);
// let r_argument_addr point to last incoming java argument
__ add(r_argument_addr,
r_arg_argument_addr, r_argument_size_in_bytes);
__ addi(r_argument_addr, r_argument_addr, -BytesPerWord);
// now loop while CTR > 0 and copy arguments
{
Label next_argument;
__ bind(next_argument);
// initialize call_stub locals (step 2) // now save tos as arguments_tos_address
__ std(tos, _entry_frame_locals_neg(arguments_tos_address), r_entryframe_fp);
// Set R15_prev_state to 0 for simplifying checks in callee.
__ load_const_optimized(R25_templateTableBase, (address)Interpreter::dispatch_table((TosState)0), R11_scratch1); // Stack on entry to frame manager / native entry: // // F0 [TOP_IJAVA_FRAME_ABI] // alignment (optional) // [outgoing Java arguments] // [ENTRY_FRAME_LOCALS] // F1 [C_FRAME] // ... //
// global toc register
__ load_const_optimized(R29_TOC, MacroAssembler::global_toc(), R11_scratch1); // Remember the senderSP so we interpreter can pop c2i arguments off of the stack // when called via a c2i.
// Pass initial_caller_sp to framemanager.
__ mr(R21_sender_SP, R1_SP);
// Do a light-weight C-call here, r_new_arg_entry holds the address // of the interpreter entry point (frame manager or native entry) // and save runtime-value of LR in return_address.
assert(r_new_arg_entry != tos && r_new_arg_entry != R19_method && r_new_arg_entry != R16_thread, "trashed r_new_arg_entry");
return_address = __ call_stub(r_new_arg_entry);
}
{
BLOCK_COMMENT("Returned from frame manager or native entry."); // Returned from frame manager or native entry. // Now pop frame, process result, and return to caller.
// Stack on exit from frame manager / native entry: // // F0 [ABI] // ... // [ENTRY_FRAME_LOCALS] // F1 [C_FRAME] // ... // // Just pop the topmost frame ... //
// Reload some volatile registers which we've spilled before the call // to frame manager / native entry. // Access all locals via frame pointer, because we know nothing about // the topmost frame's size.
__ ld(r_entryframe_fp, _abi0(callers_sp), R1_SP);
assert_different_registers(r_entryframe_fp, R3_RET, r_arg_result_addr, r_arg_result_type, r_cr, r_lr);
__ ld(r_arg_result_addr,
_entry_frame_locals_neg(result_address), r_entryframe_fp);
__ ld(r_arg_result_type,
_entry_frame_locals_neg(result_type), r_entryframe_fp);
__ ld(r_cr, _abi0(cr), r_entryframe_fp);
__ ld(r_lr, _abi0(lr), r_entryframe_fp);
// pop frame and restore non-volatiles, LR and CR
__ mr(R1_SP, r_entryframe_fp);
__ pop_cont_fastpath();
__ mtcr(r_cr);
__ mtlr(r_lr);
// Store result depending on type. Everything that is not // T_OBJECT, T_LONG, T_FLOAT, or T_DOUBLE is treated as T_INT.
__ cmpwi(CCR0, r_arg_result_type, T_OBJECT);
__ cmpwi(CCR1, r_arg_result_type, T_LONG);
__ cmpwi(CCR5, r_arg_result_type, T_FLOAT);
__ cmpwi(CCR6, r_arg_result_type, T_DOUBLE);
// case T_OBJECT:
__ bind(ret_is_object);
__ std(R3_RET, 0, r_arg_result_addr);
__ blr(); // return to caller
// case T_LONG:
__ bind(ret_is_long);
__ std(R3_RET, 0, r_arg_result_addr);
__ blr(); // return to caller
// case T_FLOAT:
__ bind(ret_is_float);
__ stfs(F1_RET, 0, r_arg_result_addr);
__ blr(); // return to caller
// case T_DOUBLE:
__ bind(ret_is_double);
__ stfd(F1_RET, 0, r_arg_result_addr);
__ blr(); // return to caller
}
return start;
}
// Return point for a Java call if there's an exception thrown in // Java code. The exception is caught and transformed into a // pending exception stored in JavaThread that can be tested from // within the VM. //
address generate_catch_exception() {
StubCodeMark mark(this, "StubRoutines", "catch_exception");
__ std(R3_ARG1, in_bytes(JavaThread::pending_exception_offset()), R16_thread); // store into `char *'
__ std(exception_file, in_bytes(JavaThread::exception_file_offset()), R16_thread); // store into `int'
__ stw(exception_line, in_bytes(JavaThread::exception_line_offset()), R16_thread);
// complete return to VM
assert(StubRoutines::_call_stub_return_address != NULL, "must have been generated before");
__ mtlr(R4_ARG2); // continue in call stub
__ blr();
return start;
}
// Continuation point for runtime calls returning with a pending // exception. The pending exception check happened in the runtime // or native call stub. The pending exception in Thread is // converted into a Java-level exception. // // Read: // // LR: The pc the runtime library callee wants to return to. // Since the exception occurred in the callee, the return pc // from the point of view of Java is the exception pc. // thread: Needed for method handles. // // Invalidate: // // volatile registers (except below). // // Update: // // R4_ARG2: exception // // (LR is unchanged and is live out). //
address generate_forward_exception() {
StubCodeMark mark(this, "StubRoutines", "forward_exception");
address start = __ pc();
if (VerifyOops) { // Get pending exception oop.
__ ld(R3_ARG1,
in_bytes(Thread::pending_exception_offset()),
R16_thread); // Make sure that this code is only executed if there is a pending exception.
{
Label L;
__ cmpdi(CCR0, R3_ARG1, 0);
__ bne(CCR0, L);
__ stop("StubRoutines::forward exception: no pending exception (1)");
__ bind(L);
}
__ verify_oop(R3_ARG1, "StubRoutines::forward exception: not an oop");
}
// Save LR/CR and copy exception pc (LR) into R4_ARG2.
__ save_LR_CR(R4_ARG2);
__ push_frame_reg_args(0, R0); // Find exception handler.
__ call_VM_leaf(CAST_FROM_FN_PTR(address,
SharedRuntime::exception_handler_for_return_address),
R16_thread,
R4_ARG2); // Copy handler's address.
__ mtctr(R3_RET);
__ pop_frame();
__ restore_LR_CR(R0);
// Set up the arguments for the exception handler: // - R3_ARG1: exception oop // - R4_ARG2: exception pc.
// The exception pc is the return address in the caller. // Must load it into R4_ARG2.
__ mflr(R4_ARG2);
#ifdef ASSERT // Make sure exception is set.
{
Label L;
__ cmpdi(CCR0, R3_ARG1, 0);
__ bne(CCR0, L);
__ stop("StubRoutines::forward exception: no pending exception (2)");
__ bind(L);
} #endif
// Clear the pending exception.
__ li(R0, 0);
__ std(R0,
in_bytes(Thread::pending_exception_offset()),
R16_thread); // Jump to exception handler.
__ bctr();
return start;
}
#undef __ #define __ masm-> // Continuation point for throwing of implicit exceptions that are // not handled in the current activation. Fabricates an exception // oop and initiates normal exception dispatching in this // frame. Only callee-saved registers are preserved (through the // normal register window / RegisterMap handling). If the compiler // needs all registers to be preserved between the fault point and // the exception handler then it must assume responsibility for that // in AbstractCompiler::continuation_for_implicit_null_exception or // continuation_for_implicit_division_by_zero_exception. All other // implicit exceptions (e.g., NullPointerException or // AbstractMethodError on entry) are either at call sites or // otherwise assume that stack unwinding will be initiated, so // caller saved registers were assumed volatile in the compiler. // // Note that we generate only this stub into a RuntimeStub, because // it needs to be properly traversed and ignored during GC, so we // change the meaning of the "__" macro within this method. // // Note: the routine set_pc_not_at_call_for_caller in // SharedRuntime.cpp requires that this code be generated into a // RuntimeStub.
address generate_throw_exception(constchar* name, address runtime_entry, bool restore_saved_exception_pc, Register arg1 = noreg, Register arg2 = noreg) {
CodeBuffer code(name, 1024 DEBUG_ONLY(+ 512), 0);
MacroAssembler* masm = new MacroAssembler(&code);
OopMapSet* oop_maps = new OopMapSet(); int frame_size_in_bytes = frame::abi_reg_args_size;
OopMap* map = new OopMap(frame_size_in_bytes / sizeof(jint), 0);
address start = __ pc();
__ save_LR_CR(R11_scratch1);
// Push a frame.
__ push_frame_reg_args(0, R11_scratch1);
address frame_complete_pc = __ pc();
if (restore_saved_exception_pc) {
__ unimplemented("StubGenerator::throw_exception with restore_saved_exception_pc");
}
// Note that we always have a runtime stub frame on the top of // stack by this point. Remember the offset of the instruction // whose address will be moved to R11_scratch1.
address gc_map_pc = __ get_PC_trash_LR(R11_scratch1);
// Set an oopmap for the call site.
oop_maps->add_gc_map((int)(gc_map_pc - start), map);
__ reset_last_Java_frame();
#ifdef ASSERT // Make sure that this code is only executed if there is a pending // exception.
{
Label L;
__ ld(R0,
in_bytes(Thread::pending_exception_offset()),
R16_thread);
__ cmpdi(CCR0, R0, 0);
__ bne(CCR0, L);
__ stop("StubRoutines::throw_exception: no pending exception");
__ bind(L);
} #endif
// Procedure for large arrays (uses data cache block zero instruction).
Label dwloop, fast, fastloop, restloop, lastdword, done; int cl_size = VM_Version::L1_data_cache_line_size(); int cl_dwords = cl_size >> 3; int cl_dwordaddr_bits = exact_log2(cl_dwords); int min_dcbz = 2; // Needs to be positive, apply dcbz only to at least min_dcbz cache lines.
// Clear up to 128byte boundary if long enough, dword_cnt=(16-(base>>3))%16.
__ dcbtst(base_ptr_reg); // Indicate write access to first cache line ...
__ andi(tmp2_reg, cnt_dwords_reg, 1); // to check if number of dwords is even.
__ srdi_(tmp1_reg, cnt_dwords_reg, 1); // number of double dwords
__ load_const_optimized(zero_reg, 0L); // Use as zero register.
__ cmpdi(CCR1, tmp2_reg, 0); // cnt_dwords even?
__ beq(CCR0, lastdword); // size <= 1
__ mtctr(tmp1_reg); // Speculatively preload counter for rest loop (>0).
__ cmpdi(CCR0, cnt_dwords_reg, (min_dcbz+1)*cl_dwords-1); // Big enough to ensure >=min_dcbz cache lines are included?
__ neg(tmp1_reg, base_ptr_reg); // bit 0..58: bogus, bit 57..60: (16-(base>>3))%16, bit 61..63: 000
__ blt(CCR0, restloop); // Too small. (<31=(2*cl_dwords)-1 is sufficient, but bigger performs better.)
__ rldicl_(tmp1_reg, tmp1_reg, 64-3, 64-cl_dwordaddr_bits); // Extract number of dwords to 128byte boundary=(16-(base>>3))%16.
__ beq(CCR0, fast); // already 128byte aligned
__ mtctr(tmp1_reg); // Set ctr to hit 128byte boundary (0<ctr<cnt).
__ subf(cnt_dwords_reg, tmp1_reg, cnt_dwords_reg); // rest (>0 since size>=256-8)
// Clear in first cache line dword-by-dword if not already 128byte aligned.
__ bind(dwloop);
__ std(zero_reg, 0, base_ptr_reg); // Clear 8byte aligned block.
__ addi(base_ptr_reg, base_ptr_reg, 8);
__ bdnz(dwloop);
// clear 128byte blocks
__ bind(fast);
__ srdi(tmp1_reg, cnt_dwords_reg, cl_dwordaddr_bits); // loop count for 128byte loop (>0 since size>=256-8)
__ andi(tmp2_reg, cnt_dwords_reg, 1); // to check if rest even
#if !defined(PRODUCT) // Wrapper which calls oopDesc::is_oop_or_null() // Only called by MacroAssembler::verify_oop staticvoid verify_oop_helper(constchar* message, oopDesc* o) { if (!oopDesc::is_oop_or_null(o)) {
fatal("%s. oop: " PTR_FORMAT, message, p2i(o));
}
++ StubRoutines::_verify_oop_count;
} #endif
// Return address of code to be called from code generated by // MacroAssembler::verify_oop. // // Don't generate, rather use C++ code.
address generate_verify_oop() { // this is actually a `FunctionDescriptor*'.
address start = 0;
// -XX:+OptimizeFill : convert fill/copy loops into intrinsic // // The code is implemented(ported from sparc) as we believe it benefits JVM98, however // tracing(-XX:+TraceOptimizeFill) shows the intrinsic replacement doesn't happen at all! // // Source code in function is_range_check_if() shows that OptimizeFill relaxed the condition // for turning on loop predication optimization, and hence the behavior of "array range check" // and "loop invariant check" could be influenced, which potentially boosted JVM98. // // Generate stub for disjoint short fill. If "aligned" is true, the // "to" address is assumed to be heapword aligned. // // Arguments for generated stub: // to: R3_ARG1 // value: R4_ARG2 // count: R5_ARG3 treated as signed //
address generate_fill(BasicType t, bool aligned, constchar* name) {
StubCodeMark mark(this, "StubRoutines", name);
address start = __ function_entry();
constRegister to = R3_ARG1; // source array address constRegister value = R4_ARG2; // fill value constRegister count = R5_ARG3; // elements count constRegister temp = R6_ARG4; // temp register
//assert_clean_int(count, O3); // Make sure 'count' is clean int.
int shift = -1; switch (t) { case T_BYTE:
shift = 2; // Clone bytes (zero extend not needed because store instructions below ignore high order bytes).
__ rldimi(value, value, 8, 48); // 8 bit -> 16 bit
__ cmpdi(CCR0, count, 2<<shift); // Short arrays (< 8 bytes) fill by element.
__ blt(CCR0, L_fill_elements);
__ rldimi(value, value, 16, 32); // 16 bit -> 32 bit break; case T_SHORT:
shift = 1; // Clone bytes (zero extend not needed because store instructions below ignore high order bytes).
__ rldimi(value, value, 16, 32); // 16 bit -> 32 bit
__ cmpdi(CCR0, count, 2<<shift); // Short arrays (< 8 bytes) fill by element.
__ blt(CCR0, L_fill_elements); break; case T_INT:
shift = 0;
__ cmpdi(CCR0, count, 2<<shift); // Short arrays (< 8 bytes) fill by element.
__ blt(CCR0, L_fill_4_bytes); break; default: ShouldNotReachHere();
}
if (!aligned && (t == T_BYTE || t == T_SHORT)) { // Align source address at 4 bytes address boundary. if (t == T_BYTE) { // One byte misalignment happens only for byte arrays.
__ andi_(temp, to, 1);
__ beq(CCR0, L_skip_align1);
__ stb(value, 0, to);
__ addi(to, to, 1);
__ addi(count, count, -1);
__ bind(L_skip_align1);
} // Two bytes misalignment happens only for byte and short (char) arrays.
__ andi_(temp, to, 2);
__ beq(CCR0, L_skip_align2);
__ sth(value, 0, to);
__ addi(to, to, 2);
__ addi(count, count, -(1 << (shift - 1)));
__ bind(L_skip_align2);
}
if (!aligned) { // Align to 8 bytes, we know we are 4 byte aligned to start.
__ andi_(temp, to, 7);
__ beq(CCR0, L_fill_32_bytes);
__ stw(value, 0, to);
__ addi(to, to, 4);
__ addi(count, count, -(1 << shift));
__ bind(L_fill_32_bytes);
}
__ li(temp, 8<<shift); // Prepare for 32 byte loop. // Clone bytes int->long as above.
__ rldimi(value, value, 32, 0); // 32 bit -> 64 bit
// // Length is too short, just fill 8 bytes at a time. //
Label L_fill_8_bytes_loop;
__ bind(L_fill_8_bytes_loop);
__ std(value, 0, to);
__ addic_(count, count, -(2 << shift));
__ addi(to, to, 8);
__ bge(CCR0, L_fill_8_bytes_loop);
// Generate overlap test for array copy stubs. // // Input: // R3_ARG1 - from // R4_ARG2 - to // R5_ARG3 - element count // void array_overlap_test(address no_overlap_target, int log2_elem_size) { Register tmp1 = R6_ARG4; Register tmp2 = R7_ARG5;
assert_positive_int(R5_ARG3);
__ subf(tmp1, R3_ARG1, R4_ARG2); // distance in bytes
__ sldi(tmp2, R5_ARG3, log2_elem_size); // size in bytes
__ cmpld(CCR0, R3_ARG1, R4_ARG2); // Use unsigned comparison!
__ cmpld(CCR1, tmp1, tmp2);
__ crnand(CCR0, Assembler::less, CCR1, Assembler::less); // Overlaps if Src before dst and distance smaller than size. // Branch to forward copy routine otherwise (within range of 32kB).
__ bc(Assembler::bcondCRbiIs1, Assembler::bi0(CCR0, Assembler::less), no_overlap_target);
// need to copy backwards
}
// This is common errorexit stub for UnsafeCopyMemory.
address generate_unsafecopy_common_error_exit() {
address start_pc = __ pc(); Register tmp1 = R6_ARG4; // probably copy stub would have changed value reset it. if (VM_Version::has_mfdscr()) {
__ load_const_optimized(tmp1, VM_Version::_dscr_val);
__ mtdscr(tmp1);
}
__ li(R3_RET, 0); // return 0
__ blr(); return start_pc;
}
// The guideline in the implementations of generate_disjoint_xxx_copy // (xxx=byte,short,int,long,oop) is to copy as many elements as possible with // single instructions, but to avoid alignment interrupts (see subsequent // comment). Furthermore, we try to minimize misaligned access, even // though they cause no alignment interrupt. // // In Big-Endian mode, the PowerPC architecture requires implementations to // handle automatically misaligned integer halfword and word accesses, // word-aligned integer doubleword accesses, and word-aligned floating-point // accesses. Other accesses may or may not generate an Alignment interrupt // depending on the implementation. // Alignment interrupt handling may require on the order of hundreds of cycles, // so every effort should be made to avoid misaligned memory values. // // // Generate stub for disjoint byte copy. If "aligned" is true, the // "from" and "to" addresses are assumed to be heapword aligned. // // Arguments for generated stub: // from: R3_ARG1 // to: R4_ARG2 // count: R5_ARG3 treated as signed //
address generate_disjoint_byte_copy(bool aligned, constchar * name) {
StubCodeMark mark(this, "StubRoutines", name);
address start = __ function_entry();
assert_positive_int(R5_ARG3);
// Don't try anything fancy if arrays don't have many elements.
__ li(tmp3, 0);
__ cmpwi(CCR0, R5_ARG3, 17);
__ ble(CCR0, l_6); // copy 4 at a time
if (!aligned) {
__ xorr(tmp1, R3_ARG1, R4_ARG2);
__ andi_(tmp1, tmp1, 3);
__ bne(CCR0, l_6); // If arrays don't have the same alignment mod 4, do 4 element copy.
// Copy elements if necessary to align to 4 bytes.
__ neg(tmp1, R3_ARG1); // Compute distance to alignment boundary.
__ andi_(tmp1, tmp1, 3);
__ beq(CCR0, l_2);
// copy 8 elements at a time
__ xorr(tmp2, R3_ARG1, R4_ARG2); // skip if src & dest have differing alignment mod 8
__ andi_(tmp1, tmp2, 7);
__ bne(CCR0, l_7); // not same alignment -> to or from is aligned -> copy 8
// copy a 2-element word if necessary to align to 8 bytes
__ andi_(R0, R3_ARG1, 7);
__ beq(CCR0, l_7);
__ bind(l_8); // Use unrolled version for mass copying (copy 32 elements a time) // Load feeding store gets zero latency on Power6, however not on Power5. // Therefore, the following sequence is made for the good of both.
__ ld(tmp1, 0, R3_ARG1);
__ ld(tmp2, 8, R3_ARG1);
__ ld(tmp3, 16, R3_ARG1);
__ ld(tmp4, 24, R3_ARG1);
__ std(tmp1, 0, R4_ARG2);
__ std(tmp2, 8, R4_ARG2);
__ std(tmp3, 16, R4_ARG2);
__ std(tmp4, 24, R4_ARG2);
__ addi(R3_ARG1, R3_ARG1, 32);
__ addi(R4_ARG2, R4_ARG2, 32);
__ bdnz(l_8);
} else { // Processor supports VSX, so use it to mass copy.
// Prefetch the data into the L2 cache.
__ dcbt(R3_ARG1, 0);
// If supported set DSCR pre-fetch to deepest. if (VM_Version::has_mfdscr()) {
__ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
__ mtdscr(tmp2);
}
__ li(tmp1, 16);
// Backbranch target aligned to 32-byte. Not 16-byte align as // loop contains < 8 instructions that fit inside a single // i-cache sector.
__ align(32);
__ bind(l_10); // Use loop with VSX load/store instructions to // copy 32 elements a time.
__ lxvd2x(tmp_vsr1, R3_ARG1); // Load src
__ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst
__ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src + 16
__ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16
__ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32
__ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32
__ bdnz(l_10); // Dec CTR and loop if not zero.
// Generate stub for conjoint byte copy. If "aligned" is true, the // "from" and "to" addresses are assumed to be heapword aligned. // // Arguments for generated stub: // from: R3_ARG1 // to: R4_ARG2 // count: R5_ARG3 treated as signed //
address generate_conjoint_byte_copy(bool aligned, constchar * name) {
StubCodeMark mark(this, "StubRoutines", name);
address start = __ function_entry();
assert_positive_int(R5_ARG3);
array_overlap_test(nooverlap_target, 0); // Do reverse copy. We assume the case of actual overlap is rare enough // that we don't have to optimize it.
Label l_1, l_2;
{ // UnsafeCopyMemory page error: continue at UnsafeCopyMemory common_error_exit
UnsafeCopyMemoryMark ucmm(this, !aligned, false);
__ b(l_2);
__ bind(l_1);
__ stbx(tmp1, R4_ARG2, R5_ARG3);
__ bind(l_2);
__ addic_(R5_ARG3, R5_ARG3, -1);
__ lbzx(tmp1, R3_ARG1, R5_ARG3);
__ bge(CCR0, l_1);
}
__ li(R3_RET, 0); // return 0
__ blr();
return start;
}
// Generate stub for disjoint short copy. If "aligned" is true, the // "from" and "to" addresses are assumed to be heapword aligned. // // Arguments for generated stub: // from: R3_ARG1 // to: R4_ARG2 // elm.count: R5_ARG3 treated as signed // // Strategy for aligned==true: // // If length <= 9: // 1. copy 2 elements at a time (l_6) // 2. copy last element if original element count was odd (l_1) // // If length > 9: // 1. copy 4 elements at a time until less than 4 elements are left (l_7) // 2. copy 2 elements at a time until less than 2 elements are left (l_6) // 3. copy last element if one was left in step 2. (l_1) // // // Strategy for aligned==false: // // If length <= 9: same as aligned==true case, but NOTE: load/stores // can be unaligned (see comment below) // // If length > 9: // 1. continue with step 6. if the alignment of from and to mod 4 // is different. // 2. align from and to to 4 bytes by copying 1 element if necessary // 3. at l_2 from and to are 4 byte aligned; continue with // 5. if they cannot be aligned to 8 bytes because they have // got different alignment mod 8. // 4. at this point we know that both, from and to, have the same // alignment mod 8, now copy one element if necessary to get // 8 byte alignment of from and to. // 5. copy 4 elements at a time until less than 4 elements are // left; depending on step 3. all load/stores are aligned or // either all loads or all stores are unaligned. // 6. copy 2 elements at a time until less than 2 elements are // left (l_6); arriving here from step 1., there is a chance // that all accesses are unaligned. // 7. copy last element if one was left in step 6. (l_1) // // There are unaligned data accesses using integer load/store // instructions in this stub. POWER allows such accesses. // // According to the manuals (PowerISA_V2.06_PUBLIC, Book II, // Chapter 2: Effect of Operand Placement on Performance) unaligned // integer load/stores have good performance. Only unaligned // floating point load/stores can have poor performance. // // TODO: // // 1. check if aligning the backbranch target of loops is beneficial //
address generate_disjoint_short_copy(bool aligned, constchar * name) {
StubCodeMark mark(this, "StubRoutines", name);
Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9;
{ // UnsafeCopyMemory page error: continue at UnsafeCopyMemory common_error_exit
UnsafeCopyMemoryMark ucmm(this, !aligned, false); // don't try anything fancy if arrays don't have many elements
__ li(tmp3, 0);
__ cmpwi(CCR0, R5_ARG3, 9);
__ ble(CCR0, l_6); // copy 2 at a time
if (!aligned) {
__ xorr(tmp1, R3_ARG1, R4_ARG2);
__ andi_(tmp1, tmp1, 3);
__ bne(CCR0, l_6); // if arrays don't have the same alignment mod 4, do 2 element copy
// At this point it is guaranteed that both, from and to have the same alignment mod 4.
// Copy 1 element if necessary to align to 4 bytes.
__ andi_(tmp1, R3_ARG1, 3);
__ beq(CCR0, l_2);
// At this point the positions of both, from and to, are at least 4 byte aligned.
// Copy 4 elements at a time. // Align to 8 bytes, but only if both, from and to, have same alignment mod 8.
__ xorr(tmp2, R3_ARG1, R4_ARG2);
__ andi_(tmp1, tmp2, 7);
__ bne(CCR0, l_7); // not same alignment mod 8 -> copy 4, either from or to will be unaligned
// Copy a 2-element word if necessary to align to 8 bytes.
__ andi_(R0, R3_ARG1, 7);
__ beq(CCR0, l_7);
__ bind(l_8); // Use unrolled version for mass copying (copy 16 elements a time). // Load feeding store gets zero latency on Power6, however not on Power5. // Therefore, the following sequence is made for the good of both.
__ ld(tmp1, 0, R3_ARG1);
__ ld(tmp2, 8, R3_ARG1);
__ ld(tmp3, 16, R3_ARG1);
__ ld(tmp4, 24, R3_ARG1);
__ std(tmp1, 0, R4_ARG2);
__ std(tmp2, 8, R4_ARG2);
__ std(tmp3, 16, R4_ARG2);
__ std(tmp4, 24, R4_ARG2);
__ addi(R3_ARG1, R3_ARG1, 32);
__ addi(R4_ARG2, R4_ARG2, 32);
__ bdnz(l_8);
} else { // Processor supports VSX, so use it to mass copy.
// Prefetch src data into L2 cache.
__ dcbt(R3_ARG1, 0);
// If supported set DSCR pre-fetch to deepest. if (VM_Version::has_mfdscr()) {
__ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
__ mtdscr(tmp2);
}
__ li(tmp1, 16);
// Backbranch target aligned to 32-byte. It's not aligned 16-byte // as loop contains < 8 instructions that fit inside a single // i-cache sector.
__ align(32);
__ bind(l_9); // Use loop with VSX load/store instructions to // copy 16 elements a time.
__ lxvd2x(tmp_vsr1, R3_ARG1); // Load from src.
__ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst.
__ lxvd2x(tmp_vsr2, R3_ARG1, tmp1); // Load from src + 16.
__ stxvd2x(tmp_vsr2, R4_ARG2, tmp1); // Store to dst + 16.
__ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32.
__ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32.
__ bdnz(l_9); // Dec CTR and loop if not zero.
// Generate stub for conjoint short copy. If "aligned" is true, the // "from" and "to" addresses are assumed to be heapword aligned. // // Arguments for generated stub: // from: R3_ARG1 // to: R4_ARG2 // count: R5_ARG3 treated as signed //
address generate_conjoint_short_copy(bool aligned, constchar * name) {
StubCodeMark mark(this, "StubRoutines", name);
address start = __ function_entry();
assert_positive_int(R5_ARG3);
// Generate core code for disjoint int copy (and oop copy on 32-bit). If "aligned" // is true, the "from" and "to" addresses are assumed to be heapword aligned. // // Arguments: // from: R3_ARG1 // to: R4_ARG2 // count: R5_ARG3 treated as signed // void generate_disjoint_int_copy_core(bool aligned) { Register tmp1 = R6_ARG4; Register tmp2 = R7_ARG5; Register tmp3 = R8_ARG6; Register tmp4 = R0;
// for short arrays, just do single element copy
__ li(tmp3, 0);
__ cmpwi(CCR0, R5_ARG3, 5);
__ ble(CCR0, l_2);
if (!aligned) { // check if arrays have same alignment mod 8.
__ xorr(tmp1, R3_ARG1, R4_ARG2);
__ andi_(R0, tmp1, 7); // Not the same alignment, but ld and std just need to be 4 byte aligned.
__ bne(CCR0, l_4); // to OR from is 8 byte aligned -> copy 2 at a time
// copy 1 element to align to and from on an 8 byte boundary
__ andi_(R0, R3_ARG1, 7);
__ beq(CCR0, l_4);
__ bind(l_6); // Use unrolled version for mass copying (copy 8 elements a time). // Load feeding store gets zero latency on power6, however not on power 5. // Therefore, the following sequence is made for the good of both.
__ ld(tmp1, 0, R3_ARG1);
__ ld(tmp2, 8, R3_ARG1);
__ ld(tmp3, 16, R3_ARG1);
__ ld(tmp4, 24, R3_ARG1);
__ std(tmp1, 0, R4_ARG2);
__ std(tmp2, 8, R4_ARG2);
__ std(tmp3, 16, R4_ARG2);
__ std(tmp4, 24, R4_ARG2);
__ addi(R3_ARG1, R3_ARG1, 32);
__ addi(R4_ARG2, R4_ARG2, 32);
__ bdnz(l_6);
} else { // Processor supports VSX, so use it to mass copy.
// Prefetch the data into the L2 cache.
__ dcbt(R3_ARG1, 0);
// If supported set DSCR pre-fetch to deepest. if (VM_Version::has_mfdscr()) {
__ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
__ mtdscr(tmp2);
}
__ li(tmp1, 16);
// Backbranch target aligned to 32-byte. Not 16-byte align as // loop contains < 8 instructions that fit inside a single // i-cache sector.
__ align(32);
__ bind(l_7); // Use loop with VSX load/store instructions to // copy 8 elements a time.
__ lxvd2x(tmp_vsr1, R3_ARG1); // Load src
__ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst
__ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src + 16
__ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16
__ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32
__ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32
__ bdnz(l_7); // Dec CTR and loop if not zero.
// Generate stub for disjoint int copy. If "aligned" is true, the // "from" and "to" addresses are assumed to be heapword aligned. // // Arguments for generated stub: // from: R3_ARG1 // to: R4_ARG2 // count: R5_ARG3 treated as signed //
address generate_disjoint_int_copy(bool aligned, constchar * name) {
StubCodeMark mark(this, "StubRoutines", name);
address start = __ function_entry();
assert_positive_int(R5_ARG3);
{ // UnsafeCopyMemory page error: continue at UnsafeCopyMemory common_error_exit
UnsafeCopyMemoryMark ucmm(this, !aligned, false);
generate_disjoint_int_copy_core(aligned);
}
__ li(R3_RET, 0); // return 0
__ blr(); return start;
}
// Generate core code for conjoint int copy (and oop copy on // 32-bit). If "aligned" is true, the "from" and "to" addresses // are assumed to be heapword aligned. // // Arguments: // from: R3_ARG1 // to: R4_ARG2 // count: R5_ARG3 treated as signed // void generate_conjoint_int_copy_core(bool aligned) { // Do reverse copy. We assume the case of actual overlap is rare enough // that we don't have to optimize it.
if (!aligned) { // check if arrays have same alignment mod 8.
__ xorr(tmp1, R3_ARG1, R4_ARG2);
__ andi_(R0, tmp1, 7); // Not the same alignment, but ld and std just need to be 4 byte aligned.
__ bne(CCR0, l_7); // to OR from is 8 byte aligned -> copy 2 at a time
// copy 1 element to align to and from on an 8 byte boundary
__ andi_(R0, R3_ARG1, 7);
__ beq(CCR0, l_7);
if (!VM_Version::has_vsx()) {
__ bind(l_4); // Use unrolled version for mass copying (copy 4 elements a time). // Load feeding store gets zero latency on Power6, however not on Power5. // Therefore, the following sequence is made for the good of both.
__ addi(R3_ARG1, R3_ARG1, -32);
__ addi(R4_ARG2, R4_ARG2, -32);
__ ld(tmp4, 24, R3_ARG1);
__ ld(tmp3, 16, R3_ARG1);
__ ld(tmp2, 8, R3_ARG1);
__ ld(tmp1, 0, R3_ARG1);
__ std(tmp4, 24, R4_ARG2);
__ std(tmp3, 16, R4_ARG2);
__ std(tmp2, 8, R4_ARG2);
__ std(tmp1, 0, R4_ARG2);
__ bdnz(l_4);
} else { // Processor supports VSX, so use it to mass copy. // Prefetch the data into the L2 cache.
__ dcbt(R3_ARG1, 0);
// If supported set DSCR pre-fetch to deepest. if (VM_Version::has_mfdscr()) {
__ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
__ mtdscr(tmp2);
}
__ li(tmp1, 16);
// Backbranch target aligned to 32-byte. Not 16-byte align as // loop contains < 8 instructions that fit inside a single // i-cache sector.
__ align(32);
__ bind(l_4); // Use loop with VSX load/store instructions to // copy 8 elements a time.
__ addi(R3_ARG1, R3_ARG1, -32); // Update src-=32
__ addi(R4_ARG2, R4_ARG2, -32); // Update dsc-=32
__ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src+16
__ lxvd2x(tmp_vsr1, R3_ARG1); // Load src
__ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst+16
__ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst
__ bdnz(l_4);
if (!VM_Version::has_vsx()) {
__ bind(l_4); // Use unrolled version for mass copying (copy 4 elements a time). // Load feeding store gets zero latency on Power6, however not on Power5. // Therefore, the following sequence is made for the good of both.
__ ld(tmp1, 0, R3_ARG1);
__ ld(tmp2, 8, R3_ARG1);
__ ld(tmp3, 16, R3_ARG1);
__ ld(tmp4, 24, R3_ARG1);
__ std(tmp1, 0, R4_ARG2);
__ std(tmp2, 8, R4_ARG2);
__ std(tmp3, 16, R4_ARG2);
__ std(tmp4, 24, R4_ARG2);
__ addi(R3_ARG1, R3_ARG1, 32);
__ addi(R4_ARG2, R4_ARG2, 32);
__ bdnz(l_4);
} else { // Processor supports VSX, so use it to mass copy.
// Prefetch the data into the L2 cache.
__ dcbt(R3_ARG1, 0);
// If supported set DSCR pre-fetch to deepest. if (VM_Version::has_mfdscr()) {
__ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
__ mtdscr(tmp2);
}
__ li(tmp1, 16);
// Backbranch target aligned to 32-byte. Not 16-byte align as // loop contains < 8 instructions that fit inside a single // i-cache sector.
__ align(32);
__ bind(l_5); // Use loop with VSX load/store instructions to // copy 4 elements a time.
__ lxvd2x(tmp_vsr1, R3_ARG1); // Load src
__ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst
__ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src + 16
__ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16
__ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32
__ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32
__ bdnz(l_5); // Dec CTR and loop if not zero.
if (!VM_Version::has_vsx()) {
__ bind(l_4); // Use unrolled version for mass copying (copy 4 elements a time). // Load feeding store gets zero latency on Power6, however not on Power5. // Therefore, the following sequence is made for the good of both.
__ addi(R3_ARG1, R3_ARG1, -32);
__ addi(R4_ARG2, R4_ARG2, -32);
__ ld(tmp4, 24, R3_ARG1);
__ ld(tmp3, 16, R3_ARG1);
__ ld(tmp2, 8, R3_ARG1);
__ ld(tmp1, 0, R3_ARG1);
__ std(tmp4, 24, R4_ARG2);
__ std(tmp3, 16, R4_ARG2);
__ std(tmp2, 8, R4_ARG2);
__ std(tmp1, 0, R4_ARG2);
__ bdnz(l_4);
} else { // Processor supports VSX, so use it to mass copy. // Prefetch the data into the L2 cache.
__ dcbt(R3_ARG1, 0);
// If supported set DSCR pre-fetch to deepest. if (VM_Version::has_mfdscr()) {
__ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
__ mtdscr(tmp2);
}
__ li(tmp1, 16);
// Backbranch target aligned to 32-byte. Not 16-byte align as // loop contains < 8 instructions that fit inside a single // i-cache sector.
__ align(32);
__ bind(l_4); // Use loop with VSX load/store instructions to // copy 4 elements a time.
__ addi(R3_ARG1, R3_ARG1, -32); // Update src-=32
__ addi(R4_ARG2, R4_ARG2, -32); // Update dsc-=32
__ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src+16
__ lxvd2x(tmp_vsr1, R3_ARG1); // Load src
__ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst+16
__ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst
__ bdnz(l_4);
// Generate stub for conjoint oop copy. If "aligned" is true, the // "from" and "to" addresses are assumed to be heapword aligned. // // Arguments for generated stub: // from: R3_ARG1 // to: R4_ARG2 // count: R5_ARG3 treated as signed // dest_uninitialized: G1 support //
address generate_conjoint_oop_copy(bool aligned, constchar * name, bool dest_uninitialized) {
StubCodeMark mark(this, "StubRoutines", name);
// Generate stub for disjoint oop copy. If "aligned" is true, the // "from" and "to" addresses are assumed to be heapword aligned. // // Arguments for generated stub: // from: R3_ARG1 // to: R4_ARG2 // count: R5_ARG3 treated as signed // dest_uninitialized: G1 support //
address generate_disjoint_oop_copy(bool aligned, constchar * name, bool dest_uninitialized) {
StubCodeMark mark(this, "StubRoutines", name);
address start = __ function_entry();
assert_positive_int(R5_ARG3);
// Helper for generating a dynamic type check. // Smashes only the given temp registers. void generate_type_check(Register sub_klass, Register super_check_offset, Register super_klass, Register temp,
Label& L_success) {
assert_different_registers(sub_klass, super_check_offset, super_klass);
// Generate stub for checked oop copy. // // Arguments for generated stub: // from: R3 // to: R4 // count: R5 treated as signed // ckoff: R6 (super_check_offset) // ckval: R7 (super_klass) // ret: R3 zero for success; (-1^K) where K is partial transfer count //
address generate_checkcast_copy(constchar *name, bool dest_uninitialized) {
__ addi(R8_offset, R8_offset, heapOopSize); // Step to next offset.
__ add_(R9_remain, R2_minus1, R9_remain); // Decrement the count.
__ beq(CCR0, success);
// ======== loop entry is here ========
__ bind(load_element);
__ load_heap_oop(R10_oop, R8_offset, R3_from,
R11_scratch1, R12_tmp,
MacroAssembler::PRESERVATION_FRAME_LR_GP_REGS,
AS_RAW, &store_null);
__ load_klass(R11_klass, R10_oop); // Query the object klass.
generate_type_check(R11_klass, R6_ckoff, R7_ckval, R12_tmp, // Branch to this on success:
store_element); // ======== end loop ========
// It was a real error; we must depend on the caller to finish the job. // Register R9_remain has number of *remaining* oops, R5_count number of *total* oops. // Emit GC store barriers for the oops we have copied (R5_count minus R9_remain), // and report their number to the caller.
__ subf_(R5_count, R9_remain, R5_count);
__ nand(R3_RET, R5_count, R5_count); // report (-1^K) to caller
__ bne(CCR0, do_epilogue);
__ blr();
// Generate 'unsafe' array copy stub. // Though just as safe as the other stubs, it takes an unscaled // size_t argument instead of an element count. // // Arguments for generated stub: // from: R3 // to: R4 // count: R5 byte count, treated as ssize_t, can be zero // // Examines the alignment of the operands and dispatches // to a long, int, short, or byte copy loop. //
address generate_unsafe_copy(constchar* name,
address byte_copy_entry,
address short_copy_entry,
address int_copy_entry,
address long_copy_entry) {
constRegister R3_from = R3_ARG1; // source array address constRegister R4_to = R4_ARG2; // destination array address constRegister R5_count = R5_ARG3; // elements count (as long on PPC64)
constRegister R6_bits = R6_ARG4; // test copy of low bits constRegister R7_tmp = R7_ARG5;
// Bump this on entry, not on exit: //inc_counter_np(SharedRuntime::_generic_array_copy_ctr, lh, temp);
// In principle, the int arguments could be dirty.
//----------------------------------------------------------------------- // Assembler stubs will be used for this call to arraycopy // if the following conditions are met: // // (1) src and dst must not be null. // (2) src_pos must not be negative. // (3) dst_pos must not be negative. // (4) length must not be negative. // (5) src klass and dst klass should be the same and not NULL. // (6) src and dst should be arrays. // (7) src_pos + length must not exceed length of src. // (8) dst_pos + length must not exceed length of dst.
BLOCK_COMMENT("arraycopy initial argument checks");
// At this point, it is known to be a typeArray (array_tag 0x3). #ifdef ASSERT
{ Label L;
jint lh_prim_tag_in_place = (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
__ load_const_optimized(temp, lh_prim_tag_in_place, R0);
__ cmpw(CCR0, lh, temp);
__ bge(CCR0, L);
__ stop("must be a primitive array");
__ bind(L);
} #endif
// Next registers should be set before the jump to corresponding stub. constRegister from = R3_ARG1; // source array address constRegister to = R4_ARG2; // destination array address constRegister count = R5_ARG3; // elements count
// 'from', 'to', 'count' registers should be set in this order // since they are the same as 'src', 'src_pos', 'dst'.
__ bind(L_checkcast_copy); // live at this point: src_klass, dst_klass
{ // Before looking at dst.length, make sure dst is also an objArray.
__ lwz(temp, lh_offset, dst_klass);
__ cmpw(CCR0, lh, temp);
__ bne(CCR0, L_failed);
// It is safe to examine both src.length and dst.length.
arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
temp, lh, L_failed);
Register sco_temp = R6_ARG4; // This register is free now.
assert_different_registers(from, to, count, sco_temp,
dst_klass, src_klass);
// Generate the type check. int sco_offset = in_bytes(Klass::super_check_offset_offset());
__ lwz(sco_temp, sco_offset, dst_klass);
generate_type_check(src_klass, sco_temp, dst_klass,
temp, L_disjoint_plain_copy);
// Fetch destination element klass from the ObjArrayKlass header. int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
// The checkcast_copy loop needs two extra arguments:
__ ld(R7_ARG5, ek_offset, dst_klass); // dest elem klass
__ lwz(R6_ARG4, sco_offset, R7_ARG5); // sco of elem klass
__ b(entry_checkcast_arraycopy);
}
// C2 does not sign extend signed parameters to full 64 bits registers:
__ rldic (R5_ARG3, R5_ARG3, 2, 32); // always positive
__ clrldi(R6_ARG4, R6_ARG4, 32); // force zero bits on higher word
__ clrldi(R7_ARG5, R7_ARG5, 32); // force zero bits on higher word
/** * Arguments: * * Input: * R3_ARG1 - in address * R4_ARG2 - in length * R5_ARG3 - out address * R6_ARG4 - out length
*/
address generate_squareToLen() {
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", "squareToLen");
address start = __ function_entry();
// args - higher word is cleaned (unsignedly) due to int to long casting constRegister in = R3_ARG1; constRegister in_len = R4_ARG2;
__ clrldi(in_len, in_len, 32); constRegister out = R5_ARG3; constRegister out_len = R6_ARG4;
__ clrldi(out_len, out_len, 32);
// Store the squares, right shifted one bit (i.e., divided by 2)
__ subi (out_aux, out, 8);
__ subi (in_aux, in, 4);
__ cmpwi (CCR0, in_len, 0); // Initialize lplw outside of the loop
__ xorr (lplw, lplw, lplw);
__ ble (CCR0, SKIP_LOOP_SQUARE); // in_len <= 0
__ mtctr (in_len);
__ bind(LOOP_SQUARE);
__ lwzu (piece, 4, in_aux);
__ mulld (product, piece, piece); // shift left 63 bits and only keep the MSB
__ rldic (lplw_s, lplw, 63, 0);
__ mr (lplw, product); // shift right 1 bit without sign extension
__ srdi (product, product, 1); // join them to the same register and store it
__ orr (product, lplw_s, product); #ifdef VM_LITTLE_ENDIAN // Swap low and high words for little endian
__ rldicl (product, product, 32, 0); #endif
__ stdu (product, 8, out_aux);
__ bdnz (LOOP_SQUARE);
__ bind(SKIP_LOOP_SQUARE);
// Add in off-diagonal sums
__ cmpwi (CCR0, in_len, 0);
__ ble (CCR0, SKIP_DIAGONAL_SUM); // Avoid CTR usage here in order to use it at mulAdd
__ subi (i_minus1, in_len, 1);
__ li (offset, 4);
__ bind(LOOP_DIAGONAL_SUM);
__ sldi (off_aux, out_len, 2);
__ sub (off_aux, off_aux, offset);
// Shift back up and set low bit // Shifts 1 bit left up to len positions. Assumes no leading zeros // begin<primitiveLeftShift>
__ cmpwi (CCR0, out_len, 0);
__ ble (CCR0, SKIP_LSHIFT);
__ li (i, 0);
__ lwz (c, 0, out);
__ subi (b, out_len, 1);
__ mtctr (b);
int nbytes_save = MacroAssembler::num_volatile_regs * BytesPerWord;
__ save_volatile_gprs(R1_SP, -nbytes_save, true);
// Link register points to instruction in prologue of the guarded nmethod. // As the stub requires one layer of indirection (argument is of type address* and not address), // passing the link register's value directly doesn't work. // Since we have to save the link register on the stack anyway, we calculate the corresponding stack address // and pass that one instead.
__ addi(R3_ARG1, R1_SP, _abi0(lr));
__ pop_frame();
__ restore_LR_CR(R3_RET /* used as tmp register */);
__ restore_volatile_gprs(R1_SP, -nbytes_save, true);
__ cmpdi(CCR0, R0, 0);
// Return to prologue if no deoptimization is required (bnelr)
__ bclr(Assembler::bcondCRbiIs1, Assembler::bi0(CCR0, Assembler::equal), Assembler::bhintIsTaken);
// Deoptimization required. // For actually handling the deoptimization, the 'wrong method stub' is invoked.
__ load_const_optimized(R0, SharedRuntime::get_handle_wrong_method_stub());
__ mtctr(R0);
// Pop the frame built in the prologue.
__ pop_frame();
// Restore link register. Required as the 'wrong method stub' needs the caller's frame // to properly deoptimize this method (e.g. by re-resolving the call site for compiled methods). // This method's prologue is aborted.
__ restore_LR_CR(R0);
__ bctr(); return stub_address;
}
#ifdef VM_LITTLE_ENDIAN // The following Base64 decode intrinsic is based on an algorithm outlined // in here: // http://0x80.pl/notesen/2016-01-17-sse-base64-decoding.html // in the section titled "Vector lookup (pshufb with bitmask)" // // This implementation differs in the following ways: // * Instead of Intel SSE instructions, Power AltiVec VMX and VSX instructions // are used instead. It turns out that some of the vector operations // needed in the algorithm require fewer AltiVec instructions. // * The algorithm in the above mentioned paper doesn't handle the // Base64-URL variant in RFC 4648. Adjustments to both the code and to two // lookup tables are needed for this. // * The "Pack" section of the code is a complete rewrite for Power because we // can utilize better instructions for this step. //
// In little-endian mode, the lxv instruction loads the element at EA into // element 15 of the vector register, EA+1 goes into element 14, and so // on. // // To make a look-up table easier to read, ARRAY_TO_LXV_ORDER reverses the // order of the elements in a vector initialization. #define ARRAY_TO_LXV_ORDER(e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15) e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0
// In the following table_*_val constants, a 0 value means the // character is not in the Base64 character set
.table_32_47_val = {
ARRAY_TO_LXV_ORDER ( /* space .. '*' = 0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* '+' = 62 */ VB64(62), /* ',' .. '.' = 0 */ 0, 0, 0, /* '/' = 63 */ VB64(63) ) },
// The first 4 index values are "don't care" because // we only use the first 12 bytes of the vector, // which are decoded from 16 bytes of Base64 characters.
.pack_permute_val = {
ARRAY_TO_LXV_ORDER(
0, 0, 0, 0,
0, 1, 2,
4, 5, 6,
8, 9, 10,
12, 13, 14 ) }
};
constunsigned block_size = 16; // number of bytes to process in each pass through the loop constunsigned block_size_shift = 4;
// According to the ELF V2 ABI, registers r3-r12 are volatile and available for use without save/restore Register s = R3_ARG1; // source starting address of Base64 characters Register sp = R4_ARG2; // source offset Register sl = R5_ARG3; // source length = # of Base64 characters to be processed Register d = R6_ARG4; // destination address Register dp = R7_ARG5; // destination offset Register isURL = R8_ARG6; // boolean, if non-zero indicates use of RFC 4648 base64url encoding Register isMIME = R9_ARG7; // boolean, if non-zero indicates use of RFC 2045 MIME encoding - not used
// Local variables Register const_ptr = R9; // used for loading constants Register tmp_reg = R10; // used for speeding up load_constant_optimized()
// Re-use R9 and R10 to avoid using non-volatile registers (requires save/restore) Register out = R9; // moving out (destination) pointer Register in = R10; // moving in (source) pointer
// The upper 32 bits of the non-pointer parameter registers are not // guaranteed to be zero, so mask off those upper bits.
__ clrldi(sp, sp, 32);
__ clrldi(sl, sl, 32);
// Don't handle the last 4 characters of the source, because this // VSX-based algorithm doesn't handle padding characters. Also the // vector code will always write 16 bytes of decoded data on each pass, // but only the first 12 of those 16 bytes are valid data (16 base64 // characters become 12 bytes of binary data), so for this reason we // need to subtract an additional 8 bytes from the source length, in // order not to write past the end of the destination buffer. The // result of this subtraction implies that a Java function in the // Base64 class will be used to process the last 12 characters.
__ sub(sl, sl, sp);
__ subi(sl, sl, 12);
// Load CTR with the number of passes through the loop // = sl >> block_size_shift. After the shift, if sl <= 0, there's too // little data to be processed by this intrinsic.
__ srawi_(sl, sl, block_size_shift);
__ ble(CCR0, return_zero);
__ mtctr(sl);
// Clear the other two parameter registers upper 32 bits.
__ clrldi(isURL, isURL, 32);
__ clrldi(dp, dp, 32);
// Load constant vec registers that need to be loaded from memory
__ load_const_optimized(const_ptr, (address)&const_block, tmp_reg);
__ lxv(bitposLUT, BLK_OFFSETOF(bitposLUT_val), const_ptr);
__ lxv(pack_rshift->to_vsr(), BLK_OFFSETOF(pack_rshift_val), const_ptr);
__ lxv(pack_lshift->to_vsr(), BLK_OFFSETOF(pack_lshift_val), const_ptr);
__ lxv(pack_permute, BLK_OFFSETOF(pack_permute_val), const_ptr);
// Splat the constants that can use xxspltib
__ xxspltib(vec_0s->to_vsr(), 0);
__ xxspltib(vec_8s->to_vsr(), 8); if (PowerArchitecturePPC64 >= 10) { // Using VALID_B64 for the offsets effectively strips the upper bit // of each byte that was selected from the table. Setting the upper // bit gives us a way to distinguish between the 6-bit value of 0 // from an error code of 0, which will happen if the character is // outside the range of the lookup, or is an illegal Base64 // character, such as %.
__ xxspltib(offsets->to_vsr(), VALID_B64);
// // Lookup // if (PowerArchitecturePPC64 >= 10) { // Use xxpermx to do a lookup of each Base64 character in the // input vector and translate it to a 6-bit value + 0x80. // Characters which are not valid Base64 characters will result // in a zero in the corresponding byte. // // Note that due to align(32) call above, the xxpermx instructions do // not require align_prefix() calls, since the final xxpermx // prefix+opcode is at byte 24.
__ xxpermx(xlate_a, table_32_47, table_48_63, input->to_vsr(), 1); // offset=4
__ xxpermx(xlate_b, table_64_79, table_80_95, input->to_vsr(), 2); // offset=12
__ xxlor(xlate_b, xlate_a, xlate_b); // offset=20
__ xxpermx(xlate_a, table_96_111, table_112_127, input->to_vsr(), 3); // offset=24
__ xxlor(input->to_vsr(), xlate_a, xlate_b); // Check for non-Base64 characters by comparing each byte to zero.
__ vcmpequb_(non_match, input, vec_0s);
} else { // Isolate the upper 4 bits of each character by shifting it right 4 bits
__ vsrb(higher_nibble, input, vec_4s); // Isolate the lower 4 bits by masking
__ xxland(lower_nibble, input->to_vsr(), vec_0xfs);
// Get the offset (the value to subtract from the byte) by using // a lookup table indexed by the upper 4 bits of the character
__ xxperm(offsets->to_vsr(), offsetLUT, higher_nibble->to_vsr());
// Find out which elements are the special case character (isURL ? '/' : '-')
__ vcmpequb(eq_special_case_char, input, vec_special_case_char);
// For each character in the input which is a special case // character, replace its offset with one that is special for that // character.
__ xxsel(offsets->to_vsr(), offsets->to_vsr(), vec_special_case_offset, eq_special_case_char->to_vsr());
// Use the lower_nibble to select a mask "M" from the lookup table.
__ xxperm(M, maskLUT, lower_nibble);
// "bit" is used to isolate which of the bits in M is relevant.
__ xxperm(bit, bitposLUT, higher_nibble->to_vsr());
// Each element of non_match correspond to one each of the 16 input // characters. Those elements that become 0x00 after the xxland // instruction are invalid Base64 characters.
__ xxland(non_match->to_vsr(), M, bit);
// Compare each element to zero //
__ vcmpequb_(non_match, non_match, vec_0s);
} // vmcmpequb_ sets the EQ bit of CCR6 if no elements compare equal. // Any element comparing equal to zero means there is an error in // that element. Note that the comparison result register // non_match is not referenced again. Only CCR6-EQ matters.
__ bne_predict_not_taken(CCR6, loop_exit);
// The Base64 characters had no errors, so add the offsets, which in // the case of Power10 is a constant vector of all 0x80's (see earlier // comment where the offsets register is loaded).
__ vaddubm(input, input, offsets);
// Pack // // In the tables below, b0, b1, .. b15 are the bytes of decoded // binary data, the first line of each of the cells (except for // the constants) uses the bit-field nomenclature from the // above-linked paper, whereas the second line is more specific // about which exact bits are present, and is constructed using the // Power ISA 3.x document style, where: // // * The specifier after the colon depicts which bits are there. // * The bit numbering is big endian style (bit 0 is the most // significant). // * || is a concatenate operator. // * Strings of 0's are a field of zeros with the shown length, and // likewise for strings of 1's.
// Note that only e12..e15 are shown here because the shifting // and OR'ing pattern replicates for e8..e11, e4..7, and // e0..e3. // // +======================+=================+======================+======================+=============+ // | Vector | e12 | e13 | e14 | e15 | // | Element | | | | | // +======================+=================+======================+======================+=============+ // | after vaddubm | 00dddddd | 00cccccc | 00bbbbbb | 00aaaaaa | // | | 00||b2:2..7 | 00||b1:4..7||b2:0..1 | 00||b0:6..7||b1:0..3 | 00||b0:0..5 | // +----------------------+-----------------+----------------------+----------------------+-------------+ // | pack_lshift | | << 6 | << 4 | << 2 | // +----------------------+-----------------+----------------------+----------------------+-------------+ // | l after vslb | 00dddddd | cc000000 | bbbb0000 | aaaaaa00 | // | | 00||b2:2..7 | b2:0..1||000000 | b1:0..3||0000 | b0:0..5||00 | // +----------------------+-----------------+----------------------+----------------------+-------------+ // | l after vslo | cc000000 | bbbb0000 | aaaaaa00 | 00000000 | // | | b2:0..1||000000 | b1:0..3||0000 | b0:0..5||00 | 00000000 | // +----------------------+-----------------+----------------------+----------------------+-------------+ // | pack_rshift | | >> 2 | >> 4 | | // +----------------------+-----------------+----------------------+----------------------+-------------+ // | r after vsrb | 00dddddd | 0000cccc | 000000bb | 00aaaaaa | // | | 00||b2:2..7 | 0000||b1:4..7 | 000000||b0:6..7 | 00||b0:0..5 | // +----------------------+-----------------+----------------------+----------------------+-------------+ // | gathered after xxlor | ccdddddd | bbbbcccc | aaaaaabb | 00aaaaaa | // | | b2:0..7 | b1:0..7 | b0:0..7 | 00||b0:0..5 | // +======================+=================+======================+======================+=============+ // // Note: there is a typo in the above-linked paper that shows the result of the gathering process is: // [ddddddcc|bbbbcccc|aaaaaabb] // but should be: // [ccdddddd|bbbbcccc|aaaaaabb] //
__ vslb(l, input, pack_lshift); // vslo of vec_8s shifts the vector by one octet toward lower // element numbers, discarding element 0. This means it actually // shifts to the right (not left) according to the order of the // table above.
__ vslo(l, l, vec_8s);
__ vsrb(r, input, pack_rshift);
__ xxlor(gathered->to_vsr(), l->to_vsr(), r->to_vsr());
// Final rearrangement of bytes into their correct positions. // +==============+======+======+======+======+=====+=====+====+====+====+====+=====+=====+=====+=====+=====+=====+ // | Vector | e0 | e1 | e2 | e3 | e4 | e5 | e6 | e7 | e8 | e9 | e10 | e11 | e12 | e13 | e14 | e15 | // | Elements | | | | | | | | | | | | | | | | | // +==============+======+======+======+======+=====+=====+====+====+====+====+=====+=====+=====+=====+=====+=====+ // | after xxlor | b11 | b10 | b9 | xx | b8 | b7 | b6 | xx | b5 | b4 | b3 | xx | b2 | b1 | b0 | xx | // +--------------+------+------+------+------+-----+-----+----+----+----+----+-----+-----+-----+-----+-----+-----+ // | pack_permute | 0 | 0 | 0 | 0 | 0 | 1 | 2 | 4 | 5 | 6 | 8 | 9 | 10 | 12 | 13 | 14 | // +--------------+------+------+------+------+-----+-----+----+----+----+----+-----+-----+-----+-----+-----+-----+ // | after xxperm | b11* | b11* | b11* | b11* | b11 | b10 | b9 | b8 | b7 | b6 | b5 | b4 | b3 | b2 | b1 | b0 | // +==============+======+======+======+======+=====+=====+====+====+====+====+=====+=====+=====+=====+=====+=====+ // xx bytes are not used to form the final data // b0..b15 are the decoded and reassembled 8-bit bytes of data // b11 with asterisk is a "don't care", because these bytes will be // overwritten on the next iteration.
__ xxperm(gathered->to_vsr(), gathered->to_vsr(), pack_permute);
// We cannot use a static displacement on the store, since it's a // multiple of 12, not 16. Note that this stxv instruction actually // writes 16 bytes, even though only the first 12 are valid data.
__ stxv(gathered->to_vsr(), 0, out);
__ addi(out, out, 12);
__ addi(in, in, 16);
__ bdnz(loop_start);
__ bind(loop_exit);
// Return the number of out bytes produced, which is (out - (d + dp)) == out - d - dp;
__ sub(R3_RET, out, d);
__ sub(R3_RET, R3_RET, dp);
// This algorithm is based on the methods described in this paper: // http://0x80.pl/notesen/2016-01-12-sse-base64-encoding.html // // The details of this implementation vary from the paper due to the // difference in the ISA between SSE and AltiVec, especially in the // splitting bytes section where there is no need on Power to mask after // the shift because the shift is byte-wise rather than an entire an entire // 128-bit word. // // For the lookup part of the algorithm, different logic is used than // described in the paper because of the availability of vperm, which can // do a 64-byte table lookup in four instructions, while preserving the // branchless nature. // // Description of the ENCODE_CORE macro // // Expand first 12 x 8-bit data bytes into 16 x 6-bit bytes (upper 2 // bits of each byte are zeros) // // (Note: e7..e0 are not shown because they follow the same pattern as // e8..e15) // // In the table below, b0, b1, .. b15 are the bytes of unencoded // binary data, the first line of each of the cells (except for // the constants) uses the bit-field nomenclature from the // above-linked paper, whereas the second line is more specific // about which exact bits are present, and is constructed using the // Power ISA 3.x document style, where: // // * The specifier after the colon depicts which bits are there. // * The bit numbering is big endian style (bit 0 is the most // significant). // * || is a concatenate operator. // * Strings of 0's are a field of zeros with the shown length, and // likewise for strings of 1's. // // +==========================+=============+======================+======================+=============+=============+======================+======================+=============+ // | Vector | e8 | e9 | e10 | e11 | e12 | e13 | e14 | e15 | // | Element | | | | | | | | | // +==========================+=============+======================+======================+=============+=============+======================+======================+=============+ // | after lxv | jjjjkkkk | iiiiiijj | gghhhhhh | ffffgggg | eeeeeeff | ccdddddd | bbbbcccc | aaaaaabb | // | | b7 | b6 | b5 | b4 | b3 | b2 | b1 | b0 | // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+ // | xxperm indexes | 0 | 10 | 11 | 12 | 0 | 13 | 14 | 15 | // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+ // | (1) after xxperm | | gghhhhhh | ffffgggg | eeeeeeff | | ccdddddd | bbbbcccc | aaaaaabb | // | | (b15) | b5 | b4 | b3 | (b15) | b2 | b1 | b0 | // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+ // | rshift_amount | 0 | 6 | 4 | 2 | 0 | 6 | 4 | 2 | // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+ // | after vsrb | | 000000gg | 0000ffff | 00eeeeee | | 000000cc | 0000bbbb | 00aaaaaa | // | | (b15) | 000000||b5:0..1 | 0000||b4:0..3 | 00||b3:0..5 | (b15) | 000000||b2:0..1 | 0000||b1:0..3 | 00||b0:0..5 | // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+ // | rshift_mask | 00000000 | 000000||11 | 0000||1111 | 00||111111 | 00000000 | 000000||11 | 0000||1111 | 00||111111 | // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+ // | rshift after vand | 00000000 | 000000gg | 0000ffff | 00eeeeee | 00000000 | 000000cc | 0000bbbb | 00aaaaaa | // | | 00000000 | 000000||b5:0..1 | 0000||b4:0..3 | 00||b3:0..5 | 00000000 | 000000||b2:0..1 | 0000||b1:0..3 | 00||b0:0..5 | // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+ // | 1 octet lshift (1) | gghhhhhh | ffffgggg | eeeeeeff | | ccdddddd | bbbbcccc | aaaaaabb | 00000000 | // | | b5 | b4 | b3 | (b15) | b2 | b1 | b0 | 00000000 | // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+ // | lshift_amount | 0 | 2 | 4 | 0 | 0 | 2 | 4 | 0 | // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+ // | after vslb | gghhhhhh | ffgggg00 | eeff0000 | | ccdddddd | bbcccc00 | aabb0000 | 00000000 | // | | b5 | b4:2..7||00 | b3:4..7||0000 | (b15) | b2:0..7 | b1:2..7||00 | b0:4..7||0000 | 00000000 | // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+ // | lshift_mask | 00||111111 | 00||1111||00 | 00||11||0000 | 00000000 | 00||111111 | 00||1111||00 | 00||11||0000 | 00000000 | // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+ // | lshift after vand | 00hhhhhh | 00gggg00 | 00ff0000 | 00000000 | 00dddddd | 00cccc00 | 00bb0000 | 00000000 | // | | 00||b5:2..7 | 00||b4:4..7||00 | 00||b3:6..7||0000 | 00000000 | 00||b2:2..7 | 00||b1:4..7||00 | 00||b0:6..7||0000 | 00000000 | // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+ // | after vor lshift, rshift | 00hhhhhh | 00gggggg | 00ffffff | 00eeeeee | 00dddddd | 00cccccc | 00bbbbbb | 00aaaaaa | // | | 00||b5:2..7 | 00||b4:4..7||b5:0..1 | 00||b3:6..7||b4:0..3 | 00||b3:0..5 | 00||b2:2..7 | 00||b1:4..7||b2:0..1 | 00||b0:6..7||b1:0..3 | 00||b0:0..5 | // +==========================+=============+======================+======================+=============+=============+======================+======================+=============+ // // Expand the first 12 bytes into 16 bytes, leaving every 4th byte // blank for now. // __ xxperm(input->to_vsr(), input->to_vsr(), expand_permute); // // Generate two bit-shifted pieces - rshift and lshift - that will // later be OR'd together. // // First the right-shifted piece // __ vsrb(rshift, input, expand_rshift); // __ vand(rshift, rshift, expand_rshift_mask); // // Now the left-shifted piece, which is done by octet shifting // the input one byte to the left, then doing a variable shift, // followed by a mask operation. // // __ vslo(lshift, input, vec_8s); // __ vslb(lshift, lshift, expand_lshift); // __ vand(lshift, lshift, expand_lshift_mask); // // Combine the two pieces by OR'ing // __ vor(expanded, rshift, lshift); // // At this point, expanded is a vector containing a 6-bit value in each // byte. These values are used as indexes into a 64-byte lookup table that // is contained in four vector registers. The lookup operation is done // using vperm instructions with the same indexes for the lower 32 and // upper 32 bytes. To figure out which of the two looked-up bytes to use // at each location, all values in expanded are compared to 31. Using // vsel, values higher than 31 use the results from the upper 32 bytes of // the lookup operation, while values less than or equal to 31 use the // lower 32 bytes of the lookup operation. // // Note: it's tempting to use a xxpermx,xxpermx,vor sequence here on // Power10 (or later), but experiments doing so on Power10 yielded a slight // performance drop, perhaps due to the need for xxpermx instruction // prefixes.
// Number of bytes to process in each pass through the main loop. // 12 of the 16 bytes from each lxv are encoded to 16 Base64 bytes. constunsigned block_size = 12;
// According to the ELF V2 ABI, registers r3-r12 are volatile and available for use without save/restore Register src = R3_ARG1; // source starting address of Base64 characters Register sp = R4_ARG2; // source starting position Register sl = R5_ARG3; // total source length of the Base64 characters to be processed Register dst = R6_ARG4; // destination address Register dp = R7_ARG5; // destination starting position Register isURL = R8_ARG6; // boolean, if non-zero indicates use of RFC 4648 base64url encoding
// Local variables Register const_ptr = R12; // used for loading constants (reuses isURL's register) Register tmp_reg = R9; // used for speeding up load_constant()
Register size = R9; // number of bytes to process (reuses tmp_reg's register) Register blocked_size = R10; // number of bytes to process a block at a time Register block_modulo = R12; // == block_size (reuse const_ptr) Register remaining = R12; // bytes remaining to process after the blocks are completed (reuse block_modulo's reg) Register in = R4; // current input (source) pointer (reuse sp's register) Register num_blocks = R11; // number of blocks to be processed by the loop Register out = R8; // current output (destination) pointer (reuse const_ptr's register) Register three = R9; // constant divisor (reuse size's register) Register bytes_to_write = R10; // number of bytes to write with the stxvl instr (reused blocked_size's register) Register tmp1 = R7; // temp register for lxvl length (reuse dp's register) Register modulo_chars = R7; // number of bytes written during the final write % 4 (reuse tmp1's register) Register pad_char = R6; // literal '=' (reuse dst's register)
// The upper 32 bits of the non-pointer parameter registers are not // guaranteed to be zero, so mask off those upper bits.
__ clrldi(sp, sp, 32);
__ clrldi(sl, sl, 32);
__ clrldi(dp, dp, 32);
__ clrldi(isURL, isURL, 32);
// Splat the constants that can use xxspltib
__ xxspltib(vec_8s->to_vsr(), 8);
__ xxspltib(vec_31s->to_vsr(), 31);
// Use a different translation lookup table depending on the // setting of isURL
__ cmpdi(CCR0, isURL, 0);
__ beq(CCR0, not_URL);
__ lxv(vec_base64_48_63->to_vsr(), BLK_OFFSETOF(base64_48_63_URL_val), const_ptr);
__ b(calculate_size);
// size = sl - sp - 4 (*) // (*) Don't process the last four bytes in the main loop because // we don't want the lxv instruction to read past the end of the src // data, in case those four bytes are on the start of an unmapped or // otherwise inaccessible page. //
__ sub(size, sl, sp);
__ subi(size, size, 4);
__ cmpdi(CCR7, size, block_size);
__ bgt(CCR7, calculate_blocked_size);
__ mr(remaining, size); // Add the 4 back into remaining again
__ addi(remaining, remaining, 4); // make "in" point to the beginning of the source data: in = src + sp
__ add(in, src, sp); // out = dst + dp
__ add(out, dst, dp);
__ b(skip_loop);
// When there are less than 16 bytes left, we need to be careful not to // read beyond the end of the src buffer, which might be in an unmapped // page. // Load the remaining bytes using lxvl.
__ rldicr(tmp1, remaining, 56, 7);
__ lxvl(input->to_vsr(), in, tmp1);
// We've processed 12 of the 13-15 data bytes, so advance the pointers, // and do one final pass for the remaining 1-3 bytes.
__ addi(in, in, 12);
__ addi(out, out, 16);
__ subi(remaining, remaining, 12);
__ subi(bytes_to_write, bytes_to_write, 16);
__ rldicr(tmp1, bytes_to_write, 56, 7);
__ lxvl(input->to_vsr(), in, tmp1);
ENCODE_CORE
__ bind(le_16_to_write); // shift bytes_to_write into the upper 8 bits of t1 for use by stxvl
__ rldicr(tmp1, bytes_to_write, 56, 7);
__ stxvl(expanded->to_vsr(), out, tmp1);
__ add(out, out, bytes_to_write);
// R3_RET contains the size of the frames to thaw, 0 if overflow or no more frames
Label thaw_success;
__ cmpdi(CCR0, R3_RET, 0);
__ bne(CCR0, thaw_success);
__ load_const_optimized(tmp1, (StubRoutines::throw_StackOverflowError_entry()), R0);
__ mtctr(tmp1); __ bctr();
__ bind(thaw_success);
__ addi(R3_RET, R3_RET, frame::abi_reg_args_size); // Large abi required for C++ calls.
__ neg(R3_RET, R3_RET); // align down resulting in a smaller negative offset
__ clrrdi(R3_RET, R3_RET, exact_log2(frame::alignment_in_bytes));
DEBUG_ONLY(__ mr(tmp1, R1_SP);)
__ resize_frame(R3_RET, tmp2); // make room for the thawed frames
__ li(R4_ARG2, kind);
__ call_VM_leaf(Continuation::thaw_entry(), R16_thread, R4_ARG2);
__ mr(R1_SP, R3_RET); // R3_RET contains the SP of the thawed top frame
if (return_barrier) { // we're now in the caller of the frame that returned to the barrier
__ mr(R3_RET, nvtmp); __ fmr(F1_RET, nvftmp); // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
} else { // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down)
__ li(R3_RET, 0); // return 0 (success) from doYield
}
if (return_barrier_exception) { Register ex_pc = R17_tos; // nonvolatile register
__ ld(ex_pc, _abi0(lr), R1_SP); // LR
__ mr(nvtmp, R3_RET); // save return value containing the exception oop
__ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), R16_thread, ex_pc);
__ mtlr(R3_RET); // the exception handler // See OptoRuntime::generate_exception_blob for register arguments
__ mr(R3_ARG1, nvtmp); // exception oop
__ mr(R4_ARG2, ex_pc); // exception pc
} else { // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
__ ld(R0, _abi0(lr), R1_SP); // LR
__ mtlr(R0);
}
__ blr();
// For c2: c_rarg0 is junk, call to runtime to write a checkpoint. // It returns a jobject handle to the event writer. // The handle is dereferenced and the return value is the event writer oop.
RuntimeStub* generate_jfr_write_checkpoint() {
CodeBuffer code("jfr_write_checkpoint", 512, 64);
MacroAssembler* _masm = new MacroAssembler(&code);
OopMapSet* oop_maps = new OopMapSet();
OopMap* map = new OopMap(framesize, 0);
oop_maps->add_gc_map(calls_return_pc - start, map);
RuntimeStub* stub = // codeBlob framesize is in words (not VMRegImpl::slot_size)
RuntimeStub::new_runtime_stub(code.name(),
&code, frame_complete,
(framesize >> (LogBytesPerWord - LogBytesPerInt)),
oop_maps, false); return stub;
}
#endif// INCLUDE_JFR
// Initialization void generate_initial() { // Generates all stubs and initializes the entry points
// Entry points that exist in all platforms. // Note: This is code that could be shared among different platforms - however the // benefit seems to be smaller than the disadvantage of having a // much more complicated generator structure. See also comment in // stubRoutines.hpp.
// Build this early so it's available for the interpreter.
StubRoutines::_throw_StackOverflowError_entry =
generate_throw_exception("StackOverflowError throw_exception",
CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError), false);
StubRoutines::_throw_delayed_StackOverflowError_entry =
generate_throw_exception("delayed StackOverflowError throw_exception",
CAST_FROM_FN_PTR(address, SharedRuntime::throw_delayed_StackOverflowError), false);
void generate_all() { // Generates all stubs and initializes the entry points
// These entry points require SharedInfo::stack0 to be set up in // non-core builds
StubRoutines::_throw_AbstractMethodError_entry = generate_throw_exception("AbstractMethodError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_AbstractMethodError), false); // Handle IncompatibleClassChangeError in itable stubs.
StubRoutines::_throw_IncompatibleClassChangeError_entry= generate_throw_exception("IncompatibleClassChangeError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_IncompatibleClassChangeError), false);
StubRoutines::_throw_NullPointerException_at_call_entry= generate_throw_exception("NullPointerException at call throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException_at_call), false);
// support for verify_oop (must happen after universe_init)
StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
// nmethod entry barriers for concurrent class unloading
BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod(); if (bs_nm != NULL) {
StubRoutines::ppc::_nmethod_entry_barrier = generate_nmethod_entry_barrier();
}
// arraycopy stubs used by compilers
generate_arraycopy_stubs();
#ifdef COMPILER2 if (UseMultiplyToLenIntrinsic) {
StubRoutines::_multiplyToLen = generate_multiplyToLen();
} if (UseSquareToLenIntrinsic) {
StubRoutines::_squareToLen = generate_squareToLen();
} if (UseMulAddIntrinsic) {
StubRoutines::_mulAdd = generate_mulAdd();
} if (UseMontgomeryMultiplyIntrinsic) {
StubRoutines::_montgomeryMultiply
= CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_multiply);
} if (UseMontgomerySquareIntrinsic) {
StubRoutines::_montgomerySquare
= CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_square);
} #endif
// data cache line writeback if (VM_Version::supports_data_cache_line_flush()) {
StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
}
if (UseAESIntrinsics) {
StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
}
#ifdef VM_LITTLE_ENDIAN // Currently supported on PPC64LE only if (UseBASE64Intrinsics) {
StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
} #endif
}
public:
StubGenerator(CodeBuffer* code, int phase) : StubCodeGenerator(code) { if (phase == 0) {
generate_initial();
} elseif (phase == 1) {
generate_phase1(); // stubs that must be available for the interpreter
} else {
generate_all();
}
}
};
#define UCM_TABLE_MAX_ENTRIES 8 void StubGenerator_generate(CodeBuffer* code, int phase) { if (UnsafeCopyMemory::_table == NULL) {
UnsafeCopyMemory::create_table(UCM_TABLE_MAX_ENTRIES);
}
StubGenerator g(code, phase);
}
Messung V0.5 in Prozent
¤ Dauer der Verarbeitung: 0.97 Sekunden
(vorverarbeitet am 2026-04-26)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.