/* * Copyright (c) 2003, 2022, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2014, 2022, Red Hat Inc. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 only, as * published by the Free Software Foundation. * * This code is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * version 2 for more details (a copy is included in the LICENSE file that * accompanied this code). * * You should have received a copy of the GNU General Public License version * 2 along with this work; if not, write to the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. * * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA * or visit www.oracle.com if you need additional information or have any * questions. *
*/
// Declaration and definition of StubGenerator (no .hpp file). // For a more detailed description of the stub routine structure // see the comment in stubRoutines.hpp
// Call stubs are used to call Java from C // // Arguments: // c_rarg0: call wrapper address address // c_rarg1: result address // c_rarg2: result type BasicType // c_rarg3: method Method* // c_rarg4: (interpreter) entry point address // c_rarg5: parameters intptr_t* // c_rarg6: parameter size (in words) int // c_rarg7: thread Thread* // // There is no return from the stub itself as any Java result // is written to result // // we save r30 (lr) as the return PC at the base of the frame and // link r29 (fp) below it as the frame pointer installing sp (r31) // into fp. // // we save r0-r7, which accounts for all the c arguments. // // TODO: strictly do we need to save them all? they are treated as // volatile by C so could we omit saving the ones we are going to // place in global registers (thread? method?) or those we only use // during setup of the Java call? // // we don't need to save r8 which C uses as an indirect result location // return register. // // we don't need to save r9-r15 which both C and Java treat as // volatile // // we don't need to save r16-18 because Java does not use them // // we save r19-r28 which Java uses as scratch registers and C // expects to be callee-save // // we save the bottom 64 bits of each value stored in v8-v15; it is // the responsibility of the caller to preserve larger values. // // so the stub frame looks like this when we enter Java code // // [ return_from_Java ] <--- sp // [ argument word n ] // ... // -27 [ argument word 1 ] // -26 [ saved v15 ] <--- sp_after_call // -25 [ saved v14 ] // -24 [ saved v13 ] // -23 [ saved v12 ] // -22 [ saved v11 ] // -21 [ saved v10 ] // -20 [ saved v9 ] // -19 [ saved v8 ] // -18 [ saved r28 ] // -17 [ saved r27 ] // -16 [ saved r26 ] // -15 [ saved r25 ] // -14 [ saved r24 ] // -13 [ saved r23 ] // -12 [ saved r22 ] // -11 [ saved r21 ] // -10 [ saved r20 ] // -9 [ saved r19 ] // -8 [ call wrapper (r0) ] // -7 [ result (r1) ] // -6 [ result type (r2) ] // -5 [ method (r3) ] // -4 [ entry point (r4) ] // -3 [ parameters (r5) ] // -2 [ parameter size (r6) ] // -1 [ thread (r7) ] // 0 [ saved fp (r29) ] <--- fp == saved sp (r31) // 1 [ saved lr (r30) ]
// Call stub stack layout word offsets from fp enum call_stub_layout {
sp_after_call_off = -26,
// set up frame and move sp to end of save area
__ enter();
__ sub(sp, rfp, -sp_after_call_off * wordSize);
// save register parameters and Java scratch/global registers // n.b. we save thread even though it gets installed in // rthread because we want to sanity check rthread later
__ str(c_rarg7, thread);
__ strw(c_rarg6, parameter_size);
__ stp(c_rarg4, c_rarg5, entry_point);
__ stp(c_rarg2, c_rarg3, result_type);
__ stp(c_rarg0, c_rarg1, call_wrapper);
// install Java thread in global register now we have saved // whatever value it held
__ mov(rthread, c_rarg7); // And method
__ mov(rmethod, c_rarg3);
// set up the heapbase register
__ reinit_heapbase();
#ifdef ASSERT // make sure we have no pending exceptions
{
Label L;
__ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
__ cmp(rscratch1, (u1)NULL_WORD);
__ br(Assembler::EQ, L);
__ stop("StubRoutines::call_stub: entered with pending exception");
__ BIND(L);
} #endif // pass parameters if any
__ mov(esp, sp);
__ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
__ andr(sp, rscratch1, -2 * wordSize);
BLOCK_COMMENT("pass parameters if any");
Label parameters_done; // parameter count is still in c_rarg6 // and parameter pointer identifying param 1 is in c_rarg5
__ cbzw(c_rarg6, parameters_done);
// we do this here because the notify will already have been done // if we get to the next instruction via an exception // // n.b. adding this instruction here affects the calculation of // whether or not a routine returns to the call stub (used when // doing stack walks) since the normal test is to check the return // pc against the address saved below. so we may need to allow for // this extra instruction in the check.
// save current address for use by exception handling code
return_address = __ pc();
// store result depending on type (everything that is not // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) // n.b. this assumes Java returns an integral result in r0 // and a floating result in j_farg0
__ ldr(j_rarg2, result);
Label is_long, is_float, is_double, exit;
__ ldr(j_rarg1, result_type);
__ cmp(j_rarg1, (u1)T_OBJECT);
__ br(Assembler::EQ, is_long);
__ cmp(j_rarg1, (u1)T_LONG);
__ br(Assembler::EQ, is_long);
__ cmp(j_rarg1, (u1)T_FLOAT);
__ br(Assembler::EQ, is_float);
__ cmp(j_rarg1, (u1)T_DOUBLE);
__ br(Assembler::EQ, is_double);
// handle T_INT case
__ strw(r0, Address(j_rarg2));
__ BIND(exit);
// pop parameters
__ sub(esp, rfp, -sp_after_call_off * wordSize);
// Return point for a Java call if there's an exception thrown in // Java code. The exception is caught and transformed into a // pending exception stored in JavaThread that can be tested from // within the VM. // // Note: Usually the parameters are removed by the callee. In case // of an exception crossing an activation frame boundary, that is // not the case if the callee is compiled code => need to setup the // rsp. // // r0: exception oop
// complete return to VM
assert(StubRoutines::_call_stub_return_address != NULL, "_call_stub_return_address must have been generated before");
__ b(StubRoutines::_call_stub_return_address);
return start;
}
// Continuation point for runtime calls returning with a pending // exception. The pending exception check happened in the runtime // or native call stub. The pending exception in Thread is // converted into a Java-level exception. // // Contract with Java-level exception handlers: // r0: exception // r3: throwing pc // // NOTE: At entry of this stub, exception-pc must be in LR !!
// NOTE: this is always used as a jump target within generated code // so it just needs to be generated code with no x86 prolog
// Upon entry, LR points to the return address returning into // Java (interpreted or compiled) code; i.e., the return address // becomes the throwing pc. // // Arguments pushed before the runtime call are still on the stack // but the exception handler will reset the stack pointer -> // ignore them. A potential result in registers can be ignored as // well.
#ifdef ASSERT // make sure this code is only executed if there is a pending exception
{
Label L;
__ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
__ cbnz(rscratch1, L);
__ stop("StubRoutines::forward exception: no pending exception (1)");
__ bind(L);
} #endif
// compute exception handler into r19
// call the VM to find the handler address associated with the // caller address. pass thread in r0 and caller pc (ret address) // in r1. n.b. the caller pc is in lr, unlike x86 where it is on // the stack.
__ mov(c_rarg1, lr); // lr will be trashed by the VM call so we move it to R19 // (callee-saved) because we also need to pass it to the handler // returned by this call.
__ mov(r19, lr);
BLOCK_COMMENT("call exception_handler_for_return_address");
__ call_VM_leaf(CAST_FROM_FN_PTR(address,
SharedRuntime::exception_handler_for_return_address),
rthread, c_rarg1); // Reinitialize the ptrue predicate register, in case the external runtime // call clobbers ptrue reg, as we may return to SVE compiled code.
__ reinitialize_ptrue();
// we should not really care that lr is no longer the callee // address. we saved the value the handler needs in r19 so we can // just copy it to r3. however, the C2 handler will push its own // frame and then calls into the VM and the VM code asserts that // the PC for the frame above the handler belongs to a compiled // Java method. So, we restore lr here to satisfy that assert.
__ mov(lr, r19); // setup r0 & r3 & clear pending exception
__ mov(r3, r19);
__ mov(r19, r0);
__ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
__ str(zr, Address(rthread, Thread::pending_exception_offset()));
#ifdef ASSERT // make sure exception is set
{
Label L;
__ cbnz(r0, L);
__ stop("StubRoutines::forward exception: no pending exception (2)");
__ bind(L);
} #endif
// continue at exception handler // r0: exception // r3: throwing pc // r19: exception handler
__ verify_oop(r0);
__ br(r19);
// object is in r0 // make sure object is 'reasonable'
__ cbz(r0, exit); // if obj is NULL it is OK
#if INCLUDE_ZGC if (UseZGC) { // Check if mask is good. // verifies that ZAddressBadMask & r0 == 0
__ ldr(c_rarg3, Address(rthread, ZThreadLocalData::address_bad_mask_offset()));
__ andr(c_rarg2, r0, c_rarg3);
__ cbnz(c_rarg2, error);
} #endif
// Check if the oop is in the right area of memory
__ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask());
__ andr(c_rarg2, r0, c_rarg3);
__ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits());
// Compare c_rarg2 and c_rarg3. We don't use a compare // instruction here because the flags register is live.
__ eor(c_rarg2, c_rarg2, c_rarg3);
__ cbnz(c_rarg2, error);
// make sure klass is 'reasonable', which is not zero.
__ load_klass(r0, r0); // get klass
__ cbz(r0, error); // if klass is NULL it is broken
// The inner part of zero_words(). This is the bulk operation, // zeroing words in blocks, possibly using DC ZVA to do it. The // caller is responsible for zeroing the last few words. // // Inputs: // r10: the HeapWord-aligned base address of an array to zero. // r11: the count in HeapWords, r11 > 0. // // Returns r10 and r11, adjusted for the caller to clear. // r10: the base address of the tail of words left to clear. // r11: the number of words in the tail. // r11 < MacroAssembler::zero_words_block_size.
// Bulk copy of blocks of 8 words. // // count is a count of words. // // Precondition: count >= 8 // // Postconditions: // // The least significant bit of count contains the remaining count // of words to copy. The rest of count is trash. // // s and d are adjusted to point to the remaining words to copy // void generate_copy_longs(Label &start, Register s, Register d, Register count,
copy_direction direction) { int unit = wordSize * direction; int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
#ifdef ASSERT // Make sure we are never given < 8 words
{
Label L;
__ cmp(count, (u1)8);
__ br(Assembler::GE, L);
__ stop("genrate_copy_longs called with < 8 words");
__ bind(L);
} #endif
__ tbz(count, 1, L2);
__ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
__ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
__ bind(L2);
}
__ ret(lr);
if (AvoidUnalignedAccesses) {
Label drain, again; // Register order for storing. Order is different for backward copy.
__ bind(unaligned_copy_long);
// source address is even aligned, target odd aligned // // when forward copying word pairs we read long pairs at offsets // {0, 2, 4, 6} (in long words). when backwards copying we read // long pairs at offsets {-2, -4, -6, -8}. We adjust the source // address by -2 in the forwards case so we can compute the // source offsets for both as {2, 4, 6, 8} * unit where unit = 1 // or -1. // // when forward copying we need to store 1 word, 3 pairs and // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a // zero offset We adjust the destination by -1 which means we // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores. // // When backwards copyng we need to store 1 word, 3 pairs and // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use // offsets {1, 3, 5, 7, 8} * unit.
// Fill 8 registers // // for forwards copy s was offset by -16 from the original input // value of s so the register contents are at these offsets // relative to the 64 bit block addressed by that original input // and so on for each successive 64 byte block when s is updated // // t0 at offset 0, t1 at offset 8 // t2 at offset 16, t3 at offset 24 // t4 at offset 32, t5 at offset 40 // t6 at offset 48, t7 at offset 56
// for backwards copy s was not offset so the register contents // are at these offsets into the preceding 64 byte block // relative to that original input and so on for each successive // preceding 64 byte block when s is updated. this explains the // slightly counter-intuitive looking pattern of register usage // in the stp instructions for backwards copy. // // t0 at offset -16, t1 at offset -8 // t2 at offset -32, t3 at offset -24 // t4 at offset -48, t5 at offset -40 // t6 at offset -64, t7 at offset -56
if (direction == copy_forwards) { // allowing for the offset of -8 the store instructions place // registers into the target 64 bit block at the following // offsets // // t0 at offset 0 // t1 at offset 8, t2 at offset 16 // t3 at offset 24, t4 at offset 32 // t5 at offset 40, t6 at offset 48 // t7 at offset 56
__ str(t0, Address(d, 1 * unit));
__ stp(t1, t2, Address(d, 2 * unit));
__ ldp(t0, t1, Address(s, 2 * unit));
__ stp(t3, t4, Address(d, 4 * unit));
__ ldp(t2, t3, Address(s, 4 * unit));
__ stp(t5, t6, Address(d, 6 * unit));
__ ldp(t4, t5, Address(s, 6 * unit));
__ str(t7, Address(__ pre(d, 8 * unit)));
__ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
} else { // d was not offset when we started so the registers are // written into the 64 bit block preceding d with the following // offsets // // t1 at offset -8 // t3 at offset -24, t0 at offset -16 // t5 at offset -48, t2 at offset -32 // t7 at offset -56, t4 at offset -48 // t6 at offset -64 // // note that this matches the offsets previously noted for the // loads
// Drain // // this uses the same pattern of offsets and register arguments // as above
__ bind(drain); if (direction == copy_forwards) {
__ str(t0, Address(d, 1 * unit));
__ stp(t1, t2, Address(d, 2 * unit));
__ stp(t3, t4, Address(d, 4 * unit));
__ stp(t5, t6, Address(d, 6 * unit));
__ str(t7, Address(__ pre(d, 8 * unit)));
} else {
__ str(t1, Address(d, 1 * unit));
__ stp(t3, t0, Address(d, 3 * unit));
__ stp(t5, t2, Address(d, 5 * unit));
__ stp(t7, t4, Address(d, 7 * unit));
__ str(t6, Address(__ pre(d, 8 * unit)));
} // now we need to copy any remaining part block which may // include a 4 word block subblock and/or a 2 word subblock. // bits 2 and 1 in the count are the tell-tale for whether we // have each such subblock
{
Label L1, L2;
__ tbz(count, exact_log2(4), L1); // this is the same as above but copying only 4 longs hence // with only one intervening stp between the str instructions // but note that the offsets and registers still follow the // same pattern
__ ldp(t0, t1, Address(s, 2 * unit));
__ ldp(t2, t3, Address(__ pre(s, 4 * unit))); if (direction == copy_forwards) {
__ str(t0, Address(d, 1 * unit));
__ stp(t1, t2, Address(d, 2 * unit));
__ str(t3, Address(__ pre(d, 4 * unit)));
} else {
__ str(t1, Address(d, 1 * unit));
__ stp(t3, t0, Address(d, 3 * unit));
__ str(t2, Address(__ pre(d, 4 * unit)));
}
__ bind(L1);
__ tbz(count, 1, L2); // this is the same as above but copying only 2 longs hence // there is no intervening stp between the str instructions // but note that the offset and register patterns are still // the same
__ ldp(t0, t1, Address(__ pre(s, 2 * unit))); if (direction == copy_forwards) {
__ str(t0, Address(d, 1 * unit));
__ str(t1, Address(__ pre(d, 2 * unit)));
} else {
__ str(t1, Address(d, 1 * unit));
__ str(t0, Address(__ pre(d, 2 * unit)));
}
__ bind(L2);
// for forwards copy we need to re-adjust the offsets we // applied so that s and d are follow the last words written
// Small copy: less than 16 bytes. // // NB: Ignores all of the bits of count which represent more than 15 // bytes, so a caller doesn't have to mask them.
void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) { bool is_backwards = step < 0;
size_t granularity = uabs(step); int direction = is_backwards ? -1 : 1; int unit = wordSize * direction;
Label Lword, Lint, Lshort, Lbyte;
assert(granularity
&& granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
constRegister t0 = r3, t1 = r4, t2 = r5, t3 = r6;
// ??? I don't know if this bit-test-and-branch is the right thing // to do. It does a lot of jumping, resulting in several // mispredicted branches. It might make more sense to do this // with something like Duff's device with a single computed branch.
__ tbz(count, 3 - exact_log2(granularity), Lword);
__ ldr(tmp, Address(__ adjust(s, unit, is_backwards)));
__ str(tmp, Address(__ adjust(d, unit, is_backwards)));
__ bind(Lword);
// All-singing all-dancing memory copy. // // Copy count units of memory from s to d. The size of a unit is // step, which can be positive or negative depending on the direction // of copy. If is_aligned is false, we align the source address. //
// 65..80/96 bytes // (96 bytes if SIMD because we do 32 byes per instruction)
__ bind(copy80); if (UseSIMDForMemoryOps) {
__ ldpq(v0, v1, Address(s, 0));
__ ldpq(v2, v3, Address(s, 32)); // Unaligned pointers can be an issue for copying. // The issue has more chances to happen when granularity of data is // less than 4(sizeof(jint)). Pointers for arrays of jint are at least // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned. // The most performance drop has been seen for the range 65-80 bytes. // For such cases using the pair of ldp/stp instead of the third pair of // ldpq/stpq fixes the performance issue. if (granularity < sizeof (jint)) {
Label copy96;
__ cmp(count, u1(80/granularity));
__ br(Assembler::HI, copy96);
__ ldp(t0, t1, Address(send, -16));
// Now we've got the small case out of the way we can align the // source address on a 2-word boundary.
Label aligned;
if (is_aligned) { // We may have to adjust by 1 word to get s 2-word-aligned.
__ tbz(s, exact_log2(wordSize), aligned);
__ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards)));
__ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards)));
__ sub(count, count, wordSize/granularity);
} else { if (is_backwards) {
__ andr(rscratch2, s, 2 * wordSize - 1);
} else {
__ neg(rscratch2, s);
__ andr(rscratch2, rscratch2, 2 * wordSize - 1);
} // rscratch2 is the byte adjustment needed to align s.
__ cbz(rscratch2, aligned); int shift = exact_log2(granularity); if (shift) __ lsr(rscratch2, rscratch2, shift);
__ sub(count, count, rscratch2);
#if 0 // ?? This code is only correct for a disjoint copy. It may or // may not make sense to use it in that case.
// Copy the first pair; s and d may not be aligned.
__ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
__ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
// We have a count of units and some trailing bytes. Adjust the // count and do a bulk copy of words.
__ lsr(rscratch2, count, exact_log2(wordSize/granularity)); if (direction == copy_forwards)
__ bl(copy_f); else
__ bl(copy_b);
// And the tail.
copy_memory_small(s, d, count, tmp, step);
if (granularity >= 8) __ bind(copy8); if (granularity >= 4) __ bind(copy4);
__ bind(finish);
}
// Scan over array at a for count oops, verifying each one. // Preserves a and count, clobbers rscratch1 and rscratch2. void verify_oop_array (int size, Register a, Register count, Register temp) {
Label loop, end;
__ mov(rscratch1, a);
__ mov(rscratch2, zr);
__ bind(loop);
__ cmp(rscratch2, count);
__ br(Assembler::HS, end); if (size == wordSize) {
__ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
__ verify_oop(temp);
} else {
__ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
__ decode_heap_oop(temp); // calls verify_oop
}
__ add(rscratch2, rscratch2, 1);
__ b(loop);
__ bind(end);
}
// Arguments: // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary // ignored // is_oop - true => oop array, so generate store check code // name - stub name string // // Inputs: // c_rarg0 - source array address // c_rarg1 - destination array address // c_rarg2 - element count, treated as ssize_t, can be zero // // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let // the hardware handle it. The two dwords within qwords that span // cache line boundaries will still be loaded and stored atomically. // // Side Effects: // disjoint_int_copy_entry is set to the no-overlap entry point // used by generate_conjoint_int_oop_copy(). //
address generate_disjoint_copy(int size, bool aligned, bool is_oop, address *entry, constchar *name, bool dest_uninitialized = false) { Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
RegSet saved_reg = RegSet::of(s, d, count);
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", name);
address start = __ pc();
__ enter();
if (entry != NULL) {
*entry = __ pc(); // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
BLOCK_COMMENT("Entry:");
}
// Arguments: // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary // ignored // is_oop - true => oop array, so generate store check code // name - stub name string // // Inputs: // c_rarg0 - source array address // c_rarg1 - destination array address // c_rarg2 - element count, treated as ssize_t, can be zero // // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let // the hardware handle it. The two dwords within qwords that span // cache line boundaries will still be loaded and stored atomically. //
address generate_conjoint_copy(int size, bool aligned, bool is_oop, address nooverlap_target,
address *entry, constchar *name, bool dest_uninitialized = false) { Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
RegSet saved_regs = RegSet::of(s, d, count);
StubCodeMark mark(this, "StubRoutines", name);
address start = __ pc();
__ enter();
if (entry != NULL) {
*entry = __ pc(); // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
BLOCK_COMMENT("Entry:");
}
// use fwd copy when (d-s) above_equal (count*size)
__ sub(rscratch1, d, s);
__ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
__ br(Assembler::HS, nooverlap_target);
DecoratorSet decorators = IN_HEAP | IS_ARRAY; if (dest_uninitialized) {
decorators |= IS_DEST_UNINITIALIZED;
} if (aligned) {
decorators |= ARRAYCOPY_ALIGNED;
}
// Arguments: // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary // ignored // name - stub name string // // Inputs: // c_rarg0 - source array address // c_rarg1 - destination array address // c_rarg2 - element count, treated as ssize_t, can be zero // // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, // we let the hardware handle it. The one to eight bytes within words, // dwords or qwords that span cache line boundaries will still be loaded // and stored atomically. // // Side Effects: // disjoint_byte_copy_entry is set to the no-overlap entry point // // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, // we let the hardware handle it. The one to eight bytes within words, // dwords or qwords that span cache line boundaries will still be loaded // and stored atomically. // // Side Effects: // disjoint_byte_copy_entry is set to the no-overlap entry point // used by generate_conjoint_byte_copy(). //
address generate_disjoint_byte_copy(bool aligned, address* entry, constchar *name) { constbool not_oop = false; return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
}
// Arguments: // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary // ignored // name - stub name string // // Inputs: // c_rarg0 - source array address // c_rarg1 - destination array address // c_rarg2 - element count, treated as ssize_t, can be zero // // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, // we let the hardware handle it. The one to eight bytes within words, // dwords or qwords that span cache line boundaries will still be loaded // and stored atomically. //
address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
address* entry, constchar *name) { constbool not_oop = false; return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
}
// Arguments: // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary // ignored // name - stub name string // // Inputs: // c_rarg0 - source array address // c_rarg1 - destination array address // c_rarg2 - element count, treated as ssize_t, can be zero // // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we // let the hardware handle it. The two or four words within dwords // or qwords that span cache line boundaries will still be loaded // and stored atomically. // // Side Effects: // disjoint_short_copy_entry is set to the no-overlap entry point // used by generate_conjoint_short_copy(). //
address generate_disjoint_short_copy(bool aligned,
address* entry, constchar *name) { constbool not_oop = false; return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
}
// Arguments: // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary // ignored // name - stub name string // // Inputs: // c_rarg0 - source array address // c_rarg1 - destination array address // c_rarg2 - element count, treated as ssize_t, can be zero // // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we // let the hardware handle it. The two or four words within dwords // or qwords that span cache line boundaries will still be loaded // and stored atomically. //
address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
address *entry, constchar *name) { constbool not_oop = false; return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
} // Arguments: // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary // ignored // name - stub name string // // Inputs: // c_rarg0 - source array address // c_rarg1 - destination array address // c_rarg2 - element count, treated as ssize_t, can be zero // // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let // the hardware handle it. The two dwords within qwords that span // cache line boundaries will still be loaded and stored atomically. // // Side Effects: // disjoint_int_copy_entry is set to the no-overlap entry point // used by generate_conjoint_int_oop_copy(). //
address generate_disjoint_int_copy(bool aligned, address *entry, constchar *name, bool dest_uninitialized = false) { constbool not_oop = false; return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
}
// Arguments: // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary // ignored // name - stub name string // // Inputs: // c_rarg0 - source array address // c_rarg1 - destination array address // c_rarg2 - element count, treated as ssize_t, can be zero // // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let // the hardware handle it. The two dwords within qwords that span // cache line boundaries will still be loaded and stored atomically. //
address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
address *entry, constchar *name, bool dest_uninitialized = false) { constbool not_oop = false; return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
}
// Arguments: // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes // ignored // name - stub name string // // Inputs: // c_rarg0 - source array address // c_rarg1 - destination array address // c_rarg2 - element count, treated as size_t, can be zero // // Side Effects: // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the // no-overlap entry point used by generate_conjoint_long_oop_copy(). //
address generate_disjoint_long_copy(bool aligned, address *entry, constchar *name, bool dest_uninitialized = false) { constbool not_oop = false; return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
}
// Arguments: // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes // ignored // name - stub name string // // Inputs: // c_rarg0 - source array address // c_rarg1 - destination array address // c_rarg2 - element count, treated as size_t, can be zero //
address generate_conjoint_long_copy(bool aligned,
address nooverlap_target, address *entry, constchar *name, bool dest_uninitialized = false) { constbool not_oop = false; return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
}
// Arguments: // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes // ignored // name - stub name string // // Inputs: // c_rarg0 - source array address // c_rarg1 - destination array address // c_rarg2 - element count, treated as size_t, can be zero // // Side Effects: // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the // no-overlap entry point used by generate_conjoint_long_oop_copy(). //
address generate_disjoint_oop_copy(bool aligned, address *entry, constchar *name, bool dest_uninitialized) { constbool is_oop = true; constint size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
}
// Arguments: // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes // ignored // name - stub name string // // Inputs: // c_rarg0 - source array address // c_rarg1 - destination array address // c_rarg2 - element count, treated as size_t, can be zero //
address generate_conjoint_oop_copy(bool aligned,
address nooverlap_target, address *entry, constchar *name, bool dest_uninitialized) { constbool is_oop = true; constint size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
name, dest_uninitialized);
}
// Helper for generating a dynamic type check. // Smashes rscratch1, rscratch2. void generate_type_check(Register sub_klass, Register super_check_offset, Register super_klass,
Label& L_success) {
assert_different_registers(sub_klass, super_check_offset, super_klass);
// Registers used as temps (r19, r20, r21, r22 are save-on-entry) constRegister copied_oop = r22; // actual oop copied constRegister count_save = r21; // orig elementscount constRegister start_to = r20; // destination array start address constRegister r19_klass = r19; // oop._klass
//--------------------------------------------------------------- // Assembler stub will be used for this call to arraycopy // if the two arrays are subtypes of Object[] but the // destination array type is not equal to or a supertype // of the source type. Each element must be separately // checked.
assert_different_registers(from, to, count, ckoff, ckval, start_to,
copied_oop, r19_klass, count_save);
__ enter(); // required for proper stackwalking of RuntimeStub frame
#ifdef ASSERT // caller guarantees that the arrays really are different // otherwise, we would have to make conjoint checks
{ Label L;
__ b(L); // conjoint check not yet implemented
__ stop("checkcast_copy within a single array");
__ bind(L);
} #endif//ASSERT
// Caller of this entry point must set up the argument registers. if (entry != NULL) {
*entry = __ pc();
BLOCK_COMMENT("Entry:");
}
// ======== loop entry is here ========
__ BIND(L_load_element);
__ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8), noreg, noreg, AS_RAW); // load the oop
__ cbz(copied_oop, L_store_element);
__ load_klass(r19_klass, copied_oop);// query the object klass
generate_type_check(r19_klass, ckoff, ckval, L_store_element); // ======== end loop ========
// It was a real error; we must depend on the caller to finish the job. // Register count = remaining oops, count_orig = total oops.
--> --------------------
--> maximum size reached
--> --------------------
¤ Dauer der Verarbeitung: 0.32 Sekunden
(vorverarbeitet)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung ist noch experimentell.