/* * Copyright (c) 2016, 2022, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2016, 2022 SAP SE. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 only, as * published by the Free Software Foundation. * * This code is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * version 2 for more details (a copy is included in the LICENSE file that * accompanied this code). * * You should have received a copy of the GNU General Public License version * 2 along with this work; if not, write to the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. * * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA * or visit www.oracle.com if you need additional information or have any * questions. *
*/
// Declaration and definition of StubGenerator (no .hpp file). // For a more detailed description of the stub routine structure // see the comment in stubRoutines.hpp.
// These static, partially const, variables are for the AES intrinsics. // They are declared/initialized here to make them available across function bodies.
#ifdefined(JIT_TIMER) staticconstint JIT_TIMER_space = 8; // extra space for JIT_TIMER data #else staticconstint JIT_TIMER_space = 0; #endif staticconstint AES_parmBlk_align = 32; // octoword alignment.
staticint AES_ctrVal_len = 0; // ctr init value len (in bytes), expected: length of dataBlk (16) staticint AES_ctrVec_len = 0; // # of ctr vector elements. That many block can be ciphered with one instruction execution staticint AES_ctrArea_len = 0; // reserved stack space (in bytes) for ctr (= ctrVal_len * ctrVec_len)
staticint AES_parmBlk_addspace = 0; // Must be multiple of AES_parmblk_align. // Will be set by stub generator to stub specific value. staticint AES_dataBlk_space = 0; // Must be multiple of AES_parmblk_align. // Will be set by stub generator to stub specific value.
// Calculate top_of_arguments_addr which will be tos (not prepushed) later. // Wimply use SP + frame::top_ijava_frame_size.
__ add2reg(r_top_of_arguments_addr,
frame::z_top_ijava_frame_abi_size - BytesPerWord, Z_SP);
// Any arguments to copy?
__ load_and_test_int2long(Z_R1, Address(r_entryframe_fp, d_arg_argument_count));
__ z_bre(arguments_copied);
// Prepare loop and copy arguments in reverse order.
{ // Calculate argument size in bytes.
__ z_sllg(r_argument_size_in_bytes, Z_R1, LogBytesPerWord);
// Get addr of first incoming Java argument.
__ z_lg(r_argument_addr, d_arg_argument_addr, r_entryframe_fp);
// Let r_argumentcopy_addr point to last outgoing Java argument.
__ add2reg(r_argumentcopy_addr, BytesPerWord, r_top_of_arguments_addr); // = Z_SP+160 effectively.
// Let r_argument_addr point to last incoming Java argument.
__ add2reg_with_index(r_argument_addr, -BytesPerWord,
r_argument_size_in_bytes, r_argument_addr);
// Now loop while Z_R1 > 0 and copy arguments.
{
Label next_argument;
__ bind(next_argument); // Mem-mem move.
__ z_mvc(0, BytesPerWord-1, r_argumentcopy_addr, 0, r_argument_addr);
__ add2reg(r_argument_addr, -BytesPerWord);
__ add2reg(r_argumentcopy_addr, BytesPerWord);
__ z_brct(Z_R1, next_argument);
}
} // End of argument copy loop.
BLOCK_COMMENT("call {");
{ // Call frame manager or native entry.
// // Register state on entry to frame manager / native entry: // // Z_ARG1 = r_top_of_arguments_addr - intptr_t *sender tos (prepushed) // Lesp = (SP) + copied_arguments_offset - 8 // Z_method - method // Z_thread - JavaThread* //
// Here, the usual SP is the initial_caller_sp.
__ z_lgr(Z_R10, Z_SP);
// Z_esp points to the slot below the last argument.
__ z_lgr(Z_esp, r_top_of_arguments_addr);
// // Stack on entry to frame manager / native entry: // // F0 [TOP_IJAVA_FRAME_ABI] // [outgoing Java arguments] // [ENTRY_FRAME_LOCALS] // F1 [C_FRAME] // ... //
// Do a light-weight C-call here, r_new_arg_entry holds the address // of the interpreter entry point (frame manager or native entry) // and save runtime-value of return_pc in return_address // (call by reference argument).
return_address = __ call_stub(r_new_arg_entry);
}
BLOCK_COMMENT("} call");
{
BLOCK_COMMENT("restore registers {"); // Returned from frame manager or native entry. // Now pop frame, process result, and return to caller.
// // Stack on exit from frame manager / native entry: // // F0 [ABI] // ... // [ENTRY_FRAME_LOCALS] // F1 [C_FRAME] // ... // // Just pop the topmost frame ... //
// Restore frame pointer.
__ z_lg(r_entryframe_fp, _z_abi(callers_sp), Z_SP); // Pop frame. Done here to minimize stalls.
__ pop_frame();
// Reload some volatile registers which we've spilled before the call // to frame manager / native entry. // Access all locals via frame pointer, because we know nothing about // the topmost frame's size.
__ z_lg(r_arg_result_addr, result_address_offset, r_entryframe_fp);
__ z_lg(r_arg_result_type, result_type_offset, r_entryframe_fp);
// // Stack on exit from call_stub: // // 0 [C_FRAME] // ... // // No call_stub frames left. //
// All non-volatiles have been restored at this point!!
//------------------------------------------------------------------------ // The following code makes some assumptions on the T_<type> enum values. // The enum is defined in globalDefinitions.hpp. // The validity of the assumptions is tested as far as possible. // The assigned values should not be shuffled // T_BOOLEAN==4 - lowest used enum value // T_NARROWOOP==16 - largest used enum value //------------------------------------------------------------------------
BLOCK_COMMENT("process result {");
Label firstHandler; int handlerLen= 8; #ifdef ASSERT char assertMsg[] = "check BasicType definition in globalDefinitions.hpp";
__ z_chi(r_arg_result_type, T_BOOLEAN);
__ asm_assert_low(assertMsg, 0x0234);
__ z_chi(r_arg_result_type, T_NARROWOOP);
__ asm_assert_high(assertMsg, 0x0235); #endif
__ add2reg(r_arg_result_type, -T_BOOLEAN); // Remove offset.
__ z_larl(Z_R1, firstHandler); // location of first handler
__ z_sllg(r_arg_result_type, r_arg_result_type, 3); // Each handler is 8 bytes long.
__ z_bc(MacroAssembler::bcondAlways, 0, r_arg_result_type, Z_R1);
// Return point for a Java call if there's an exception thrown in // Java code. The exception is caught and transformed into a // pending exception stored in JavaThread that can be tested from // within the VM.
address generate_catch_exception() {
StubCodeMark mark(this, "StubRoutines", "catch_exception");
__ z_stg(Z_ARG1, thread_(pending_exception)); // Store into `char *'.
__ z_stg(exception_file, thread_(exception_file)); // Store into `int'.
__ z_st(exception_line, thread_(exception_line));
// Complete return to VM.
assert(StubRoutines::_call_stub_return_address != NULL, "must have been generated before");
// Continue in call stub.
__ z_br(Z_ARG2);
return start;
}
// Continuation point for runtime calls returning with a pending // exception. The pending exception check happened in the runtime // or native call stub. The pending exception in Thread is // converted into a Java-level exception. // // Read: // Z_R14: pc the runtime library callee wants to return to. // Since the exception occurred in the callee, the return pc // from the point of view of Java is the exception pc. // // Invalidate: // Volatile registers (except below). // // Update: // Z_ARG1: exception // (Z_R14 is unchanged and is live out). //
address generate_forward_exception() {
StubCodeMark mark(this, "StubRoutines", "forward_exception");
address start = __ pc();
// Make sure that this code is only executed if there is a pending exception.
{
Label L;
__ z_ltgr(Z_ARG1, Z_ARG1);
__ z_brne(L);
__ stop("StubRoutines::forward exception: no pending exception (1)");
__ bind(L);
}
__ verify_oop(Z_ARG1, "StubRoutines::forward exception: not an oop"); #endif
// The exception pc is the return address in the caller, // must load it into Z_ARG2
__ z_lgr(Z_ARG2, Z_R14);
#ifdef ASSERT // Make sure exception is set.
{ Label L;
__ z_ltgr(Z_ARG1, Z_ARG1);
__ z_brne(L);
__ stop("StubRoutines::forward exception: no pending exception (2)");
__ bind(L);
} #endif // Clear the pending exception.
__ clear_mem(Address(Z_thread, pending_exception_offset), sizeof(void *)); // Jump to exception handler
__ z_br(Z_R1 /*handler address*/);
return start;
#undef pending_exception_offset
}
// Continuation point for throwing of implicit exceptions that are // not handled in the current activation. Fabricates an exception // oop and initiates normal exception dispatching in this // frame. Only callee-saved registers are preserved (through the // normal RegisterMap handling). If the compiler // needs all registers to be preserved between the fault point and // the exception handler then it must assume responsibility for that // in AbstractCompiler::continuation_for_implicit_null_exception or // continuation_for_implicit_division_by_zero_exception. All other // implicit exceptions (e.g., NullPointerException or // AbstractMethodError on entry) are either at call sites or // otherwise assume that stack unwinding will be initiated, so // caller saved registers were assumed volatile in the compiler.
// Note that we generate only this stub into a RuntimeStub, because // it needs to be properly traversed and ignored during GC, so we // change the meaning of the "__" macro within this method.
// Note: the routine set_pc_not_at_call_for_caller in // SharedRuntime.cpp requires that this code be generated into a // RuntimeStub. #undef __ #define __ masm->
address generate_throw_exception(constchar* name, address runtime_entry, bool restore_saved_exception_pc, Register arg1 = noreg, Register arg2 = noreg) {
assert_different_registers(arg1, Z_R0_scratch); // would be destroyed by push_frame()
assert_different_registers(arg2, Z_R0_scratch); // would be destroyed by push_frame()
int insts_size = 256; int locs_size = 0;
CodeBuffer code(name, insts_size, locs_size);
MacroAssembler* masm = new MacroAssembler(&code); int framesize_in_bytes;
address start = __ pc();
// Note that we always have a runtime stub frame on the top of stack at this point.
__ get_PC(Z_R1);
__ set_last_Java_frame(/*sp*/Z_SP, /*pc*/Z_R1);
// Do the call.
BLOCK_COMMENT("call runtime_entry");
__ call_VM_leaf(runtime_entry, Z_thread, arg1, arg2);
__ reset_last_Java_frame();
#ifdef ASSERT // Make sure that this code is only executed if there is a pending exception.
{ Label L;
__ z_lg(Z_R0,
in_bytes(Thread::pending_exception_offset()),
Z_thread);
__ z_ltgr(Z_R0, Z_R0);
__ z_brne(L);
__ stop("StubRoutines::throw_exception: no pending exception");
__ bind(L);
} #endif
// No args, but tmp registers that are killed. constRegister Rlength = Z_ARG4; // cache array length constRegister Rarray_ptr = Z_ARG5; // Current value from cache array.
if (UseCompressedOops) {
assert(Universe::heap() != NULL, "java heap must be initialized to generate partial_subtype_check stub");
}
// Always take the slow path.
__ check_klass_subtype_slow_path(Rsubklass, Rsuperklass,
Rarray_ptr, Rlength, NULL, &miss);
// Match falls through here.
__ clear_reg(Z_RET); // Zero indicates a match. Set EQ flag in CC.
__ z_br(Z_R14);
__ BIND(miss);
__ load_const_optimized(Z_RET, 1); // One indicates a miss.
__ z_ltgr(Z_RET, Z_RET); // Set NE flag in CR.
__ z_br(Z_R14);
return start;
}
#if !defined(PRODUCT) // Wrapper which calls oopDesc::is_oop_or_null() // Only called by MacroAssembler::verify_oop staticvoid verify_oop_helper(constchar* message, oopDesc* o) { if (!oopDesc::is_oop_or_null(o)) {
fatal("%s. oop: " PTR_FORMAT, message, p2i(o));
}
++ StubRoutines::_verify_oop_count;
} #endif
// Return address of code to be called from code generated by // MacroAssembler::verify_oop. // // Don't generate, rather use C++ code.
address generate_verify_oop_subroutine() { // Don't generate a StubCodeMark, because no code is generated! // Generating the mark triggers notifying the oprofile jvmti agent // about the dynamic code generation, but the stub without // code (code_size == 0) confuses opjitconv // StubCodeMark mark(this, "StubRoutines", "verify_oop_stub");
// This is to test that the count register contains a positive int value. // Required because C2 does not respect int to long conversion for stub calls. void assert_positive_int(Register count) { #ifdef ASSERT
__ z_srag(Z_R0, count, 31); // Just leave the sign (must be zero) in Z_R0.
__ asm_assert_eq("missing zero extend", 0xAFFE); #endif
}
// Generate overlap test for array copy stubs. // If no actual overlap is detected, control is transferred to the // "normal" copy stub (entry address passed in disjoint_copy_target). // Otherwise, execution continues with the code generated by the // caller of array_overlap_test. // // Input: // Z_ARG1 - from // Z_ARG2 - to // Z_ARG3 - element count void array_overlap_test(address disjoint_copy_target, int log2_elem_size) {
__ MacroAssembler::compare_and_branch_optimized(Z_ARG2, Z_ARG1, Assembler::bcondNotHigh,
disjoint_copy_target, /*len64=*/true, /*has_sign=*/false);
Register index = Z_ARG3; if (log2_elem_size > 0) {
__ z_sllg(Z_R1, Z_ARG3, log2_elem_size); // byte count
index = Z_R1;
}
__ add2reg_with_index(Z_R1, 0, index, Z_ARG1); // First byte after "from" range.
// Destructive overlap: let caller generate code for that.
}
// Generate stub for disjoint array copy. If "aligned" is true, the // "from" and "to" addresses are assumed to be heapword aligned. // // Arguments for generated stub: // from: Z_ARG1 // to: Z_ARG2 // count: Z_ARG3 treated as signed void generate_disjoint_copy(bool aligned, int element_size, bool branchToEnd, bool restoreArgs) { // This is the zarch specific stub generator for general array copy tasks. // It has the following prereqs and features: // // - No destructive overlap allowed (else unpredictable results). // - Destructive overlap does not exist if the leftmost byte of the target // does not coincide with any of the source bytes (except the leftmost). // // Register usage upon entry: // Z_ARG1 == Z_R2 : address of source array // Z_ARG2 == Z_R3 : address of target array // Z_ARG3 == Z_R4 : length of operands (# of elements on entry) // // Register usage within the generator: // - Z_R0 and Z_R1 are KILLed by the stub routine (target addr/len). // Used as pair register operand in complex moves, scratch registers anyway. // - Z_R5 is KILLed by the stub routine (source register pair addr/len) (even/odd reg). // Same as R0/R1, but no scratch register. // - Z_ARG1, Z_ARG2, Z_ARG3 are USEd but preserved by the stub routine, // but they might get temporarily overwritten.
Register save_reg = Z_ARG4; // (= Z_R5), holds original target operand address for restore.
{ Register llen_reg = Z_R1; // Holds left operand len (odd reg). Register laddr_reg = Z_R0; // Holds left operand addr (even reg), overlaps with data_reg. Register rlen_reg = Z_R5; // Holds right operand len (odd reg), overlaps with save_reg. Register raddr_reg = Z_R4; // Holds right operand addr (even reg), overlaps with len_reg.
Register data_reg = Z_R0; // Holds copied data chunk in alignment process and copy loop. Register len_reg = Z_ARG3; // Holds operand len (#elements at entry, #bytes shortly after). Register dst_reg = Z_ARG2; // Holds left (target) operand addr. Register src_reg = Z_ARG1; // Holds right (source) operand addr.
assert((element_size<=256) && (256%element_size == 0), "element size must be <= 256, power of 2"); unsignedint log2_size = exact_log2(element_size);
switch (element_size) { case 1: BLOCK_COMMENT("ARRAYCOPY DISJOINT byte {"); break; case 2: BLOCK_COMMENT("ARRAYCOPY DISJOINT short {"); break; case 4: BLOCK_COMMENT("ARRAYCOPY DISJOINT int {"); break; case 8: BLOCK_COMMENT("ARRAYCOPY DISJOINT long {"); break; default: BLOCK_COMMENT("ARRAYCOPY DISJOINT {"); break;
}
assert_positive_int(len_reg);
BLOCK_COMMENT("preparation {");
// No copying if len <= 0. if (branchToEnd) {
__ compare64_and_branch(len_reg, (intptr_t) 0, Assembler::bcondNotHigh, done);
} else { if (VM_Version::has_CompareBranch()) {
__ z_cgib(len_reg, 0, Assembler::bcondNotHigh, 0, Z_R14);
} else {
__ z_ltgr(len_reg, len_reg);
__ z_bcr(Assembler::bcondNotPositive, Z_R14);
}
}
// Prefetch just one cache line. Speculative opt for short arrays. // Do not use Z_R1 in prefetch. Is undefined here. if (VM_Version::has_Prefetch()) {
__ z_pfd(0x01, 0, Z_R0, src_reg); // Fetch access.
__ z_pfd(0x02, 0, Z_R0, dst_reg); // Store access.
}
BLOCK_COMMENT("} preparation");
// Save args only if really needed. // Keep len test local to branch. Is generated only once.
BLOCK_COMMENT("mode selection {");
// Special handling for arrays with only a few elements. // Nothing fancy: just an executed MVC. if (log2_size > 0) {
__ z_sllg(Z_R1, len_reg, log2_size); // Remember #bytes in Z_R1.
} if (element_size != 8) {
__ z_cghi(len_reg, 256/element_size);
__ z_brnh(doMVC);
usedMVC = true;
} if (element_size == 8) { // Long and oop arrays are always aligned.
__ z_cghi(len_reg, 256/element_size);
__ z_brnh(doMVCUnrolled);
usedMVCUnrolled = true;
}
// Prefetch another cache line. We, for sure, have more than one line to copy. if (VM_Version::has_Prefetch()) {
__ z_pfd(0x01, 256, Z_R0, src_reg); // Fetch access.
__ z_pfd(0x02, 256, Z_R0, dst_reg); // Store access.
}
if (restoreArgs) { // Remember entry value of ARG2 to restore all arguments later from that knowledge.
__ z_lgr(save_reg, dst_reg);
}
__ z_cghi(len_reg, 4096/element_size); if (log2_size == 0) {
__ z_lgr(Z_R1, len_reg); // Init Z_R1 with #bytes
}
__ z_brnh(doMVCLOOP);
// Fall through to MVCLE case.
BLOCK_COMMENT("} mode selection");
// MVCLE: for long arrays // DW aligned: Best performance for sizes > 4kBytes. // unaligned: Least complex for sizes > 256 bytes. if (usedMVCLE) {
BLOCK_COMMENT("mode MVCLE {");
__ MacroAssembler::move_long_ext(laddr_reg, raddr_reg, 0xb0); // special: bypass cache // __ MacroAssembler::move_long_ext(laddr_reg, raddr_reg, 0xb8); // special: Hold data in cache. // __ MacroAssembler::move_long_ext(laddr_reg, raddr_reg, 0);
if (restoreArgs) { // MVCLE updates the source (Z_R4,Z_R5) and target (Z_R0,Z_R1) register pairs. // Dst_reg (Z_ARG2) and src_reg (Z_ARG1) are left untouched. No restore required. // Len_reg (Z_ARG3) is destroyed and must be restored.
__ z_slgr(laddr_reg, dst_reg); // copied #bytes if (log2_size > 0) {
__ z_srag(Z_ARG3, laddr_reg, log2_size); // Convert back to #elements.
} else {
__ z_lgr(Z_ARG3, laddr_reg);
}
} if (branchToEnd) {
__ z_bru(done);
} else {
__ z_br(Z_R14);
}
BLOCK_COMMENT("} mode MVCLE");
} // No fallthru possible here.
// MVCUnrolled: for short, aligned arrays.
if (usedMVCUnrolled) {
BLOCK_COMMENT("mode MVC unrolled {");
stride = 8;
// Generate unrolled MVC instructions. for (int ii = 32; ii > 1; ii--) {
__ z_mvc(0, ii * stride-1, dst_reg, 0, src_reg); // ii*8 byte copy if (branchToEnd) {
__ z_bru(done);
} else {
__ z_br(Z_R14);
}
}
// This is an absolute fast path: // - Array len in bytes must be not greater than 256. // - Array len in bytes must be an integer mult of DW // to save expensive handling of trailing bytes. // - Argument restore is not done, // i.e. previous code must not alter arguments (this code doesn't either).
__ bind(doMVCUnrolled);
// Avoid mul, prefer shift where possible. // Combine shift right (for #DW) with shift left (for block size). // Set CC for zero test below (asm_assert). // Note: #bytes comes in Z_R1, #DW in len_reg. unsignedint MVCblocksize = pcMVCblock_e - pcMVCblock_b; unsignedint logMVCblocksize = 0xffffffffU; // Pacify compiler ("used uninitialized" warning).
if (log2_size > 0) { // Len was scaled into Z_R1. switch (MVCblocksize) {
case 8: logMVCblocksize = 3;
__ z_ltgr(Z_R0, Z_R1); // #bytes is index break; // reasonable size, use shift
case 16: logMVCblocksize = 4;
__ z_slag(Z_R0, Z_R1, logMVCblocksize-log2_size); break; // reasonable size, use shift
default: logMVCblocksize = 0;
__ z_ltgr(Z_R0, len_reg); // #DW for mul break; // all other sizes: use mul
}
} else {
guarantee(log2_size, "doMVCUnrolled: only for DW entities");
}
// This test (and branch) is redundant. Previous code makes sure that // - element count > 0 // - element size == 8. // Thus, len reg should never be zero here. We insert an asm_assert() here, // just to double-check and to be on the safe side.
__ asm_assert(false, "zero len cannot occur", 99);
__ z_larl(Z_R1, MVC_ListEnd); // Get addr of last instr block. // Avoid mul, prefer shift where possible. if (logMVCblocksize == 0) {
__ z_mghi(Z_R0, MVCblocksize);
}
__ z_slgr(Z_R1, Z_R0);
__ z_br(Z_R1);
BLOCK_COMMENT("} mode MVC unrolled");
} // No fallthru possible here.
// MVC execute template // Must always generate. Usage may be switched on below. // There is no suitable place after here to put the template.
__ bind(MVC_template);
__ z_mvc(0,0,dst_reg,0,src_reg); // Instr template, never exec directly!
// MVC Loop: for medium-sized arrays
// Only for DW aligned arrays (src and dst). // #bytes to copy must be at least 256!!! // Non-aligned cases handled separately.
stride = 256;
stride_reg = Z_R1; // Holds #bytes when control arrives here.
ix_reg = Z_ARG3; // Alias for len_reg.
if (usedMVCLOOP) {
BLOCK_COMMENT("mode MVC loop {");
__ bind(doMVCLOOP);
__ z_lcgr(ix_reg, Z_R1); // Ix runs from -(n-2)*stride to 1*stride (inclusive).
__ z_llill(stride_reg, stride);
__ add2reg(ix_reg, 2*stride); // Thus: increment ix by 2*stride.
// Don 't use add2reg() here, since we must set the condition code!
__ z_aghi(ix_reg, -2*stride); // Compensate incr from above: zero diff means "all copied".
if (restoreArgs) {
__ z_lcgr(Z_R1, ix_reg); // Prepare ix_reg for copy loop, #bytes expected in Z_R1.
__ z_brnz(doMVCgeneral); // We're not done yet, ix_reg is not zero.
// ARG1, ARG2, and ARG3 were altered by the code above, so restore them building on save_reg.
__ z_slgr(dst_reg, save_reg); // copied #bytes
__ z_slgr(src_reg, dst_reg); // = ARG1 (now restored) if (log2_size) {
__ z_srag(Z_ARG3, dst_reg, log2_size); // Convert back to #elements to restore ARG3.
} else {
__ z_lgr(Z_ARG3, dst_reg);
}
__ z_lgr(Z_ARG2, save_reg); // ARG2 now restored.
// MVCgeneral: for short, unaligned arrays, after other copy operations
// Somewhat expensive due to use of EX instruction, but simple. if (usedMVCgeneral) {
BLOCK_COMMENT("mode MVC general {");
__ bind(doMVCgeneral);
__ add2reg(len_reg, -1, Z_R1); // Get #bytes-1 for EXECUTE. if (VM_Version::has_ExecuteExtensions()) {
__ z_exrl(len_reg, MVC_template); // Execute MVC with variable length.
} else {
__ z_larl(Z_R1, MVC_template); // Get addr of instr template.
__ z_ex(len_reg, 0, Z_R0, Z_R1); // Execute MVC with variable length.
} // penalty: 9 ticks
if (restoreArgs) { // ARG1, ARG2, and ARG3 were altered by code executed before, so restore them building on save_reg
__ z_slgr(dst_reg, save_reg); // Copied #bytes without the "doMVCgeneral" chunk
__ z_slgr(src_reg, dst_reg); // = ARG1 (now restored), was not advanced for "doMVCgeneral" chunk
__ add2reg_with_index(dst_reg, 1, len_reg, dst_reg); // Len of executed MVC was not accounted for, yet. if (log2_size) {
__ z_srag(Z_ARG3, dst_reg, log2_size); // Convert back to #elements to restore ARG3
} else {
__ z_lgr(Z_ARG3, dst_reg);
}
__ z_lgr(Z_ARG2, save_reg); // ARG2 now restored.
}
if (usedMVC) { if (branchToEnd) {
__ z_bru(done);
} else {
__ z_br(Z_R14);
}
} else { if (!branchToEnd) __ z_br(Z_R14);
}
BLOCK_COMMENT("} mode MVC general");
} // Fallthru possible if following block not generated.
// MVC: for short, unaligned arrays
// Somewhat expensive due to use of EX instruction, but simple. penalty: 9 ticks. // Differs from doMVCgeneral in reconstruction of ARG2, ARG3, and ARG4. if (usedMVC) {
BLOCK_COMMENT("mode MVC {");
__ bind(doMVC);
// get #bytes-1 for EXECUTE if (log2_size) {
__ add2reg(Z_R1, -1); // Length was scaled into Z_R1.
} else {
__ add2reg(Z_R1, -1, len_reg); // Length was not scaled.
}
if (VM_Version::has_ExecuteExtensions()) {
__ z_exrl(Z_R1, MVC_template); // Execute MVC with variable length.
} else {
__ z_lgr(Z_R0, Z_R5); // Save ARG4, may be unnecessary.
__ z_larl(Z_R5, MVC_template); // Get addr of instr template.
__ z_ex(Z_R1, 0, Z_R0, Z_R5); // Execute MVC with variable length.
__ z_lgr(Z_R5, Z_R0); // Restore ARG4, may be unnecessary.
}
if (!branchToEnd) {
__ z_br(Z_R14);
}
BLOCK_COMMENT("} mode MVC");
}
__ bind(done);
switch (element_size) { case 1: BLOCK_COMMENT("} ARRAYCOPY DISJOINT byte "); break; case 2: BLOCK_COMMENT("} ARRAYCOPY DISJOINT short"); break; case 4: BLOCK_COMMENT("} ARRAYCOPY DISJOINT int "); break; case 8: BLOCK_COMMENT("} ARRAYCOPY DISJOINT long "); break; default: BLOCK_COMMENT("} ARRAYCOPY DISJOINT "); break;
}
}
}
// Generate stub for conjoint array copy. If "aligned" is true, the // "from" and "to" addresses are assumed to be heapword aligned. // // Arguments for generated stub: // from: Z_ARG1 // to: Z_ARG2 // count: Z_ARG3 treated as signed void generate_conjoint_copy(bool aligned, int element_size, bool branchToEnd) {
// This is the zarch specific stub generator for general array copy tasks. // It has the following prereqs and features: // // - Destructive overlap exists and is handled by reverse copy. // - Destructive overlap exists if the leftmost byte of the target // does coincide with any of the source bytes (except the leftmost). // - Z_R0 and Z_R1 are KILLed by the stub routine (data and stride) // - Z_ARG1 and Z_ARG2 are USEd but preserved by the stub routine. // - Z_ARG3 is USED but preserved by the stub routine. // - Z_ARG4 is used as index register and is thus KILLed. //
{ Register stride_reg = Z_R1; // Stride & compare value in loop (negative element_size). Register data_reg = Z_R0; // Holds value of currently processed element. Register ix_reg = Z_ARG4; // Holds byte index of currently processed element. Register len_reg = Z_ARG3; // Holds length (in #elements) of arrays. Register dst_reg = Z_ARG2; // Holds left operand addr. Register src_reg = Z_ARG1; // Holds right operand addr.
assert(256%element_size == 0, "Element size must be power of 2.");
assert(element_size <= 8, "Can't handle more than DW units.");
switch (element_size) { case 1: BLOCK_COMMENT("ARRAYCOPY CONJOINT byte {"); break; case 2: BLOCK_COMMENT("ARRAYCOPY CONJOINT short {"); break; case 4: BLOCK_COMMENT("ARRAYCOPY CONJOINT int {"); break; case 8: BLOCK_COMMENT("ARRAYCOPY CONJOINT long {"); break; default: BLOCK_COMMENT("ARRAYCOPY CONJOINT {"); break;
}
assert_positive_int(len_reg);
if (VM_Version::has_Prefetch()) {
__ z_pfd(0x01, 0, Z_R0, src_reg); // Fetch access.
__ z_pfd(0x02, 0, Z_R0, dst_reg); // Store access.
}
// Optimize reverse copy loop. // Main loop copies DW units which may be unaligned. Unaligned access adds some penalty ticks. // Unaligned DW access (neither fetch nor store) is DW-atomic, but should be alignment-atomic. // Preceding the main loop, some bytes are copied to obtain a DW-multiple remaining length.
__ load_const_optimized(stride_reg, stride); // Prepare for DW copy loop.
if (element_size == 8) // Nothing to do here.
__ z_bru(countLoop1); else { // Do not generate dead code.
__ z_tmll(ix_reg, 7); // Check the "odd" bits.
__ z_bre(countLoop1); // There are none, very good!
}
if (log2_size == 0) { // Handle leftover Byte.
__ z_tmll(ix_reg, 1);
__ z_bre(skipBY);
__ z_lb(data_reg, -1, ix_reg, src_reg);
__ z_stcy(data_reg, -1, ix_reg, dst_reg);
__ add2reg(ix_reg, -1); // Decrement delayed to avoid AGI.
__ bind(skipBY); // fallthru
} if (log2_size <= 1) { // Handle leftover HW.
__ z_tmll(ix_reg, 2);
__ z_bre(skipHW);
__ z_lhy(data_reg, -2, ix_reg, src_reg);
__ z_sthy(data_reg, -2, ix_reg, dst_reg);
__ add2reg(ix_reg, -2); // Decrement delayed to avoid AGI.
__ bind(skipHW);
__ z_tmll(ix_reg, 4);
__ z_bre(countLoop1); // fallthru
} if (log2_size <= 2) { // There are just 4 bytes (left) that need to be copied.
__ z_ly(data_reg, -4, ix_reg, src_reg);
__ z_sty(data_reg, -4, ix_reg, dst_reg);
__ add2reg(ix_reg, -4); // Decrement delayed to avoid AGI.
__ z_bru(countLoop1);
}
// Control can never get to here. Never! Never ever!
__ z_illtrap(0x99);
__ bind(copyLoop1);
__ z_lg(data_reg, 0, ix_reg, src_reg);
__ z_stg(data_reg, 0, ix_reg, dst_reg);
__ bind(countLoop1);
__ z_brxhg(ix_reg, stride_reg, copyLoop1);
if (!branchToEnd)
__ z_br(Z_R14);
switch (element_size) { case 1: BLOCK_COMMENT("} ARRAYCOPY CONJOINT byte "); break; case 2: BLOCK_COMMENT("} ARRAYCOPY CONJOINT short"); break; case 4: BLOCK_COMMENT("} ARRAYCOPY CONJOINT int "); break; case 8: BLOCK_COMMENT("} ARRAYCOPY CONJOINT long "); break; default: BLOCK_COMMENT("} ARRAYCOPY CONJOINT "); break;
}
}
}
// Generate stub for disjoint byte copy. If "aligned" is true, the // "from" and "to" addresses are assumed to be heapword aligned.
address generate_disjoint_byte_copy(bool aligned, constchar * name) {
StubCodeMark mark(this, "StubRoutines", name);
// This is the zarch specific stub generator for byte array copy. // Refer to generate_disjoint_copy for a list of prereqs and features: unsignedint start_off = __ offset(); // Remember stub start address (is rtn value).
generate_disjoint_copy(aligned, 1, false, false); return __ addr_at(start_off);
}
address generate_disjoint_short_copy(bool aligned, constchar * name) {
StubCodeMark mark(this, "StubRoutines", name); // This is the zarch specific stub generator for short array copy. // Refer to generate_disjoint_copy for a list of prereqs and features: unsignedint start_off = __ offset(); // Remember stub start address (is rtn value).
generate_disjoint_copy(aligned, 2, false, false); return __ addr_at(start_off);
}
address generate_disjoint_int_copy(bool aligned, constchar * name) {
StubCodeMark mark(this, "StubRoutines", name); // This is the zarch specific stub generator for int array copy. // Refer to generate_disjoint_copy for a list of prereqs and features: unsignedint start_off = __ offset(); // Remember stub start address (is rtn value).
generate_disjoint_copy(aligned, 4, false, false); return __ addr_at(start_off);
}
address generate_disjoint_long_copy(bool aligned, constchar * name) {
StubCodeMark mark(this, "StubRoutines", name); // This is the zarch specific stub generator for long array copy. // Refer to generate_disjoint_copy for a list of prereqs and features: unsignedint start_off = __ offset(); // Remember stub start address (is rtn value).
generate_disjoint_copy(aligned, 8, false, false); return __ addr_at(start_off);
}
address generate_disjoint_oop_copy(bool aligned, constchar * name, bool dest_uninitialized) {
StubCodeMark mark(this, "StubRoutines", name); // This is the zarch specific stub generator for oop array copy. // Refer to generate_disjoint_copy for a list of prereqs and features. unsignedint start_off = __ offset(); // Remember stub start address (is rtn value). unsignedint size = UseCompressedOops ? 4 : 8;
address generate_conjoint_byte_copy(bool aligned, constchar * name) {
StubCodeMark mark(this, "StubRoutines", name); // This is the zarch specific stub generator for overlapping byte array copy. // Refer to generate_conjoint_copy for a list of prereqs and features: unsignedint start_off = __ offset(); // Remember stub start address (is rtn value).
address nooverlap_target = aligned ? StubRoutines::arrayof_jbyte_disjoint_arraycopy()
: StubRoutines::jbyte_disjoint_arraycopy();
array_overlap_test(nooverlap_target, 0); // Branch away to nooverlap_target if disjoint.
generate_conjoint_copy(aligned, 1, false);
return __ addr_at(start_off);
}
address generate_conjoint_short_copy(bool aligned, constchar * name) {
StubCodeMark mark(this, "StubRoutines", name); // This is the zarch specific stub generator for overlapping short array copy. // Refer to generate_conjoint_copy for a list of prereqs and features: unsignedint start_off = __ offset(); // Remember stub start address (is rtn value).
address nooverlap_target = aligned ? StubRoutines::arrayof_jshort_disjoint_arraycopy()
: StubRoutines::jshort_disjoint_arraycopy();
array_overlap_test(nooverlap_target, 1); // Branch away to nooverlap_target if disjoint.
generate_conjoint_copy(aligned, 2, false);
return __ addr_at(start_off);
}
address generate_conjoint_int_copy(bool aligned, constchar * name) {
StubCodeMark mark(this, "StubRoutines", name); // This is the zarch specific stub generator for overlapping int array copy. // Refer to generate_conjoint_copy for a list of prereqs and features:
array_overlap_test(nooverlap_target, 2); // Branch away to nooverlap_target if disjoint.
generate_conjoint_copy(aligned, 4, false);
return __ addr_at(start_off);
}
address generate_conjoint_long_copy(bool aligned, constchar * name) {
StubCodeMark mark(this, "StubRoutines", name); // This is the zarch specific stub generator for overlapping long array copy. // Refer to generate_conjoint_copy for a list of prereqs and features:
// Branch to disjoint_copy (if applicable) before pre_barrier to avoid double pre_barrier.
array_overlap_test(nooverlap_target, shift); // Branch away to nooverlap_target if disjoint.
DecoratorSet decorators = IN_HEAP | IS_ARRAY; if (dest_uninitialized) {
decorators |= IS_DEST_UNINITIALIZED;
} if (aligned) {
decorators |= ARRAYCOPY_ALIGNED;
}
// Call interface for AES_encryptBlock, AES_decryptBlock stubs. // // Z_ARG1 - source data block. Ptr to leftmost byte to be processed. // Z_ARG2 - destination data block. Ptr to leftmost byte to be stored. // For in-place encryption/decryption, ARG1 and ARG2 can point // to the same piece of storage. // Z_ARG3 - Crypto key address (expanded key). The first n bits of // the expanded key constitute the original AES-<n> key (see below). // // Z_RET - return value. First unprocessed byte offset in src buffer. // // Some remarks: // The crypto key, as passed from the caller to these encryption stubs, // is a so-called expanded key. It is derived from the original key // by the Rijndael key schedule, see http://en.wikipedia.org/wiki/Rijndael_key_schedule // With the expanded key, the cipher/decipher task is decomposed in // multiple, less complex steps, called rounds. Sun SPARC and Intel // processors obviously implement support for those less complex steps. // z/Architecture provides instructions for full cipher/decipher complexity. // Therefore, we need the original, not the expanded key here. // Luckily, the first n bits of an AES-<n> expanded key are formed // by the original key itself. That takes us out of trouble. :-) // The key length (in bytes) relation is as follows: // original expanded rounds key bit keylen // key bytes key bytes length in words // 16 176 11 128 44 // 24 208 13 192 52 // 32 240 15 256 60 // // The crypto instructions used in the AES* stubs have some specific register requirements. // Z_R0 holds the crypto function code. Please refer to the KM/KMC instruction // description in the "z/Architecture Principles of Operation" manual for details. // Z_R1 holds the parameter block address. The parameter block contains the cryptographic key // (KM instruction) and the chaining value (KMC instruction). // dst must designate an even-numbered register, holding the address of the output message. // src must designate an even/odd register pair, holding the address/length of the original message
// Helper function which generates code to // - load the function code in register fCode (== Z_R0). // - load the data block length (depends on cipher function) into register srclen if requested. // - is_decipher switches between cipher/decipher function codes // - set_len requests (if true) loading the data block length in register srclen void generate_load_AES_fCode(Register keylen, Register fCode, Register srclen, bool is_decipher) {
BLOCK_COMMENT("Set fCode {"); {
Label fCode_set; int mode = is_decipher ? VM_Version::CipherMode::decipher : VM_Version::CipherMode::cipher; bool identical_dataBlk_len = (VM_Version::Cipher::_AES128_dataBlk == VM_Version::Cipher::_AES192_dataBlk)
&& (VM_Version::Cipher::_AES128_dataBlk == VM_Version::Cipher::_AES256_dataBlk); // Expanded key length is 44/52/60 * 4 bytes for AES-128/AES-192/AES-256.
__ z_cghi(keylen, 52); // Check only once at the beginning. keylen and fCode may share the same register.
__ bind(fCode_set); if (identical_dataBlk_len) {
__ z_lghi(srclen, VM_Version::Cipher::_AES128_dataBlk);
}
}
BLOCK_COMMENT("} Set fCode");
}
// Push a parameter block for the cipher/decipher instruction on the stack. // Layout of the additional stack space allocated for AES_cipherBlockChaining: // // | | // +--------+ <-- SP before expansion // | | // : : alignment loss (part 2), 0..(AES_parmBlk_align-1) bytes // | | // +--------+ // | | // : : space for parameter block, size VM_Version::Cipher::_AES*_parmBlk_C // | | // +--------+ <-- parmBlk, octoword-aligned, start of parameter block // | | // : : additional stack space for spills etc., size AES_parmBlk_addspace, DW @ Z_SP not usable!!! // | | // +--------+ <-- Z_SP + alignment loss, octoword-aligned // | | // : : alignment loss (part 1), 0..(AES_parmBlk_align-1) bytes. DW @ Z_SP not usable!!! // | | // +--------+ <-- Z_SP after expansion
void generate_push_Block(int dataBlk_len, int parmBlk_len, int crypto_fCode, Register parmBlk, Register keylen, Register fCode, Register cv, Register key) {
AES_parmBlk_addspace = AES_parmBlk_align; // Must be multiple of AES_parmblk_align. // spill space for regs etc., don't use DW @SP! constint cv_len = dataBlk_len; constint key_len = parmBlk_len - cv_len; // This len must be known at JIT compile time. Only then are we able to recalc the SP before resize. // We buy this knowledge by wasting some (up to AES_parmBlk_align) bytes of stack space. constint resize_len = cv_len + key_len + AES_parmBlk_align + AES_parmBlk_addspace;
// Use parmBlk as temp reg here to hold the frame pointer.
__ resize_frame(-resize_len, parmBlk, true);
// NOTE: // Before returning, the stub has to copy the chaining value from // the parmBlk, where it was updated by the crypto instruction, back // to the chaining value array the address of which was passed in the cv argument. // As all the available registers are used and modified by KMC, we need to save // the key length across the KMC instruction. We do so by spilling it to the stack, // just preceding the parmBlk (at (parmBlk - 8)). void generate_push_parmBlk(Register keylen, Register fCode, Register parmBlk, Register key, Register cv, bool is_decipher) { int mode = is_decipher ? VM_Version::CipherMode::decipher : VM_Version::CipherMode::cipher;
Label parmBlk_128, parmBlk_192, parmBlk_256, parmBlk_set;
BLOCK_COMMENT("push parmBlk {"); // We have just three cipher strengths which translates into three // possible extended key lengths: 44, 52, and 60 bytes. // We therefore can compare the actual length against the "middle" length // and get: lt -> len=44, eq -> len=52, gt -> len=60.
__ z_cghi(keylen, 52); if (VM_Version::has_Crypto_AES128()) { __ z_brl(parmBlk_128); } // keyLen < 52: AES128 if (VM_Version::has_Crypto_AES192()) { __ z_bre(parmBlk_192); } // keyLen == 52: AES192 if (VM_Version::has_Crypto_AES256()) { __ z_brh(parmBlk_256); } // keyLen > 52: AES256
// Security net: requested AES function not available on this CPU. // NOTE: // As of now (March 2015), this safety net is not required. JCE policy files limit the // cryptographic strength of the keys used to 128 bit. If we have AES hardware support // at all, we have at least AES-128.
__ stop_static("AES key strength not supported by CPU. Use -XX:-UseAES as remedy.", 0);
// Pop a parameter block from the stack. The chaining value portion of the parameter block // is copied back to the cv array as it is needed for subsequent cipher steps. // The keylen value as well as the original SP (before resizing) was pushed to the stack // when pushing the parameter block. void generate_pop_parmBlk(Register keylen, Register parmBlk, Register key, Register cv) {
// Security net: there is no one here. If we would need it, we should have // fallen into it already when pushing the parameter block. if (VM_Version::has_Crypto_AES128()) {
__ bind(parmBlk_128);
cv_len = VM_Version::Cipher::_AES128_dataBlk;
__ z_mvc(0, cv_len-1, cv, 0, parmBlk); // Copy cv. if (VM_Version::has_Crypto_AES192() || VM_Version::has_Crypto_AES256()) {
__ z_bru(parmBlk_set);
}
}
constRegister keylen = Z_R0; // Temporarily (until fCode is set) holds the expanded key array length.
// Register definitions as required by KM instruction. constRegister fCode = Z_R0; // crypto function code constRegister parmBlk = Z_R1; // parameter block address (points to crypto key) constRegister src = Z_ARG1; // Must be even reg (KM requirement). constRegister srclen = Z_ARG2; // Must be odd reg and pair with src. Overwrites destination address. constRegister dst = Z_ARG3; // Must be even reg (KM requirement). Overwrites expanded key address.
// Read key len of expanded key (in 4-byte words).
__ z_lgf(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
// Copy arguments to registers as required by crypto instruction.
__ z_lgr(parmBlk, key); // crypto key (in T_INT array).
__ lgr_if_needed(src, from); // Copy src address. Will not emit, src/from are identical.
__ z_lgr(dst, to); // Copy dst address, even register required.
// Construct function code into fCode(Z_R0), data block length into srclen(Z_ARG2).
generate_load_AES_fCode(keylen, fCode, srclen, is_decipher);
// These stubs receive the addresses of the cryptographic key and of the chaining value as two separate // arguments (registers "key" and "cv", respectively). The KMC instruction, on the other hand, requires // chaining value and key to be, in this sequence, adjacent in storage. Thus, we need to allocate some // thread-local working storage. Using heap memory incurs all the hassles of allocating/freeing. // Stack space, on the contrary, is deallocated automatically when we return from the stub to the caller. // *** WARNING *** // Please note that we do not formally allocate stack space, nor do we // update the stack pointer. Therefore, no function calls are allowed // and nobody else must use the stack range where the parameter block // is located. // We align the parameter block to the next available octoword. // // Compute chained AES encrypt function. void generate_AES_cipherBlockChaining(bool is_decipher) {
Register from = Z_ARG1; // source byte array (clear text) Register to = Z_ARG2; // destination byte array (ciphered) Register key = Z_ARG3; // expanded key array. Register cv = Z_ARG4; // chaining value constRegister msglen = Z_ARG5; // Total length of the msg to be encrypted. Value must be returned // in Z_RET upon completion of this stub. Is 32-bit integer.
// Read key len of expanded key (in 4-byte words).
__ z_lgf(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
// Construct parm block address in parmBlk (== Z_R1), copy cv and key to parm block. // Construct function code in fCode (Z_R0).
generate_push_parmBlk(keylen, fCode, parmBlk, key, cv, is_decipher);
// Prepare other registers for instruction.
__ lgr_if_needed(src, from); // Copy src address. Will not emit, src/from are identical.
__ z_lgr(dst, to);
__ z_llgfr(srclen, msglen); // We pass the offsets as ints, not as longs as required.
__ kmc(dst, src); // Cipher the message.
generate_pop_parmBlk(keylen, parmBlk, key, cv);
__ z_llgfr(Z_RET, msglen); // We pass the offsets as ints, not as longs as required.
__ z_br(Z_R14);
}
// AES CounterMode // Push a parameter block for the cipher/decipher instruction on the stack. // Layout of the additional stack space allocated for counterMode_AES_cipherBlock // // | | // +--------+ <-- SP before expansion // | | JIT_TIMER timestamp buffer, only if JIT_TIMER is defined. // +--------+ // | | // : : alignment loss (part 2), 0..(AES_parmBlk_align-1) bytes. // | | // +--------+ <-- gap = parmBlk + parmBlk_len + ctrArea_len // | | // : : byte[] ctr - kmctr expects a counter vector the size of the input vector. // : : The interface only provides byte[16] iv, the init vector. // : : The size of this area is a tradeoff between stack space, init effort, and speed. // | | Each counter is a 128bit int. Vector element i is formed by incrementing element (i-1). // +--------+ <-- ctr = parmBlk + parmBlk_len // | | // : : space for parameter block, size VM_Version::Cipher::_AES*_parmBlk_G // | | // +--------+ <-- parmBlk = Z_SP + (alignment loss (part 1+2)) + AES_dataBlk_space + AES_parmBlk_addSpace, octoword-aligned, start of parameter block // | | // : : additional stack space for spills etc., min. size AES_parmBlk_addspace, all bytes usable. // | | // +--------+ <-- Z_SP + alignment loss (part 1+2) + AES_dataBlk_space, octoword-aligned // | | // : : space for one source data block and one dest data block. // | | // +--------+ <-- Z_SP + alignment loss (part 1+2), octoword-aligned // | | // : : additional alignment loss. Blocks above can't tolerate unusable DW @SP. // | | // +--------+ <-- Z_SP + alignment loss (part 1), octoword-aligned // | | // : : alignment loss (part 1), 0..(AES_parmBlk_align-1) bytes. DW @ Z_SP holds frame ptr. // | | // +--------+ <-- Z_SP after expansion // // additional space allocation (per DW): // spillSpace = parmBlk - AES_parmBlk_addspace // dataBlocks = spillSpace - AES_dataBlk_space // // parmBlk-8 various fields of various lengths // parmBlk-1: key_len (only one byte is stored at parmBlk-1) // parmBlk-2: fCode (only one byte is stored at parmBlk-2) // parmBlk-4: ctrVal_len (as retrieved from iv array), in bytes, as HW // parmBlk-8: msglen length (in bytes) of crypto msg, as passed in by caller // return value is calculated from this: rv = msglen - processed. // parmBlk-16 old_SP (SP before resize) // parmBlk-24 temp values // up to and including main loop in generate_counterMode_AES // - parmBlk-20: remmsg_len remaining msg len (aka unprocessed msg bytes) // after main loop in generate_counterMode_AES // - parmBlk-24: spill slot for various address values // // parmBlk-40 free spill slot, used for local spills. // parmBlk-64 ARG2(dst) ptr spill slot // parmBlk-56 ARG3(crypto key) ptr spill slot // parmBlk-48 ARG4(counter value) ptr spill slot // // // Layout of the parameter block (instruction KMCTR, function KMCTR-AES* // // +--------+ key_len: +16 (AES-128), +24 (AES-192), +32 (AES-256) // | | // | | cryptographic key // | | // +--------+ <-- parmBlk // // On exit: // Z_SP points to resized frame // Z_SP before resize available from -16(parmBlk) // parmBlk points to crypto instruction parameter block // parameter block is filled with crypto key. // msglen unchanged, saved for later at -24(parmBlk) // fCode contains function code for instruction // key unchanged // void generate_counterMode_prepare_Stack(Register parmBlk, Register ctr, Register counter, Register scratch) {
// save argument registers. // ARG1(from) is Z_RET as well. Not saved or restored. // ARG5(msglen) is restored by other means.
__ z_stmg(Z_ARG2, Z_ARG4, argsave_offset, parmBlk);
assert(AES_ctrVec_len > 0, "sanity. We need a counter vector");
__ add2reg(counter, AES_parmBlk_align, parmBlk); // counter array is located behind crypto key. Available range is disp12 only.
__ z_mvc(0, AES_ctrVal_len-1, counter, 0, ctr); // move first copy of iv for (int j = 1; j < AES_ctrVec_len; j+=j) { // j (and amount of moved data) doubles with every iteration int offset = j * AES_ctrVal_len; if (offset <= 256) {
__ z_mvc(offset, offset-1, counter, 0, counter); // move iv
} else { for (int k = 0; k < offset; k += 256) {
__ z_mvc(offset+k, 255, counter, 0, counter);
}
}
}
Label noCarry, done;
__ z_lg(scratch, Address(ctr, 8)); // get low-order DW of initial counter.
__ z_algfi(scratch, AES_ctrVec_len); // check if we will overflow during init.
__ z_brc(Assembler::bcondLogNoCarry, noCarry); // No, 64-bit increment is sufficient.
for (int j = 1; j < AES_ctrVec_len; j++) { // start with j = 1; no need to add 0 to the first counter value. int offset = j * AES_ctrVal_len;
generate_increment128(counter, offset, j, scratch); // increment iv by index value
}
__ z_bru(done);
__ bind(noCarry); for (int j = 1; j < AES_ctrVec_len; j++) { // start with j = 1; no need to add 0 to the first counter value. int offset = j * AES_ctrVal_len;
generate_increment64(counter, offset, j); // increment iv by index value
}
__ add2reg(counter, AES_parmBlk_align, parmBlk); // ptr to counter array needs to be restored for (int j = 0; j < AES_ctrVec_len; j++) { int offset = j * AES_ctrVal_len;
generate_increment128(counter, offset, AES_ctrVec_len, scratch); // increment iv by # vector elements if (v0_only) break;
}
// IBM s390 (IBM z/Architecture, to be more exact) uses Big-Endian number representation. // Therefore, the bits are ordered from most significant to least significant. The address // of a number in memory points to its lowest location where the most significant bit is stored. void generate_increment64(Register counter, int offset, int increment) {
__ z_algsi(offset + 8, counter, increment); // increment, no overflow check
}
void generate_increment128(Register counter, int offset, int increment, Register scratch) {
__ clear_reg(scratch); // prepare to add carry to high-order DW
__ z_algsi(offset + 8, counter, increment); // increment low order DW
__ z_alcg(scratch, Address(counter, offset)); // add carry to high-order DW
__ z_stg(scratch, Address(counter, offset)); // store back
}
void generate_counterMode_push_Block(int dataBlk_len, int parmBlk_len, int crypto_fCode, Register parmBlk, Register msglen, Register fCode, Register key) {
// space for data blocks (src and dst, one each) for partial block processing)
AES_dataBlk_space = roundup(2*dataBlk_len, AES_parmBlk_align);
AES_parmBlk_addspace = AES_parmBlk_align // spill space (temp data)
+ AES_parmBlk_align // for argument save/restore
; constint key_len = parmBlk_len; // The length of the unextended key (16, 24, 32)
assert((AES_ctrVal_len == 0) || (AES_ctrVal_len == dataBlk_len), "varying dataBlk_len is not supported.");
AES_ctrVal_len = dataBlk_len; // ctr init value len (in bytes)
AES_ctrArea_len = AES_ctrVec_len * AES_ctrVal_len; // space required on stack for ctr vector
// This len must be known at JIT compile time. Only then are we able to recalc the SP before resize. // We buy this knowledge by wasting some (up to AES_parmBlk_align) bytes of stack space. constint resize_len = JIT_TIMER_space // timestamp storage for JIT_TIMER
+ AES_parmBlk_align // room for alignment of parmBlk
+ AES_parmBlk_align // extra room for alignment
+ AES_dataBlk_space // one src and one dst data blk
+ AES_parmBlk_addspace // spill space for local data
+ roundup(parmBlk_len, AES_parmBlk_align) // aligned length of parmBlk
+ AES_ctrArea_len // stack space for ctr vector
; Register scratch = fCode; // We can use fCode as a scratch register. It's contents on entry // is irrelevant and it is set at the very end of this code block.
// After the frame is resized, the parmBlk is positioned such // that it is octoword-aligned. This potentially creates some // alignment waste in addspace and/or in the gap area. // After resize_frame, scratch contains the frame pointer.
__ resize_frame(-resize_len, scratch, true);
// There is room to spill stuff in the range [parmBlk-AES_parmBlk_addspace+8, parmBlk).
__ z_mviy(keylen_offset, parmBlk, key_len - 1); // Spill crypto key length for later use. Decrement by one for direct use with xc template.
__ z_mviy(fCode_offset, parmBlk, crypto_fCode); // Crypto function code, will be loaded into Z_R0 later.
__ z_sty(msglen, msglen_offset, parmBlk); // full plaintext/ciphertext len.
__ z_sra(msglen, exact_log2(dataBlk_len)); // # full cipher blocks that can be formed from input text.
__ z_sty(msglen, remmsg_len_offset, parmBlk);
__ add2reg(scratch, resize_len, Z_SP); // calculate (SP before resize) from resized SP.
__ z_stg(scratch, unextSP_offset, parmBlk); // Spill unextended SP for easy revert.
// Fill parmBlk with all required data
__ z_mvc(0, key_len-1, parmBlk, 0, key); // Copy key. Need to do it here - key_len is only known here.
BLOCK_COMMENT(err_msg("} push_Block (%d bytes) counterMode_AESCrypt%d", resize_len, parmBlk_len*8));
}
void generate_counterMode_pop_Block(Register parmBlk, Register msglen, Label& eraser) { // For added safety, clear the stack area where the crypto key was stored. Register scratch = msglen;
assert_different_registers(scratch, Z_R0); // can't use Z_R0 for exrl.
// wipe out key on stack
__ z_llgc(scratch, keylen_offset, parmBlk); // get saved (key_len-1) value (we saved just one byte!)
__ z_exrl(scratch, eraser); // template relies on parmBlk still pointing to key on stack
// restore argument registers. // ARG1(from) is Z_RET as well. Not restored - will hold return value anyway. // ARG5(msglen) is restored further down.
__ z_lmg(Z_ARG2, Z_ARG4, argsave_offset, parmBlk);
__ z_lgf(msglen, msglen_offset, parmBlk); // Restore msglen, only low order FW is valid
__ z_lg(Z_SP, unextSP_offset, parmBlk); // trim stack back to unextended size
}
void generate_counterMode_push_parmBlk(Register parmBlk, Register msglen, Register fCode, Register key, bool is_decipher) { int mode = is_decipher ? VM_Version::CipherMode::decipher : VM_Version::CipherMode::cipher;
Label parmBlk_128, parmBlk_192, parmBlk_256, parmBlk_set; Register keylen = fCode; // Expanded key length, as read from key array, Temp only. // use fCode as scratch; fCode receives its final value later.
// Read key len of expanded key (in 4-byte words).
__ z_lgf(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
__ z_cghi(keylen, 52); if (VM_Version::has_Crypto_AES_CTR256()) { __ z_brh(parmBlk_256); } // keyLen > 52: AES256. Assume: most frequent if (VM_Version::has_Crypto_AES_CTR128()) { __ z_brl(parmBlk_128); } // keyLen < 52: AES128. if (VM_Version::has_Crypto_AES_CTR192()) { __ z_bre(parmBlk_192); } // keyLen == 52: AES192. Assume: least frequent
// Safety net: requested AES_CTR function for requested keylen not available on this CPU.
__ stop_static("AES key strength not supported by CPU. Use -XX:-UseAESCTRIntrinsics as remedy.", 0);
BLOCK_COMMENT("} pop parmBlk counterMode_AESCrypt");
}
// Resize current stack frame to make room for some register data which needs // to be spilled temporarily. All registers in the range [from..to] are spilled // automatically. The actual length of the allocated aux block is returned. // The extra spill space (if requested) is located at // [Z_SP+stackSpace-spillSpace, Z_SP+stackSpace) // Kills Z__R0 (contains fp afterwards) and Z_R1 (contains old SP afterwards). // All space in the range [SP..SP+regSpace) is reserved. // As always (here): 0(SP) - stack linkage, 8(SP) - SP before resize for easy pop. int generate_push_aux_block(Register from, Register to, unsignedint spillSpace) { int n_regs = to->encoding() - from->encoding() + 1; int linkSpace = 2*wordSize; int regSpace = n_regs*wordSize; int stackSpace = roundup(linkSpace + regSpace + spillSpace, AES_parmBlk_align);
BLOCK_COMMENT(err_msg("push aux_block (%d bytes) counterMode_AESCrypt {", stackSpace));
__ z_lgr(Z_R1, Z_SP);
__ resize_frame(-stackSpace, Z_R0, true);
__ z_stg(Z_R1, 8, Z_SP);
__ z_stmg(from, to, linkSpace, Z_SP);
BLOCK_COMMENT(err_msg("} push aux_block (%d bytes) counterMode_AESCrypt", stackSpace)); return stackSpace;
} // Reverts everything done by generate_push_aux_block(). void generate_pop_aux_block(Register from, Register to) {
BLOCK_COMMENT("pop aux_block counterMode_AESCrypt {");
__ z_lmg(from, to, 16, Z_SP);
__ z_lg(Z_SP, 8, Z_SP);
BLOCK_COMMENT("} pop aux_block counterMode_AESCrypt");
}
Register from = Z_ARG1; // byte[], source byte array (clear text) Register to = Z_ARG2; // byte[], destination byte array (ciphered) Register key = Z_ARG3; // byte[], expanded key array. Register ctr = Z_ARG4; // byte[], counter byte array. constRegister msglen = Z_ARG5; // int, Total length of the msg to be encrypted. Value must be // returned in Z_RET upon completion of this stub. // This is a jint. Negative values are illegal, but technically possible. // Do not rely on high word. Contents is undefined.
constRegister fCode = Z_R0; // crypto function code constRegister parmBlk = Z_R1; // parameter block address (points to crypto key) constRegister src = Z_ARG1; // is Z_R2 constRegister srclen = Z_ARG2; // Overwrites destination address. constRegister dst = Z_ARG3; // Overwrites key address. constRegister counter = Z_ARG5; // Overwrites msglen. Must have counter array in an even register.
// Check if there is a leftover, partially used encrypted counter from last invocation. // If so, use those leftover counter bytes first before starting the "normal" encryption.
{ Register cvIndex = Z_R10; // # unused bytes of last encrypted counter value Register cvUnused = Z_R11; // # unused bytes of last encrypted counter value Register encCtr = Z_R12; // encrypted counter value, points to first ununsed byte.
Label no_preLoop, preLoop_end;
// Before pushing an aux block, check if it's necessary at all (saves some cycles).
__ z_lt(Z_R0, _z_abi(remaining_cargs) + 8 + 4, Z_R0, Z_SP); // arg7: # unused bytes in encCTR.
__ z_brnp(no_preLoop); // no unused bytes, nothing special to do.
int oldSP_Offset = generate_push_aux_block(Z_R10, Z_R12, 16); int arg6_Offset = oldSP_Offset + _z_abi(remaining_cargs); int arg7_Offset = oldSP_Offset + _z_abi(remaining_cargs) + 8;
__ z_ltgf(cvUnused, arg7_Offset+4, Z_R0, Z_SP); // arg7: # unused bytes in encCTR. (16-arg7) is index of first unused byte.
__ z_brnp(preLoop_end); // "not positive" means no unused bytes left
__ z_aghik(cvIndex, cvUnused, -16); // calculate index of first unused byte. AES_ctrVal_len undefined at this point.
__ z_brnl(preLoop_end); // NotLow(=NotNegative): unused bytes >= 16? How that?
__ z_lcgr(cvIndex, cvIndex);
__ z_lg(encCtr, arg6_Offset, Z_SP); // arg6: encrypted counter byte array.
__ z_agr(encCtr, cvIndex); // first unused byte of encrypted ctr. Used in ctrXOR.
__ z_cr(cvUnused, msglen); // check if msg is long enough
__ z_locr(cvUnused, msglen, Assembler::bcondHigh); // take the shorter length
__ z_aghi(cvUnused, -1); // decrement # unused bytes by 1 for exrl instruction
__ z_brl(preLoop_end); // negative result means nothing to do (msglen is zero)
__ add2reg(cvUnused, 1, cvUnused);
__ z_agr(from, cvUnused);
__ z_agr(to, cvUnused);
__ z_sr(msglen, cvUnused);
__ z_brnz(preLoop_end); // there is still work to do
// Remaining msglen is zero, i.e. all msg bytes were processed in preLoop. // Take an early exit.
generate_pop_aux_block(Z_R10, Z_R12);
__ z_bru(Exit);
//------------------------------------------- //---< execution templates for preLoop >--- //-------------------------------------------
__ bind(fromMover);
__ z_mvc(0, 0, to, 0, from); // Template instruction to move input data to dst.
__ bind(ctrXOR);
__ z_xc(0, 0, to, 0, encCtr); // Template instruction to XOR input data (now in to) with encrypted counter.
// Expand stack, load parm block address into parmBlk (== Z_R1), copy crypto key to parm block.
generate_counterMode_push_parmBlk(parmBlk, msglen, fCode, key, is_decipher); // Create count vector on stack to accommodate up to AES_ctrVec_len blocks.
generate_counterMode_prepare_Stack(parmBlk, ctr, counter, fCode);
// Prepare other registers for instruction.
__ lgr_if_needed(src, from); // Copy src address. Will not emit, src/from are identical.
__ z_lgr(dst, to);
__ z_llgc(fCode, fCode_offset, Z_R0, parmBlk);
__ bind(CryptoLoop);
__ z_lghi(srclen, AES_ctrArea_len); // preset len (#bytes) for next iteration: max possible.
__ z_asi(remmsg_len_offset, parmBlk, -AES_ctrVec_len); // decrement #remaining blocks (16 bytes each). Range: [+127..-128]
__ z_brl(CryptoLoop_setupAndDoLast); // Handling the last iteration out-of-line
__ bind(CryptoLoop_doit);
__ kmctr(dst, counter, src); // Cipher the message.
__ z_lt(srclen, remmsg_len_offset, Z_R0, parmBlk); // check if this was the last iteration
__ z_brz(CryptoLoop_ctrVal_inc); // == 0: ctrVector fully used. Need to increment the first // vector element to encrypt remaining unprocessed bytes. // __ z_brl(CryptoLoop_end); // < 0: this was detected before and handled at CryptoLoop_setupAndDoLast
generate_counterMode_increment_ctrVector(parmBlk, counter, srclen, false); // srclen unused here (serves as scratch)
__ z_bru(CryptoLoop);
__ bind(CryptoLoop_end);
// OK, when we arrive here, we have encrypted all of the "from" byte stream // except for the last few [0..dataBlk_len) bytes. To encrypt these few bytes // we need to form an extra src and dst data block of dataBlk_len each. This // is because we can only process full blocks but we must not read or write // beyond the boundaries of the argument arrays. Here is what we do: // - The src data block is filled with the remaining "from" bytes, padded with 0x00's. // - The single src data block is encrypted into the dst data block. // - The dst data block is copied into the "to" array, but only the leftmost few bytes // (as many as were left in the source byte stream). // - The counter value to be used is is pointed at by the counter register. // - Fortunately, the crypto instruction (kmctr) updates all related addresses such that we // know where to continue with "from" and "to" and which counter value to use next.
// Use speaking alias for temp register Register dataBlk = counter;
__ z_stg(counter, -24, parmBlk); // spill address of counter array
__ add2reg(dataBlk, -(AES_parmBlk_addspace + AES_dataBlk_space), parmBlk);
__ z_lgf(srclen, msglen_offset, parmBlk); // full plaintext/ciphertext len.
__ z_nilf(srclen, AES_ctrVal_len - 1); // those rightmost bits indicate the unprocessed #bytes
__ z_braz(allDone); // no unprocessed bytes? Then we are done.
__ add2reg(srclen, -1); // decrement for exrl
__ z_stg(srclen, localSpill_offset, parmBlk); // save for later reuse
__ z_xc(0, AES_ctrVal_len - 1, dataBlk, 0, dataBlk); // clear src block (zero padding)
__ z_exrl(srclen, srcMover); // copy src byte stream (remaining bytes)
__ load_const_optimized(srclen, AES_ctrVal_len); // kmctr processes only complete blocks
__ z_lgr(src, dataBlk); // tmp src address for kmctr
__ z_lg(counter, -24, parmBlk); // restore counter
__ z_stg(dst, -24, parmBlk); // save current dst
__ add2reg(dst, AES_ctrVal_len, src); // tmp dst is right after tmp src
__ kmctr(dst, counter, src); // Cipher the remaining bytes.
//---------------------------- //---< out-of-line code >--- //----------------------------
__ bind(CryptoLoop_setupAndDoLast);
__ z_lgf(srclen, remmsg_len_offset, parmBlk); // remaining #blocks in memory is < 0
__ z_aghi(srclen, AES_ctrVec_len); // recalculate the actually remaining #blocks
__ z_sllg(srclen, srclen, exact_log2(AES_ctrVal_len)); // convert to #bytes. Counter value is same length as data block
__ kmctr(dst, counter, src); // Cipher the last integral blocks of the message.
__ z_bru(CryptoLoop_end);
__ bind(CryptoLoop_ctrVal_inc);
generate_counterMode_increment_ctrVector(parmBlk, counter, srclen, true); // srclen unused here (serves as scratch)
__ z_bru(CryptoLoop_end);
//------------------------------- //---< execution templates >--- //-------------------------------
__ bind(dataEraser);
__ z_xc(0, 0, parmBlk, 0, parmBlk); // Template instruction to erase crypto key on stack.
__ bind(dstMover);
__ z_mvc(0, 0, dst, 0, dataBlk); // Template instruction to move encrypted reminder from stack to dst.
__ bind(srcMover);
__ z_mvc(0, 0, dataBlk, 0, src); // Template instruction to move reminder of source byte stream to stack.
}
// Create two intrinsic variants, optimized for short and long plaintexts. // void generate_counterMode_AES(bool is_decipher) {
constRegister msglen = Z_ARG5; // int, Total length of the msg to be encrypted. Value must be // returned in Z_RET upon completion of this stub. constint threshold = 256; // above this length (in bytes), text is considered long. constint vec_short = threshold>>6; // that many blocks (16 bytes each) per iteration, max 4 loop iterations constint vec_long = threshold>>2; // that many blocks (16 bytes each) per iteration.
// Copy back result and free parameter block.
__ z_mvc(Address(state), Address(Z_R1), 16);
__ z_xc(Address(Z_R1), param_block_size, Address(Z_R1));
__ z_aghi(Z_SP, frame_resize);
__ z_br(Z_R14);
return __ addr_at(start_off);
}
// Call interface for all SHA* stubs. // // Z_ARG1 - source data block. Ptr to leftmost byte to be processed. // Z_ARG2 - current SHA state. Ptr to state area. This area serves as // parameter block as required by the crypto instruction. // Z_ARG3 - current byte offset in source data block. // Z_ARG4 - last byte offset in source data block. // (Z_ARG4 - Z_ARG3) gives the #bytes remaining to be processed. // // Z_RET - return value. First unprocessed byte offset in src buffer. // // A few notes on the call interface: // - All stubs, whether they are single-block or multi-block, are assumed to // digest an integer multiple of the data block length of data. All data // blocks are digested using the intermediate message digest (KIMD) instruction. // Special end processing, as done by the KLMD instruction, seems to be // emulated by the calling code. // // - Z_ARG1 addresses the first byte of source data. The offset (Z_ARG3) is // already accounted for. // // - The current SHA state (the intermediate message digest value) is contained // in an area addressed by Z_ARG2. The area size depends on the SHA variant // and is accessible via the enum VM_Version::MsgDigest::_SHA<n>_parmBlk_I // // - The single-block stub is expected to digest exactly one data block, starting // at the address passed in Z_ARG1. // // - The multi-block stub is expected to digest all data blocks which start in // the offset interval [srcOff(Z_ARG3), srcLimit(Z_ARG4)). The exact difference // (srcLimit-srcOff), rounded up to the next multiple of the data block length, // gives the number of blocks to digest. It must be assumed that the calling code // provides for a large enough source data buffer. // // Compute SHA-1 function.
address generate_SHA1_stub(bool multiBlock, constchar* name) {
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", name); unsignedint start_off = __ offset(); // Remember stub start address (is rtn value).
constRegister srcBuff = Z_ARG1; // Points to first block to process (offset already added). constRegister SHAState = Z_ARG2; // Only on entry. Reused soon thereafter for kimd register pairs. constRegister srcOff = Z_ARG3; // int constRegister srcLimit = Z_ARG4; // Only passed in multiBlock case. int
constRegister SHAState_local = Z_R1; constRegister SHAState_save = Z_ARG3; constRegister srcBufLen = Z_ARG2; // Destroys state address, must be copied before.
Label useKLMD, rtn;
if (multiBlock) { // Process everything from offset to limit.
// The following description is valid if we get a raw (unpimped) source data buffer, // spanning the range between [srcOff(Z_ARG3), srcLimit(Z_ARG4)). As detailed above, // the calling convention for these stubs is different. We leave the description in // to inform the reader what must be happening hidden in the calling code. // // The data block to be processed can have arbitrary length, i.e. its length does not // need to be an integer multiple of SHA<n>_datablk. Therefore, we need to implement // two different paths. If the length is an integer multiple, we use KIMD, saving us // to copy the SHA state back and forth. If the length is odd, we copy the SHA state // to the stack, execute a KLMD instruction on it and copy the result back to the // caller's SHA state location.
// Total #srcBuff blocks to process. if (VM_Version::has_DistinctOpnds()) {
__ z_srk(srcBufLen, srcLimit, srcOff); // exact difference
__ z_ahi(srcBufLen, VM_Version::MsgDigest::_SHA1_dataBlk-1); // round up
__ z_nill(srcBufLen, (~(VM_Version::MsgDigest::_SHA1_dataBlk-1)) & 0xffff);
__ z_ark(srcLimit, srcOff, srcBufLen); // Srclimit temporarily holds return value.
__ z_llgfr(srcBufLen, srcBufLen); // Cast to 64-bit.
} else {
__ z_lgfr(srcBufLen, srcLimit); // Exact difference. srcLimit passed as int.
__ z_sgfr(srcBufLen, srcOff); // SrcOff passed as int, now properly casted to long.
__ z_aghi(srcBufLen, VM_Version::MsgDigest::_SHA1_dataBlk-1); // round up
__ z_nill(srcBufLen, (~(VM_Version::MsgDigest::_SHA1_dataBlk-1)) & 0xffff);
__ z_lgr(srcLimit, srcOff); // SrcLimit temporarily holds return value.
__ z_agr(srcLimit, srcBufLen);
}
// Integral #blocks to digest? // As a result of the calculations above, srcBufLen MUST be an integer // multiple of _SHA1_dataBlk, or else we are in big trouble. // We insert an asm_assert into the KLMD case to guard against that.
__ z_tmll(srcBufLen, VM_Version::MsgDigest::_SHA1_dataBlk-1);
__ z_brc(Assembler::bcondNotAllZero, useKLMD);
// Process all full blocks.
__ kimd(srcBuff);
__ z_lgr(Z_RET, srcLimit); // Offset of first unprocessed byte in buffer.
} else { // Process one data block only.
__ load_const_optimized(srcBufLen, (int)VM_Version::MsgDigest::_SHA1_dataBlk); // #srcBuff bytes to process
__ kimd(srcBuff);
__ add2reg(Z_RET, (int)VM_Version::MsgDigest::_SHA1_dataBlk, srcOff); // Offset of first unprocessed byte in buffer. No 32 to 64 bit extension needed.
}
__ bind(rtn);
__ z_br(Z_R14);
if (multiBlock) {
__ bind(useKLMD);
#if 1 // Security net: this stub is believed to be called for full-sized data blocks only // NOTE: The following code is believed to be correct, but is is not tested.
__ stop_static("SHA128 stub can digest full data blocks only. Use -XX:-UseSHA as remedy.", 0); #endif
}
if (multiBlock) { // Process everything from offset to limit. // The following description is valid if we get a raw (unpimped) source data buffer, // spanning the range between [srcOff(Z_ARG3), srcLimit(Z_ARG4)). As detailed above, // the calling convention for these stubs is different. We leave the description in // to inform the reader what must be happening hidden in the calling code. // // The data block to be processed can have arbitrary length, i.e. its length does not // need to be an integer multiple of SHA<n>_datablk. Therefore, we need to implement // two different paths. If the length is an integer multiple, we use KIMD, saving us // to copy the SHA state back and forth. If the length is odd, we copy the SHA state // to the stack, execute a KLMD instruction on it and copy the result back to the // caller's SHA state location.
// Integral #blocks to digest? // As a result of the calculations above, srcBufLen MUST be an integer // multiple of _SHA1_dataBlk, or else we are in big trouble. // We insert an asm_assert into the KLMD case to guard against that.
__ z_tmll(srcBufLen, VM_Version::MsgDigest::_SHA256_dataBlk-1);
__ z_brc(Assembler::bcondNotAllZero, useKLMD);
// Process all full blocks.
__ kimd(srcBuff);
__ z_lgr(Z_RET, srcLimit); // Offset of first unprocessed byte in buffer.
} else { // Process one data block only.
__ load_const_optimized(srcBufLen, (int)VM_Version::MsgDigest::_SHA256_dataBlk); // #srcBuff bytes to process
__ kimd(srcBuff);
__ add2reg(Z_RET, (int)VM_Version::MsgDigest::_SHA256_dataBlk, srcOff); // Offset of first unprocessed byte in buffer.
}
__ bind(rtn);
__ z_br(Z_R14);
if (multiBlock) {
__ bind(useKLMD); #if 1 // Security net: this stub is believed to be called for full-sized data blocks only. // NOTE: // The following code is believed to be correct, but is is not tested.
__ stop_static("SHA256 stub can digest full data blocks only. Use -XX:-UseSHA as remedy.", 0); #endif
}
if (multiBlock) { // Process everything from offset to limit. // The following description is valid if we get a raw (unpimped) source data buffer, // spanning the range between [srcOff(Z_ARG3), srcLimit(Z_ARG4)). As detailed above, // the calling convention for these stubs is different. We leave the description in // to inform the reader what must be happening hidden in the calling code. // // The data block to be processed can have arbitrary length, i.e. its length does not // need to be an integer multiple of SHA<n>_datablk. Therefore, we need to implement // two different paths. If the length is an integer multiple, we use KIMD, saving us // to copy the SHA state back and forth. If the length is odd, we copy the SHA state // to the stack, execute a KLMD instruction on it and copy the result back to the // caller's SHA state location.
// integral #blocks to digest? // As a result of the calculations above, srcBufLen MUST be an integer // multiple of _SHA1_dataBlk, or else we are in big trouble. // We insert an asm_assert into the KLMD case to guard against that.
__ z_tmll(srcBufLen, VM_Version::MsgDigest::_SHA512_dataBlk-1);
__ z_brc(Assembler::bcondNotAllZero, useKLMD);
// Process all full blocks.
__ kimd(srcBuff);
__ z_lgr(Z_RET, srcLimit); // Offset of first unprocessed byte in buffer.
} else { // Process one data block only.
__ load_const_optimized(srcBufLen, (int)VM_Version::MsgDigest::_SHA512_dataBlk); // #srcBuff bytes to process
__ kimd(srcBuff);
__ add2reg(Z_RET, (int)VM_Version::MsgDigest::_SHA512_dataBlk, srcOff); // Offset of first unprocessed byte in buffer.
}
__ bind(rtn);
__ z_br(Z_R14);
if (multiBlock) {
__ bind(useKLMD); #if 1 // Security net: this stub is believed to be called for full-sized data blocks only // NOTE: // The following code is believed to be correct, but is is not tested.
__ stop_static("SHA512 stub can digest full data blocks only. Use -XX:-UseSHA as remedy.", 0); #endif
}
return __ addr_at(start_off);
}
/** * Arguments: * * Inputs: * Z_ARG1 - int crc * Z_ARG2 - byte* buf * Z_ARG3 - int length (of buffer) * * Result: * Z_RET - int crc result
**/ // Compute CRC function (generic, for all polynomials). void generate_CRC_updateBytes(constchar* name, Register table, bool invertCRC) {
// arguments to kernel_crc32: Register crc = Z_ARG1; // Current checksum, preset by caller or result from previous call, int. Register data = Z_ARG2; // source byte array Register dataLen = Z_ARG3; // #bytes to process, int // Register table = Z_ARG4; // crc table address. Preloaded and passed in by caller. constRegister t0 = Z_R10; // work reg for kernel* emitters constRegister t1 = Z_R11; // work reg for kernel* emitters constRegister t2 = Z_R12; // work reg for kernel* emitters constRegister t3 = Z_R13; // work reg for kernel* emitters
// We pass these values as ints, not as longs as required by C calling convention. // Crc used as int.
__ z_llgfr(dataLen, dataLen);
__ resize_frame(-(6*8), Z_R0, true); // Resize frame to provide add'l space to spill 5 registers.
__ z_stmg(Z_R10, Z_R13, 1*8, Z_SP); // Spill regs 10..11 to make them available as work registers.
__ kernel_crc32_1word(crc, data, dataLen, table, t0, t1, t2, t3, invertCRC);
__ z_lmg(Z_R10, Z_R13, 1*8, Z_SP); // Spill regs 10..11 back from stack.
__ resize_frame(+(6*8), Z_R0, true); // Resize frame to provide add'l space to spill 5 registers.
__ z_llgfr(Z_RET, crc); // Updated crc is function result. No copying required, just zero upper 32 bits.
__ z_br(Z_R14); // Result already in Z_RET == Z_ARG1.
}
void generate_initial() { // Generates all stubs and initializes the entry points.
// Entry points that exist in all platforms. // Note: This is code that could be shared among different // platforms - however the benefit seems to be smaller than the // disadvantage of having a much more complicated generator // structure. See also comment in stubRoutines.hpp.
StubRoutines::_forward_exception_entry = generate_forward_exception();
// Build this early so it's available for the interpreter.
StubRoutines::_throw_StackOverflowError_entry =
generate_throw_exception("StackOverflowError throw_exception",
CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError), false);
StubRoutines::_throw_delayed_StackOverflowError_entry =
generate_throw_exception("delayed StackOverflowError throw_exception",
CAST_FROM_FN_PTR(address, SharedRuntime::throw_delayed_StackOverflowError), false);
//---------------------------------------------------------------------- // Entry points that are platform specific.
if (UseCRC32Intrinsics) {
StubRoutines::_crc_table_adr = (address)StubRoutines::zarch::_crc_table;
StubRoutines::_updateBytesCRC32 = generate_CRC32_updateBytes("CRC32_updateBytes");
}
if (UseCRC32CIntrinsics) {
StubRoutines::_crc32c_table_addr = (address)StubRoutines::zarch::_crc32c_table;
StubRoutines::_updateBytesCRC32C = generate_CRC32C_updateBytes("CRC32C_updateBytes");
}
// Comapct string intrinsics: Translate table for string inflate intrinsic. Used by trot instruction.
StubRoutines::zarch::_trot_table_addr = (address)StubRoutines::zarch::_trot_table;
}
void generate_phase1() { if (!Continuations::enabled()) return;
// These entry points require SharedInfo::stack0 to be set up in non-core builds.
StubRoutines::_throw_AbstractMethodError_entry = generate_throw_exception("AbstractMethodError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_AbstractMethodError), false);
StubRoutines::_throw_IncompatibleClassChangeError_entry= generate_throw_exception("IncompatibleClassChangeError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_IncompatibleClassChangeError), false);
StubRoutines::_throw_NullPointerException_at_call_entry= generate_throw_exception("NullPointerException at call throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException_at_call), false);
// Support for verify_oop (must happen after universe_init).
StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop_subroutine();
// Arraycopy stubs used by compilers.
generate_arraycopy_stubs();
// Generate AES intrinsics code. if (UseAESIntrinsics) { if (VM_Version::has_Crypto_AES()) {
StubRoutines::_aescrypt_encryptBlock = generate_AES_encryptBlock("AES_encryptBlock");
StubRoutines::_aescrypt_decryptBlock = generate_AES_decryptBlock("AES_decryptBlock");
StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_AES_encrypt("AES_encryptBlock_chaining");
StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_AES_decrypt("AES_decryptBlock_chaining");
} else { // In PRODUCT builds, the function pointers will keep their initial (NULL) value. // LibraryCallKit::try_to_inline() will return false then, preventing the intrinsic to be called.
assert(VM_Version::has_Crypto_AES(), "Inconsistent settings. Check vm_version_s390.cpp");
}
}
if (UseAESCTRIntrinsics) { if (VM_Version::has_Crypto_AES_CTR()) {
StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt("counterMode_AESCrypt");
} else { // In PRODUCT builds, the function pointers will keep their initial (NULL) value. // LibraryCallKit::try_to_inline() will return false then, preventing the intrinsic to be called.
assert(VM_Version::has_Crypto_AES_CTR(), "Inconsistent settings. Check vm_version_s390.cpp");
}
}
// nmethod entry barriers for concurrent class unloading
BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod(); if (bs_nm != NULL) {
StubRoutines::zarch::_nmethod_entry_barrier = generate_nmethod_entry_barrier();
}
#ifdef COMPILER2 if (UseMultiplyToLenIntrinsic) {
StubRoutines::_multiplyToLen = generate_multiplyToLen();
} if (UseMontgomeryMultiplyIntrinsic) {
StubRoutines::_montgomeryMultiply
= CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_multiply);
} if (UseMontgomerySquareIntrinsic) {
StubRoutines::_montgomerySquare
= CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_square);
} #endif
}
public:
StubGenerator(CodeBuffer* code, int phase) : StubCodeGenerator(code) {
_stub_count = (phase == 0) ? 0x100 : 0x200; if (phase == 0) {
generate_initial();
} elseif (phase == 1) {
generate_phase1(); // stubs that must be available for the interpreter
} else {
generate_all();
}
}
private: int _stub_count; void stub_prolog(StubCodeDesc* cdesc) { #ifdef ASSERT // Put extra information in the stub code, to make it more readable. // Write the high part of the address. // [RGV] Check if there is a dependency on the size of this prolog.
__ emit_data((intptr_t)cdesc >> 32);
__ emit_data((intptr_t)cdesc);
__ emit_data(++_stub_count); #endif
align(true);
}
void align(bool at_header = false) { // z/Architecture cache line size is 256 bytes. // There is no obvious benefit in aligning stub // code to cache lines. Use CodeEntryAlignment instead. constunsignedint icache_line_size = CodeEntryAlignment; constunsignedint icache_half_line_size = MIN2<unsignedint>(32, CodeEntryAlignment);
if (at_header) { while ((intptr_t)(__ pc()) % icache_line_size != 0) {
__ z_illtrap();
}
} else { while ((intptr_t)(__ pc()) % icache_half_line_size != 0) {
__ z_nop();
}
}
}
};
void StubGenerator_generate(CodeBuffer* code, int phase) {
StubGenerator g(code, phase);
}
Messung V0.5 in Prozent
¤ Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.0.107Bemerkung:
(Wie Sie bei der Firma Beratungs- und Dienstleistungen beauftragen können 2026-04-26)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.