/* * Copyright (c) 1999, 2022, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2015, 2022 SAP SE. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 only, as * published by the Free Software Foundation. * * This code is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * version 2 for more details (a copy is included in the LICENSE file that * accompanied this code). * * You should have received a copy of the GNU General Public License version * 2 along with this work; if not, write to the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. * * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA * or visit www.oracle.com if you need additional information or have any * questions. *
*/
// put OS-includes here # include <sys/types.h> # include <sys/mman.h> # include <sys/stat.h> # include <sys/select.h> # include <pthread.h> # include <signal.h> # include <endian.h> # include <errno.h> # include <dlfcn.h> # include <stdio.h> # include <unistd.h> # include <sys/resource.h> # include <pthread.h> # include <sys/stat.h> # include <sys/time.h> # include <sys/times.h> # include <sys/utsname.h> # include <sys/socket.h> # include <pwd.h> # include <poll.h> # include <fcntl.h> # include <string.h> # include <syscall.h> # include <sys/sysinfo.h> # include <sys/ipc.h> # include <sys/shm.h> # include <link.h> # include <stdint.h> # include <inttypes.h> # include <sys/ioctl.h> # include <linux/elf-em.h> #ifdef __GLIBC__ # include <malloc.h> #endif
// if RUSAGE_THREAD for getrusage() has not been defined, do it here. The code calling // getrusage() is prepared to handle the associated failure. #ifndef RUSAGE_THREAD #define RUSAGE_THREAD (1) /* only the calling thread */ #endif
#define MAX_PATH (2 * K)
#define MAX_SECS 100000000
// for timer info max values which include all bits #define ALL_64_BITS CONST64(0xFFFFFFFFFFFFFFFF)
#ifdef MUSL_LIBC // dlvsym is not a part of POSIX // and musl libc doesn't implement it. staticvoid *dlvsym(void *handle, constchar *symbol, constchar *version) { // load the latest version of symbol return dlsym(handle, symbol);
} #endif
#ifdef __GLIBC__ // We want to be buildable and runnable on older and newer glibcs, so resolve both // mallinfo and mallinfo2 dynamically. struct old_mallinfo { int arena; int ordblks; int smblks; int hblks; int hblkhd; int usmblks; int fsmblks; int uordblks; int fordblks; int keepcost;
}; typedefstruct old_mallinfo (*mallinfo_func_t)(void); static mallinfo_func_t g_mallinfo = NULL;
// If the VM might have been created on the primordial thread, we need to resolve the // primordial thread stack bounds and check if the current thread might be the // primordial thread in places. If we know that the primordial thread is never used, // such as when the VM was created by one of the standard java launchers, we can // avoid this staticbool suppress_primordial_thread_resolution = false;
staticvoid next_line(FILE *f) { int c; do {
c = fgetc(f);
} while (c != '\n' && c != EOF);
}
bool os::Linux::get_tick_information(CPUPerfTicks* pticks, int which_logical_cpu) {
FILE* fh;
uint64_t userTicks, niceTicks, systemTicks, idleTicks; // since at least kernel 2.6 : iowait: time waiting for I/O to complete // irq: time servicing interrupts; softirq: time servicing softirqs
uint64_t iowTicks = 0, irqTicks = 0, sirqTicks= 0; // steal (since kernel 2.6.11): time spent in other OS when running in a virtualized environment
uint64_t stealTicks = 0; // guest (since kernel 2.6.24): time spent running a virtual CPU for guest OS under the // control of the Linux kernel
uint64_t guestNiceTicks = 0; int logical_cpu = -1; constint required_tickinfo_count = (which_logical_cpu == -1) ? 4 : 5; int n;
memset(pticks, 0, sizeof(CPUPerfTicks));
if ((fh = os::fopen("/proc/stat", "r")) == NULL) { returnfalse;
}
// pid_t gettid() // // Returns the kernel thread id of the currently running thread. Kernel // thread id is used to access /proc.
pid_t os::Linux::gettid() { int rslt = syscall(SYS_gettid);
assert(rslt != -1, "must be."); // old linuxthreads implementation? return (pid_t)rslt;
}
// Returns the amount of swap currently configured, in bytes. // This can change at any time.
julong os::Linux::host_swap() { struct sysinfo si;
sysinfo(&si); return (julong)si.totalswap;
}
// Most versions of linux have a bug where the number of processors are // determined by looking at the /proc file system. In a chroot environment, // the system call returns 1. staticbool unsafe_chroot_detected = false; staticconstchar *unstable_chroot_error = "/proc file system not found.\n" "Java may be unstable running multithreaded in a chroot " "environment on Linux when /proc filesystem is not mounted.";
void os::init_system_properties_values() { // The next steps are taken in the product version: // // Obtain the JAVA_HOME value from the location of libjvm.so. // This library should be located at: // <JAVA_HOME>/lib/{client|server}/libjvm.so. // // If "/jre/lib/" appears at the right place in the path, then we // assume libjvm.so is installed in a JDK and we use this path. // // Otherwise exit with message: "Could not create the Java virtual machine." // // The following extra steps are taken in the debugging version: // // If "/jre/lib/" does NOT appear at the right place in the path // instead of exit check for $JAVA_HOME environment variable. // // If it is defined and we are able to locate $JAVA_HOME/jre/lib/<arch>, // then we append a fake suffix "hotspot/libjvm.so" to this path so // it looks like libjvm.so is installed there // <JAVA_HOME>/jre/lib/<arch>/hotspot/libjvm.so. // // Otherwise exit. // // Important note: if the location of libjvm.so changes this // code needs to be changed accordingly.
// See ld(1): // The linker uses the following search paths to locate required // shared libraries: // 1: ... // ... // 7: The default directories, normally /lib and /usr/lib. #ifndef OVERRIDE_LIBPATH #ifdefined(_LP64) #define DEFAULT_LIBPATH "/usr/lib64:/lib64:/lib:/usr/lib" #else #define DEFAULT_LIBPATH "/lib:/usr/lib" #endif #else #define DEFAULT_LIBPATH OVERRIDE_LIBPATH #endif
// Base path of extensions installed on the system. #define SYS_EXT_DIR "/usr/java/packages" #define EXTENSIONS_DIR "/lib/ext"
// Buffer that fits several sprintfs. // Note that the space for the colon and the trailing null are provided // by the nulls included by the sizeof operator. const size_t bufsize =
MAX2((size_t)MAXPATHLEN, // For dll_dir & friends.
(size_t)MAXPATHLEN + sizeof(EXTENSIONS_DIR) + sizeof(SYS_EXT_DIR) + sizeof(EXTENSIONS_DIR)); // extensions dir char *buf = NEW_C_HEAP_ARRAY(char, bufsize, mtInternal);
// Found the full path to libjvm.so. // Now cut the path to <java_home>/jre if we can.
pslash = strrchr(buf, '/'); if (pslash != NULL) {
*pslash = '\0'; // Get rid of /libjvm.so.
}
pslash = strrchr(buf, '/'); if (pslash != NULL) {
*pslash = '\0'; // Get rid of /{client|server|hotspot}.
}
Arguments::set_dll_dir(buf);
if (pslash != NULL) {
pslash = strrchr(buf, '/'); if (pslash != NULL) {
*pslash = '\0'; // Get rid of /lib.
}
}
Arguments::set_java_home(buf); if (!set_boot_path('/', ':')) {
vm_exit_during_initialization("Failed setting boot class path.", NULL);
}
}
// Where to look for native libraries. // // Note: Due to a legacy implementation, most of the library path // is set in the launcher. This was to accommodate linking restrictions // on legacy Linux implementations (which are no longer supported). // Eventually, all the library path setting will be done here. // // However, to prevent the proliferation of improperly built native // libraries, the new path component /usr/java/packages is added here. // Eventually, all the library path setting will be done here.
{ // Get the user setting of LD_LIBRARY_PATH, and prepended it. It // should always exist (until the legacy problem cited above is // addressed). constchar *v = ::getenv("LD_LIBRARY_PATH"); constchar *v_colon = ":"; if (v == NULL) { v = ""; v_colon = ""; } // That's +1 for the colon and +1 for the trailing '\0'. char *ld_library_path = NEW_C_HEAP_ARRAY(char,
strlen(v) + 1 + sizeof(SYS_EXT_DIR) + sizeof("/lib/") + sizeof(DEFAULT_LIBPATH) + 1,
mtInternal);
sprintf(ld_library_path, "%s%s" SYS_EXT_DIR "/lib:" DEFAULT_LIBPATH, v, v_colon);
Arguments::set_library_path(ld_library_path);
FREE_C_HEAP_ARRAY(char, ld_library_path);
}
void os::Linux::libpthread_init() { // Save glibc and pthread version strings. #if !defined(_CS_GNU_LIBC_VERSION) || \
!defined(_CS_GNU_LIBPTHREAD_VERSION) #error"glibc too old (< 2.3.2)" #endif
// os::Linux::manually_expand_stack() takes care of expanding the thread // stack. Note that this is normally not needed: pthread stacks allocate // thread stack using mmap() without MAP_NORESERVE, so the stack is already // committed. Therefore it is not necessary to expand the stack manually. // // Manually expanding the stack was historically needed on LinuxThreads // thread stacks, which were allocated with mmap(MAP_GROWSDOWN). Nowadays // it is kept to deal with very rare corner cases: // // For one, user may run the VM on an own implementation of threads // whose stacks are - like the old LinuxThreads - implemented using // mmap(MAP_GROWSDOWN). // // Also, this coding may be needed if the VM is running on the primordial // thread. Normally we avoid running on the primordial thread; however, // user may still invoke the VM on the primordial thread. // // The following historical comment describes the details about running // on a thread stack allocated with mmap(MAP_GROWSDOWN):
// Force Linux kernel to expand current thread stack. If "bottom" is close // to the stack guard, caller should block all signals. // // MAP_GROWSDOWN: // A special mmap() flag that is used to implement thread stacks. It tells // kernel that the memory region should extend downwards when needed. This // allows early versions of LinuxThreads to only mmap the first few pages // when creating a new thread. Linux kernel will automatically expand thread // stack as needed (on page faults). // // However, because the memory region of a MAP_GROWSDOWN stack can grow on // demand, if a page fault happens outside an already mapped MAP_GROWSDOWN // region, it's hard to tell if the fault is due to a legitimate stack // access or because of reading/writing non-exist memory (e.g. buffer // overrun). As a rule, if the fault happens below current stack pointer, // Linux kernel does not expand stack, instead a SIGSEGV is sent to the // application (see Linux kernel fault.c). // // This Linux feature can cause SIGSEGV when VM bangs thread stack for // stack overflow detection. // // Newer version of LinuxThreads (since glibc-2.2, or, RH-7.x) and NPTL do // not use MAP_GROWSDOWN. // // To get around the problem and allow stack banging on Linux, we need to // manually expand thread stack after receiving the SIGSEGV. // // There are two ways to expand thread stack to address "bottom", we used // both of them in JVM before 1.5: // 1. adjust stack pointer first so that it is below "bottom", and then // touch "bottom" // 2. mmap() the page in question // // Now alternate signal stack is gone, it's harder to use 2. For instance, // if current sp is already near the lower end of page 101, and we need to // call mmap() to map page 100, it is possible that part of the mmap() frame // will be placed in page 100. When page 100 is mapped, it is zero-filled. // That will destroy the mmap() frame and cause VM to crash. // // The following code works by adjusting sp first, then accessing the "bottom" // page to force a page fault. Linux kernel will then automatically expand the // stack mapping. // // _expand_stack_to() assumes its frame size is less than page size, which // should always be true if the function is not inlined.
// Adjust bottom to point to the largest address within the same page, it // gives us a one-page buffer if alloca() allocates slightly more memory.
bottom = (address)align_down((uintptr_t)bottom, os::vm_page_size());
bottom += os::vm_page_size() - 1;
// sp might be slightly above current stack pointer; if that's the case, we // will alloca() a little more space than necessary, which is OK. Don't use // os::current_stack_pointer(), as its result can be slightly below current // stack pointer, causing us to not alloca enough to reach "bottom".
sp = (address)&sp;
if (sp > bottom) {
size = sp - bottom;
p = (volatilechar *)alloca(size);
assert(p != NULL && p <= (volatilechar *)bottom, "alloca problem?");
p[0] = '\0';
}
}
////////////////////////////////////////////////////////////////////////////// // create new thread
// Thread start routine for all newly created threads staticvoid *thread_native_entry(Thread *thread) {
thread->record_stack_base_and_size();
#ifndef __GLIBC__ // Try to randomize the cache line index of hot stack frames. // This helps when threads of the same stack traces evict each other's // cache lines. The threads can be either from the same JVM instance, or // from different JVM instances. The benefit is especially true for // processors with hyperthreading technology. // This code is not needed anymore in glibc because it has MULTI_PAGE_ALIASING // and we did not see any degradation in performance without `alloca()`. staticint counter = 0; int pid = os::current_process_id(); int random = ((pid ^ counter++) & 7) * 128; void *stackmem = alloca(random != 0 ? random : 1); // ensure we allocate > 0 // Ensure the alloca result is used in a way that prevents the compiler from eliding it.
*(char *)stackmem = 1; #endif
if (UseNUMA) { int lgrp_id = os::numa_get_group_id(); if (lgrp_id != -1) {
thread->set_lgrp_id(lgrp_id);
}
} // initialize signal mask for this thread
PosixSignals::hotspot_sigmask(thread);
// initialize floating point control register
os::Linux::init_thread_fpu_state();
// handshaking with parent thread
{
MutexLocker ml(sync, Mutex::_no_safepoint_check_flag);
// On Linux, glibc places static TLS blocks (for __thread variables) on // the thread stack. This decreases the stack size actually available // to threads. // // For large static TLS sizes, this may cause threads to malfunction due // to insufficient stack space. This is a well-known issue in glibc: // http://sourceware.org/bugzilla/show_bug.cgi?id=11787. // // As a workaround, we call a private but assumed-stable glibc function, // __pthread_get_minstack() to obtain the minstack size and derive the // static TLS size from it. We then increase the user requested stack // size by this TLS size. // // Due to compatibility concerns, this size adjustment is opt-in and // controlled via AdjustStackSizeForTLS. typedef size_t (*GetMinStack)(const pthread_attr_t *attr);
// Returns the size of the static TLS area glibc puts on thread stacks. // The value is cached on first use, which occurs when the first thread // is created during VM initialization. static size_t get_static_tls_area_size(const pthread_attr_t *attr) {
size_t tls_size = 0; if (_get_minstack_func != NULL) { // Obtain the pthread minstack size by calling __pthread_get_minstack.
size_t minstack_size = _get_minstack_func(attr);
// Remove non-TLS area size included in minstack size returned // by __pthread_get_minstack() to get the static TLS size. // In glibc before 2.27, minstack size includes guard_size. // In glibc 2.27 and later, guard_size is automatically added // to the stack size by pthread_create and is no longer included // in minstack size. In both cases, the guard_size is taken into // account, so there is no need to adjust the result for that. // // Although __pthread_get_minstack() is a private glibc function, // it is expected to have a stable behavior across future glibc // versions while glibc still allocates the static TLS blocks off // the stack. Following is glibc 2.28 __pthread_get_minstack(): // // size_t // __pthread_get_minstack (const pthread_attr_t *attr) // { // return GLRO(dl_pagesize) + __static_tls_size + PTHREAD_STACK_MIN; // } // // // The following 'minstack_size > os::vm_page_size() + PTHREAD_STACK_MIN' // if check is done for precaution. if (minstack_size > (size_t)os::vm_page_size() + PTHREAD_STACK_MIN) {
tls_size = minstack_size - os::vm_page_size() - PTHREAD_STACK_MIN;
}
}
log_info(os, thread)("Stack size adjustment for TLS is " SIZE_FORMAT,
tls_size); return tls_size;
}
// Calculate stack size if it's not specified by caller.
size_t stack_size = os::Posix::get_initial_stack_size(thr_type, req_stack_size); // In glibc versions prior to 2.27 the guard size mechanism // is not implemented properly. The posix standard requires adding // the size of the guard pages to the stack size, instead Linux // takes the space out of 'stacksize'. Thus we adapt the requested // stack_size by the size of the guard pages to mimic proper // behaviour. However, be careful not to end up with a size // of zero due to overflow. Don't add the guard page in that case.
size_t guard_size = os::Linux::default_guard_size(thr_type); // Configure glibc guard page. Must happen before calling // get_static_tls_area_size(), which uses the guard_size.
pthread_attr_setguardsize(&attr, guard_size);
size_t stack_adjust_size = 0; if (AdjustStackSizeForTLS) { // Adjust the stack_size for on-stack TLS - see get_static_tls_area_size().
stack_adjust_size += get_static_tls_area_size(&attr);
} else {
stack_adjust_size += guard_size;
}
stack_adjust_size = align_up(stack_adjust_size, os::vm_page_size()); if (stack_size <= SIZE_MAX - stack_adjust_size) {
stack_size += stack_adjust_size;
}
assert(is_aligned(stack_size, os::vm_page_size()), "stack_size not aligned");
int status = pthread_attr_setstacksize(&attr, stack_size); if (status != 0) { // pthread_attr_setstacksize() function can fail // if the stack size exceeds a system-imposed limit.
assert_status(status == EINVAL, status, "pthread_attr_setstacksize");
log_warning(os, thread)("The %sthread stack size specified is invalid: " SIZE_FORMAT "k",
(thr_type == compiler_thread) ? "compiler " : ((thr_type == java_thread) ? "" : "VM "),
stack_size / K);
thread->set_osthread(NULL); delete osthread; returnfalse;
}
ThreadState state;
{
ResourceMark rm;
pthread_t tid; int ret = 0; int limit = 3; do {
ret = pthread_create(&tid, &attr, (void* (*)(void*)) thread_native_entry, thread);
} while (ret == EAGAIN && limit-- > 0);
char buf[64]; if (ret == 0) {
log_info(os, thread)("Thread \"%s\" started (pthread id: " UINTX_FORMAT ", attributes: %s). ",
thread->name(), (uintx) tid, os::Posix::describe_pthread_attr(buf, sizeof(buf), &attr));
} else {
log_warning(os, thread)("Failed to start thread \"%s\" - pthread_create failed (%s) for attributes: %s.",
thread->name(), os::errno_name(ret), os::Posix::describe_pthread_attr(buf, sizeof(buf), &attr)); // Log some OS information which might explain why creating the thread failed.
log_info(os, thread)("Number of threads approx. running in the VM: %d", Threads::number_of_threads());
LogStream st(Log(os, thread)::info());
os::Posix::print_rlimit_info(&st);
os::print_memory_info(&st);
os::Linux::print_proc_sys_info(&st);
os::Linux::print_container_info(&st);
}
pthread_attr_destroy(&attr);
if (ret != 0) { // Need to clean up stuff we've allocated so far
thread->set_osthread(NULL); delete osthread; returnfalse;
}
// Store pthread info into the OSThread
osthread->set_pthread_id(tid);
// Wait until child thread is either initialized or aborted
{
Monitor* sync_with_child = osthread->startThread_lock();
MutexLocker ml(sync_with_child, Mutex::_no_safepoint_check_flag); while ((state = osthread->get_state()) == ALLOCATED) {
sync_with_child->wait_without_safepoint_check();
}
}
}
// The thread is returned suspended (in state INITIALIZED), // and is started higher up in the call chain
assert(state == INITIALIZED, "race condition"); returntrue;
}
// bootstrap the main thread bool os::create_main_thread(JavaThread* thread) {
assert(os::Linux::_main_thread == pthread_self(), "should be called inside main thread"); return create_attached_thread(thread);
}
// Allocate the OSThread object
OSThread* osthread = new OSThread();
if (osthread == NULL) { returnfalse;
}
// Store pthread info into the OSThread
osthread->set_thread_id(os::Linux::gettid());
osthread->set_pthread_id(::pthread_self());
// initialize floating point control register
os::Linux::init_thread_fpu_state();
// Initial thread state is RUNNABLE
osthread->set_state(RUNNABLE);
thread->set_osthread(osthread);
if (UseNUMA) { int lgrp_id = os::numa_get_group_id(); if (lgrp_id != -1) {
thread->set_lgrp_id(lgrp_id);
}
}
if (os::is_primordial_thread()) { // If current thread is primordial thread, its stack is mapped on demand, // see notes about MAP_GROWSDOWN. Here we try to force kernel to map // the entire stack region to avoid SEGV in stack banging. // It is also useful to get around the heap-stack-gap problem on SuSE // kernel (see 4821821 for details). We first expand stack to the top // of yellow zone, then enable stack yellow zone (order is significant, // enabling yellow zone first will crash JVM on SuSE Linux), so there // is no gap between the last two virtual memory regions.
StackOverflow* overflow_state = thread->stack_overflow_state();
address addr = overflow_state->stack_reserved_zone_base();
assert(addr != NULL, "initialization problem?");
assert(overflow_state->stack_available(addr) > 0, "stack guard should not be enabled");
// Free Linux resources related to the OSThread void os::free_thread(OSThread* osthread) {
assert(osthread != NULL, "osthread not set");
// We are told to free resources of the argument thread, // but we can only really operate on the current thread.
assert(Thread::current()->osthread() == osthread, "os::free_thread but not current thread");
#ifdef ASSERT
sigset_t current;
sigemptyset(¤t);
pthread_sigmask(SIG_SETMASK, NULL, ¤t);
assert(!sigismember(¤t, PosixSignals::SR_signum), "SR signal should not be blocked!"); #endif
// Check if current thread is the primordial thread, similar to Solaris thr_main. bool os::is_primordial_thread(void) { if (suppress_primordial_thread_resolution) { returnfalse;
} char dummy; // If called before init complete, thread stack bottom will be null. // Can be called if fatal error occurs before initialization. if (os::Linux::initial_thread_stack_bottom() == NULL) returnfalse;
assert(os::Linux::initial_thread_stack_bottom() != NULL &&
os::Linux::initial_thread_stack_size() != 0, "os::init did not locate primordial thread's stack region"); if ((address)&dummy >= os::Linux::initial_thread_stack_bottom() &&
(address)&dummy < os::Linux::initial_thread_stack_bottom() +
os::Linux::initial_thread_stack_size()) { returntrue;
} else { returnfalse;
}
}
// Find the virtual memory area that contains addr staticbool find_vma(address addr, address* vma_low, address* vma_high) {
FILE *fp = os::fopen("/proc/self/maps", "r"); if (fp) {
address low, high; while (!feof(fp)) { if (fscanf(fp, "%p-%p", &low, &high) == 2) { if (low <= addr && addr < high) { if (vma_low) *vma_low = low; if (vma_high) *vma_high = high;
fclose(fp); returntrue;
}
} for (;;) { int ch = fgetc(fp); if (ch == EOF || ch == (int)'\n') break;
}
}
fclose(fp);
} returnfalse;
}
// Locate primordial thread stack. This special handling of primordial thread stack // is needed because pthread_getattr_np() on most (all?) Linux distros returns // bogus value for the primordial process thread. While the launcher has created // the VM in a new thread since JDK 6, we still have to allow for the use of the // JNI invocation API from a primordial thread. void os::Linux::capture_initial_stack(size_t max_size) {
// max_size is either 0 (which means accept OS default for thread stacks) or // a user-specified value known to be at least the minimum needed. If we // are actually on the primordial thread we can make it appear that we have a // smaller max_size stack by inserting the guard pages at that location. But we // cannot do anything to emulate a larger stack than what has been provided by // the OS or threading library. In fact if we try to use a stack greater than // what is set by rlimit then we will crash the hosting process.
// Maximum stack size is the easy part, get it from RLIMIT_STACK. // If this is "unlimited" then it will be a huge value. struct rlimit rlim;
getrlimit(RLIMIT_STACK, &rlim);
size_t stack_size = rlim.rlim_cur;
// 6308388: a bug in ld.so will relocate its own .data section to the // lower end of primordial stack; reduce ulimit -s value a little bit // so we won't install guard page on ld.so's data section. // But ensure we don't underflow the stack size - allow 1 page spare if (stack_size >= (size_t)(3 * os::vm_page_size())) {
stack_size -= 2 * os::vm_page_size();
}
// Try to figure out where the stack base (top) is. This is harder. // // When an application is started, glibc saves the initial stack pointer in // a global variable "__libc_stack_end", which is then used by system // libraries. __libc_stack_end should be pretty close to stack top. The // variable is available since the very early days. However, because it is // a private interface, it could disappear in the future. // // Linux kernel saves start_stack information in /proc/<pid>/stat. Similar // to __libc_stack_end, it is very close to stack top, but isn't the real // stack top. Note that /proc may not exist if VM is running as a chroot // program, so reading /proc/<pid>/stat could fail. Also the contents of // /proc/<pid>/stat could change in the future (though unlikely). // // We try __libc_stack_end first. If that doesn't work, look for // /proc/<pid>/stat. If neither of them works, we use current stack pointer // as a hint, which should work well in most cases.
uintptr_t stack_start;
// try __libc_stack_end first
uintptr_t *p = (uintptr_t *)dlsym(RTLD_DEFAULT, "__libc_stack_end"); if (p && *p) {
stack_start = *p;
} else { // see if we can get the start_stack field from /proc/self/stat
FILE *fp; int pid; char state; int ppid; int pgrp; int session; int nr; int tpgrp; unsignedlong flags; unsignedlong minflt; unsignedlong cminflt; unsignedlong majflt; unsignedlong cmajflt; unsignedlong utime; unsignedlong stime; long cutime; long cstime; long prio; long nice; long junk; long it_real;
uintptr_t start;
uintptr_t vsize;
intptr_t rss;
uintptr_t rsslim;
uintptr_t scodes;
uintptr_t ecode; int i;
// Figure what the primordial thread stack base is. Code is inspired // by email from Hans Boehm. /proc/self/stat begins with current pid, // followed by command name surrounded by parentheses, state, etc. char stat[2048]; int statlen;
// Skip pid and the command string. Note that we could be dealing with // weird command names, e.g. user could decide to rename java launcher // to "java 1.4.2 :)", then the stat file would look like // 1234 (java 1.4.2 :)) R ... ... // We don't really need to know the command string, just find the last // occurrence of ")" and then start parsing from there. See bug 4726580. char * s = strrchr(stat, ')');
i = 0; if (s) { // Skip blank chars do { s++; } while (s && isspace(*s));
if (i != 28 - 2) {
assert(false, "Bad conversion from /proc/self/stat"); // product mode - assume we are the primordial thread, good luck in the // embedded case.
warning("Can't detect primordial thread stack location - bad conversion");
stack_start = (uintptr_t) &rlim;
}
} else { // For some reason we can't open /proc/self/stat (for example, running on // FreeBSD with a Linux emulator, or inside chroot), this should work for // most cases, so don't abort:
warning("Can't detect primordial thread stack location - no /proc/self/stat");
stack_start = (uintptr_t) &rlim;
}
}
// Now we have a pointer (stack_start) very close to the stack top, the // next thing to do is to figure out the exact location of stack top. We // can find out the virtual memory area that contains stack_start by // reading /proc/self/maps, it should be the last vma in /proc/self/maps, // and its upper limit is the real stack top. (again, this would fail if // running inside chroot, because /proc may not exist.)
uintptr_t stack_top;
address low, high; if (find_vma((address)stack_start, &low, &high)) { // success, "high" is the true stack top. (ignore "low", because initial // thread stack grows on demand, its real bottom is high - RLIMIT_STACK.)
stack_top = (uintptr_t)high;
} else { // failed, likely because /proc/self/maps does not exist
warning("Can't detect primordial thread stack location - find_vma failed"); // best effort: stack_start is normally within a few pages below the real // stack top, use it as stack top, and reduce stack size so we won't put // guard page outside stack.
stack_top = stack_start;
stack_size -= 16 * os::vm_page_size();
}
// stack_top could be partially down the page so align it
stack_top = align_up(stack_top, os::vm_page_size());
// Allowed stack value is minimum of max_size and what we derived from rlimit if (max_size > 0) {
_initial_thread_stack_size = MIN2(max_size, stack_size);
} else { // Accept the rlimit max, but if stack is unlimited then it will be huge, so // clamp it at 8MB as we do on Solaris
_initial_thread_stack_size = MIN2(stack_size, 8*M);
}
_initial_thread_stack_size = align_down(_initial_thread_stack_size, os::vm_page_size());
_initial_thread_stack_bottom = (address)stack_top - _initial_thread_stack_size;
if (log_is_enabled(Info, os, thread)) { // See if we seem to be on primordial process thread bool primordial = uintptr_t(&rlim) > uintptr_t(_initial_thread_stack_bottom) &&
uintptr_t(&rlim) < stack_top;
// Switch to using fast clocks for thread cpu time if // the clock_getres() returns 0 error code. // Note, that some kernels may support the current thread // clock (CLOCK_THREAD_CPUTIME_ID) but not the clocks // returned by the pthread_getcpuclockid(). // If the fast Posix clocks are supported then the clock_getres() // must return at least tp.tv_sec == 0 which means a resolution // better than 1 sec. This is extra check for reliability.
// thread_id is kernel thread id (similar to Solaris LWP id)
intx os::current_thread_id() { return os::Linux::gettid(); } int os::current_process_id() { return ::getpid();
}
// DLL functions
// This must be hard coded because it's the system's temporary // directory not the java application's temp directory, ala java.io.tmpdir. constchar* os::get_temp_directory() { return"/tmp"; }
// check if addr is inside libjvm.so bool os::address_is_in_vm(address addr) { static address libjvm_base_addr;
Dl_info dlinfo;
if (libjvm_base_addr == NULL) { if (dladdr(CAST_FROM_FN_PTR(void *, os::address_is_in_vm), &dlinfo) != 0) {
libjvm_base_addr = (address)dlinfo.dli_fbase;
}
assert(libjvm_base_addr !=NULL, "Cannot obtain base address for libjvm");
}
if (dladdr((void *)addr, &dlinfo) != 0) { if (libjvm_base_addr == (address)dlinfo.dli_fbase) returntrue;
}
returnfalse;
}
bool os::dll_address_to_function_name(address addr, char *buf, int buflen, int *offset, bool demangle) { // buf is not optional, but offset is optional
assert(buf != NULL, "sanity check");
Dl_info dlinfo;
if (dladdr((void*)addr, &dlinfo) != 0) { // see if we have a matching symbol if (dlinfo.dli_saddr != NULL && dlinfo.dli_sname != NULL) { if (!(demangle && Decoder::demangle(dlinfo.dli_sname, buf, buflen))) {
jio_snprintf(buf, buflen, "%s", dlinfo.dli_sname);
} if (offset != NULL) *offset = addr - (address)dlinfo.dli_saddr; returntrue;
} // no matching symbol so try for just file info if (dlinfo.dli_fname != NULL && dlinfo.dli_fbase != NULL) { if (Decoder::decode((address)(addr - (address)dlinfo.dli_fbase),
buf, buflen, offset, dlinfo.dli_fname, demangle)) { returntrue;
}
}
}
bool os::dll_address_to_library_name(address addr, char* buf, int buflen, int* offset) { // buf is not optional, but offset is optional
assert(buf != nullptr, "sanity check");
// Remember the stack's state. The Linux dynamic linker will change // the stack to 'executable' at most once, so we must safepoint only once. bool os::Linux::_stack_is_executable = false;
// VM operation that loads a library. This is necessary if stack protection // of the Java stacks can be lost during loading the library. If we // do not stop the Java threads, they can stack overflow before the stacks // are protected again. class VM_LinuxDllLoad: public VM_Operation { private: constchar *_filename; char *_ebuf; int _ebuflen; void *_lib; public:
VM_LinuxDllLoad(constchar *fn, char *ebuf, int ebuflen) :
_filename(fn), _ebuf(ebuf), _ebuflen(ebuflen), _lib(NULL) {}
VMOp_Type type() const { return VMOp_LinuxDllLoad; } void doit() {
_lib = os::Linux::dll_load_in_vmthread(_filename, _ebuf, _ebuflen);
os::Linux::_stack_is_executable = true;
} void* loaded_library() { return _lib; }
};
void * os::dll_load(constchar *filename, char *ebuf, int ebuflen) { void * result = NULL; bool load_attempted = false;
log_info(os)("attempting shared library load of %s", filename);
// Check whether the library to load might change execution rights // of the stack. If they are changed, the protection of the stack // guard pages will be lost. We need a safepoint to fix this. // // See Linux man page execstack(8) for more info. if (os::uses_stack_guard_pages() && !os::Linux::_stack_is_executable) { if (!ElfFile::specifies_noexecstack(filename)) { if (!is_init_completed()) {
os::Linux::_stack_is_executable = true; // This is OK - No Java threads have been created yet, and hence no // stack guard pages to fix. // // Dynamic loader will make all stacks executable after // this function returns, and will not do that again.
assert(Threads::number_of_threads() == 0, "no Java threads should exist yet.");
} else {
warning("You have loaded library %s which might have disabled stack guard. " "The VM will try to fix the stack guard now.\n" "It's highly recommended that you fix the library with " "'execstack -c <libfile>', or link it with '-z noexecstack'.",
filename);
JavaThread *jt = JavaThread::current(); if (jt->thread_state() != _thread_in_native) { // This happens when a compiler thread tries to load a hsdis-<arch>.so file // that requires ExecStack. Cannot enter safe point. Let's give up.
warning("Unable to fix stack guard. Giving up.");
} else { if (!LoadExecStackDllInVMThread) { // This is for the case where the DLL has an static // constructor function that executes JNI code. We cannot // load such DLLs in the VMThread.
result = os::Linux::dlopen_helper(filename, ebuf, ebuflen);
}
if (elf_head.e_ident[EI_DATA] != LITTLE_ENDIAN_ONLY(ELFDATA2LSB) BIG_ENDIAN_ONLY(ELFDATA2MSB)) { // handle invalid/out of range endianness values if (elf_head.e_ident[EI_DATA] == 0 || elf_head.e_ident[EI_DATA] > 2) { return NULL;
}
#ifdefined(VM_LITTLE_ENDIAN) // VM is LE, shared object BE
elf_head.e_machine = be16toh(elf_head.e_machine); #else // VM is BE, shared object LE
elf_head.e_machine = le16toh(elf_head.e_machine); #endif
}
typedefstruct {
Elf32_Half code; // Actual value as defined in elf.h
Elf32_Half compat_class; // Compatibility of archs at VM's sense unsignedchar elf_class; // 32 or 64 bit unsignedchar endianness; // MSB or LSB char* name; // String representation
} arch_t;
// Identify compatibility class for VM's architecture and library's architecture // Obtain string descriptions for architectures
arch_t lib_arch={elf_head.e_machine,0,elf_head.e_ident[EI_CLASS], elf_head.e_ident[EI_DATA], NULL}; int running_arch_index=-1;
for (unsignedint i=0; i < ARRAY_SIZE(arch_array); i++) { if (running_arch_code == arch_array[i].code) {
running_arch_index = i;
} if (lib_arch.code == arch_array[i].code) {
lib_arch.compat_class = arch_array[i].compat_class;
lib_arch.name = arch_array[i].name;
}
}
assert(running_arch_index != -1, "Didn't find running architecture code (running_arch_code) in arch_array"); if (running_arch_index == -1) { // Even though running architecture detection failed // we may still continue with reporting dlerror() message return NULL;
}
if (lib_arch.compat_class != arch_array[running_arch_index].compat_class) { if (lib_arch.name != NULL) {
::snprintf(diag_msg_buf, diag_msg_max_length-1, " (Possible cause: can't load %s .so on a %s platform)",
lib_arch.name, arch_array[running_arch_index].name);
} else {
::snprintf(diag_msg_buf, diag_msg_max_length-1, " (Possible cause: can't load this .so (machine code=0x%x) on a %s platform)",
lib_arch.code, arch_array[running_arch_index].name);
} return NULL;
}
if (lib_arch.elf_class != arch_array[running_arch_index].elf_class) {
::snprintf(diag_msg_buf, diag_msg_max_length-1, " (Possible cause: architecture word width mismatch, can't load %d-bit .so on a %d-bit platform)",
(int) lib_arch.elf_class * 32, arch_array[running_arch_index].elf_class * 32); return NULL;
}
return NULL;
}
void * os::Linux::dlopen_helper(constchar *filename, char *ebuf, int ebuflen) { void * result = ::dlopen(filename, RTLD_LAZY); if (result == NULL) { constchar* error_report = ::dlerror(); if (error_report == NULL) {
error_report = "dlerror returned no error description";
} if (ebuf != NULL && ebuflen > 0) {
::strncpy(ebuf, error_report, ebuflen-1);
ebuf[ebuflen-1]='\0';
}
Events::log_dll_message(NULL, "Loading shared library %s failed, %s", filename, error_report);
log_info(os)("shared library load of %s failed, %s", filename, error_report);
} else {
Events::log_dll_message(NULL, "Loaded shared library %s", filename);
log_info(os)("shared library load of %s was successful", filename);
} return result;
}
void * os::Linux::dll_load_in_vmthread(constchar *filename, char *ebuf, int ebuflen) { void * result = NULL; if (LoadExecStackDllInVMThread) {
result = dlopen_helper(filename, ebuf, ebuflen);
}
// Since 7019808, libjvm.so is linked with -noexecstack. If the VM loads a // library that requires an executable stack, or which does not have this // stack attribute set, dlopen changes the stack attribute to executable. The // read protection of the guard pages gets lost. // // Need to check _stack_is_executable again as multiple VM_LinuxDllLoad // may have been queued at the same time.
if (!_stack_is_executable) { for (JavaThreadIteratorWithHandle jtiwh; JavaThread *jt = jtiwh.next(); ) {
StackOverflow* overflow_state = jt->stack_overflow_state(); if (!overflow_state->stack_guard_zone_unused() && // Stack not yet fully initialized
overflow_state->stack_guards_enabled()) { // No pending stack overflow exceptions if (!os::guard_memory((char *)jt->stack_end(), StackOverflow::stack_guard_zone_size())) {
warning("Attempt to reguard stack yellow zone failed.");
}
}
}
}
return result;
}
constchar* os::Linux::dll_path(void* lib) { struct link_map *lmap; constchar* l_path = NULL;
assert(lib != NULL, "dll_path parameter must not be NULL");
int res_dli = ::dlinfo(lib, RTLD_DI_LINKMAP, &lmap); if (res_dli == 0) {
l_path = lmap->l_name;
} return l_path;
}
// Try to identify popular distros. // Most Linux distributions have a /etc/XXX-release file, which contains // the OS version string. Newer Linux distributions have a /etc/lsb-release // file that also contains the OS version string. Some have more than one // /etc/XXX-release file (e.g. Mandrake has both /etc/mandrake-release and // /etc/redhat-release.), so the order is important. // Any Linux that is based on Redhat (i.e. Oracle, Mandrake, Sun JDS...) have // their own specific XXX-release file as well as a redhat-release file. // Because of this the XXX-release file needs to be searched for before the // redhat-release file. // Since Red Hat and SuSE have an lsb-release file that is not very descriptive the // search for redhat-release / SuSE-release needs to be before lsb-release. // Since the lsb-release file is the new standard it needs to be searched // before the older style release files. // Searching system-release (Red Hat) and os-release (other Linuxes) are a // next to last resort. The os-release file is a new standard that contains // distribution information and the system-release file seems to be an old // standard that has been replaced by the lsb-release and os-release files. // Searching for the debian_version file is the last resort. It contains // an informative string like "6.0.6" or "wheezy/sid". Because of this // "Debian " is printed before the contents of the debian_version file.
void os::Linux::print_distro_info(outputStream* st) { for (int i = 0;; i++) { constchar* file = distro_files[i]; if (file == NULL) { break; // done
} // If file prints, we found it. if (_print_ascii_file(file, st)) { return;
}
}
void os::Linux::print_proc_sys_info(outputStream* st) {
_print_ascii_file_h("/proc/sys/kernel/threads-max (system-wide limit on the number of threads)", "/proc/sys/kernel/threads-max", st);
_print_ascii_file_h("/proc/sys/vm/max_map_count (maximum number of memory map areas a process may have)", "/proc/sys/vm/max_map_count", st);
_print_ascii_file_h("/proc/sys/kernel/pid_max (system-wide limit on number of process identifiers)", "/proc/sys/kernel/pid_max", st);
}
// some information regarding THPs; for details see // https://www.kernel.org/doc/Documentation/vm/transhuge.txt
_print_ascii_file_h("/sys/kernel/mm/transparent_hugepage/enabled", "/sys/kernel/mm/transparent_hugepage/enabled", st);
_print_ascii_file_h("/sys/kernel/mm/transparent_hugepage/defrag (defrag/compaction efforts parameter)", "/sys/kernel/mm/transparent_hugepage/defrag", st);
}
#ifdef __GLIBC__ // For Glibc, print a one-liner with the malloc tunables. // Most important and popular is MALLOC_ARENA_MAX, but we are // thorough and print them all. staticvoid print_glibc_malloc_tunables(outputStream* st) { staticconstchar* var[] = { // the new variant "GLIBC_TUNABLES", // legacy variants "MALLOC_CHECK_", "MALLOC_TOP_PAD_", "MALLOC_PERTURB_", "MALLOC_MMAP_THRESHOLD_", "MALLOC_TRIM_THRESHOLD_", "MALLOC_MMAP_MAX_", "MALLOC_ARENA_TEST", "MALLOC_ARENA_MAX",
NULL};
st->print("glibc malloc tunables: "); bool printed = false; for (int i = 0; var[i] != NULL; i ++) { constchar* const val = ::getenv(var[i]); if (val != NULL) {
st->print("%s%s=%s", (printed ? ", " : ""), var[i], val);
printed = true;
}
} if (!printed) {
st->print("(default)");
}
} #endif// __GLIBC__
void os::Linux::print_uptime_info(outputStream* st) { struct sysinfo sinfo; int ret = sysinfo(&sinfo); if (ret == 0) {
os::print_dhm(st, "OS uptime:", (long) sinfo.uptime);
}
}
bool os::Linux::print_container_info(outputStream* st) { if (!OSContainer::is_containerized()) {
st->print_cr("container information not found."); returnfalse;
}
char *p = OSContainer::cpu_cpuset_cpus();
st->print_cr("cpu_cpuset_cpus: %s", p != NULL ? p : "not supported");
free(p);
p = OSContainer::cpu_cpuset_memory_nodes();
st->print_cr("cpu_memory_nodes: %s", p != NULL ? p : "not supported");
free(p);
int i = OSContainer::active_processor_count();
st->print("active_processor_count: "); if (i > 0) { if (ActiveProcessorCount > 0) {
st->print_cr("%d, but overridden by -XX:ActiveProcessorCount %d", i, ActiveProcessorCount);
} else {
st->print_cr("%d", i);
}
} else {
st->print_cr("not supported");
}
i = OSContainer::cpu_quota();
st->print("cpu_quota: "); if (i > 0) {
st->print_cr("%d", i);
} else {
st->print_cr("%s", i == OSCONTAINER_ERROR ? "not supported" : "no quota");
}
i = OSContainer::cpu_period();
st->print("cpu_period: "); if (i > 0) {
st->print_cr("%d", i);
} else {
st->print_cr("%s", i == OSCONTAINER_ERROR ? "not supported" : "no period");
}
i = OSContainer::cpu_shares();
st->print("cpu_shares: "); if (i > 0) {
st->print_cr("%d", i);
} else {
st->print_cr("%s", i == OSCONTAINER_ERROR ? "not supported" : "no shares");
}
// Print the first "model name" line and the first "flags" line // that we find and nothing more. We assume "model name" comes // before "flags" so if we find a second "model name", then the // "flags" field is considered missing. staticbool print_model_name_and_flags(outputStream* st, char* buf, size_t buflen) { #ifdefined(IA32) || defined(AMD64) // Other platforms have less repetitive cpuinfo files
FILE *fp = os::fopen("/proc/cpuinfo", "r"); if (fp) { bool model_name_printed = false; while (!feof(fp)) { if (fgets(buf, buflen, fp)) { // Assume model name comes before flags if (strstr(buf, "model name") != NULL) { if (!model_name_printed) {
st->print_raw("CPU Model and flags from /proc/cpuinfo:\n");
st->print_raw(buf);
model_name_printed = true;
} else { // model name printed but not flags? Odd, just return
fclose(fp); returntrue;
}
} // print the flags line too if (strstr(buf, "flags") != NULL) {
st->print_raw(buf);
fclose(fp); returntrue;
}
}
}
fclose(fp);
} #endif// x86 platforms returnfalse;
}
// additional information about CPU e.g. available frequency ranges staticvoid print_sys_devices_cpu_info(outputStream* st) {
_print_ascii_file_h("Online cpus", "/sys/devices/system/cpu/online", st);
_print_ascii_file_h("Offline cpus", "/sys/devices/system/cpu/offline", st);
if (ExtensiveErrorReports) { // cache related info (cpu 0, should be similar for other CPUs) for (unsignedint i=0; i < 10; i++) { // handle max. 10 cache entries char hbuf_level[60]; char hbuf_type[60]; char hbuf_size[60]; char hbuf_coherency_line_size[80];
snprintf(hbuf_level, 60, "/sys/devices/system/cpu/cpu0/cache/index%u/level", i);
snprintf(hbuf_type, 60, "/sys/devices/system/cpu/cpu0/cache/index%u/type", i);
snprintf(hbuf_size, 60, "/sys/devices/system/cpu/cpu0/cache/index%u/size", i);
snprintf(hbuf_coherency_line_size, 80, "/sys/devices/system/cpu/cpu0/cache/index%u/coherency_line_size", i); if (os::file_exists(hbuf_level)) {
_print_ascii_file_h("cache level", hbuf_level, st);
_print_ascii_file_h("cache type", hbuf_type, st);
_print_ascii_file_h("cache size", hbuf_size, st);
_print_ascii_file_h("cache coherency line size", hbuf_coherency_line_size, st);
}
}
}
// we miss the cpufreq entries on Power and s390x #ifdefined(IA32) || defined(AMD64)
_print_ascii_file_h("BIOS frequency limitation", "/sys/devices/system/cpu/cpu0/cpufreq/bios_limit", st);
_print_ascii_file_h("Frequency switch latency (ns)", "/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_transition_latency", st);
_print_ascii_file_h("Available cpu frequencies", "/sys/devices/system/cpu/cpu0/cpufreq/scaling_available_frequencies", st); // min and max should be in the Available range but still print them (not all info might be available for all kernels) if (ExtensiveErrorReports) {
_print_ascii_file_h("Maximum cpu frequency", "/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq", st);
_print_ascii_file_h("Minimum cpu frequency", "/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_min_freq", st);
_print_ascii_file_h("Current cpu frequency", "/sys/devices/system/cpu/cpu0/cpufreq/scaling_cur_freq", st);
} // governors are power schemes, see https://wiki.archlinux.org/index.php/CPU_frequency_scaling if (ExtensiveErrorReports) {
_print_ascii_file_h("Available governors", "/sys/devices/system/cpu/cpu0/cpufreq/scaling_available_governors", st);
}
_print_ascii_file_h("Current governor", "/sys/devices/system/cpu/cpu0/cpufreq/scaling_governor", st); // Core performance boost, see https://www.kernel.org/doc/Documentation/cpu-freq/boost.txt // Raise operating frequency of some cores in a multi-core package if certain conditions apply, e.g. // whole chip is not fully utilized
_print_ascii_file_h("Core performance/turbo boost", "/sys/devices/system/cpu/cpufreq/boost", st); #endif
}
void os::pd_print_cpu_info(outputStream* st, char* buf, size_t buflen) { // Only print the model name if the platform provides this as a summary if (!print_model_name_and_flags(st, buf, buflen)) {
_print_ascii_file_h("/proc/cpuinfo", "/proc/cpuinfo", st, false);
}
st->cr();
print_sys_devices_cpu_info(st);
}
// Parses the cpuinfo file for string representing the model name. void os::get_summary_cpu_info(char* cpuinfo, size_t length) {
FILE* fp = os::fopen("/proc/cpuinfo", "r"); if (fp != NULL) { while (!feof(fp)) { char buf[256]; if (fgets(buf, sizeof(buf), fp)) { char* start = strstr(buf, search_string); if (start != NULL) { char *ptr = start + strlen(search_string); char *end = buf + strlen(buf); while (ptr != end) { // skip whitespace and colon for the rest of the name. if (*ptr != ' ' && *ptr != '\t' && *ptr != ':') { break;
}
ptr++;
} if (ptr != end) { // reasonable string, get rid of newline and keep the rest char* nl = strchr(buf, '\n'); if (nl != NULL) *nl = '\0';
strncpy(cpuinfo, ptr, length);
fclose(fp); return;
}
}
}
}
fclose(fp);
} // cpuinfo not found or parsing failed, just print generic string. The entire // /proc/cpuinfo file will be printed later in the file (or enough of it for x86) #ifdefined(AARCH64)
strncpy(cpuinfo, "AArch64", length); #elifdefined(AMD64)
strncpy(cpuinfo, "x86_64", length); #elifdefined(ARM) // Order wrt. AARCH64 is relevant!
strncpy(cpuinfo, "ARM", length); #elifdefined(IA32)
strncpy(cpuinfo, "x86_32", length); #elifdefined(IA64)
strncpy(cpuinfo, "IA64", length); #elifdefined(PPC)
strncpy(cpuinfo, "PPC64", length); #elifdefined(RISCV)
strncpy(cpuinfo, LP64_ONLY("RISCV64") NOT_LP64("RISCV32"), length); #elifdefined(S390)
strncpy(cpuinfo, "S390", length); #elifdefined(SPARC)
strncpy(cpuinfo, "sparcv9", length); #elifdefined(ZERO_LIBARCH)
strncpy(cpuinfo, ZERO_LIBARCH, length); #else
strncpy(cpuinfo, "unknown", length); #endif
}
staticchar saved_jvm_path[MAXPATHLEN] = {0};
// Find the full path to the current module, libjvm.so void os::jvm_path(char *buf, jint buflen) { // Error checking. if (buflen < MAXPATHLEN) {
assert(false, "must use a large-enough buffer");
buf[0] = '\0'; return;
} // Lazy resolve the path to current module. if (saved_jvm_path[0] != 0) {
strcpy(buf, saved_jvm_path); return;
}
if (Arguments::sun_java_launcher_is_altjvm()) { // Support for the java launcher's '-XXaltjvm=<path>' option. Typical // value for buf is "<JAVA_HOME>/jre/lib/<vmtype>/libjvm.so". // If "/jre/lib/" appears at the right place in the string, then // assume we are installed in a JDK and we're done. Otherwise, check // for a JAVA_HOME environment variable and fix up the path so it // looks like libjvm.so is installed there (append a fake suffix // hotspot/libjvm.so). constchar *p = buf + strlen(buf) - 1; for (int count = 0; p > buf && count < 5; ++count) { for (--p; p > buf && *p != '/'; --p) /* empty */ ;
}
if (strncmp(p, "/jre/lib/", 9) != 0) { // Look for JAVA_HOME in the environment. char* java_home_var = ::getenv("JAVA_HOME"); if (java_home_var != NULL && java_home_var[0] != 0) { char* jrelib_p; int len;
// Check the current module name "libjvm.so".
p = strrchr(buf, '/'); if (p == NULL) { return;
}
assert(strstr(p, "/libjvm") == p, "invalid library name");
rp = os::Posix::realpath(java_home_var, buf, buflen); if (rp == NULL) { return;
}
// determine if this is a legacy image or modules image // modules image doesn't have "jre" subdirectory
len = strlen(buf);
assert(len < buflen, "Ran out of buffer room");
jrelib_p = buf + len;
snprintf(jrelib_p, buflen-len, "/jre/lib"); if (0 != access(buf, F_OK)) {
snprintf(jrelib_p, buflen-len, "/lib");
}
if (0 == access(buf, F_OK)) { // Use current module name "libjvm.so"
len = strlen(buf);
snprintf(buf + len, buflen-len, "/hotspot/libjvm.so");
} else { // Go back to path of .so
rp = os::Posix::realpath(dli_fname, buf, buflen); if (rp == NULL) { return;
}
}
}
}
}
// Rationale behind this function: // current (Mon Apr 25 20:12:18 MSD 2005) oprofile drops samples without executable // mapping for address (see lookup_dcookie() in the kernel module), thus we cannot get // samples for JITted code. Here we create private executable mapping over the code cache // and then we can use standard (well, almost, as mapping can change) way to provide // info for the reporting script by storing timestamp and location of symbol void linux_wrap_code(char* base, size_t size) { staticvolatile jint cnt = 0;
if (!UseOprofile) { return;
}
char buf[PATH_MAX+1]; int num = Atomic::add(&cnt, 1);
staticbool recoverable_mmap_error(int err) { // See if the error is one we can let the caller handle. This // list of errno values comes from JBS-6843484. I can't find a // Linux man page that documents this specific set of errno // values so while this list currently matches Solaris, it may // change as we gain experience with this failure mode. switch (err) { case EBADF: case EINVAL: case ENOTSUP: // let the caller deal with these errors returntrue;
default: // Any remaining errors on this OS can cause our reserved mapping // to be lost. That can cause confusion where different data // structures think they have the same memory mapped. The worst // scenario is if both the VM and a library think they have the // same memory mapped. returnfalse;
}
}
// NOTE: Linux kernel does not really reserve the pages for us. // All it does is to check if there are enough free pages // left at the time of mmap(). This could be a potential // problem. int os::Linux::commit_memory_impl(char* addr, size_t size, bool exec) { int prot = exec ? PROT_READ|PROT_WRITE|PROT_EXEC : PROT_READ|PROT_WRITE;
uintptr_t res = (uintptr_t) ::mmap(addr, size, prot,
MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0); if (res != (uintptr_t) MAP_FAILED) { if (UseNUMAInterleaving) {
numa_make_global(addr, size);
} return 0;
}
int err = errno; // save errno from mmap() call above
void os::pd_commit_memory_or_exit(char* addr, size_t size, bool exec, constchar* mesg) {
assert(mesg != NULL, "mesg must be specified"); int err = os::Linux::commit_memory_impl(addr, size, exec); if (err != 0) { // the caller wants all commit errors to exit with the specified mesg:
warn_fail_commit_memory(addr, size, exec, err);
vm_exit_out_of_memory(size, OOM_MMAP_ERROR, "%s", mesg);
}
}
// Define MAP_HUGETLB here so we can build HotSpot on old systems. #ifndef MAP_HUGETLB #define MAP_HUGETLB 0x40000 #endif
// If mmap flags are set with MAP_HUGETLB and the system supports multiple // huge page sizes, flag bits [26:31] can be used to encode the log2 of the // desired huge page size. Otherwise, the system's default huge page size will be used. // See mmap(2) man page for more info (since Linux 3.8). // https://lwn.net/Articles/533499/ #ifndef MAP_HUGE_SHIFT #define MAP_HUGE_SHIFT 26 #endif
// Define MADV_HUGEPAGE here so we can build HotSpot on old systems. #ifndef MADV_HUGEPAGE #define MADV_HUGEPAGE 14 #endif
int os::Linux::commit_memory_impl(char* addr, size_t size,
size_t alignment_hint, bool exec) { int err = os::Linux::commit_memory_impl(addr, size, exec); if (err == 0) {
realign_memory(addr, size, alignment_hint);
} return err;
}
void os::pd_commit_memory_or_exit(char* addr, size_t size,
size_t alignment_hint, bool exec, constchar* mesg) {
assert(mesg != NULL, "mesg must be specified"); int err = os::Linux::commit_memory_impl(addr, size, alignment_hint, exec); if (err != 0) { // the caller wants all commit errors to exit with the specified mesg:
warn_fail_commit_memory(addr, size, alignment_hint, exec, err);
vm_exit_out_of_memory(size, OOM_MMAP_ERROR, "%s", mesg);
}
}
void os::pd_realign_memory(char *addr, size_t bytes, size_t alignment_hint) { if (UseTransparentHugePages && alignment_hint > (size_t)vm_page_size()) { // We don't check the return value: madvise(MADV_HUGEPAGE) may not // be supported or the memory may already be backed by huge pages.
::madvise(addr, bytes, MADV_HUGEPAGE);
}
}
void os::pd_free_memory(char *addr, size_t bytes, size_t alignment_hint) { // This method works by doing an mmap over an existing mmaping and effectively discarding // the existing pages. However it won't work for SHM-based large pages that cannot be // uncommitted at all. We don't do anything in this case to avoid creating a segment with // small pages on top of the SHM segment. This method always works for small pages, so we // allow that in any case. if (alignment_hint <= (size_t)os::vm_page_size() || can_commit_large_page_memory()) {
commit_memory(addr, bytes, alignment_hint, !ExecMem);
}
}
// Define for numa_set_bind_policy(int). Setting the argument to 0 will set the // bind policy to MPOL_PREFERRED for the current thread. #define USE_MPOL_PREFERRED 0
void os::numa_make_local(char *addr, size_t bytes, int lgrp_hint) { // To make NUMA and large pages more robust when both enabled, we need to ease // the requirements on where the memory should be allocated. MPOL_BIND is the // default policy and it will force memory to be allocated on the specified // node. Changing this to MPOL_PREFERRED will prefer to allocate the memory on // the specified node, but will not force it. Using this policy will prevent // getting SIGBUS when trying to allocate large pages on NUMA nodes with no // free large pages.
Linux::numa_set_bind_policy(USE_MPOL_PREFERRED);
Linux::numa_tonode_memory(addr, bytes, lgrp_hint);
}
bool os::numa_topology_changed() { returnfalse; }
size_t os::numa_get_groups_num() { // Return just the number of nodes in which it's possible to allocate memory // (in numa terminology, configured nodes). return Linux::numa_num_configured_nodes();
}
int os::numa_get_group_id() { int cpu_id = Linux::sched_getcpu(); if (cpu_id != -1) { int lgrp_id = Linux::get_node_by_cpu(cpu_id); if (lgrp_id != -1) { return lgrp_id;
}
} return 0;
}
int os::numa_get_group_id_for_address(constvoid* address) { void** pages = const_cast<void**>(&address); int id = -1;
if (os::Linux::numa_move_pages(0, 1, pages, NULL, &id, 0) == -1) { return -1;
} if (id < 0) { return -1;
} return id;
}
int os::Linux::get_existing_num_nodes() { int node; int highest_node_number = Linux::numa_max_node(); int num_nodes = 0;
// Get the total number of nodes in the system including nodes without memory. for (node = 0; node <= highest_node_number; node++) { if (is_node_in_existing_nodes(node)) {
num_nodes++;
}
} return num_nodes;
}
size_t os::numa_get_leaf_groups(int *ids, size_t size) { int highest_node_number = Linux::numa_max_node();
size_t i = 0;
// Map all node ids in which it is possible to allocate memory. Also nodes are // not always consecutively available, i.e. available from 0 to the highest // node number. If the nodes have been bound explicitly using numactl membind, // then allocate memory from those nodes only. for (int node = 0; node <= highest_node_number; node++) { if (Linux::is_node_in_bound_nodes((unsignedint)node)) {
ids[i++] = node;
}
} return i;
}
int os::Linux::sched_getcpu_syscall(void) { unsignedint cpu = 0; int retval = -1;
#ifdefined(IA32) #ifndef SYS_getcpu #define SYS_getcpu 318 #endif
retval = syscall(SYS_getcpu, &cpu, NULL, NULL); #elifdefined(AMD64) // Unfortunately we have to bring all these macros here from vsyscall.h // to be able to compile on old linuxes. #define __NR_vgetcpu 2 #define VSYSCALL_START (-10UL << 20) #define VSYSCALL_SIZE 1024 #define VSYSCALL_ADDR(vsyscall_nr) (VSYSCALL_START+VSYSCALL_SIZE*(vsyscall_nr)) typedeflong (*vgetcpu_t)(unsignedint *cpu, unsignedint *node, unsignedlong *tcache);
vgetcpu_t vgetcpu = (vgetcpu_t)VSYSCALL_ADDR(__NR_vgetcpu);
retval = vgetcpu(&cpu, NULL, NULL); #endif
return (retval == -1) ? retval : cpu;
}
void os::Linux::sched_getcpu_init() { // sched_getcpu() should be in libc.
set_sched_getcpu(CAST_TO_FN_PTR(sched_getcpu_func_t,
dlsym(RTLD_DEFAULT, "sched_getcpu")));
// If it's not, try a direct syscall. if (sched_getcpu() == -1) {
set_sched_getcpu(CAST_TO_FN_PTR(sched_getcpu_func_t,
(void*)&sched_getcpu_syscall));
}
if (sched_getcpu() == -1) {
vm_exit_during_initialization("getcpu(2) system call not supported by kernel");
}
}
// Something to do with the numa-aware allocator needs these symbols extern"C" JNIEXPORT void numa_warn(int number, char *where, ...) { } extern"C" JNIEXPORT void numa_error(char *where) { }
// Handle request to load libnuma symbol version 1.1 (API v1). If it fails // load symbol from base version instead. void* os::Linux::libnuma_dlsym(void* handle, constchar *name) { void *f = dlvsym(handle, name, "libnuma_1.1"); if (f == NULL) {
f = dlsym(handle, name);
} return f;
}
// Handle request to load libnuma symbol version 1.2 (API v2) only. // Return NULL if the symbol is not defined in this particular version. void* os::Linux::libnuma_v2_dlsym(void* handle, constchar* name) { return dlvsym(handle, name, "libnuma_1.2");
}
// Check numa dependent syscalls staticbool numa_syscall_check() { // NUMA APIs depend on several syscalls. E.g., get_mempolicy is required for numa_get_membind and // numa_get_interleave_mask. But these dependent syscalls can be unsupported for various reasons. // Especially in dockers, get_mempolicy is not allowed with the default configuration. So it's necessary // to check whether the syscalls are available. Currently, only get_mempolicy is checked since checking // others like mbind would cause unexpected side effects. #ifdef SYS_get_mempolicy int dummy = 0; if (syscall(SYS_get_mempolicy, &dummy, NULL, 0, (void*)&dummy, 3) == -1) { returnfalse;
} #endif
if (numa_available() != -1) {
set_numa_all_nodes((unsignedlong*)libnuma_dlsym(handle, "numa_all_nodes"));
set_numa_all_nodes_ptr((struct bitmask **)libnuma_dlsym(handle, "numa_all_nodes_ptr"));
set_numa_nodes_ptr((struct bitmask **)libnuma_dlsym(handle, "numa_nodes_ptr"));
set_numa_interleave_bitmask(_numa_get_interleave_mask());
set_numa_membind_bitmask(_numa_get_membind()); // Create an index -> node mapping, since nodes are not always consecutive
_nindex_to_node = new (mtInternal) GrowableArray<int>(0, mtInternal);
rebuild_nindex_to_node_map(); // Create a cpu -> node mapping
_cpu_to_node = new (mtInternal) GrowableArray<int>(0, mtInternal);
rebuild_cpu_to_node_map(); returntrue;
}
}
} returnfalse;
}
size_t os::Linux::default_guard_size(os::ThreadType thr_type) { // Creating guard page is very expensive. Java thread has HotSpot // guard pages, only enable glibc guard page for non-Java threads. // (Remember: compiler thread is a Java thread, too!) return ((thr_type == java_thread || thr_type == compiler_thread) ? 0 : os::vm_page_size());
}
void os::Linux::rebuild_nindex_to_node_map() { int highest_node_number = Linux::numa_max_node();
nindex_to_node()->clear(); for (int node = 0; node <= highest_node_number; node++) { if (Linux::is_node_in_existing_nodes(node)) {
nindex_to_node()->append(node);
}
}
}
// rebuild_cpu_to_node_map() constructs a table mapping cpud id to node id. // The table is later used in get_node_by_cpu(). void os::Linux::rebuild_cpu_to_node_map() { const size_t NCPUS = 32768; // Since the buffer size computation is very obscure // in libnuma (possible values are starting from 16, // and continuing up with every other power of 2, but less // than the maximum number of CPUs supported by kernel), and // is a subject to change (in libnuma version 2 the requirements // are more reasonable) we'll just hardcode the number they use // in the library. const size_t BitsPerCLong = sizeof(long) * CHAR_BIT;
int distance = 0; int closest_distance = INT_MAX; int closest_node = 0; unsignedlong *cpu_map = NEW_C_HEAP_ARRAY(unsignedlong, cpu_map_size, mtInternal); for (size_t i = 0; i < node_num; i++) { // Check if node is configured (not a memory-less node). If it is not, find // the closest configured node. Check also if node is bound, i.e. it's allowed // to allocate memory from the node. If it's not allowed, map cpus in that node // to the closest node from which memory allocation is allowed. if (!is_node_in_configured_nodes(nindex_to_node()->at(i)) ||
!is_node_in_bound_nodes(nindex_to_node()->at(i))) {
closest_distance = INT_MAX; // Check distance from all remaining nodes in the system. Ignore distance // from itself, from another non-configured node, and from another non-bound // node. for (size_t m = 0; m < node_num; m++) { if (m != i &&
is_node_in_configured_nodes(nindex_to_node()->at(m)) &&
is_node_in_bound_nodes(nindex_to_node()->at(m))) {
distance = numa_distance(nindex_to_node()->at(i), nindex_to_node()->at(m)); // If a closest node is found, update. There is always at least one // configured and bound node in the system so there is always at least // one node close. if (distance != 0 && distance < closest_distance) {
closest_distance = distance;
closest_node = nindex_to_node()->at(m);
}
}
}
} else { // Current node is already a configured node.
closest_node = nindex_to_node()->at(i);
}
// Get cpus from the original node and map them to the closest node. If node // is a configured node (not a memory-less node), then original node and // closest node are the same. if (numa_node_to_cpus(nindex_to_node()->at(i), cpu_map, cpu_map_size * sizeof(unsignedlong)) != -1) { for (size_t j = 0; j < cpu_map_valid_size; j++) { if (cpu_map[j] != 0) { for (size_t k = 0; k < BitsPerCLong; k++) { if (cpu_map[j] & (1UL << k)) { int cpu_index = j * BitsPerCLong + k;
#ifndef PRODUCT if (UseDebuggerErgo1 && cpu_index >= (int)cpu_num) { // Some debuggers limit the processor count without // intercepting the NUMA APIs. Just fake the values.
cpu_index = 0;
} #endif
int os::Linux::numa_node_to_cpus(int node, unsignedlong *buffer, int bufferlen) { // use the latest version of numa_node_to_cpus if available if (_numa_node_to_cpus_v2 != NULL) {
// libnuma bitmask struct struct bitmask { unsignedlong size; /* number of bits in the map */ unsignedlong *maskp;
};
// Use a trick with mincore to check whether the page is mapped or not. // mincore sets vec to 1 if page resides in memory and to 0 if page // is swapped output but if page we are asking for is unmapped // it returns -1,ENOMEM
mincore_return_value = mincore(nbot, page_sz, vec);
if (mincore_return_value == -1) { // Page is not mapped go up // to find first mapped page if (errno != EAGAIN) {
assert(errno == ENOMEM, "Unexpected mincore errno");
imax = imid;
}
} else { // Page is mapped go down // to find first not mapped page
imin = imid + 1;
}
}
nbot = nbot + page_sz;
// Adjust stack bottom one page up if last checked page is not mapped if (mincore_return_value == -1) {
nbot = nbot + page_sz;
}
return nbot;
}
bool os::committed_in_range(address start, size_t size, address& committed_start, size_t& committed_size) { int mincore_return_value; const size_t stripe = 1024; // query this many pages each time unsignedchar vec[stripe + 1]; // set a guard
vec[stripe] = 'X';
for (int index = 0; index < loops && !found_range; index ++) {
assert(pages > 0, "Nothing to do"); int pages_to_query = (pages >= stripe) ? stripe : pages;
pages -= pages_to_query;
// Get stable read while ((mincore_return_value = mincore(loop_base, pages_to_query * page_sz, vec)) == -1 && errno == EAGAIN);
// During shutdown, some memory goes away without properly notifying NMT, // E.g. ConcurrentGCThread/WatcherThread can exit without deleting thread object. // Bailout and return as not committed for now. if (mincore_return_value == -1 && errno == ENOMEM) { returnfalse;
}
assert(vec[stripe] == 'X', "overflow guard");
assert(mincore_return_value == 0, "Range must be valid"); // Process this stripe for (int vecIdx = 0; vecIdx < pages_to_query; vecIdx ++) { if ((vec[vecIdx] & 0x01) == 0) { // not committed // End of current contiguous region if (committed_start != NULL) {
found_range = true; break;
}
} else { // committed // Start of region if (committed_start == NULL) {
committed_start = loop_base + page_sz * vecIdx;
}
committed_pages ++;
}
}
loop_base += pages_to_query * page_sz;
}
if (committed_start != NULL) {
assert(committed_pages > 0, "Must have committed region");
assert(committed_pages <= int(size / page_sz), "Can not commit more than it has");
assert(committed_start >= start && committed_start < start + size, "Out of range");
committed_size = page_sz * committed_pages; returntrue;
} else {
assert(committed_pages == 0, "Should not have committed region"); returnfalse;
}
}
// Linux uses a growable mapping for the stack, and if the mapping for // the stack guard pages is not removed when we detach a thread the // stack cannot grow beyond the pages where the stack guard was // mapped. If at some point later in the process the stack expands to // that point, the Linux kernel cannot expand the stack any further // because the guard pages are in the way, and a segfault occurs. // // However, it's essential not to split the stack region by unmapping // a region (leaving a hole) that's already part of the stack mapping, // so if the stack mapping has already grown beyond the guard pages at // the time we create them, we have to truncate the stack mapping. // So, we need to know the extent of the stack mapping when // create_stack_guard_pages() is called.
// We only need this for stacks that are growable: at the time of // writing thread stacks don't use growable mappings (i.e. those // creeated with MAP_GROWSDOWN), and aren't marked "[stack]", so this // only applies to the main thread.
// If the (growable) stack mapping already extends beyond the point // where we're going to put our guard pages, truncate the mapping at // that point by munmap()ping it. This ensures that when we later // munmap() the guard pages we don't leave a hole in the stack // mapping. This only affects the main/primordial thread
bool os::pd_create_stack_guard_pages(char* addr, size_t size) { if (os::is_primordial_thread()) { // As we manually grow stack up to bottom inside create_attached_thread(), // it's likely that os::Linux::initial_thread_stack_bottom is mapped and // we don't need to do anything special. // Check it first, before calling heavy function.
uintptr_t stack_extent = (uintptr_t) os::Linux::initial_thread_stack_bottom(); unsignedchar vec[1];
if (mincore((address)stack_extent, os::vm_page_size(), vec) == -1) { // Fallback to slow path on all errors, including EAGAIN
assert((uintptr_t)addr >= stack_extent, "Sanity: addr should be larger than extent, " PTR_FORMAT " >= " PTR_FORMAT,
p2i(addr), stack_extent);
stack_extent = (uintptr_t) get_stack_commited_bottom(
os::Linux::initial_thread_stack_bottom(),
(size_t)addr - stack_extent);
}
// If this is a growable mapping, remove the guard pages entirely by // munmap()ping them. If not, just call uncommit_memory(). This only // affects the main/primordial thread, but guard against future OS changes. // It's safe to always unmap guard pages for primordial thread because we // always place it right after end of the mapped region.
if (os::is_primordial_thread()) { return ::munmap(addr, size) == 0;
}
return os::uncommit_memory(addr, size);
}
// 'requested_addr' is only treated as a hint, the return value may or // may not start from the requested address. Unlike Linux mmap(), this // function returns NULL to indicate failure. staticchar* anon_mmap(char* requested_addr, size_t bytes) { // MAP_FIXED is intentionally left out, to leave existing mappings intact. constint flags = MAP_PRIVATE | MAP_NORESERVE | MAP_ANONYMOUS;
// Map reserved/uncommitted pages PROT_NONE so we fail early if we // touch an uncommitted page. Otherwise, the read/write might // succeed if we have enough swap space to back the physical page. char* addr = (char*)::mmap(requested_addr, bytes, PROT_NONE, flags, -1, 0);
return addr == MAP_FAILED ? NULL : addr;
}
// Allocate (using mmap, NO_RESERVE, with small pages) at either a given request address // (req_addr != NULL) or with a given alignment. // - bytes shall be a multiple of alignment. // - req_addr can be NULL. If not NULL, it must be a multiple of alignment. // - alignment sets the alignment at which memory shall be allocated. // It must be a multiple of allocation granularity. // Returns address of memory or NULL. If req_addr was not NULL, will only return // req_addr or NULL. staticchar* anon_mmap_aligned(char* req_addr, size_t bytes, size_t alignment) {
size_t extra_size = bytes; if (req_addr == NULL && alignment > 0) {
extra_size += alignment;
}
staticbool linux_mprotect(char* addr, size_t size, int prot) { // Linux wants the mprotect address argument to be page aligned. char* bottom = (char*)align_down((intptr_t)addr, os::vm_page_size());
// According to SUSv3, mprotect() should only be used with mappings // established by mmap(), and mmap() always maps whole pages. Unaligned // 'addr' likely indicates problem in the VM (e.g. trying to change // protection of malloc'ed or statically allocated memory). Check the // caller if you hit this assert.
assert(addr == bottom, "sanity check");
size = align_up(pointer_delta(addr, bottom, 1) + size, os::vm_page_size()); // Don't log anything if we're executing in the poison page signal handling // context. It can lead to reentrant use of other parts of the VM code. #ifdef CAN_SHOW_REGISTERS_ON_ASSERT if (addr != g_assert_poison) #endif
Events::log(NULL, "Protecting memory [" INTPTR_FORMAT "," INTPTR_FORMAT "] with protection modes %x", p2i(bottom), p2i(bottom+size), prot); return ::mprotect(bottom, size, prot) == 0;
}
// Set protections specified bool os::protect_memory(char* addr, size_t bytes, ProtType prot, bool is_committed) { unsignedint p = 0; switch (prot) { case MEM_PROT_NONE: p = PROT_NONE; break; case MEM_PROT_READ: p = PROT_READ; break; case MEM_PROT_RW: p = PROT_READ|PROT_WRITE; break; case MEM_PROT_RWX: p = PROT_READ|PROT_WRITE|PROT_EXEC; break; default:
ShouldNotReachHere();
} // is_committed is unused. return linux_mprotect(addr, bytes, p);
}
if (warn) {
warning("HugeTLBFS is not configured or not supported by the operating system.");
}
returnfalse;
}
bool os::Linux::shm_hugetlbfs_sanity_check(bool warn, size_t page_size) { // Try to create a large shared memory segment. int shmid = shmget(IPC_PRIVATE, page_size, SHM_HUGETLB|IPC_CREAT|SHM_R|SHM_W); if (shmid == -1) { // Possible reasons for shmget failure: // 1. shmmax is too small for the request. // > check shmmax value: cat /proc/sys/kernel/shmmax // > increase shmmax value: echo "new_value" > /proc/sys/kernel/shmmax // 2. not enough large page memory. // > check available large pages: cat /proc/meminfo // > increase amount of large pages: // sysctl -w vm.nr_hugepages=new_value // > For more information regarding large pages please refer to: // https://www.kernel.org/doc/Documentation/vm/hugetlbpage.txt if (warn) {
warning("Large pages using UseSHM are not configured on this system.");
} returnfalse;
} // Managed to create a segment, now delete it.
shmctl(shmid, IPC_RMID, NULL); returntrue;
}
// From the coredump_filter documentation: // // - (bit 0) anonymous private memory // - (bit 1) anonymous shared memory // - (bit 2) file-backed private memory // - (bit 3) file-backed shared memory // - (bit 4) ELF header pages in file-backed private memory areas (it is // effective only if the bit 2 is cleared) // - (bit 5) hugetlb private memory // - (bit 6) hugetlb shared memory // - (bit 7) dax private memory // - (bit 8) dax shared memory // staticvoid set_coredump_filter(CoredumpFilterBit bit) {
FILE *f; long cdm;
if ((f = os::fopen("/proc/self/coredump_filter", "r+")) == NULL) { return;
}
if (fscanf(f, "%lx", &cdm) != 1) {
fclose(f); return;
}
long saved_cdm = cdm;
rewind(f);
cdm |= bit;
if (cdm != saved_cdm) {
fprintf(f, "%#lx", cdm);
}
// large_page_size on Linux is used to round up heap size. x86 uses either // 2M or 4M page, depending on whether PAE (Physical Address Extensions) // mode is enabled. AMD64/EM64T uses 2M page in 64bit mode. IA64 can use // page as large as 1G. // // Here we try to figure out page size by parsing /proc/meminfo and looking // for a line with the following format: // Hugepagesize: 2048 kB // // If we can't determine the value (e.g. /proc is not mounted, or the text // format has been changed), we'll set largest page size to 0
FILE *fp = os::fopen("/proc/meminfo", "r"); if (fp) { while (!feof(fp)) { int x = 0; char buf[16]; if (fscanf(fp, "Hugepagesize: %d", &x) == 1) { if (x && fgets(buf, sizeof(buf), fp) && strcmp(buf, " kB\n") == 0) {
default_large_page_size = x * K; break;
}
} else { // skip to next line for (;;) { int ch = fgetc(fp); if (ch == EOF || ch == (int)'\n') break;
}
}
}
fclose(fp);
}
return default_large_page_size;
}
static os::PageSizes scan_multiple_page_support() { // Scan /sys/kernel/mm/hugepages // to discover the available page sizes constchar* sys_hugepages = "/sys/kernel/mm/hugepages";
os::PageSizes page_sizes;
DIR *dir = opendir(sys_hugepages);
struct dirent *entry;
size_t page_size; while ((entry = readdir(dir)) != NULL) { if (entry->d_type == DT_DIR &&
sscanf(entry->d_name, "hugepages-%zukB", &page_size) == 1) { // The kernel is using kB, hotspot uses bytes // Add each found Large Page Size to page_sizes
page_sizes.add(page_size * K);
}
}
closedir(dir);
void warn_no_large_pages_configured() { if (!FLAG_IS_DEFAULT(UseLargePages)) {
log_warning(pagesize)("UseLargePages disabled, no large pages configured and available on the system.");
}
}
// The type of large pages has not been specified by the user.
// Try UseHugeTLBFS and then UseSHM.
UseHugeTLBFS = UseSHM = true;
// Don't try UseTransparentHugePages since there are known // performance issues with it turned on. This might change in the future.
UseTransparentHugePages = false;
}
if (UseHugeTLBFS) { bool warn_on_failure = !FLAG_IS_DEFAULT(UseHugeTLBFS); if (hugetlbfs_sanity_check(warn_on_failure, page_size)) {
UseSHM = false; returntrue;
}
UseHugeTLBFS = false;
}
if (UseSHM) { bool warn_on_failure = !FLAG_IS_DEFAULT(UseSHM); if (shm_hugetlbfs_sanity_check(warn_on_failure, page_size)) { returntrue;
}
UseSHM = false;
}
warn_no_large_pages_configured(); returnfalse;
}
void os::large_page_init() { // 1) Handle the case where we do not want to use huge pages and hence // there is no need to scan the OS for related info if (!UseLargePages &&
!UseTransparentHugePages &&
!UseHugeTLBFS &&
!UseSHM) { // Not using large pages. return;
}
if (!FLAG_IS_DEFAULT(UseLargePages) && !UseLargePages) { // The user explicitly turned off large pages. // Ignore the rest of the large pages flags.
UseTransparentHugePages = false;
UseHugeTLBFS = false;
UseSHM = false; return;
}
// 2) Scan OS info
size_t default_large_page_size = scan_default_large_page_size();
os::Linux::_default_large_page_size = default_large_page_size; if (default_large_page_size == 0) { // No large pages configured, return.
warn_no_large_pages_configured();
UseLargePages = false;
UseTransparentHugePages = false;
UseHugeTLBFS = false;
UseSHM = false; return;
}
os::PageSizes all_large_pages = scan_multiple_page_support();
// 3) Consistency check and post-processing
// It is unclear if /sys/kernel/mm/hugepages/ and /proc/meminfo could disagree. Manually // re-add the default page size to the list of page sizes to be sure.
all_large_pages.add(default_large_page_size);
// Check LargePageSizeInBytes matches an available page size and if so set _large_page_size // using LargePageSizeInBytes as the maximum allowed large page size. If LargePageSizeInBytes // doesn't match an available page size set _large_page_size to default_large_page_size // and use it as the maximum. if (FLAG_IS_DEFAULT(LargePageSizeInBytes) ||
LargePageSizeInBytes == 0 ||
LargePageSizeInBytes == default_large_page_size) {
_large_page_size = default_large_page_size;
log_info(pagesize)("Using the default large page size: " SIZE_FORMAT "%s",
byte_size_in_exact_unit(_large_page_size),
exact_unit_for_byte_size(_large_page_size));
} else { if (all_large_pages.contains(LargePageSizeInBytes)) {
_large_page_size = LargePageSizeInBytes;
log_info(pagesize)("Overriding default large page size (" SIZE_FORMAT "%s) " "using LargePageSizeInBytes: " SIZE_FORMAT "%s",
byte_size_in_exact_unit(default_large_page_size),
exact_unit_for_byte_size(default_large_page_size),
byte_size_in_exact_unit(_large_page_size),
exact_unit_for_byte_size(_large_page_size));
} else {
_large_page_size = default_large_page_size;
log_info(pagesize)("LargePageSizeInBytes is not a valid large page size (" SIZE_FORMAT "%s) " "using the default large page size: " SIZE_FORMAT "%s",
byte_size_in_exact_unit(LargePageSizeInBytes),
exact_unit_for_byte_size(LargePageSizeInBytes),
byte_size_in_exact_unit(_large_page_size),
exact_unit_for_byte_size(_large_page_size));
}
}
// Populate _page_sizes with large page sizes less than or equal to // _large_page_size. for (size_t page_size = _large_page_size; page_size != 0;
page_size = all_large_pages.next_smaller(page_size)) {
_page_sizes.add(page_size);
}
// Since we don't know if the kernel unmapped the pre-reserved memory area // we can't unmap it, since that would potentially unmap memory that was // mapped from other threads. return NULL;
}
return addr;
}
staticchar* shmat_at_address(int shmid, char* req_addr) { if (!is_aligned(req_addr, SHMLBA)) {
assert(false, "Requested address needs to be SHMLBA aligned"); return NULL;
}
char* addr = (char*)shmat(shmid, req_addr, 0);
if ((intptr_t)addr == -1) {
shm_warning_with_errno("Failed to attach shared memory."); return NULL;
}
return addr;
}
staticchar* shmat_large_pages(int shmid, size_t bytes, size_t alignment, char* req_addr) { // If a req_addr has been provided, we assume that the caller has already aligned the address. if (req_addr != NULL) {
assert(is_aligned(req_addr, os::large_page_size()), "Must be divisible by the large page size");
assert(is_aligned(req_addr, alignment), "Must be divisible by given alignment"); return shmat_at_address(shmid, req_addr);
}
// Since shmid has been setup with SHM_HUGETLB, shmat will automatically // return large page size aligned memory addresses when req_addr == NULL. // However, if the alignment is larger than the large page size, we have // to manually ensure that the memory returned is 'alignment' aligned. if (alignment > os::large_page_size()) {
assert(is_aligned(alignment, os::large_page_size()), "Must be divisible by the large page size"); return shmat_with_alignment(shmid, bytes, alignment);
} else { return shmat_at_address(shmid, NULL);
}
}
char* os::Linux::reserve_memory_special_shm(size_t bytes, size_t alignment, char* req_addr, bool exec) { // "exec" is passed in but not used. Creating the shared image for // the code cache doesn't have an SHM_X executable permission to check.
assert(UseLargePages && UseSHM, "only for SHM large pages");
assert(is_aligned(req_addr, os::large_page_size()), "Unaligned address");
assert(is_aligned(req_addr, alignment), "Unaligned address");
if (!is_aligned(bytes, os::large_page_size())) { return NULL; // Fallback to small pages.
}
// Create a large shared memory region to attach to based on size. // Currently, size is the total size of the heap. int shmid = shmget(IPC_PRIVATE, bytes, SHM_HUGETLB|IPC_CREAT|SHM_R|SHM_W); if (shmid == -1) { // Possible reasons for shmget failure: // 1. shmmax is too small for the request. // > check shmmax value: cat /proc/sys/kernel/shmmax // > increase shmmax value: echo "new_value" > /proc/sys/kernel/shmmax // 2. not enough large page memory. // > check available large pages: cat /proc/meminfo // > increase amount of large pages: // sysctl -w vm.nr_hugepages=new_value // > For more information regarding large pages please refer to: // https://www.kernel.org/doc/Documentation/vm/hugetlbpage.txt // Note 1: different Linux may use different name for this property, // e.g. on Redhat AS-3 it is "hugetlb_pool". // Note 2: it's possible there's enough physical memory available but // they are so fragmented after a long run that they can't // coalesce into large pages. Try to reserve large pages when // the system is still "fresh".
shm_warning_with_errno("Failed to reserve shared memory."); return NULL;
}
// Attach to the region. char* addr = shmat_large_pages(shmid, bytes, alignment, req_addr);
// Remove shmid. If shmat() is successful, the actual shared memory segment // will be deleted when it's detached by shmdt() or when the process // terminates. If shmat() is not successful this will remove the shared // segment immediately.
shmctl(shmid, IPC_RMID, NULL);
return addr;
}
staticvoid log_on_commit_special_failure(char* req_addr, size_t bytes,
size_t page_size, int error) {
assert(error == ENOMEM, "Only expect to fail if no memory is available");
log_info(pagesize)("Failed to reserve and commit memory with given page size. req_addr: " PTR_FORMAT " size: " SIZE_FORMAT "%s, page size: " SIZE_FORMAT "%s, (errno = %d)",
p2i(req_addr), byte_size_in_exact_unit(bytes), exact_unit_for_byte_size(bytes),
byte_size_in_exact_unit(page_size), exact_unit_for_byte_size(page_size), error);
}
bool os::Linux::commit_memory_special(size_t bytes,
size_t page_size, char* req_addr, bool exec) {
assert(UseLargePages && UseHugeTLBFS, "Should only get here when HugeTLBFS large pages are used");
assert(is_aligned(bytes, page_size), "Unaligned size");
assert(is_aligned(req_addr, page_size), "Unaligned address");
assert(req_addr != NULL, "Must have a requested address for special mappings");
int prot = exec ? PROT_READ|PROT_WRITE|PROT_EXEC : PROT_READ|PROT_WRITE; int flags = MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED;
// For large pages additional flags are required. if (page_size > (size_t) os::vm_page_size()) {
flags |= MAP_HUGETLB | hugetlbfs_page_size_flag(page_size);
} char* addr = (char*)::mmap(req_addr, bytes, prot, flags, -1, 0);
char* os::Linux::reserve_memory_special_huge_tlbfs(size_t bytes,
size_t alignment,
size_t page_size, char* req_addr, bool exec) {
assert(UseLargePages && UseHugeTLBFS, "only for Huge TLBFS large pages");
assert(is_aligned(req_addr, alignment), "Must be");
assert(is_aligned(req_addr, page_size), "Must be");
assert(is_aligned(alignment, os::vm_allocation_granularity()), "Must be");
assert(_page_sizes.contains(page_size), "Must be a valid page size");
assert(page_size > (size_t)os::vm_page_size(), "Must be a large page size");
assert(bytes >= page_size, "Shouldn't allocate large pages for small sizes");
// We only end up here when at least 1 large page can be used. // If the size is not a multiple of the large page size, we // will mix the type of pages used, but in a descending order. // Start off by reserving a range of the given size that is // properly aligned. At this point no pages are committed. If // a requested address is given it will be used and it must be // aligned to both the large page size and the given alignment. // The larger of the two will be used.
size_t required_alignment = MAX(page_size, alignment); char* const aligned_start = anon_mmap_aligned(req_addr, bytes, required_alignment); if (aligned_start == NULL) { return NULL;
}
// First commit using large pages.
size_t large_bytes = align_down(bytes, page_size); bool large_committed = commit_memory_special(large_bytes, page_size, aligned_start, exec);
if (large_committed && bytes == large_bytes) { // The size was large page aligned so no additional work is // needed even if the commit failed. return aligned_start;
}
// The requested size requires some small pages as well. char* small_start = aligned_start + large_bytes;
size_t small_size = bytes - large_bytes; if (!large_committed) { // Failed to commit large pages, so we need to unmap the // reminder of the orinal reservation.
::munmap(small_start, small_size); return NULL;
}
// Commit the remaining bytes using small pages. bool small_committed = commit_memory_special(small_size, os::vm_page_size(), small_start, exec); if (!small_committed) { // Failed to commit the remaining size, need to unmap // the large pages part of the reservation.
::munmap(aligned_start, large_bytes); return NULL;
} return aligned_start;
}
char* os::pd_reserve_memory_special(size_t bytes, size_t alignment, size_t page_size, char* req_addr, bool exec) {
assert(UseLargePages, "only for large pages");
char* addr; if (UseSHM) { // No support for using specific page sizes with SHM.
addr = os::Linux::reserve_memory_special_shm(bytes, alignment, req_addr, exec);
} else {
assert(UseHugeTLBFS, "must be");
addr = os::Linux::reserve_memory_special_huge_tlbfs(bytes, alignment, page_size, req_addr, exec);
}
if (addr != NULL) { if (UseNUMAInterleaving) {
numa_make_global(addr, bytes);
}
}
return addr;
}
bool os::Linux::release_memory_special_shm(char* base, size_t bytes) { // detaching the SHM segment will also delete it, see reserve_memory_special_shm() return shmdt(base) == 0;
}
// With SysV SHM the entire memory region must be allocated as shared // memory. // HugeTLBFS allows application to commit large page memory on demand. // However, when committing memory with HugeTLBFS fails, the region // that was supposed to be committed will lose the old reservation // and allow other threads to steal that memory region. Because of this // behavior we can't commit HugeTLBFS memory. bool os::can_commit_large_page_memory() { return UseTransparentHugePages;
}
char* os::pd_attempt_map_memory_to_file_at(char* requested_addr, size_t bytes, int file_desc) {
assert(file_desc >= 0, "file_desc is not valid"); char* result = pd_attempt_reserve_memory_at(requested_addr, bytes, !ExecMem); if (result != NULL) { if (replace_existing_mapping_with_file_mapping(result, bytes, file_desc) == NULL) {
vm_exit_during_initialization(err_msg("Error in mapping Java heap at the given filesystem directory"));
}
} return result;
}
// Reserve memory at an arbitrary address, only if that area is // available (and not reserved for something else).
char* os::pd_attempt_reserve_memory_at(char* requested_addr, size_t bytes, bool exec) { // Assert only that the size is a multiple of the page size, since // that's all that mmap requires, and since that's all we really know // about at this low abstraction level. If we need higher alignment, // we can either pass an alignment to this method or verify alignment // in one of the methods further up the call chain. See bug 5044738.
assert(bytes % os::vm_page_size() == 0, "reserving unexpected size block");
// Repeatedly allocate blocks until the block is allocated at the // right spot.
// Linux mmap allows caller to pass an address as hint; give it a try first, // if kernel honors the hint then we can return immediately. char * addr = anon_mmap(requested_addr, bytes); if (addr == requested_addr) { return requested_addr;
}
if (addr != NULL) { // mmap() is successful but it fails to reserve at the requested address
anon_munmap(addr, bytes);
}
return NULL;
}
// Used to convert frequent JVM_Yield() to nops bool os::dont_yield() { return DontYieldALot;
}
// Linux CFS scheduler (since 2.6.23) does not guarantee sched_yield(2) will // actually give up the CPU. Since skip buddy (v2.6.28): // // * Sets the yielding task as skip buddy for current CPU's run queue. // * Picks next from run queue, if empty, picks a skip buddy (can be the yielding task). // * Clears skip buddies for this run queue (yielding task no longer a skip buddy). // // An alternative is calling os::naked_short_nanosleep with a small number to avoid // getting re-scheduled immediately. // void os::naked_yield() {
sched_yield();
}
//////////////////////////////////////////////////////////////////////////////// // thread priority support
// Note: Normal Linux applications are run with SCHED_OTHER policy. SCHED_OTHER // only supports dynamic priority, static priority must be zero. For real-time // applications, Linux supports SCHED_RR which allows static priority (1-99). // However, for large multi-threaded applications, SCHED_RR is not only slower // than SCHED_OTHER, but also very unstable (my volano tests hang hard 4 out // of 5 runs - Sep 2005). // // The following code actually changes the niceness of kernel-thread/LWP. It // has an assumption that setpriority() only modifies one kernel-thread/LWP, // not the entire user process, and user level threads are 1:1 mapped to kernel // threads. It has always been the case, but could change in the future. For // this reason, the code should not be used as default (ThreadPriorityPolicy=0). // It is only used when ThreadPriorityPolicy=1 and may require system level permission // (e.g., root privilege or CAP_SYS_NICE capability).
int os::java_to_os_priority[CriticalPriority + 1] = {
19, // 0 Entry should never be used
4, // 1 MinPriority
3, // 2
2, // 3
1, // 4
0, // 5 NormPriority
-1, // 6
-2, // 7
-3, // 8
-4, // 9 NearMaxPriority
-5, // 10 MaxPriority
-5 // 11 CriticalPriority
};
staticint prio_init() { if (ThreadPriorityPolicy == 1) { if (geteuid() != 0) { if (!FLAG_IS_DEFAULT(ThreadPriorityPolicy) && !FLAG_IS_JIMAGE_RESOURCE(ThreadPriorityPolicy)) {
warning("-XX:ThreadPriorityPolicy=1 may require system level permission, " \ "e.g., being the root user. If the necessary permission is not " \ "possessed, changes to priority will be silently ignored.");
}
}
} if (UseCriticalJavaThreadPriority) {
os::java_to_os_priority[MaxPriority] = os::java_to_os_priority[CriticalPriority];
} return 0;
}
OSReturn os::set_native_priority(Thread* thread, int newpri) { if (!UseThreadPriorities || ThreadPriorityPolicy == 0) return OS_OK;
int ret = setpriority(PRIO_PROCESS, thread->osthread()->thread_id(), newpri); return (ret == 0) ? OS_OK : OS_ERR;
}
// This is the fastest way to get thread cpu time on Linux. // Returns cpu time (user+sys) for any thread, not only for current. // POSIX compliant clocks are implemented in the kernels 2.6.16+. // It might work on 2.6.10+ with a special kernel/glibc patch. // For reference, please, see IEEE Std 1003.1-2004: // http://www.unix.org/single_unix_specification
// Determine if the vmid is the parent pid for a child in a PID namespace. // Return the namespace pid if so, otherwise -1. int os::Linux::get_namespace_pid(int vmid) { char fname[24]; int retpid = -1;
if (fp) { int pid, nspid; int ret; while (!feof(fp) && !ferror(fp)) {
ret = fscanf(fp, "NSpid: %d %d", &pid, &nspid); if (ret == 1) { break;
} if (ret == 2) {
retpid = nspid; break;
} for (;;) { int ch = fgetc(fp); if (ch == EOF || ch == (int)'\n') break;
}
}
fclose(fp);
} return retpid;
}
externvoid report_error(char* file_name, int line_no, char* title, char* format, ...);
// Some linux distributions (notably: Alpine Linux) include the // grsecurity in the kernel. Of particular interest from a JVM perspective // is PaX (https://pax.grsecurity.net/), which adds some security features // related to page attributes. Specifically, the MPROTECT PaX functionality // (https://pax.grsecurity.net/docs/mprotect.txt) prevents dynamic // code generation by disallowing a (previously) writable page to be // marked as executable. This is, of course, exactly what HotSpot does // for both JIT compiled method, as well as for stubs, adapters, etc. // // Instead of crashing "lazily" when trying to make a page executable, // this code probes for the presence of PaX and reports the failure // eagerly. staticvoid check_pax(void) { // Zero doesn't generate code dynamically, so no need to perform the PaX check #ifndef ZERO
size_t size = os::vm_page_size();
void* p = ::mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0); if (p == MAP_FAILED) {
log_debug(os)("os_linux.cpp: check_pax: mmap failed (%s)" , os::strerror(errno));
vm_exit_out_of_memory(size, OOM_MMAP_ERROR, "failed to allocate memory for PaX check.");
}
int res = ::mprotect(p, size, PROT_READ|PROT_WRITE|PROT_EXEC); if (res == -1) {
log_debug(os)("os_linux.cpp: check_pax: mprotect failed (%s)" , os::strerror(errno));
vm_exit_during_initialization( "Failed to mark memory page as executable - check if grsecurity/PaX is enabled");
}
::munmap(p, size); #endif
}
// this is called _before_ most of the global arguments have been parsed void os::init(void) { char dummy; // used to get a guess on initial stack address
clock_tics_per_sec = sysconf(_SC_CLK_TCK);
int page_size = sysconf(_SC_PAGESIZE);
OSInfo::set_vm_page_size(page_size);
OSInfo::set_vm_allocation_granularity(page_size); if (os::vm_page_size() <= 0) {
fatal("os_linux.cpp: os::init: sysconf failed (%s)",
os::strerror(errno));
}
_page_sizes.add(os::vm_page_size());
// Java can be invoked as // 1. Without numactl and heap will be allocated/configured on all nodes as // per the system policy. // 2. With numactl --interleave: // Use numa_get_interleave_mask(v2) API to get nodes bitmask. The same // API for membind case bitmask is reset. // Interleave is only hint and Kernel can fallback to other nodes if // no memory is available on the target nodes. // 3. With numactl --membind: // Use numa_get_membind(v2) API to get nodes bitmask. The same API for // interleave case returns bitmask of all nodes. // numa_all_nodes_ptr holds bitmask of all nodes. // numa_get_interleave_mask(v2) and numa_get_membind(v2) APIs returns correct // bitmask when externally configured to run on all or fewer nodes.
if (!Linux::libnuma_init()) {
FLAG_SET_ERGO(UseNUMA, false);
FLAG_SET_ERGO(UseNUMAInterleaving, false); // Also depends on libnuma.
} else { if ((Linux::numa_max_node() < 1) || Linux::is_bound_to_single_node()) { // If there's only one node (they start from 0) or if the process // is bound explicitly to a single node using membind, disable NUMA
UseNUMA = false;
} else {
LogTarget(Info,os) log;
LogStream ls(log);
// When NUMA requested, not-NUMA-aware allocations default to interleaving. if (UseNUMA && !UseNUMAInterleaving) {
FLAG_SET_ERGO_IF_DEFAULT(UseNUMAInterleaving, true);
}
if (UseParallelGC && UseNUMA && UseLargePages && !can_commit_large_page_memory()) { // With SHM and HugeTLBFS large pages we cannot uncommit a page, so there's no way // we can make the adaptive lgrp chunk resizing work. If the user specified both // UseNUMA and UseLargePages (or UseSHM/UseHugeTLBFS) on the command line - warn // and disable adaptive resizing. if (UseAdaptiveSizePolicy || UseAdaptiveNUMAChunkSizing) {
warning("UseNUMA is not fully compatible with SHM/HugeTLBFS large pages, " "disabling adaptive resizing (-XX:-UseAdaptiveSizePolicy -XX:-UseAdaptiveNUMAChunkSizing)");
UseAdaptiveSizePolicy = false;
UseAdaptiveNUMAChunkSizing = false;
}
}
}
#ifdefined(IA32) && !defined(ZERO) /* * Work-around (execute code at a high address) for broken NX emulation using CS limit, * Red Hat patch "Exec-Shield" (IA32 only). * * Map and execute at a high VA to prevent CS lazy updates race with SMP MM * invalidation.Further code generation by the JVM will no longer cause CS limit * updates. * * Affects IA32: RHEL 5 & 6, Ubuntu 10.04 (LTS), 10.10, 11.04, 11.10, 12.04. * @see JDK-8023956
*/ staticvoid workaround_expand_exec_shield_cs_limit() {
assert(os::Linux::initial_thread_stack_bottom() != NULL, "sanity");
size_t page_size = os::vm_page_size();
/* * JDK-8197429 * * Expand the stack mapping to the end of the initial stack before * attempting to install the codebuf. This is needed because newer * Linux kernels impose a distance of a megabyte between stack * memory and other memory regions. If we try to install the * codebuf before expanding the stack the installation will appear * to succeed but we'll get a segfault later if we expand the stack * in Java code. *
*/ if (os::is_primordial_thread()) {
address limit = os::Linux::initial_thread_stack_bottom(); if (! DisablePrimordialThreadGuardPages) {
limit += StackOverflow::stack_red_zone_size() +
StackOverflow::stack_yellow_zone_size();
}
os::Linux::expand_stack_to(limit);
}
/* * Take the highest VA the OS will give us and exec * * Although using -(pagesz) as mmap hint works on newer kernel as you would * think, older variants affected by this work-around don't (search forward only). * * On the affected distributions, we understand the memory layout to be: * * TASK_LIMIT= 3G, main stack base close to TASK_LIMT. * * A few pages south main stack will do it. * * If we are embedded in an app other than launcher (initial != main stack), * we don't have much control or understanding of the address space, just let it slide.
*/ char* hint = (char*)(os::Linux::initial_thread_stack_bottom() -
(StackOverflow::stack_guard_zone_size() + page_size)); char* codebuf = os::attempt_reserve_memory_at(hint, page_size);
if (codebuf == NULL) { // JDK-8197429: There may be a stack gap of one megabyte between // the limit of the stack and the nearest memory region: this is a // Linux kernel workaround for CVE-2017-1000364. If we failed to // map our codebuf, try again at an address one megabyte lower.
hint -= 1 * M;
codebuf = os::attempt_reserve_memory_at(hint, page_size);
}
if ((codebuf == NULL) || (!os::commit_memory(codebuf, page_size, true))) { return; // No matter, we tried, best effort.
}
// Some code to exec: the 'ret' instruction
codebuf[0] = 0xC3;
// Call the code in the codebuf
__asm__ volatile("call *%0" : : "r"(codebuf));
// keep the page mapped so CS limit isn't reduced.
} #endif// defined(IA32) && !defined(ZERO)
// this is called _after_ the global arguments have been parsed
jint os::init_2(void) {
// This could be set after os::Posix::init() but all platforms // have to set it the same so we have to mirror Solaris.
DEBUG_ONLY(os::set_mutex_init_done();)
os::Posix::init_2();
Linux::fast_thread_clock_init();
if (PosixSignals::init() == JNI_ERR) { return JNI_ERR;
}
if (AdjustStackSizeForTLS) {
get_minstack_init();
}
// Check and sets minimum stack sizes against command line options if (set_minimum_stack_sizes() == JNI_ERR) { return JNI_ERR;
}
#ifdefined(IA32) && !defined(ZERO) // Need to ensure we've determined the process's initial stack to // perform the workaround
Linux::capture_initial_stack(JavaThread::stack_size_at_create());
workaround_expand_exec_shield_cs_limit(); #else
suppress_primordial_thread_resolution = Arguments::created_by_java_launcher(); if (!suppress_primordial_thread_resolution) {
Linux::capture_initial_stack(JavaThread::stack_size_at_create());
} #endif
Linux::libpthread_init();
Linux::sched_getcpu_init();
log_info(os)("HotSpot is running with %s, %s",
Linux::libc_version(), Linux::libpthread_version());
if (UseNUMA || UseNUMAInterleaving) {
Linux::numa_init();
}
if (MaxFDLimit) { // set the number of file descriptors to max. print out error // if getrlimit/setrlimit fails but continue regardless. struct rlimit nbr_files; int status = getrlimit(RLIMIT_NOFILE, &nbr_files); if (status != 0) {
log_info(os)("os::init_2 getrlimit failed: %s", os::strerror(errno));
} else {
nbr_files.rlim_cur = nbr_files.rlim_max;
status = setrlimit(RLIMIT_NOFILE, &nbr_files); if (status != 0) {
log_info(os)("os::init_2 setrlimit failed: %s", os::strerror(errno));
}
}
}
// at-exit methods are called in the reverse order of their registration. // atexit functions are called on return from main or as a result of a // call to exit(3C). There can be only 32 of these functions registered // and atexit() does not set errno.
if (PerfAllowAtExitRegistration) { // only register atexit functions if PerfAllowAtExitRegistration is set. // atexit functions can be delayed until process exit time, which // can be problematic for embedded VM situations. Embedded VMs should // call DestroyJavaVM() to assure that VM resources are released.
// note: perfMemory_exit_helper atexit function may be removed in // the future if the appropriate cleanup code can be added to the // VM_Exit VMOperation's doit method. if (atexit(perfMemory_exit_helper) != 0) {
warning("os::init_2 atexit(perfMemory_exit_helper) failed");
}
}
// initialize thread priority policy
prio_init();
if (!FLAG_IS_DEFAULT(AllocateHeapAt)) {
set_coredump_filter(DAX_SHARED_BIT);
}
if (DumpPrivateMappingsInCore) {
set_coredump_filter(FILE_BACKED_PVT_BIT);
}
if (DumpSharedMappingsInCore) {
set_coredump_filter(FILE_BACKED_SHARED_BIT);
}
if (DumpPerfMapAtExit && FLAG_IS_DEFAULT(UseCodeCacheFlushing)) { // Disable code cache flushing to ensure the map file written at // exit contains all nmethods generated during execution.
FLAG_SET_DEFAULT(UseCodeCacheFlushing, false);
}
return JNI_OK;
}
// older glibc versions don't have this macro (which expands to // an optimized bit-counting function) so we have to roll our own #ifndef CPU_COUNT
staticint _cpu_count(const cpu_set_t* cpus) { int count = 0; // only look up to the number of configured processors for (int i = 0; i < os::processor_count(); i++) { if (CPU_ISSET(i, cpus)) {
count++;
}
} return count;
}
#define CPU_COUNT(cpus) _cpu_count(cpus)
#endif// CPU_COUNT
// Get the current number of available processors for this process. // This value can change at any time during a process's lifetime. // sched_getaffinity gives an accurate answer as it accounts for cpusets. // If it appears there may be more than 1024 processors then we do a // dynamic check - see 6515172 for details. // If anything goes wrong we fallback to returning the number of online // processors - which can be greater than the number available to the process. staticint get_active_processor_count() { // Note: keep this function, with its CPU_xx macros, *outside* the os namespace (see JDK-8289477).
cpu_set_t cpus; // can represent at most 1024 (CPU_SETSIZE) processors
cpu_set_t* cpus_p = &cpus; int cpus_size = sizeof(cpu_set_t);
int configured_cpus = os::processor_count(); // upper bound on available cpus int cpu_count = 0;
// old build platforms may not support dynamic cpu sets #ifdef CPU_ALLOC
// To enable easy testing of the dynamic path on different platforms we // introduce a diagnostic flag: UseCpuAllocPath if (configured_cpus >= CPU_SETSIZE || UseCpuAllocPath) { // kernel may use a mask bigger than cpu_set_t
log_trace(os)("active_processor_count: using dynamic path %s" "- configured processors: %d",
UseCpuAllocPath ? "(forced) " : "",
configured_cpus);
cpus_p = CPU_ALLOC(configured_cpus); if (cpus_p != NULL) {
cpus_size = CPU_ALLOC_SIZE(configured_cpus); // zero it just to be safe
CPU_ZERO_S(cpus_size, cpus_p);
} else { // failed to allocate so fallback to online cpus int online_cpus = ::sysconf(_SC_NPROCESSORS_ONLN);
log_trace(os)("active_processor_count: " "CPU_ALLOC failed (%s) - using " "online processor count: %d",
os::strerror(errno), online_cpus); return online_cpus;
}
} else {
log_trace(os)("active_processor_count: using static path - configured processors: %d",
configured_cpus);
} #else// CPU_ALLOC // these stubs won't be executed #define CPU_COUNT_S(size, cpus) -1 #define CPU_FREE(cpus)
log_trace(os)("active_processor_count: only static path available - configured processors: %d",
configured_cpus); #endif// CPU_ALLOC
// pid 0 means the current thread - which we have to assume represents the process if (sched_getaffinity(0, cpus_size, cpus_p) == 0) { if (cpus_p != &cpus) { // can only be true when CPU_ALLOC used
cpu_count = CPU_COUNT_S(cpus_size, cpus_p);
} else {
cpu_count = CPU_COUNT(cpus_p);
}
log_trace(os)("active_processor_count: sched_getaffinity processor count: %d", cpu_count);
} else {
cpu_count = ::sysconf(_SC_NPROCESSORS_ONLN);
warning("sched_getaffinity failed (%s)- using online processor count (%d) " "which may exceed available processors", os::strerror(errno), cpu_count);
}
if (cpus_p != &cpus) { // can only be true when CPU_ALLOC used
CPU_FREE(cpus_p);
}
int os::Linux::active_processor_count() { return get_active_processor_count();
}
// Determine the active processor count from one of // three different sources: // // 1. User option -XX:ActiveProcessorCount // 2. kernel os calls (sched_getaffinity or sysconf(_SC_NPROCESSORS_ONLN) // 3. extracted from cgroup cpu subsystem (shares and quotas) // // Option 1, if specified, will always override. // If the cgroup subsystem is active and configured, we // will return the min of the cgroup and option 2 results. // This is required since tools, such as numactl, that // alter cpu affinity do not update cgroup subsystem // cpuset configuration files. int os::active_processor_count() { // User has overridden the number of active processors if (ActiveProcessorCount > 0) {
log_trace(os)("active_processor_count: " "active processor count set by user : %d",
ActiveProcessorCount); return ActiveProcessorCount;
}
int active_cpus; if (OSContainer::is_containerized()) {
active_cpus = OSContainer::active_processor_count();
log_trace(os)("active_processor_count: determined by OSContainer: %d",
active_cpus);
} else {
active_cpus = os::Linux::active_processor_count();
}
return active_cpus;
}
staticbool should_warn_invalid_processor_id() { if (os::processor_count() == 1) { // Don't warn if we only have one processor returnfalse;
}
staticvolatileint warn_once = 1;
if (Atomic::load(&warn_once) == 0 ||
Atomic::xchg(&warn_once, 0) == 0) { // Don't warn more than once returnfalse;
}
returntrue;
}
uint os::processor_id() { constint id = Linux::sched_getcpu();
if (id < processor_count()) { return (uint)id;
}
// Some environments (e.g. openvz containers and the rr debugger) incorrectly // report a processor id that is higher than the number of processors available. // This is problematic, for example, when implementing CPU-local data structures, // where the processor id is used to index into an array of length processor_count(). // If this happens we return 0 here. This is is safe since we always have at least // one processor, but it's not optimal for performance if we're actually executing // in an environment with more than one processor. if (should_warn_invalid_processor_id()) {
log_warning(os)("Invalid processor id reported by the operating system " "(got processor id %d, valid processor id range is 0-%d)",
id, processor_count() - 1);
log_warning(os)("Falling back to assuming processor id is 0. " "This could have a negative impact on performance.");
}
return 0;
}
void os::set_native_thread_name(constchar *name) { if (Linux::_pthread_setname_np) { char buf [16]; // according to glibc manpage, 16 chars incl. '/0'
snprintf(buf, sizeof(buf), "%s", name);
buf[sizeof(buf) - 1] = '\0'; constint rc = Linux::_pthread_setname_np(pthread_self(), buf); // ERANGE should not happen; all other errors should just be ignored.
assert(rc != ERANGE, "pthread_setname_np failed");
}
}
//////////////////////////////////////////////////////////////////////////////// // debug support
// This does not do anything on Linux. This is basically a hook for being // able to use structured exception handling (thread-local exception filters) // on, e.g., Win32. void
os::os_exception_wrapper(java_call_t f, JavaValue* value, const methodHandle& method,
JavaCallArguments* args, JavaThread* thread) {
f(value, method, args, thread);
}
// This code originates from JDK's sysOpen and open64_w // from src/solaris/hpi/src/system_md.c
int os::open(constchar *path, int oflag, int mode) { if (strlen(path) > MAX_PATH - 1) {
errno = ENAMETOOLONG; return -1;
}
// All file descriptors that are opened in the Java process and not // specifically destined for a subprocess should have the close-on-exec // flag set. If we don't set it, then careless 3rd party native code // might fork and exec without closing all appropriate file descriptors // (e.g. as we do in closeDescriptors in UNIXProcess.c), and this in // turn might: // // - cause end-of-file to fail to be detected on some file // descriptors, resulting in mysterious hangs, or // // - might cause an fopen in the subprocess to fail on a system // suffering from bug 1085341. // // (Yes, the default setting of the close-on-exec flag is a Unix // design flaw) // // See: // 1085341: 32-bit stdio routines should support file descriptors >255 // 4843136: (process) pipe file descriptor from Runtime.exec not being closed // 6339493: (process) Runtime.exec does not close all file descriptors on Solaris 9 // // Modern Linux kernels (after 2.6.23 2007) support O_CLOEXEC with open(). // O_CLOEXEC is preferable to using FD_CLOEXEC on an open file descriptor // because it saves a system call and removes a small window where the flag // is unset. On ancient Linux kernels the O_CLOEXEC flag will be ignored // and we fall back to using FD_CLOEXEC (see below). #ifdef O_CLOEXEC
oflag |= O_CLOEXEC; #endif
int fd = ::open64(path, oflag, mode); if (fd == -1) return -1;
//If the open succeeded, the file might still be a directory
{ struct stat64 buf64; int ret = ::fstat64(fd, &buf64); int st_mode = buf64.st_mode;
#ifdef FD_CLOEXEC // Validate that the use of the O_CLOEXEC flag on open above worked. // With recent kernels, we will perform this check exactly once. static sig_atomic_t O_CLOEXEC_is_known_to_work = 0; if (!O_CLOEXEC_is_known_to_work) { int flags = ::fcntl(fd, F_GETFD); if (flags != -1) { if ((flags & FD_CLOEXEC) != 0)
O_CLOEXEC_is_known_to_work = 1; else
::fcntl(fd, F_SETFD, flags | FD_CLOEXEC);
}
} #endif
static jlong fast_cpu_time(Thread *thread) {
clockid_t clockid; int rc = os::Linux::pthread_getcpuclockid(thread->osthread()->pthread_id(),
&clockid); if (rc == 0) { return os::Linux::fast_thread_cpu_time(clockid);
} else { // It's possible to encounter a terminated native thread that failed // to detach itself from the VM - which should result in ESRCH.
assert_status(rc == ESRCH, rc, "pthread_getcpuclockid failed"); return -1;
}
}
// current_thread_cpu_time(bool) and thread_cpu_time(Thread*, bool) // are used by JVM M&M and JVMTI to get user+sys or user CPU time // of a thread. // // current_thread_cpu_time() and thread_cpu_time(Thread*) returns // the fast estimate available on the platform.
jlong os::current_thread_cpu_time() { if (os::Linux::supports_fast_thread_cpu_time()) { return os::Linux::fast_thread_cpu_time(CLOCK_THREAD_CPUTIME_ID);
} else { // return user + sys since the cost is the same return slow_thread_cpu_time(Thread::current(), true/* user + sys */);
}
}
jlong os::thread_cpu_time(Thread* thread) { // consistent with what current_thread_cpu_time() returns if (os::Linux::supports_fast_thread_cpu_time()) { return fast_cpu_time(thread);
} else { return slow_thread_cpu_time(thread, true/* user + sys */);
}
}
// Skip pid and the command string. Note that we could be dealing with // weird command names, e.g. user could decide to rename java launcher // to "java 1.4.2 :)", then the stat file would look like // 1234 (java 1.4.2 :)) R ... ... // We don't really need to know the command string, just find the last // occurrence of ")" and then start parsing from there. See bug 4726580.
s = strrchr(stat, ')'); if (s == NULL) return -1;
// Skip blank chars do { s++; } while (s && isspace(*s));
void os::current_thread_cpu_time_info(jvmtiTimerInfo *info_ptr) {
info_ptr->max_value = ALL_64_BITS; // will not wrap in less than 64 bits
info_ptr->may_skip_backward = false; // elapsed time not wall time
info_ptr->may_skip_forward = false; // elapsed time not wall time
info_ptr->kind = JVMTI_TIMER_TOTAL_CPU; // user+system time is returned
}
void os::thread_cpu_time_info(jvmtiTimerInfo *info_ptr) {
info_ptr->max_value = ALL_64_BITS; // will not wrap in less than 64 bits
info_ptr->may_skip_backward = false; // elapsed time not wall time
info_ptr->may_skip_forward = false; // elapsed time not wall time
info_ptr->kind = JVMTI_TIMER_TOTAL_CPU; // user+system time is returned
}
// System loadavg support. Returns -1 if load average cannot be obtained. // Linux doesn't yet have a (official) notion of processor sets, // so just return the system wide load average. int os::loadavg(double loadavg[], int nelem) { return ::getloadavg(loadavg, nelem);
}
// Get the default path to the core file // Returns the length of the string int os::get_core_path(char* buffer, size_t bufferSize) { /* * Max length of /proc/sys/kernel/core_pattern is 128 characters. * See https://www.kernel.org/doc/Documentation/sysctl/kernel.txt
*/ constint core_pattern_len = 129; char core_pattern[core_pattern_len] = {0};
int core_pattern_file = ::open("/proc/sys/kernel/core_pattern", O_RDONLY); if (core_pattern_file == -1) { return -1;
}
ssize_t ret = ::read(core_pattern_file, core_pattern, core_pattern_len);
::close(core_pattern_file); if (ret <= 0 || ret >= core_pattern_len || core_pattern[0] == '\n') { return -1;
} if (core_pattern[ret-1] == '\n') {
core_pattern[ret-1] = '\0';
} else {
core_pattern[ret] = '\0';
}
// Replace the %p in the core pattern with the process id. NOTE: we do this // only if the pattern doesn't start with "|", and we support only one %p in // the pattern. char *pid_pos = strstr(core_pattern, "%p"); constchar* tail = (pid_pos != NULL) ? (pid_pos + 2) : ""; // skip over the "%p" int written;
if (core_pattern[0] == '/') { if (pid_pos != NULL) {
*pid_pos = '\0';
written = jio_snprintf(buffer, bufferSize, "%s%d%s", core_pattern,
current_process_id(), tail);
} else {
written = jio_snprintf(buffer, bufferSize, "%s", core_pattern);
}
} else { char cwd[PATH_MAX];
constchar* p = get_current_directory(cwd, PATH_MAX); if (p == NULL) { return -1;
}
if (core_pattern[0] == '|') {
written = jio_snprintf(buffer, bufferSize, "\"%s\" (or dumping to %s/core.%d)",
&core_pattern[1], p, current_process_id());
} elseif (pid_pos != NULL) {
*pid_pos = '\0';
written = jio_snprintf(buffer, bufferSize, "%s/%s%d%s", p, core_pattern,
current_process_id(), tail);
} else {
written = jio_snprintf(buffer, bufferSize, "%s/%s", p, core_pattern);
}
}
if (written < 0) { return -1;
}
if (((size_t)written < bufferSize) && (pid_pos == NULL) && (core_pattern[0] != '|')) { int core_uses_pid_file = ::open("/proc/sys/kernel/core_uses_pid", O_RDONLY);
if (core_uses_pid_file != -1) { char core_uses_pid = 0;
ssize_t ret = ::read(core_uses_pid_file, &core_uses_pid, 1);
::close(core_uses_pid_file);
bool os::start_debugging(char *buf, int buflen) { int len = (int)strlen(buf); char *p = &buf[len];
jio_snprintf(p, buflen-len, "\n\n" "Do you want to debug the problem?\n\n" "To debug, run 'gdb /proc/%d/exe %d'; then switch to thread " UINTX_FORMAT " (" INTPTR_FORMAT ")\n" "Enter 'yes' to launch gdb automatically (PATH must include gdb)\n" "Otherwise, press RETURN to abort...",
os::current_process_id(), os::current_process_id(),
os::current_thread_id(), os::current_thread_id());
if (yes) { // yes, user asked VM to launch debugger
jio_snprintf(buf, sizeof(char)*buflen, "gdb /proc/%d/exe %d",
os::current_process_id(), os::current_process_id());
size_t os::current_stack_size() { // This stack size includes the usable stack and HotSpot guard pages // (for the threads that have Hotspot guard pages).
address bottom;
size_t size;
current_stack_region(&bottom, &size); return size;
} #endif
staticinlinestruct timespec get_mtime(constchar* filename) { struct stat st; int ret = os::stat(filename, &st);
assert(ret == 0, "failed to stat() file '%s': %s", filename, os::strerror(errno)); return st.st_mtim;
}
¤ Diese beiden folgenden Angebotsgruppen bietet das Unternehmen0.89Angebot
(Wie Sie bei der Firma Beratungs- und Dienstleistungen beauftragen können 2026-04-26)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.