New src/os/linux/vm/os

   1 /*
   2  * Copyright (c) 1999, 2016, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 // no precompiled headers
  26 #include "classfile/classLoader.hpp"
  27 #include "classfile/systemDictionary.hpp"
  28 #include "classfile/vmSymbols.hpp"
  29 #include "code/icBuffer.hpp"
  30 #include "code/vtableStubs.hpp"
  31 #include "compiler/compileBroker.hpp"
  32 #include "compiler/disassembler.hpp"
  33 #include "interpreter/interpreter.hpp"
  34 #include "jvm_linux.h"
  35 #include "logging/log.hpp"
  36 #include "memory/allocation.inline.hpp"
  37 #include "memory/filemap.hpp"
  38 #include "oops/oop.inline.hpp"
  39 #include "os_linux.inline.hpp"
  40 #include "os_share_linux.hpp"
  41 #include "prims/jniFastGetField.hpp"
  42 #include "prims/jvm.h"
  43 #include "prims/jvm_misc.hpp"
  44 #include "runtime/arguments.hpp"
  45 #include "runtime/atomic.hpp"
  46 #include "runtime/extendedPC.hpp"
  47 #include "runtime/globals.hpp"
  48 #include "runtime/interfaceSupport.hpp"
  49 #include "runtime/init.hpp"
  50 #include "runtime/java.hpp"
  51 #include "runtime/javaCalls.hpp"
  52 #include "runtime/mutexLocker.hpp"
  53 #include "runtime/objectMonitor.hpp"
  54 #include "runtime/orderAccess.inline.hpp"
  55 #include "runtime/osThread.hpp"
  56 #include "runtime/perfMemory.hpp"
  57 #include "runtime/sharedRuntime.hpp"
  58 #include "runtime/statSampler.hpp"
  59 #include "runtime/stubRoutines.hpp"
  60 #include "runtime/thread.inline.hpp"
  61 #include "runtime/threadCritical.hpp"
  62 #include "runtime/timer.hpp"
  63 #include "semaphore_posix.hpp"
  64 #include "services/attachListener.hpp"
  65 #include "services/memTracker.hpp"
  66 #include "services/runtimeService.hpp"
  67 #include "utilities/decoder.hpp"
  68 #include "utilities/defaultStream.hpp"
  69 #include "utilities/events.hpp"
  70 #include "utilities/elfFile.hpp"
  71 #include "utilities/growableArray.hpp"
  72 #include "utilities/macros.hpp"
  73 #include "utilities/vmError.hpp"
  74 
  75 // put OS-includes here
  76 # include <sys/types.h>
  77 # include <sys/mman.h>
  78 # include <sys/stat.h>
  79 # include <sys/select.h>
  80 # include <pthread.h>
  81 # include <signal.h>
  82 # include <errno.h>
  83 # include <dlfcn.h>
  84 # include <stdio.h>
  85 # include <unistd.h>
  86 # include <sys/resource.h>
  87 # include <pthread.h>
  88 # include <sys/stat.h>
  89 # include <sys/time.h>
  90 # include <sys/times.h>
  91 # include <sys/utsname.h>
  92 # include <sys/socket.h>
  93 # include <sys/wait.h>
  94 # include <pwd.h>
  95 # include <poll.h>
  96 # include <semaphore.h>
  97 # include <fcntl.h>
  98 # include <string.h>
  99 # include <syscall.h>
 100 # include <sys/sysinfo.h>
 101 # include <sys/ipc.h>
 102 # include <sys/shm.h>
 103 # include <link.h>
 104 # include <stdint.h>
 105 # include <inttypes.h>
 106 # include <sys/ioctl.h>
 107 
 108 #ifndef _GNU_SOURCE
 109   #define _GNU_SOURCE
 110   #include <sched.h>
 111   #undef _GNU_SOURCE
 112 #else
 113   #include <sched.h>
 114 #endif
 115 
 116 // if RUSAGE_THREAD for getrusage() has not been defined, do it here. The code calling
 117 // getrusage() is prepared to handle the associated failure.
 118 #ifndef RUSAGE_THREAD
 119   #define RUSAGE_THREAD   (1)               /* only the calling thread */
 120 #endif
 121 
 122 #define MAX_PATH    (2 * K)
 123 
 124 #define MAX_SECS 100000000
 125 
 126 // for timer info max values which include all bits
 127 #define ALL_64_BITS CONST64(0xFFFFFFFFFFFFFFFF)
 128 
 129 #define LARGEPAGES_BIT (1 << 6)
 130 ////////////////////////////////////////////////////////////////////////////////
 131 // global variables
 132 julong os::Linux::_physical_memory = 0;
 133 
 134 address   os::Linux::_initial_thread_stack_bottom = NULL;
 135 uintptr_t os::Linux::_initial_thread_stack_size   = 0;
 136 
 137 int (*os::Linux::_clock_gettime)(clockid_t, struct timespec *) = NULL;
 138 int (*os::Linux::_pthread_getcpuclockid)(pthread_t, clockid_t *) = NULL;
 139 int (*os::Linux::_pthread_setname_np)(pthread_t, const char*) = NULL;
 140 Mutex* os::Linux::_createThread_lock = NULL;
 141 pthread_t os::Linux::_main_thread;
 142 int os::Linux::_page_size = -1;
 143 bool os::Linux::_supports_fast_thread_cpu_time = false;
 144 uint32_t os::Linux::_os_version = 0;
 145 const char * os::Linux::_glibc_version = "unknown";
 146 const char * os::Linux::_libpthread_version = "unknown";
 147 pthread_condattr_t os::Linux::_condattr[1];
 148 
 149 static jlong initial_time_count=0;
 150 
 151 static int clock_tics_per_sec = 100;
 152 
 153 // For diagnostics to print a message once. see run_periodic_checks
 154 static sigset_t check_signal_done;
 155 static bool check_signals = true;
 156 
 157 // Signal number used to suspend/resume a thread
 158 
 159 // do not use any signal number less than SIGSEGV, see 4355769
 160 static int SR_signum = SIGUSR2;
 161 sigset_t SR_sigset;
 162 
 163 // Declarations
 164 static void unpackTime(timespec* absTime, bool isAbsolute, jlong time);
 165 
 166 // utility functions
 167 
 168 static int SR_initialize();
 169 
 170 julong os::available_memory() {
 171   return Linux::available_memory();
 172 }
 173 
 174 julong os::Linux::available_memory() {
 175   // values in struct sysinfo are "unsigned long"
 176   struct sysinfo si;
 177   sysinfo(&si);
 178 
 179   return (julong)si.freeram * si.mem_unit;
 180 }
 181 
 182 julong os::physical_memory() {
 183   return Linux::physical_memory();
 184 }
 185 
 186 // Return true if user is running as root.
 187 
 188 bool os::have_special_privileges() {
 189   static bool init = false;
 190   static bool privileges = false;
 191   if (!init) {
 192     privileges = (getuid() != geteuid()) || (getgid() != getegid());
 193     init = true;
 194   }
 195   return privileges;
 196 }
 197 
 198 
 199 #ifndef SYS_gettid
 200 // i386: 224, ia64: 1105, amd64: 186, sparc 143
 201   #ifdef __ia64__
 202     #define SYS_gettid 1105
 203   #else
 204     #ifdef __i386__
 205       #define SYS_gettid 224
 206     #else
 207       #ifdef __amd64__
 208         #define SYS_gettid 186
 209       #else
 210         #ifdef __sparc__
 211           #define SYS_gettid 143
 212         #else
 213           #error define gettid for the arch
 214         #endif
 215       #endif
 216     #endif
 217   #endif
 218 #endif
 219 
 220 
 221 // pid_t gettid()
 222 //
 223 // Returns the kernel thread id of the currently running thread. Kernel
 224 // thread id is used to access /proc.
 225 pid_t os::Linux::gettid() {
 226   int rslt = syscall(SYS_gettid);
 227   assert(rslt != -1, "must be."); // old linuxthreads implementation?
 228   return (pid_t)rslt;
 229 }
 230 
 231 // Most versions of linux have a bug where the number of processors are
 232 // determined by looking at the /proc file system.  In a chroot environment,
 233 // the system call returns 1.  This causes the VM to act as if it is
 234 // a single processor and elide locking (see is_MP() call).
 235 static bool unsafe_chroot_detected = false;
 236 static const char *unstable_chroot_error = "/proc file system not found.\n"
 237                      "Java may be unstable running multithreaded in a chroot "
 238                      "environment on Linux when /proc filesystem is not mounted.";
 239 
 240 void os::Linux::initialize_system_info() {
 241   set_processor_count(sysconf(_SC_NPROCESSORS_CONF));
 242   if (processor_count() == 1) {
 243     pid_t pid = os::Linux::gettid();
 244     char fname[32];
 245     jio_snprintf(fname, sizeof(fname), "/proc/%d", pid);
 246     FILE *fp = fopen(fname, "r");
 247     if (fp == NULL) {
 248       unsafe_chroot_detected = true;
 249     } else {
 250       fclose(fp);
 251     }
 252   }
 253   _physical_memory = (julong)sysconf(_SC_PHYS_PAGES) * (julong)sysconf(_SC_PAGESIZE);
 254   assert(processor_count() > 0, "linux error");
 255 }
 256 
 257 void os::init_system_properties_values() {
 258   // The next steps are taken in the product version:
 259   //
 260   // Obtain the JAVA_HOME value from the location of libjvm.so.
 261   // This library should be located at:
 262   // <JAVA_HOME>/lib/{client|server}/libjvm.so.
 263   //
 264   // If "/jre/lib/" appears at the right place in the path, then we
 265   // assume libjvm.so is installed in a JDK and we use this path.
 266   //
 267   // Otherwise exit with message: "Could not create the Java virtual machine."
 268   //
 269   // The following extra steps are taken in the debugging version:
 270   //
 271   // If "/jre/lib/" does NOT appear at the right place in the path
 272   // instead of exit check for $JAVA_HOME environment variable.
 273   //
 274   // If it is defined and we are able to locate $JAVA_HOME/jre/lib/<arch>,
 275   // then we append a fake suffix "hotspot/libjvm.so" to this path so
 276   // it looks like libjvm.so is installed there
 277   // <JAVA_HOME>/jre/lib/<arch>/hotspot/libjvm.so.
 278   //
 279   // Otherwise exit.
 280   //
 281   // Important note: if the location of libjvm.so changes this
 282   // code needs to be changed accordingly.
 283 
 284   // See ld(1):
 285   //      The linker uses the following search paths to locate required
 286   //      shared libraries:
 287   //        1: ...
 288   //        ...
 289   //        7: The default directories, normally /lib and /usr/lib.
 290 #if defined(AMD64) || (defined(_LP64) && defined(SPARC)) || defined(PPC64) || defined(S390)
 291   #define DEFAULT_LIBPATH "/usr/lib64:/lib64:/lib:/usr/lib"
 292 #else
 293   #define DEFAULT_LIBPATH "/lib:/usr/lib"
 294 #endif
 295 
 296 // Base path of extensions installed on the system.
 297 #define SYS_EXT_DIR     "/usr/java/packages"
 298 #define EXTENSIONS_DIR  "/lib/ext"
 299 
 300   // Buffer that fits several sprintfs.
 301   // Note that the space for the colon and the trailing null are provided
 302   // by the nulls included by the sizeof operator.
 303   const size_t bufsize =
 304     MAX2((size_t)MAXPATHLEN,  // For dll_dir & friends.
 305          (size_t)MAXPATHLEN + sizeof(EXTENSIONS_DIR) + sizeof(SYS_EXT_DIR) + sizeof(EXTENSIONS_DIR)); // extensions dir
 306   char *buf = (char *)NEW_C_HEAP_ARRAY(char, bufsize, mtInternal);
 307 
 308   // sysclasspath, java_home, dll_dir
 309   {
 310     char *pslash;
 311     os::jvm_path(buf, bufsize);
 312 
 313     // Found the full path to libjvm.so.
 314     // Now cut the path to <java_home>/jre if we can.
 315     pslash = strrchr(buf, '/');
 316     if (pslash != NULL) {
 317       *pslash = '\0';            // Get rid of /libjvm.so.
 318     }
 319     pslash = strrchr(buf, '/');
 320     if (pslash != NULL) {
 321       *pslash = '\0';            // Get rid of /{client|server|hotspot}.
 322     }
 323     Arguments::set_dll_dir(buf);
 324 
 325     if (pslash != NULL) {
 326       pslash = strrchr(buf, '/');
 327       if (pslash != NULL) {
 328         *pslash = '\0';        // Get rid of /lib.
 329       }
 330     }
 331     Arguments::set_java_home(buf);
 332     set_boot_path('/', ':');
 333   }
 334 
 335   // Where to look for native libraries.
 336   //
 337   // Note: Due to a legacy implementation, most of the library path
 338   // is set in the launcher. This was to accomodate linking restrictions
 339   // on legacy Linux implementations (which are no longer supported).
 340   // Eventually, all the library path setting will be done here.
 341   //
 342   // However, to prevent the proliferation of improperly built native
 343   // libraries, the new path component /usr/java/packages is added here.
 344   // Eventually, all the library path setting will be done here.
 345   {
 346     // Get the user setting of LD_LIBRARY_PATH, and prepended it. It
 347     // should always exist (until the legacy problem cited above is
 348     // addressed).
 349     const char *v = ::getenv("LD_LIBRARY_PATH");
 350     const char *v_colon = ":";
 351     if (v == NULL) { v = ""; v_colon = ""; }
 352     // That's +1 for the colon and +1 for the trailing '\0'.
 353     char *ld_library_path = (char *)NEW_C_HEAP_ARRAY(char,
 354                                                      strlen(v) + 1 +
 355                                                      sizeof(SYS_EXT_DIR) + sizeof("/lib/") + sizeof(DEFAULT_LIBPATH) + 1,
 356                                                      mtInternal);
 357     sprintf(ld_library_path, "%s%s" SYS_EXT_DIR "/lib:" DEFAULT_LIBPATH, v, v_colon);
 358     Arguments::set_library_path(ld_library_path);
 359     FREE_C_HEAP_ARRAY(char, ld_library_path);
 360   }
 361 
 362   // Extensions directories.
 363   sprintf(buf, "%s" EXTENSIONS_DIR ":" SYS_EXT_DIR EXTENSIONS_DIR, Arguments::get_java_home());
 364   Arguments::set_ext_dirs(buf);
 365 
 366   FREE_C_HEAP_ARRAY(char, buf);
 367 
 368 #undef DEFAULT_LIBPATH
 369 #undef SYS_EXT_DIR
 370 #undef EXTENSIONS_DIR
 371 }
 372 
 373 ////////////////////////////////////////////////////////////////////////////////
 374 // breakpoint support
 375 
 376 void os::breakpoint() {
 377   BREAKPOINT;
 378 }
 379 
 380 extern "C" void breakpoint() {
 381   // use debugger to set breakpoint here
 382 }
 383 
 384 ////////////////////////////////////////////////////////////////////////////////
 385 // signal support
 386 
 387 debug_only(static bool signal_sets_initialized = false);
 388 static sigset_t unblocked_sigs, vm_sigs, allowdebug_blocked_sigs;
 389 
 390 bool os::Linux::is_sig_ignored(int sig) {
 391   struct sigaction oact;
 392   sigaction(sig, (struct sigaction*)NULL, &oact);
 393   void* ohlr = oact.sa_sigaction ? CAST_FROM_FN_PTR(void*,  oact.sa_sigaction)
 394                                  : CAST_FROM_FN_PTR(void*,  oact.sa_handler);
 395   if (ohlr == CAST_FROM_FN_PTR(void*, SIG_IGN)) {
 396     return true;
 397   } else {
 398     return false;
 399   }
 400 }
 401 
 402 void os::Linux::signal_sets_init() {
 403   // Should also have an assertion stating we are still single-threaded.
 404   assert(!signal_sets_initialized, "Already initialized");
 405   // Fill in signals that are necessarily unblocked for all threads in
 406   // the VM. Currently, we unblock the following signals:
 407   // SHUTDOWN{1,2,3}_SIGNAL: for shutdown hooks support (unless over-ridden
 408   //                         by -Xrs (=ReduceSignalUsage));
 409   // BREAK_SIGNAL which is unblocked only by the VM thread and blocked by all
 410   // other threads. The "ReduceSignalUsage" boolean tells us not to alter
 411   // the dispositions or masks wrt these signals.
 412   // Programs embedding the VM that want to use the above signals for their
 413   // own purposes must, at this time, use the "-Xrs" option to prevent
 414   // interference with shutdown hooks and BREAK_SIGNAL thread dumping.
 415   // (See bug 4345157, and other related bugs).
 416   // In reality, though, unblocking these signals is really a nop, since
 417   // these signals are not blocked by default.
 418   sigemptyset(&unblocked_sigs);
 419   sigemptyset(&allowdebug_blocked_sigs);
 420   sigaddset(&unblocked_sigs, SIGILL);
 421   sigaddset(&unblocked_sigs, SIGSEGV);
 422   sigaddset(&unblocked_sigs, SIGBUS);
 423   sigaddset(&unblocked_sigs, SIGFPE);
 424 #if defined(PPC64)
 425   sigaddset(&unblocked_sigs, SIGTRAP);
 426 #endif
 427   sigaddset(&unblocked_sigs, SR_signum);
 428 
 429   if (!ReduceSignalUsage) {
 430     if (!os::Linux::is_sig_ignored(SHUTDOWN1_SIGNAL)) {
 431       sigaddset(&unblocked_sigs, SHUTDOWN1_SIGNAL);
 432       sigaddset(&allowdebug_blocked_sigs, SHUTDOWN1_SIGNAL);
 433     }
 434     if (!os::Linux::is_sig_ignored(SHUTDOWN2_SIGNAL)) {
 435       sigaddset(&unblocked_sigs, SHUTDOWN2_SIGNAL);
 436       sigaddset(&allowdebug_blocked_sigs, SHUTDOWN2_SIGNAL);
 437     }
 438     if (!os::Linux::is_sig_ignored(SHUTDOWN3_SIGNAL)) {
 439       sigaddset(&unblocked_sigs, SHUTDOWN3_SIGNAL);
 440       sigaddset(&allowdebug_blocked_sigs, SHUTDOWN3_SIGNAL);
 441     }
 442   }
 443   // Fill in signals that are blocked by all but the VM thread.
 444   sigemptyset(&vm_sigs);
 445   if (!ReduceSignalUsage) {
 446     sigaddset(&vm_sigs, BREAK_SIGNAL);
 447   }
 448   debug_only(signal_sets_initialized = true);
 449 
 450 }
 451 
 452 // These are signals that are unblocked while a thread is running Java.
 453 // (For some reason, they get blocked by default.)
 454 sigset_t* os::Linux::unblocked_signals() {
 455   assert(signal_sets_initialized, "Not initialized");
 456   return &unblocked_sigs;
 457 }
 458 
 459 // These are the signals that are blocked while a (non-VM) thread is
 460 // running Java. Only the VM thread handles these signals.
 461 sigset_t* os::Linux::vm_signals() {
 462   assert(signal_sets_initialized, "Not initialized");
 463   return &vm_sigs;
 464 }
 465 
 466 // These are signals that are blocked during cond_wait to allow debugger in
 467 sigset_t* os::Linux::allowdebug_blocked_signals() {
 468   assert(signal_sets_initialized, "Not initialized");
 469   return &allowdebug_blocked_sigs;
 470 }
 471 
 472 void os::Linux::hotspot_sigmask(Thread* thread) {
 473 
 474   //Save caller's signal mask before setting VM signal mask
 475   sigset_t caller_sigmask;
 476   pthread_sigmask(SIG_BLOCK, NULL, &caller_sigmask);
 477 
 478   OSThread* osthread = thread->osthread();
 479   osthread->set_caller_sigmask(caller_sigmask);
 480 
 481   pthread_sigmask(SIG_UNBLOCK, os::Linux::unblocked_signals(), NULL);
 482 
 483   if (!ReduceSignalUsage) {
 484     if (thread->is_VM_thread()) {
 485       // Only the VM thread handles BREAK_SIGNAL ...
 486       pthread_sigmask(SIG_UNBLOCK, vm_signals(), NULL);
 487     } else {
 488       // ... all other threads block BREAK_SIGNAL
 489       pthread_sigmask(SIG_BLOCK, vm_signals(), NULL);
 490     }
 491   }
 492 }
 493 
 494 //////////////////////////////////////////////////////////////////////////////
 495 // detecting pthread library
 496 
 497 void os::Linux::libpthread_init() {
 498   // Save glibc and pthread version strings.
 499 #if !defined(_CS_GNU_LIBC_VERSION) || \
 500     !defined(_CS_GNU_LIBPTHREAD_VERSION)
 501   #error "glibc too old (< 2.3.2)"
 502 #endif
 503 
 504   size_t n;
 505 
 506   n = confstr(_CS_GNU_LIBC_VERSION, NULL, 0);
 507   if (n > 0) {
 508     char* str = (char *)malloc(n, mtInternal);
 509     confstr(_CS_GNU_LIBC_VERSION, str, n);
 510     os::Linux::set_glibc_version(str);
 511   }
 512 
 513   n = confstr(_CS_GNU_LIBPTHREAD_VERSION, NULL, 0);
 514   if (n > 0) {
 515     char* str = (char *)malloc(n, mtInternal);
 516     confstr(_CS_GNU_LIBPTHREAD_VERSION, str, n);
 517     os::Linux::set_libpthread_version(str);
 518   }
 519 }
 520 
 521 /////////////////////////////////////////////////////////////////////////////
 522 // thread stack expansion
 523 
 524 // os::Linux::manually_expand_stack() takes care of expanding the thread
 525 // stack. Note that this is normally not needed: pthread stacks allocate
 526 // thread stack using mmap() without MAP_NORESERVE, so the stack is already
 527 // committed. Therefore it is not necessary to expand the stack manually.
 528 //
 529 // Manually expanding the stack was historically needed on LinuxThreads
 530 // thread stacks, which were allocated with mmap(MAP_GROWSDOWN). Nowadays
 531 // it is kept to deal with very rare corner cases:
 532 //
 533 // For one, user may run the VM on an own implementation of threads
 534 // whose stacks are - like the old LinuxThreads - implemented using
 535 // mmap(MAP_GROWSDOWN).
 536 //
 537 // Also, this coding may be needed if the VM is running on the primordial
 538 // thread. Normally we avoid running on the primordial thread; however,
 539 // user may still invoke the VM on the primordial thread.
 540 //
 541 // The following historical comment describes the details about running
 542 // on a thread stack allocated with mmap(MAP_GROWSDOWN):
 543 
 544 
 545 // Force Linux kernel to expand current thread stack. If "bottom" is close
 546 // to the stack guard, caller should block all signals.
 547 //
 548 // MAP_GROWSDOWN:
 549 //   A special mmap() flag that is used to implement thread stacks. It tells
 550 //   kernel that the memory region should extend downwards when needed. This
 551 //   allows early versions of LinuxThreads to only mmap the first few pages
 552 //   when creating a new thread. Linux kernel will automatically expand thread
 553 //   stack as needed (on page faults).
 554 //
 555 //   However, because the memory region of a MAP_GROWSDOWN stack can grow on
 556 //   demand, if a page fault happens outside an already mapped MAP_GROWSDOWN
 557 //   region, it's hard to tell if the fault is due to a legitimate stack
 558 //   access or because of reading/writing non-exist memory (e.g. buffer
 559 //   overrun). As a rule, if the fault happens below current stack pointer,
 560 //   Linux kernel does not expand stack, instead a SIGSEGV is sent to the
 561 //   application (see Linux kernel fault.c).
 562 //
 563 //   This Linux feature can cause SIGSEGV when VM bangs thread stack for
 564 //   stack overflow detection.
 565 //
 566 //   Newer version of LinuxThreads (since glibc-2.2, or, RH-7.x) and NPTL do
 567 //   not use MAP_GROWSDOWN.
 568 //
 569 // To get around the problem and allow stack banging on Linux, we need to
 570 // manually expand thread stack after receiving the SIGSEGV.
 571 //
 572 // There are two ways to expand thread stack to address "bottom", we used
 573 // both of them in JVM before 1.5:
 574 //   1. adjust stack pointer first so that it is below "bottom", and then
 575 //      touch "bottom"
 576 //   2. mmap() the page in question
 577 //
 578 // Now alternate signal stack is gone, it's harder to use 2. For instance,
 579 // if current sp is already near the lower end of page 101, and we need to
 580 // call mmap() to map page 100, it is possible that part of the mmap() frame
 581 // will be placed in page 100. When page 100 is mapped, it is zero-filled.
 582 // That will destroy the mmap() frame and cause VM to crash.
 583 //
 584 // The following code works by adjusting sp first, then accessing the "bottom"
 585 // page to force a page fault. Linux kernel will then automatically expand the
 586 // stack mapping.
 587 //
 588 // _expand_stack_to() assumes its frame size is less than page size, which
 589 // should always be true if the function is not inlined.
 590 
 591 static void NOINLINE _expand_stack_to(address bottom) {
 592   address sp;
 593   size_t size;
 594   volatile char *p;
 595 
 596   // Adjust bottom to point to the largest address within the same page, it
 597   // gives us a one-page buffer if alloca() allocates slightly more memory.
 598   bottom = (address)align_size_down((uintptr_t)bottom, os::Linux::page_size());
 599   bottom += os::Linux::page_size() - 1;
 600 
 601   // sp might be slightly above current stack pointer; if that's the case, we
 602   // will alloca() a little more space than necessary, which is OK. Don't use
 603   // os::current_stack_pointer(), as its result can be slightly below current
 604   // stack pointer, causing us to not alloca enough to reach "bottom".
 605   sp = (address)&sp;
 606 
 607   if (sp > bottom) {
 608     size = sp - bottom;
 609     p = (volatile char *)alloca(size);
 610     assert(p != NULL && p <= (volatile char *)bottom, "alloca problem?");
 611     p[0] = '\0';
 612   }
 613 }
 614 
 615 bool os::Linux::manually_expand_stack(JavaThread * t, address addr) {
 616   assert(t!=NULL, "just checking");
 617   assert(t->osthread()->expanding_stack(), "expand should be set");
 618   assert(t->stack_base() != NULL, "stack_base was not initialized");
 619 
 620   if (addr <  t->stack_base() && addr >= t->stack_reserved_zone_base()) {
 621     sigset_t mask_all, old_sigset;
 622     sigfillset(&mask_all);
 623     pthread_sigmask(SIG_SETMASK, &mask_all, &old_sigset);
 624     _expand_stack_to(addr);
 625     pthread_sigmask(SIG_SETMASK, &old_sigset, NULL);
 626     return true;
 627   }
 628   return false;
 629 }
 630 
 631 //////////////////////////////////////////////////////////////////////////////
 632 // create new thread
 633 
 634 // Thread start routine for all newly created threads
 635 static void *thread_native_entry(Thread *thread) {
 636   // Try to randomize the cache line index of hot stack frames.
 637   // This helps when threads of the same stack traces evict each other's
 638   // cache lines. The threads can be either from the same JVM instance, or
 639   // from different JVM instances. The benefit is especially true for
 640   // processors with hyperthreading technology.
 641   static int counter = 0;
 642   int pid = os::current_process_id();
 643   alloca(((pid ^ counter++) & 7) * 128);
 644 
 645   thread->initialize_thread_current();
 646 
 647   OSThread* osthread = thread->osthread();
 648   Monitor* sync = osthread->startThread_lock();
 649 
 650   osthread->set_thread_id(os::current_thread_id());
 651 
 652   log_info(os, thread)("Thread is alive (tid: " UINTX_FORMAT ", pthread id: " UINTX_FORMAT ").",
 653     os::current_thread_id(), (uintx) pthread_self());
 654 
 655   if (UseNUMA) {
 656     int lgrp_id = os::numa_get_group_id();
 657     if (lgrp_id != -1) {
 658       thread->set_lgrp_id(lgrp_id);
 659     }
 660   }
 661   // initialize signal mask for this thread
 662   os::Linux::hotspot_sigmask(thread);
 663 
 664   // initialize floating point control register
 665   os::Linux::init_thread_fpu_state();
 666 
 667   // handshaking with parent thread
 668   {
 669     MutexLockerEx ml(sync, Mutex::_no_safepoint_check_flag);
 670 
 671     // notify parent thread
 672     osthread->set_state(INITIALIZED);
 673     sync->notify_all();
 674 
 675     // wait until os::start_thread()
 676     while (osthread->get_state() == INITIALIZED) {
 677       sync->wait(Mutex::_no_safepoint_check_flag);
 678     }
 679   }
 680 
 681   // call one more level start routine
 682   thread->run();
 683 
 684   log_info(os, thread)("Thread finished (tid: " UINTX_FORMAT ", pthread id: " UINTX_FORMAT ").",
 685     os::current_thread_id(), (uintx) pthread_self());
 686 
 687   // If a thread has not deleted itself ("delete this") as part of its
 688   // termination sequence, we have to ensure thread-local-storage is
 689   // cleared before we actually terminate. No threads should ever be
 690   // deleted asynchronously with respect to their termination.
 691   if (Thread::current_or_null_safe() != NULL) {
 692     assert(Thread::current_or_null_safe() == thread, "current thread is wrong");
 693     thread->clear_thread_current();
 694   }
 695 
 696   return 0;
 697 }
 698 
 699 bool os::create_thread(Thread* thread, ThreadType thr_type,
 700                        size_t req_stack_size) {
 701   assert(thread->osthread() == NULL, "caller responsible");
 702 
 703   // Allocate the OSThread object
 704   OSThread* osthread = new OSThread(NULL, NULL);
 705   if (osthread == NULL) {
 706     return false;
 707   }
 708 
 709   // set the correct thread state
 710   osthread->set_thread_type(thr_type);
 711 
 712   // Initial state is ALLOCATED but not INITIALIZED
 713   osthread->set_state(ALLOCATED);
 714 
 715   thread->set_osthread(osthread);
 716 
 717   // init thread attributes
 718   pthread_attr_t attr;
 719   pthread_attr_init(&attr);
 720   pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
 721 
 722   // Calculate stack size if it's not specified by caller.
 723   size_t stack_size = os::Posix::get_initial_stack_size(thr_type, req_stack_size);
 724   // In the Linux NPTL pthread implementation the guard size mechanism
 725   // is not implemented properly. The posix standard requires adding
 726   // the size of the guard pages to the stack size, instead Linux
 727   // takes the space out of 'stacksize'. Thus we adapt the requested
 728   // stack_size by the size of the guard pages to mimick proper
 729   // behaviour.
 730   stack_size = align_size_up(stack_size + os::Linux::default_guard_size(thr_type), vm_page_size());
 731   pthread_attr_setstacksize(&attr, stack_size);
 732 
 733   // Configure glibc guard page.
 734   pthread_attr_setguardsize(&attr, os::Linux::default_guard_size(thr_type));
 735 
 736   ThreadState state;
 737 
 738   {
 739     pthread_t tid;
 740     int ret = pthread_create(&tid, &attr, (void* (*)(void*)) thread_native_entry, thread);
 741 
 742     char buf[64];
 743     if (ret == 0) {
 744       log_info(os, thread)("Thread started (pthread id: " UINTX_FORMAT ", attributes: %s). ",
 745         (uintx) tid, os::Posix::describe_pthread_attr(buf, sizeof(buf), &attr));
 746     } else {
 747       log_warning(os, thread)("Failed to start thread - pthread_create failed (%s) for attributes: %s.",
 748         os::errno_name(ret), os::Posix::describe_pthread_attr(buf, sizeof(buf), &attr));
 749     }
 750 
 751     pthread_attr_destroy(&attr);
 752 
 753     if (ret != 0) {
 754       // Need to clean up stuff we've allocated so far
 755       thread->set_osthread(NULL);
 756       delete osthread;
 757       return false;
 758     }
 759 
 760     // Store pthread info into the OSThread
 761     osthread->set_pthread_id(tid);
 762 
 763     // Wait until child thread is either initialized or aborted
 764     {
 765       Monitor* sync_with_child = osthread->startThread_lock();
 766       MutexLockerEx ml(sync_with_child, Mutex::_no_safepoint_check_flag);
 767       while ((state = osthread->get_state()) == ALLOCATED) {
 768         sync_with_child->wait(Mutex::_no_safepoint_check_flag);
 769       }
 770     }
 771   }
 772 
 773   // Aborted due to thread limit being reached
 774   if (state == ZOMBIE) {
 775     thread->set_osthread(NULL);
 776     delete osthread;
 777     return false;
 778   }
 779 
 780   // The thread is returned suspended (in state INITIALIZED),
 781   // and is started higher up in the call chain
 782   assert(state == INITIALIZED, "race condition");
 783   return true;
 784 }
 785 
 786 /////////////////////////////////////////////////////////////////////////////
 787 // attach existing thread
 788 
 789 // bootstrap the main thread
 790 bool os::create_main_thread(JavaThread* thread) {
 791   assert(os::Linux::_main_thread == pthread_self(), "should be called inside main thread");
 792   return create_attached_thread(thread);
 793 }
 794 
 795 bool os::create_attached_thread(JavaThread* thread) {
 796 #ifdef ASSERT
 797   thread->verify_not_published();
 798 #endif
 799 
 800   // Allocate the OSThread object
 801   OSThread* osthread = new OSThread(NULL, NULL);
 802 
 803   if (osthread == NULL) {
 804     return false;
 805   }
 806 
 807   // Store pthread info into the OSThread
 808   osthread->set_thread_id(os::Linux::gettid());
 809   osthread->set_pthread_id(::pthread_self());
 810 
 811   // initialize floating point control register
 812   os::Linux::init_thread_fpu_state();
 813 
 814   // Initial thread state is RUNNABLE
 815   osthread->set_state(RUNNABLE);
 816 
 817   thread->set_osthread(osthread);
 818 
 819   if (UseNUMA) {
 820     int lgrp_id = os::numa_get_group_id();
 821     if (lgrp_id != -1) {
 822       thread->set_lgrp_id(lgrp_id);
 823     }
 824   }
 825 
 826   if (os::Linux::is_initial_thread()) {
 827     // If current thread is initial thread, its stack is mapped on demand,
 828     // see notes about MAP_GROWSDOWN. Here we try to force kernel to map
 829     // the entire stack region to avoid SEGV in stack banging.
 830     // It is also useful to get around the heap-stack-gap problem on SuSE
 831     // kernel (see 4821821 for details). We first expand stack to the top
 832     // of yellow zone, then enable stack yellow zone (order is significant,
 833     // enabling yellow zone first will crash JVM on SuSE Linux), so there
 834     // is no gap between the last two virtual memory regions.
 835 
 836     JavaThread *jt = (JavaThread *)thread;
 837     address addr = jt->stack_reserved_zone_base();
 838     assert(addr != NULL, "initialization problem?");
 839     assert(jt->stack_available(addr) > 0, "stack guard should not be enabled");
 840 
 841     osthread->set_expanding_stack();
 842     os::Linux::manually_expand_stack(jt, addr);
 843     osthread->clear_expanding_stack();
 844   }
 845 
 846   // initialize signal mask for this thread
 847   // and save the caller's signal mask
 848   os::Linux::hotspot_sigmask(thread);
 849 
 850   log_info(os, thread)("Thread attached (tid: " UINTX_FORMAT ", pthread id: " UINTX_FORMAT ").",
 851     os::current_thread_id(), (uintx) pthread_self());
 852 
 853   return true;
 854 }
 855 
 856 void os::pd_start_thread(Thread* thread) {
 857   OSThread * osthread = thread->osthread();
 858   assert(osthread->get_state() != INITIALIZED, "just checking");
 859   Monitor* sync_with_child = osthread->startThread_lock();
 860   MutexLockerEx ml(sync_with_child, Mutex::_no_safepoint_check_flag);
 861   sync_with_child->notify();
 862 }
 863 
 864 // Free Linux resources related to the OSThread
 865 void os::free_thread(OSThread* osthread) {
 866   assert(osthread != NULL, "osthread not set");
 867 
 868   // We are told to free resources of the argument thread,
 869   // but we can only really operate on the current thread.
 870   assert(Thread::current()->osthread() == osthread,
 871          "os::free_thread but not current thread");
 872 
 873 #ifdef ASSERT
 874   sigset_t current;
 875   sigemptyset(&current);
 876   pthread_sigmask(SIG_SETMASK, NULL, &current);
 877   assert(!sigismember(&current, SR_signum), "SR signal should not be blocked!");
 878 #endif
 879 
 880   // Restore caller's signal mask
 881   sigset_t sigmask = osthread->caller_sigmask();
 882   pthread_sigmask(SIG_SETMASK, &sigmask, NULL);
 883 
 884   delete osthread;
 885 }
 886 
 887 //////////////////////////////////////////////////////////////////////////////
 888 // initial thread
 889 
 890 // Check if current thread is the initial thread, similar to Solaris thr_main.
 891 bool os::Linux::is_initial_thread(void) {
 892   char dummy;
 893   // If called before init complete, thread stack bottom will be null.
 894   // Can be called if fatal error occurs before initialization.
 895   if (initial_thread_stack_bottom() == NULL) return false;
 896   assert(initial_thread_stack_bottom() != NULL &&
 897          initial_thread_stack_size()   != 0,
 898          "os::init did not locate initial thread's stack region");
 899   if ((address)&dummy >= initial_thread_stack_bottom() &&
 900       (address)&dummy < initial_thread_stack_bottom() + initial_thread_stack_size()) {
 901     return true;
 902   } else {
 903     return false;
 904   }
 905 }
 906 
 907 // Find the virtual memory area that contains addr
 908 static bool find_vma(address addr, address* vma_low, address* vma_high) {
 909   FILE *fp = fopen("/proc/self/maps", "r");
 910   if (fp) {
 911     address low, high;
 912     while (!feof(fp)) {
 913       if (fscanf(fp, "%p-%p", &low, &high) == 2) {
 914         if (low <= addr && addr < high) {
 915           if (vma_low)  *vma_low  = low;
 916           if (vma_high) *vma_high = high;
 917           fclose(fp);
 918           return true;
 919         }
 920       }
 921       for (;;) {
 922         int ch = fgetc(fp);
 923         if (ch == EOF || ch == (int)'\n') break;
 924       }
 925     }
 926     fclose(fp);
 927   }
 928   return false;
 929 }
 930 
 931 // Locate initial thread stack. This special handling of initial thread stack
 932 // is needed because pthread_getattr_np() on most (all?) Linux distros returns
 933 // bogus value for the primordial process thread. While the launcher has created
 934 // the VM in a new thread since JDK 6, we still have to allow for the use of the
 935 // JNI invocation API from a primordial thread.
 936 void os::Linux::capture_initial_stack(size_t max_size) {
 937 
 938   // max_size is either 0 (which means accept OS default for thread stacks) or
 939   // a user-specified value known to be at least the minimum needed. If we
 940   // are actually on the primordial thread we can make it appear that we have a
 941   // smaller max_size stack by inserting the guard pages at that location. But we
 942   // cannot do anything to emulate a larger stack than what has been provided by
 943   // the OS or threading library. In fact if we try to use a stack greater than
 944   // what is set by rlimit then we will crash the hosting process.
 945 
 946   // Maximum stack size is the easy part, get it from RLIMIT_STACK.
 947   // If this is "unlimited" then it will be a huge value.
 948   struct rlimit rlim;
 949   getrlimit(RLIMIT_STACK, &rlim);
 950   size_t stack_size = rlim.rlim_cur;
 951 
 952   // 6308388: a bug in ld.so will relocate its own .data section to the
 953   //   lower end of primordial stack; reduce ulimit -s value a little bit
 954   //   so we won't install guard page on ld.so's data section.
 955   stack_size -= 2 * page_size();
 956 
 957   // Try to figure out where the stack base (top) is. This is harder.
 958   //
 959   // When an application is started, glibc saves the initial stack pointer in
 960   // a global variable "__libc_stack_end", which is then used by system
 961   // libraries. __libc_stack_end should be pretty close to stack top. The
 962   // variable is available since the very early days. However, because it is
 963   // a private interface, it could disappear in the future.
 964   //
 965   // Linux kernel saves start_stack information in /proc/<pid>/stat. Similar
 966   // to __libc_stack_end, it is very close to stack top, but isn't the real
 967   // stack top. Note that /proc may not exist if VM is running as a chroot
 968   // program, so reading /proc/<pid>/stat could fail. Also the contents of
 969   // /proc/<pid>/stat could change in the future (though unlikely).
 970   //
 971   // We try __libc_stack_end first. If that doesn't work, look for
 972   // /proc/<pid>/stat. If neither of them works, we use current stack pointer
 973   // as a hint, which should work well in most cases.
 974 
 975   uintptr_t stack_start;
 976 
 977   // try __libc_stack_end first
 978   uintptr_t *p = (uintptr_t *)dlsym(RTLD_DEFAULT, "__libc_stack_end");
 979   if (p && *p) {
 980     stack_start = *p;
 981   } else {
 982     // see if we can get the start_stack field from /proc/self/stat
 983     FILE *fp;
 984     int pid;
 985     char state;
 986     int ppid;
 987     int pgrp;
 988     int session;
 989     int nr;
 990     int tpgrp;
 991     unsigned long flags;
 992     unsigned long minflt;
 993     unsigned long cminflt;
 994     unsigned long majflt;
 995     unsigned long cmajflt;
 996     unsigned long utime;
 997     unsigned long stime;
 998     long cutime;
 999     long cstime;
1000     long prio;
1001     long nice;
1002     long junk;
1003     long it_real;
1004     uintptr_t start;
1005     uintptr_t vsize;
1006     intptr_t rss;
1007     uintptr_t rsslim;
1008     uintptr_t scodes;
1009     uintptr_t ecode;
1010     int i;
1011 
1012     // Figure what the primordial thread stack base is. Code is inspired
1013     // by email from Hans Boehm. /proc/self/stat begins with current pid,
1014     // followed by command name surrounded by parentheses, state, etc.
1015     char stat[2048];
1016     int statlen;
1017 
1018     fp = fopen("/proc/self/stat", "r");
1019     if (fp) {
1020       statlen = fread(stat, 1, 2047, fp);
1021       stat[statlen] = '\0';
1022       fclose(fp);
1023 
1024       // Skip pid and the command string. Note that we could be dealing with
1025       // weird command names, e.g. user could decide to rename java launcher
1026       // to "java 1.4.2 :)", then the stat file would look like
1027       //                1234 (java 1.4.2 :)) R ... ...
1028       // We don't really need to know the command string, just find the last
1029       // occurrence of ")" and then start parsing from there. See bug 4726580.
1030       char * s = strrchr(stat, ')');
1031 
1032       i = 0;
1033       if (s) {
1034         // Skip blank chars
1035         do { s++; } while (s && isspace(*s));
1036 
1037 #define _UFM UINTX_FORMAT
1038 #define _DFM INTX_FORMAT
1039 
1040         //                                     1   1   1   1   1   1   1   1   1   1   2   2    2    2    2    2    2    2    2
1041         //              3  4  5  6  7  8   9   0   1   2   3   4   5   6   7   8   9   0   1    2    3    4    5    6    7    8
1042         i = sscanf(s, "%c %d %d %d %d %d %lu %lu %lu %lu %lu %lu %lu %ld %ld %ld %ld %ld %ld " _UFM _UFM _DFM _UFM _UFM _UFM _UFM,
1043                    &state,          // 3  %c
1044                    &ppid,           // 4  %d
1045                    &pgrp,           // 5  %d
1046                    &session,        // 6  %d
1047                    &nr,             // 7  %d
1048                    &tpgrp,          // 8  %d
1049                    &flags,          // 9  %lu
1050                    &minflt,         // 10 %lu
1051                    &cminflt,        // 11 %lu
1052                    &majflt,         // 12 %lu
1053                    &cmajflt,        // 13 %lu
1054                    &utime,          // 14 %lu
1055                    &stime,          // 15 %lu
1056                    &cutime,         // 16 %ld
1057                    &cstime,         // 17 %ld
1058                    &prio,           // 18 %ld
1059                    &nice,           // 19 %ld
1060                    &junk,           // 20 %ld
1061                    &it_real,        // 21 %ld
1062                    &start,          // 22 UINTX_FORMAT
1063                    &vsize,          // 23 UINTX_FORMAT
1064                    &rss,            // 24 INTX_FORMAT
1065                    &rsslim,         // 25 UINTX_FORMAT
1066                    &scodes,         // 26 UINTX_FORMAT
1067                    &ecode,          // 27 UINTX_FORMAT
1068                    &stack_start);   // 28 UINTX_FORMAT
1069       }
1070 
1071 #undef _UFM
1072 #undef _DFM
1073 
1074       if (i != 28 - 2) {
1075         assert(false, "Bad conversion from /proc/self/stat");
1076         // product mode - assume we are the initial thread, good luck in the
1077         // embedded case.
1078         warning("Can't detect initial thread stack location - bad conversion");
1079         stack_start = (uintptr_t) &rlim;
1080       }
1081     } else {
1082       // For some reason we can't open /proc/self/stat (for example, running on
1083       // FreeBSD with a Linux emulator, or inside chroot), this should work for
1084       // most cases, so don't abort:
1085       warning("Can't detect initial thread stack location - no /proc/self/stat");
1086       stack_start = (uintptr_t) &rlim;
1087     }
1088   }
1089 
1090   // Now we have a pointer (stack_start) very close to the stack top, the
1091   // next thing to do is to figure out the exact location of stack top. We
1092   // can find out the virtual memory area that contains stack_start by
1093   // reading /proc/self/maps, it should be the last vma in /proc/self/maps,
1094   // and its upper limit is the real stack top. (again, this would fail if
1095   // running inside chroot, because /proc may not exist.)
1096 
1097   uintptr_t stack_top;
1098   address low, high;
1099   if (find_vma((address)stack_start, &low, &high)) {
1100     // success, "high" is the true stack top. (ignore "low", because initial
1101     // thread stack grows on demand, its real bottom is high - RLIMIT_STACK.)
1102     stack_top = (uintptr_t)high;
1103   } else {
1104     // failed, likely because /proc/self/maps does not exist
1105     warning("Can't detect initial thread stack location - find_vma failed");
1106     // best effort: stack_start is normally within a few pages below the real
1107     // stack top, use it as stack top, and reduce stack size so we won't put
1108     // guard page outside stack.
1109     stack_top = stack_start;
1110     stack_size -= 16 * page_size();
1111   }
1112 
1113   // stack_top could be partially down the page so align it
1114   stack_top = align_size_up(stack_top, page_size());
1115 
1116   // Allowed stack value is minimum of max_size and what we derived from rlimit
1117   if (max_size > 0) {
1118     _initial_thread_stack_size = MIN2(max_size, stack_size);
1119   } else {
1120     // Accept the rlimit max, but if stack is unlimited then it will be huge, so
1121     // clamp it at 8MB as we do on Solaris
1122     _initial_thread_stack_size = MIN2(stack_size, 8*M);
1123   }
1124   _initial_thread_stack_size = align_size_down(_initial_thread_stack_size, page_size());
1125   _initial_thread_stack_bottom = (address)stack_top - _initial_thread_stack_size;
1126 
1127   assert(_initial_thread_stack_bottom < (address)stack_top, "overflow!");
1128 
1129   if (log_is_enabled(Info, os, thread)) {
1130     // See if we seem to be on primordial process thread
1131     bool primordial = uintptr_t(&rlim) > uintptr_t(_initial_thread_stack_bottom) &&
1132                       uintptr_t(&rlim) < stack_top;
1133 
1134     log_info(os, thread)("Capturing initial stack in %s thread: req. size: " SIZE_FORMAT "K, actual size: "
1135                          SIZE_FORMAT "K, top=" INTPTR_FORMAT ", bottom=" INTPTR_FORMAT,
1136                          primordial ? "primordial" : "user", max_size / K,  _initial_thread_stack_size / K,
1137                          stack_top, intptr_t(_initial_thread_stack_bottom));
1138   }
1139 }
1140 
1141 ////////////////////////////////////////////////////////////////////////////////
1142 // time support
1143 
1144 // Time since start-up in seconds to a fine granularity.
1145 // Used by VMSelfDestructTimer and the MemProfiler.
1146 double os::elapsedTime() {
1147 
1148   return ((double)os::elapsed_counter()) / os::elapsed_frequency(); // nanosecond resolution
1149 }
1150 
1151 jlong os::elapsed_counter() {
1152   return javaTimeNanos() - initial_time_count;
1153 }
1154 
1155 jlong os::elapsed_frequency() {
1156   return NANOSECS_PER_SEC; // nanosecond resolution
1157 }
1158 
1159 bool os::supports_vtime() { return true; }
1160 bool os::enable_vtime()   { return false; }
1161 bool os::vtime_enabled()  { return false; }
1162 
1163 double os::elapsedVTime() {
1164   struct rusage usage;
1165   int retval = getrusage(RUSAGE_THREAD, &usage);
1166   if (retval == 0) {
1167     return (double) (usage.ru_utime.tv_sec + usage.ru_stime.tv_sec) + (double) (usage.ru_utime.tv_usec + usage.ru_stime.tv_usec) / (1000 * 1000);
1168   } else {
1169     // better than nothing, but not much
1170     return elapsedTime();
1171   }
1172 }
1173 
1174 jlong os::javaTimeMillis() {
1175   timeval time;
1176   int status = gettimeofday(&time, NULL);
1177   assert(status != -1, "linux error");
1178   return jlong(time.tv_sec) * 1000  +  jlong(time.tv_usec / 1000);
1179 }
1180 
1181 void os::javaTimeSystemUTC(jlong &seconds, jlong &nanos) {
1182   timeval time;
1183   int status = gettimeofday(&time, NULL);
1184   assert(status != -1, "linux error");
1185   seconds = jlong(time.tv_sec);
1186   nanos = jlong(time.tv_usec) * 1000;
1187 }
1188 
1189 
1190 #ifndef CLOCK_MONOTONIC
1191   #define CLOCK_MONOTONIC (1)
1192 #endif
1193 
1194 void os::Linux::clock_init() {
1195   // we do dlopen's in this particular order due to bug in linux
1196   // dynamical loader (see 6348968) leading to crash on exit
1197   void* handle = dlopen("librt.so.1", RTLD_LAZY);
1198   if (handle == NULL) {
1199     handle = dlopen("librt.so", RTLD_LAZY);
1200   }
1201 
1202   if (handle) {
1203     int (*clock_getres_func)(clockid_t, struct timespec*) =
1204            (int(*)(clockid_t, struct timespec*))dlsym(handle, "clock_getres");
1205     int (*clock_gettime_func)(clockid_t, struct timespec*) =
1206            (int(*)(clockid_t, struct timespec*))dlsym(handle, "clock_gettime");
1207     if (clock_getres_func && clock_gettime_func) {
1208       // See if monotonic clock is supported by the kernel. Note that some
1209       // early implementations simply return kernel jiffies (updated every
1210       // 1/100 or 1/1000 second). It would be bad to use such a low res clock
1211       // for nano time (though the monotonic property is still nice to have).
1212       // It's fixed in newer kernels, however clock_getres() still returns
1213       // 1/HZ. We check if clock_getres() works, but will ignore its reported
1214       // resolution for now. Hopefully as people move to new kernels, this
1215       // won't be a problem.
1216       struct timespec res;
1217       struct timespec tp;
1218       if (clock_getres_func (CLOCK_MONOTONIC, &res) == 0 &&
1219           clock_gettime_func(CLOCK_MONOTONIC, &tp)  == 0) {
1220         // yes, monotonic clock is supported
1221         _clock_gettime = clock_gettime_func;
1222         return;
1223       } else {
1224         // close librt if there is no monotonic clock
1225         dlclose(handle);
1226       }
1227     }
1228   }
1229   warning("No monotonic clock was available - timed services may " \
1230           "be adversely affected if the time-of-day clock changes");
1231 }
1232 
1233 #ifndef SYS_clock_getres
1234   #if defined(X86) || defined(PPC64) || defined(S390)
1235     #define SYS_clock_getres AMD64_ONLY(229) IA32_ONLY(266) PPC64_ONLY(247) S390_ONLY(261)
1236     #define sys_clock_getres(x,y)  ::syscall(SYS_clock_getres, x, y)
1237   #else
1238     #warning "SYS_clock_getres not defined for this platform, disabling fast_thread_cpu_time"
1239     #define sys_clock_getres(x,y)  -1
1240   #endif
1241 #else
1242   #define sys_clock_getres(x,y)  ::syscall(SYS_clock_getres, x, y)
1243 #endif
1244 
1245 void os::Linux::fast_thread_clock_init() {
1246   if (!UseLinuxPosixThreadCPUClocks) {
1247     return;
1248   }
1249   clockid_t clockid;
1250   struct timespec tp;
1251   int (*pthread_getcpuclockid_func)(pthread_t, clockid_t *) =
1252       (int(*)(pthread_t, clockid_t *)) dlsym(RTLD_DEFAULT, "pthread_getcpuclockid");
1253 
1254   // Switch to using fast clocks for thread cpu time if
1255   // the sys_clock_getres() returns 0 error code.
1256   // Note, that some kernels may support the current thread
1257   // clock (CLOCK_THREAD_CPUTIME_ID) but not the clocks
1258   // returned by the pthread_getcpuclockid().
1259   // If the fast Posix clocks are supported then the sys_clock_getres()
1260   // must return at least tp.tv_sec == 0 which means a resolution
1261   // better than 1 sec. This is extra check for reliability.
1262 
1263   if (pthread_getcpuclockid_func &&
1264       pthread_getcpuclockid_func(_main_thread, &clockid) == 0 &&
1265       sys_clock_getres(clockid, &tp) == 0 && tp.tv_sec == 0) {
1266     _supports_fast_thread_cpu_time = true;
1267     _pthread_getcpuclockid = pthread_getcpuclockid_func;
1268   }
1269 }
1270 
1271 jlong os::javaTimeNanos() {
1272   if (os::supports_monotonic_clock()) {
1273     struct timespec tp;
1274     int status = Linux::clock_gettime(CLOCK_MONOTONIC, &tp);
1275     assert(status == 0, "gettime error");
1276     jlong result = jlong(tp.tv_sec) * (1000 * 1000 * 1000) + jlong(tp.tv_nsec);
1277     return result;
1278   } else {
1279     timeval time;
1280     int status = gettimeofday(&time, NULL);
1281     assert(status != -1, "linux error");
1282     jlong usecs = jlong(time.tv_sec) * (1000 * 1000) + jlong(time.tv_usec);
1283     return 1000 * usecs;
1284   }
1285 }
1286 
1287 void os::javaTimeNanos_info(jvmtiTimerInfo *info_ptr) {
1288   if (os::supports_monotonic_clock()) {
1289     info_ptr->max_value = ALL_64_BITS;
1290 
1291     // CLOCK_MONOTONIC - amount of time since some arbitrary point in the past
1292     info_ptr->may_skip_backward = false;      // not subject to resetting or drifting
1293     info_ptr->may_skip_forward = false;       // not subject to resetting or drifting
1294   } else {
1295     // gettimeofday - based on time in seconds since the Epoch thus does not wrap
1296     info_ptr->max_value = ALL_64_BITS;
1297 
1298     // gettimeofday is a real time clock so it skips
1299     info_ptr->may_skip_backward = true;
1300     info_ptr->may_skip_forward = true;
1301   }
1302 
1303   info_ptr->kind = JVMTI_TIMER_ELAPSED;                // elapsed not CPU time
1304 }
1305 
1306 // Return the real, user, and system times in seconds from an
1307 // arbitrary fixed point in the past.
1308 bool os::getTimesSecs(double* process_real_time,
1309                       double* process_user_time,
1310                       double* process_system_time) {
1311   struct tms ticks;
1312   clock_t real_ticks = times(&ticks);
1313 
1314   if (real_ticks == (clock_t) (-1)) {
1315     return false;
1316   } else {
1317     double ticks_per_second = (double) clock_tics_per_sec;
1318     *process_user_time = ((double) ticks.tms_utime) / ticks_per_second;
1319     *process_system_time = ((double) ticks.tms_stime) / ticks_per_second;
1320     *process_real_time = ((double) real_ticks) / ticks_per_second;
1321 
1322     return true;
1323   }
1324 }
1325 
1326 
1327 char * os::local_time_string(char *buf, size_t buflen) {
1328   struct tm t;
1329   time_t long_time;
1330   time(&long_time);
1331   localtime_r(&long_time, &t);
1332   jio_snprintf(buf, buflen, "%d-%02d-%02d %02d:%02d:%02d",
1333                t.tm_year + 1900, t.tm_mon + 1, t.tm_mday,
1334                t.tm_hour, t.tm_min, t.tm_sec);
1335   return buf;
1336 }
1337 
1338 struct tm* os::localtime_pd(const time_t* clock, struct tm*  res) {
1339   return localtime_r(clock, res);
1340 }
1341 
1342 ////////////////////////////////////////////////////////////////////////////////
1343 // runtime exit support
1344 
1345 // Note: os::shutdown() might be called very early during initialization, or
1346 // called from signal handler. Before adding something to os::shutdown(), make
1347 // sure it is async-safe and can handle partially initialized VM.
1348 void os::shutdown() {
1349 
1350   // allow PerfMemory to attempt cleanup of any persistent resources
1351   perfMemory_exit();
1352 
1353   // needs to remove object in file system
1354   AttachListener::abort();
1355 
1356   // flush buffered output, finish log files
1357   ostream_abort();
1358 
1359   // Check for abort hook
1360   abort_hook_t abort_hook = Arguments::abort_hook();
1361   if (abort_hook != NULL) {
1362     abort_hook();
1363   }
1364 
1365 }
1366 
1367 // Note: os::abort() might be called very early during initialization, or
1368 // called from signal handler. Before adding something to os::abort(), make
1369 // sure it is async-safe and can handle partially initialized VM.
1370 void os::abort(bool dump_core, void* siginfo, const void* context) {
1371   os::shutdown();
1372   if (dump_core) {
1373 #ifndef PRODUCT
1374     fdStream out(defaultStream::output_fd());
1375     out.print_raw("Current thread is ");
1376     char buf[16];
1377     jio_snprintf(buf, sizeof(buf), UINTX_FORMAT, os::current_thread_id());
1378     out.print_raw_cr(buf);
1379     out.print_raw_cr("Dumping core ...");
1380 #endif
1381     ::abort(); // dump core
1382   }
1383 
1384   ::exit(1);
1385 }
1386 
1387 // Die immediately, no exit hook, no abort hook, no cleanup.
1388 void os::die() {
1389   ::abort();
1390 }
1391 
1392 
1393 // This method is a copy of JDK's sysGetLastErrorString
1394 // from src/solaris/hpi/src/system_md.c
1395 
1396 size_t os::lasterror(char *buf, size_t len) {
1397   if (errno == 0)  return 0;
1398 
1399   const char *s = os::strerror(errno);
1400   size_t n = ::strlen(s);
1401   if (n >= len) {
1402     n = len - 1;
1403   }
1404   ::strncpy(buf, s, n);
1405   buf[n] = '\0';
1406   return n;
1407 }
1408 
1409 // thread_id is kernel thread id (similar to Solaris LWP id)
1410 intx os::current_thread_id() { return os::Linux::gettid(); }
1411 int os::current_process_id() {
1412   return ::getpid();
1413 }
1414 
1415 // DLL functions
1416 
1417 const char* os::dll_file_extension() { return ".so"; }
1418 
1419 // This must be hard coded because it's the system's temporary
1420 // directory not the java application's temp directory, ala java.io.tmpdir.
1421 const char* os::get_temp_directory() { return "/tmp"; }
1422 
1423 static bool file_exists(const char* filename) {
1424   struct stat statbuf;
1425   if (filename == NULL || strlen(filename) == 0) {
1426     return false;
1427   }
1428   return os::stat(filename, &statbuf) == 0;
1429 }
1430 
1431 bool os::dll_build_name(char* buffer, size_t buflen,
1432                         const char* pname, const char* fname) {
1433   bool retval = false;
1434   // Copied from libhpi
1435   const size_t pnamelen = pname ? strlen(pname) : 0;
1436 
1437   // Return error on buffer overflow.
1438   if (pnamelen + strlen(fname) + 10 > (size_t) buflen) {
1439     return retval;
1440   }
1441 
1442   if (pnamelen == 0) {
1443     snprintf(buffer, buflen, "lib%s.so", fname);
1444     retval = true;
1445   } else if (strchr(pname, *os::path_separator()) != NULL) {
1446     int n;
1447     char** pelements = split_path(pname, &n);
1448     if (pelements == NULL) {
1449       return false;
1450     }
1451     for (int i = 0; i < n; i++) {
1452       // Really shouldn't be NULL, but check can't hurt
1453       if (pelements[i] == NULL || strlen(pelements[i]) == 0) {
1454         continue; // skip the empty path values
1455       }
1456       snprintf(buffer, buflen, "%s/lib%s.so", pelements[i], fname);
1457       if (file_exists(buffer)) {
1458         retval = true;
1459         break;
1460       }
1461     }
1462     // release the storage
1463     for (int i = 0; i < n; i++) {
1464       if (pelements[i] != NULL) {
1465         FREE_C_HEAP_ARRAY(char, pelements[i]);
1466       }
1467     }
1468     if (pelements != NULL) {
1469       FREE_C_HEAP_ARRAY(char*, pelements);
1470     }
1471   } else {
1472     snprintf(buffer, buflen, "%s/lib%s.so", pname, fname);
1473     retval = true;
1474   }
1475   return retval;
1476 }
1477 
1478 // check if addr is inside libjvm.so
1479 bool os::address_is_in_vm(address addr) {
1480   static address libjvm_base_addr;
1481   Dl_info dlinfo;
1482 
1483   if (libjvm_base_addr == NULL) {
1484     if (dladdr(CAST_FROM_FN_PTR(void *, os::address_is_in_vm), &dlinfo) != 0) {
1485       libjvm_base_addr = (address)dlinfo.dli_fbase;
1486     }
1487     assert(libjvm_base_addr !=NULL, "Cannot obtain base address for libjvm");
1488   }
1489 
1490   if (dladdr((void *)addr, &dlinfo) != 0) {
1491     if (libjvm_base_addr == (address)dlinfo.dli_fbase) return true;
1492   }
1493 
1494   return false;
1495 }
1496 
1497 bool os::dll_address_to_function_name(address addr, char *buf,
1498                                       int buflen, int *offset,
1499                                       bool demangle) {
1500   // buf is not optional, but offset is optional
1501   assert(buf != NULL, "sanity check");
1502 
1503   Dl_info dlinfo;
1504 
1505   if (dladdr((void*)addr, &dlinfo) != 0) {
1506     // see if we have a matching symbol
1507     if (dlinfo.dli_saddr != NULL && dlinfo.dli_sname != NULL) {
1508       if (!(demangle && Decoder::demangle(dlinfo.dli_sname, buf, buflen))) {
1509         jio_snprintf(buf, buflen, "%s", dlinfo.dli_sname);
1510       }
1511       if (offset != NULL) *offset = addr - (address)dlinfo.dli_saddr;
1512       return true;
1513     }
1514     // no matching symbol so try for just file info
1515     if (dlinfo.dli_fname != NULL && dlinfo.dli_fbase != NULL) {
1516       if (Decoder::decode((address)(addr - (address)dlinfo.dli_fbase),
1517                           buf, buflen, offset, dlinfo.dli_fname, demangle)) {
1518         return true;
1519       }
1520     }
1521   }
1522 
1523   buf[0] = '\0';
1524   if (offset != NULL) *offset = -1;
1525   return false;
1526 }
1527 
1528 struct _address_to_library_name {
1529   address addr;          // input : memory address
1530   size_t  buflen;        //         size of fname
1531   char*   fname;         // output: library name
1532   address base;          //         library base addr
1533 };
1534 
1535 static int address_to_library_name_callback(struct dl_phdr_info *info,
1536                                             size_t size, void *data) {
1537   int i;
1538   bool found = false;
1539   address libbase = NULL;
1540   struct _address_to_library_name * d = (struct _address_to_library_name *)data;
1541 
1542   // iterate through all loadable segments
1543   for (i = 0; i < info->dlpi_phnum; i++) {
1544     address segbase = (address)(info->dlpi_addr + info->dlpi_phdr[i].p_vaddr);
1545     if (info->dlpi_phdr[i].p_type == PT_LOAD) {
1546       // base address of a library is the lowest address of its loaded
1547       // segments.
1548       if (libbase == NULL || libbase > segbase) {
1549         libbase = segbase;
1550       }
1551       // see if 'addr' is within current segment
1552       if (segbase <= d->addr &&
1553           d->addr < segbase + info->dlpi_phdr[i].p_memsz) {
1554         found = true;
1555       }
1556     }
1557   }
1558 
1559   // dlpi_name is NULL or empty if the ELF file is executable, return 0
1560   // so dll_address_to_library_name() can fall through to use dladdr() which
1561   // can figure out executable name from argv[0].
1562   if (found && info->dlpi_name && info->dlpi_name[0]) {
1563     d->base = libbase;
1564     if (d->fname) {
1565       jio_snprintf(d->fname, d->buflen, "%s", info->dlpi_name);
1566     }
1567     return 1;
1568   }
1569   return 0;
1570 }
1571 
1572 bool os::dll_address_to_library_name(address addr, char* buf,
1573                                      int buflen, int* offset) {
1574   // buf is not optional, but offset is optional
1575   assert(buf != NULL, "sanity check");
1576 
1577   Dl_info dlinfo;
1578   struct _address_to_library_name data;
1579 
1580   // There is a bug in old glibc dladdr() implementation that it could resolve
1581   // to wrong library name if the .so file has a base address != NULL. Here
1582   // we iterate through the program headers of all loaded libraries to find
1583   // out which library 'addr' really belongs to. This workaround can be
1584   // removed once the minimum requirement for glibc is moved to 2.3.x.
1585   data.addr = addr;
1586   data.fname = buf;
1587   data.buflen = buflen;
1588   data.base = NULL;
1589   int rslt = dl_iterate_phdr(address_to_library_name_callback, (void *)&data);
1590 
1591   if (rslt) {
1592     // buf already contains library name
1593     if (offset) *offset = addr - data.base;
1594     return true;
1595   }
1596   if (dladdr((void*)addr, &dlinfo) != 0) {
1597     if (dlinfo.dli_fname != NULL) {
1598       jio_snprintf(buf, buflen, "%s", dlinfo.dli_fname);
1599     }
1600     if (dlinfo.dli_fbase != NULL && offset != NULL) {
1601       *offset = addr - (address)dlinfo.dli_fbase;
1602     }
1603     return true;
1604   }
1605 
1606   buf[0] = '\0';
1607   if (offset) *offset = -1;
1608   return false;
1609 }
1610 
1611 // Loads .dll/.so and
1612 // in case of error it checks if .dll/.so was built for the
1613 // same architecture as Hotspot is running on
1614 
1615 
1616 // Remember the stack's state. The Linux dynamic linker will change
1617 // the stack to 'executable' at most once, so we must safepoint only once.
1618 bool os::Linux::_stack_is_executable = false;
1619 
1620 // VM operation that loads a library.  This is necessary if stack protection
1621 // of the Java stacks can be lost during loading the library.  If we
1622 // do not stop the Java threads, they can stack overflow before the stacks
1623 // are protected again.
1624 class VM_LinuxDllLoad: public VM_Operation {
1625  private:
1626   const char *_filename;
1627   char *_ebuf;
1628   int _ebuflen;
1629   void *_lib;
1630  public:
1631   VM_LinuxDllLoad(const char *fn, char *ebuf, int ebuflen) :
1632     _filename(fn), _ebuf(ebuf), _ebuflen(ebuflen), _lib(NULL) {}
1633   VMOp_Type type() const { return VMOp_LinuxDllLoad; }
1634   void doit() {
1635     _lib = os::Linux::dll_load_in_vmthread(_filename, _ebuf, _ebuflen);
1636     os::Linux::_stack_is_executable = true;
1637   }
1638   void* loaded_library() { return _lib; }
1639 };
1640 
1641 void * os::dll_load(const char *filename, char *ebuf, int ebuflen) {
1642   void * result = NULL;
1643   bool load_attempted = false;
1644 
1645   // Check whether the library to load might change execution rights
1646   // of the stack. If they are changed, the protection of the stack
1647   // guard pages will be lost. We need a safepoint to fix this.
1648   //
1649   // See Linux man page execstack(8) for more info.
1650   if (os::uses_stack_guard_pages() && !os::Linux::_stack_is_executable) {
1651     ElfFile ef(filename);
1652     if (!ef.specifies_noexecstack()) {
1653       if (!is_init_completed()) {
1654         os::Linux::_stack_is_executable = true;
1655         // This is OK - No Java threads have been created yet, and hence no
1656         // stack guard pages to fix.
1657         //
1658         // This should happen only when you are building JDK7 using a very
1659         // old version of JDK6 (e.g., with JPRT) and running test_gamma.
1660         //
1661         // Dynamic loader will make all stacks executable after
1662         // this function returns, and will not do that again.
1663         assert(Threads::first() == NULL, "no Java threads should exist yet.");
1664       } else {
1665         warning("You have loaded library %s which might have disabled stack guard. "
1666                 "The VM will try to fix the stack guard now.\n"
1667                 "It's highly recommended that you fix the library with "
1668                 "'execstack -c <libfile>', or link it with '-z noexecstack'.",
1669                 filename);
1670 
1671         assert(Thread::current()->is_Java_thread(), "must be Java thread");
1672         JavaThread *jt = JavaThread::current();
1673         if (jt->thread_state() != _thread_in_native) {
1674           // This happens when a compiler thread tries to load a hsdis-<arch>.so file
1675           // that requires ExecStack. Cannot enter safe point. Let's give up.
1676           warning("Unable to fix stack guard. Giving up.");
1677         } else {
1678           if (!LoadExecStackDllInVMThread) {
1679             // This is for the case where the DLL has an static
1680             // constructor function that executes JNI code. We cannot
1681             // load such DLLs in the VMThread.
1682             result = os::Linux::dlopen_helper(filename, ebuf, ebuflen);
1683           }
1684 
1685           ThreadInVMfromNative tiv(jt);
1686           debug_only(VMNativeEntryWrapper vew;)
1687 
1688           VM_LinuxDllLoad op(filename, ebuf, ebuflen);
1689           VMThread::execute(&op);
1690           if (LoadExecStackDllInVMThread) {
1691             result = op.loaded_library();
1692           }
1693           load_attempted = true;
1694         }
1695       }
1696     }
1697   }
1698 
1699   if (!load_attempted) {
1700     result = os::Linux::dlopen_helper(filename, ebuf, ebuflen);
1701   }
1702 
1703   if (result != NULL) {
1704     // Successful loading
1705     return result;
1706   }
1707 
1708   Elf32_Ehdr elf_head;
1709   int diag_msg_max_length=ebuflen-strlen(ebuf);
1710   char* diag_msg_buf=ebuf+strlen(ebuf);
1711 
1712   if (diag_msg_max_length==0) {
1713     // No more space in ebuf for additional diagnostics message
1714     return NULL;
1715   }
1716 
1717 
1718   int file_descriptor= ::open(filename, O_RDONLY | O_NONBLOCK);
1719 
1720   if (file_descriptor < 0) {
1721     // Can't open library, report dlerror() message
1722     return NULL;
1723   }
1724 
1725   bool failed_to_read_elf_head=
1726     (sizeof(elf_head)!=
1727      (::read(file_descriptor, &elf_head,sizeof(elf_head))));
1728 
1729   ::close(file_descriptor);
1730   if (failed_to_read_elf_head) {
1731     // file i/o error - report dlerror() msg
1732     return NULL;
1733   }
1734 
1735   typedef struct {
1736     Elf32_Half    code;         // Actual value as defined in elf.h
1737     Elf32_Half    compat_class; // Compatibility of archs at VM's sense
1738     unsigned char elf_class;    // 32 or 64 bit
1739     unsigned char endianess;    // MSB or LSB
1740     char*         name;         // String representation
1741   } arch_t;
1742 
1743 #ifndef EM_486
1744   #define EM_486          6               /* Intel 80486 */
1745 #endif
1746 #ifndef EM_AARCH64
1747   #define EM_AARCH64    183               /* ARM AARCH64 */
1748 #endif
1749 
1750   static const arch_t arch_array[]={
1751     {EM_386,         EM_386,     ELFCLASS32, ELFDATA2LSB, (char*)"IA 32"},
1752     {EM_486,         EM_386,     ELFCLASS32, ELFDATA2LSB, (char*)"IA 32"},
1753     {EM_IA_64,       EM_IA_64,   ELFCLASS64, ELFDATA2LSB, (char*)"IA 64"},
1754     {EM_X86_64,      EM_X86_64,  ELFCLASS64, ELFDATA2LSB, (char*)"AMD 64"},
1755     {EM_SPARC,       EM_SPARC,   ELFCLASS32, ELFDATA2MSB, (char*)"Sparc 32"},
1756     {EM_SPARC32PLUS, EM_SPARC,   ELFCLASS32, ELFDATA2MSB, (char*)"Sparc 32"},
1757     {EM_SPARCV9,     EM_SPARCV9, ELFCLASS64, ELFDATA2MSB, (char*)"Sparc v9 64"},
1758     {EM_PPC,         EM_PPC,     ELFCLASS32, ELFDATA2MSB, (char*)"Power PC 32"},
1759 #if defined(VM_LITTLE_ENDIAN)
1760     {EM_PPC64,       EM_PPC64,   ELFCLASS64, ELFDATA2LSB, (char*)"Power PC 64"},
1761 #else
1762     {EM_PPC64,       EM_PPC64,   ELFCLASS64, ELFDATA2MSB, (char*)"Power PC 64 LE"},
1763 #endif
1764     {EM_ARM,         EM_ARM,     ELFCLASS32,   ELFDATA2LSB, (char*)"ARM"},
1765     {EM_S390,        EM_S390,    ELFCLASSNONE, ELFDATA2MSB, (char*)"IBM System/390"},
1766     {EM_ALPHA,       EM_ALPHA,   ELFCLASS64, ELFDATA2LSB, (char*)"Alpha"},
1767     {EM_MIPS_RS3_LE, EM_MIPS_RS3_LE, ELFCLASS32, ELFDATA2LSB, (char*)"MIPSel"},
1768     {EM_MIPS,        EM_MIPS,    ELFCLASS32, ELFDATA2MSB, (char*)"MIPS"},
1769     {EM_PARISC,      EM_PARISC,  ELFCLASS32, ELFDATA2MSB, (char*)"PARISC"},
1770     {EM_68K,         EM_68K,     ELFCLASS32, ELFDATA2MSB, (char*)"M68k"},
1771     {EM_AARCH64,     EM_AARCH64, ELFCLASS64, ELFDATA2LSB, (char*)"AARCH64"},
1772   };
1773 
1774 #if  (defined IA32)
1775   static  Elf32_Half running_arch_code=EM_386;
1776 #elif   (defined AMD64)
1777   static  Elf32_Half running_arch_code=EM_X86_64;
1778 #elif  (defined IA64)
1779   static  Elf32_Half running_arch_code=EM_IA_64;
1780 #elif  (defined __sparc) && (defined _LP64)
1781   static  Elf32_Half running_arch_code=EM_SPARCV9;
1782 #elif  (defined __sparc) && (!defined _LP64)
1783   static  Elf32_Half running_arch_code=EM_SPARC;
1784 #elif  (defined __powerpc64__)
1785   static  Elf32_Half running_arch_code=EM_PPC64;
1786 #elif  (defined __powerpc__)
1787   static  Elf32_Half running_arch_code=EM_PPC;
1788 #elif  (defined AARCH64)
1789   static  Elf32_Half running_arch_code=EM_AARCH64;
1790 #elif  (defined ARM)
1791   static  Elf32_Half running_arch_code=EM_ARM;
1792 #elif  (defined S390)
1793   static  Elf32_Half running_arch_code=EM_S390;
1794 #elif  (defined ALPHA)
1795   static  Elf32_Half running_arch_code=EM_ALPHA;
1796 #elif  (defined MIPSEL)
1797   static  Elf32_Half running_arch_code=EM_MIPS_RS3_LE;
1798 #elif  (defined PARISC)
1799   static  Elf32_Half running_arch_code=EM_PARISC;
1800 #elif  (defined MIPS)
1801   static  Elf32_Half running_arch_code=EM_MIPS;
1802 #elif  (defined M68K)
1803   static  Elf32_Half running_arch_code=EM_68K;
1804 #else
1805     #error Method os::dll_load requires that one of following is defined:\
1806         AARCH64, ALPHA, ARM, AMD64, IA32, IA64, M68K, MIPS, MIPSEL, PARISC, __powerpc__, __powerpc64__, S390, __sparc
1807 #endif
1808 
1809   // Identify compatability class for VM's architecture and library's architecture
1810   // Obtain string descriptions for architectures
1811 
1812   arch_t lib_arch={elf_head.e_machine,0,elf_head.e_ident[EI_CLASS], elf_head.e_ident[EI_DATA], NULL};
1813   int running_arch_index=-1;
1814 
1815   for (unsigned int i=0; i < ARRAY_SIZE(arch_array); i++) {
1816     if (running_arch_code == arch_array[i].code) {
1817       running_arch_index    = i;
1818     }
1819     if (lib_arch.code == arch_array[i].code) {
1820       lib_arch.compat_class = arch_array[i].compat_class;
1821       lib_arch.name         = arch_array[i].name;
1822     }
1823   }
1824 
1825   assert(running_arch_index != -1,
1826          "Didn't find running architecture code (running_arch_code) in arch_array");
1827   if (running_arch_index == -1) {
1828     // Even though running architecture detection failed
1829     // we may still continue with reporting dlerror() message
1830     return NULL;
1831   }
1832 
1833   if (lib_arch.endianess != arch_array[running_arch_index].endianess) {
1834     ::snprintf(diag_msg_buf, diag_msg_max_length-1," (Possible cause: endianness mismatch)");
1835     return NULL;
1836   }
1837 
1838 #ifndef S390
1839   if (lib_arch.elf_class != arch_array[running_arch_index].elf_class) {
1840     ::snprintf(diag_msg_buf, diag_msg_max_length-1," (Possible cause: architecture word width mismatch)");
1841     return NULL;
1842   }
1843 #endif // !S390
1844 
1845   if (lib_arch.compat_class != arch_array[running_arch_index].compat_class) {
1846     if (lib_arch.name!=NULL) {
1847       ::snprintf(diag_msg_buf, diag_msg_max_length-1,
1848                  " (Possible cause: can't load %s-bit .so on a %s-bit platform)",
1849                  lib_arch.name, arch_array[running_arch_index].name);
1850     } else {
1851       ::snprintf(diag_msg_buf, diag_msg_max_length-1,
1852                  " (Possible cause: can't load this .so (machine code=0x%x) on a %s-bit platform)",
1853                  lib_arch.code,
1854                  arch_array[running_arch_index].name);
1855     }
1856   }
1857 
1858   return NULL;
1859 }
1860 
1861 void * os::Linux::dlopen_helper(const char *filename, char *ebuf,
1862                                 int ebuflen) {
1863   void * result = ::dlopen(filename, RTLD_LAZY);
1864   if (result == NULL) {
1865     ::strncpy(ebuf, ::dlerror(), ebuflen - 1);
1866     ebuf[ebuflen-1] = '\0';
1867   }
1868   return result;
1869 }
1870 
1871 void * os::Linux::dll_load_in_vmthread(const char *filename, char *ebuf,
1872                                        int ebuflen) {
1873   void * result = NULL;
1874   if (LoadExecStackDllInVMThread) {
1875     result = dlopen_helper(filename, ebuf, ebuflen);
1876   }
1877 
1878   // Since 7019808, libjvm.so is linked with -noexecstack. If the VM loads a
1879   // library that requires an executable stack, or which does not have this
1880   // stack attribute set, dlopen changes the stack attribute to executable. The
1881   // read protection of the guard pages gets lost.
1882   //
1883   // Need to check _stack_is_executable again as multiple VM_LinuxDllLoad
1884   // may have been queued at the same time.
1885 
1886   if (!_stack_is_executable) {
1887     JavaThread *jt = Threads::first();
1888 
1889     while (jt) {
1890       if (!jt->stack_guard_zone_unused() &&     // Stack not yet fully initialized
1891           jt->stack_guards_enabled()) {         // No pending stack overflow exceptions
1892         if (!os::guard_memory((char *)jt->stack_end(), jt->stack_guard_zone_size())) {
1893           warning("Attempt to reguard stack yellow zone failed.");
1894         }
1895       }
1896       jt = jt->next();
1897     }
1898   }
1899 
1900   return result;
1901 }
1902 
1903 void* os::dll_lookup(void* handle, const char* name) {
1904   void* res = dlsym(handle, name);
1905   return res;
1906 }
1907 
1908 void* os::get_default_process_handle() {
1909   return (void*)::dlopen(NULL, RTLD_LAZY);
1910 }
1911 
1912 static bool _print_ascii_file(const char* filename, outputStream* st) {
1913   int fd = ::open(filename, O_RDONLY);
1914   if (fd == -1) {
1915     return false;
1916   }
1917 
1918   char buf[33];
1919   int bytes;
1920   buf[32] = '\0';
1921   while ((bytes = ::read(fd, buf, sizeof(buf)-1)) > 0) {
1922     st->print_raw(buf, bytes);
1923   }
1924 
1925   ::close(fd);
1926 
1927   return true;
1928 }
1929 
1930 void os::print_dll_info(outputStream *st) {
1931   st->print_cr("Dynamic libraries:");
1932 
1933   char fname[32];
1934   pid_t pid = os::Linux::gettid();
1935 
1936   jio_snprintf(fname, sizeof(fname), "/proc/%d/maps", pid);
1937 
1938   if (!_print_ascii_file(fname, st)) {
1939     st->print("Can not get library information for pid = %d\n", pid);
1940   }
1941 }
1942 
1943 int os::get_loaded_modules_info(os::LoadedModulesCallbackFunc callback, void *param) {
1944   FILE *procmapsFile = NULL;
1945 
1946   // Open the procfs maps file for the current process
1947   if ((procmapsFile = fopen("/proc/self/maps", "r")) != NULL) {
1948     // Allocate PATH_MAX for file name plus a reasonable size for other fields.
1949     char line[PATH_MAX + 100];
1950 
1951     // Read line by line from 'file'
1952     while (fgets(line, sizeof(line), procmapsFile) != NULL) {
1953       u8 base, top, offset, inode;
1954       char permissions[5];
1955       char device[6];
1956       char name[PATH_MAX + 1];
1957 
1958       // Parse fields from line
1959       sscanf(line, UINT64_FORMAT_X "-" UINT64_FORMAT_X " %4s " UINT64_FORMAT_X " %5s " INT64_FORMAT " %s",
1960              &base, &top, permissions, &offset, device, &inode, name);
1961 
1962       // Filter by device id '00:00' so that we only get file system mapped files.
1963       if (strcmp(device, "00:00") != 0) {
1964 
1965         // Call callback with the fields of interest
1966         if(callback(name, (address)base, (address)top, param)) {
1967           // Oops abort, callback aborted
1968           fclose(procmapsFile);
1969           return 1;
1970         }
1971       }
1972     }
1973     fclose(procmapsFile);
1974   }
1975   return 0;
1976 }
1977 
1978 void os::print_os_info_brief(outputStream* st) {
1979   os::Linux::print_distro_info(st);
1980 
1981   os::Posix::print_uname_info(st);
1982 
1983   os::Linux::print_libversion_info(st);
1984 
1985 }
1986 
1987 void os::print_os_info(outputStream* st) {
1988   st->print("OS:");
1989 
1990   os::Linux::print_distro_info(st);
1991 
1992   os::Posix::print_uname_info(st);
1993 
1994   // Print warning if unsafe chroot environment detected
1995   if (unsafe_chroot_detected) {
1996     st->print("WARNING!! ");
1997     st->print_cr("%s", unstable_chroot_error);
1998   }
1999 
2000   os::Linux::print_libversion_info(st);
2001 
2002   os::Posix::print_rlimit_info(st);
2003 
2004   os::Posix::print_load_average(st);
2005 
2006   os::Linux::print_full_memory_info(st);
2007 }
2008 
2009 // Try to identify popular distros.
2010 // Most Linux distributions have a /etc/XXX-release file, which contains
2011 // the OS version string. Newer Linux distributions have a /etc/lsb-release
2012 // file that also contains the OS version string. Some have more than one
2013 // /etc/XXX-release file (e.g. Mandrake has both /etc/mandrake-release and
2014 // /etc/redhat-release.), so the order is important.
2015 // Any Linux that is based on Redhat (i.e. Oracle, Mandrake, Sun JDS...) have
2016 // their own specific XXX-release file as well as a redhat-release file.
2017 // Because of this the XXX-release file needs to be searched for before the
2018 // redhat-release file.
2019 // Since Red Hat and SuSE have an lsb-release file that is not very descriptive the
2020 // search for redhat-release / SuSE-release needs to be before lsb-release.
2021 // Since the lsb-release file is the new standard it needs to be searched
2022 // before the older style release files.
2023 // Searching system-release (Red Hat) and os-release (other Linuxes) are a
2024 // next to last resort.  The os-release file is a new standard that contains
2025 // distribution information and the system-release file seems to be an old
2026 // standard that has been replaced by the lsb-release and os-release files.
2027 // Searching for the debian_version file is the last resort.  It contains
2028 // an informative string like "6.0.6" or "wheezy/sid". Because of this
2029 // "Debian " is printed before the contents of the debian_version file.
2030 
2031 const char* distro_files[] = {
2032   "/etc/oracle-release",
2033   "/etc/mandriva-release",
2034   "/etc/mandrake-release",
2035   "/etc/sun-release",
2036   "/etc/redhat-release",
2037   "/etc/SuSE-release",
2038   "/etc/lsb-release",
2039   "/etc/turbolinux-release",
2040   "/etc/gentoo-release",
2041   "/etc/ltib-release",
2042   "/etc/angstrom-version",
2043   "/etc/system-release",
2044   "/etc/os-release",
2045   NULL };
2046 
2047 void os::Linux::print_distro_info(outputStream* st) {
2048   for (int i = 0;; i++) {
2049     const char* file = distro_files[i];
2050     if (file == NULL) {
2051       break;  // done
2052     }
2053     // If file prints, we found it.
2054     if (_print_ascii_file(file, st)) {
2055       return;
2056     }
2057   }
2058 
2059   if (file_exists("/etc/debian_version")) {
2060     st->print("Debian ");
2061     _print_ascii_file("/etc/debian_version", st);
2062   } else {
2063     st->print("Linux");
2064   }
2065   st->cr();
2066 }
2067 
2068 static void parse_os_info_helper(FILE* fp, char* distro, size_t length, bool get_first_line) {
2069   char buf[256];
2070   while (fgets(buf, sizeof(buf), fp)) {
2071     // Edit out extra stuff in expected format
2072     if (strstr(buf, "DISTRIB_DESCRIPTION=") != NULL || strstr(buf, "PRETTY_NAME=") != NULL) {
2073       char* ptr = strstr(buf, "\"");  // the name is in quotes
2074       if (ptr != NULL) {
2075         ptr++; // go beyond first quote
2076         char* nl = strchr(ptr, '\"');
2077         if (nl != NULL) *nl = '\0';
2078         strncpy(distro, ptr, length);
2079       } else {
2080         ptr = strstr(buf, "=");
2081         ptr++; // go beyond equals then
2082         char* nl = strchr(ptr, '\n');
2083         if (nl != NULL) *nl = '\0';
2084         strncpy(distro, ptr, length);
2085       }
2086       return;
2087     } else if (get_first_line) {
2088       char* nl = strchr(buf, '\n');
2089       if (nl != NULL) *nl = '\0';
2090       strncpy(distro, buf, length);
2091       return;
2092     }
2093   }
2094   // print last line and close
2095   char* nl = strchr(buf, '\n');
2096   if (nl != NULL) *nl = '\0';
2097   strncpy(distro, buf, length);
2098 }
2099 
2100 static void parse_os_info(char* distro, size_t length, const char* file) {
2101   FILE* fp = fopen(file, "r");
2102   if (fp != NULL) {
2103     // if suse format, print out first line
2104     bool get_first_line = (strcmp(file, "/etc/SuSE-release") == 0);
2105     parse_os_info_helper(fp, distro, length, get_first_line);
2106     fclose(fp);
2107   }
2108 }
2109 
2110 void os::get_summary_os_info(char* buf, size_t buflen) {
2111   for (int i = 0;; i++) {
2112     const char* file = distro_files[i];
2113     if (file == NULL) {
2114       break; // ran out of distro_files
2115     }
2116     if (file_exists(file)) {
2117       parse_os_info(buf, buflen, file);
2118       return;
2119     }
2120   }
2121   // special case for debian
2122   if (file_exists("/etc/debian_version")) {
2123     strncpy(buf, "Debian ", buflen);
2124     parse_os_info(&buf[7], buflen-7, "/etc/debian_version");
2125   } else {
2126     strncpy(buf, "Linux", buflen);
2127   }
2128 }
2129 
2130 void os::Linux::print_libversion_info(outputStream* st) {
2131   // libc, pthread
2132   st->print("libc:");
2133   st->print("%s ", os::Linux::glibc_version());
2134   st->print("%s ", os::Linux::libpthread_version());
2135   st->cr();
2136 }
2137 
2138 void os::Linux::print_full_memory_info(outputStream* st) {
2139   st->print("\n/proc/meminfo:\n");
2140   _print_ascii_file("/proc/meminfo", st);
2141   st->cr();
2142 }
2143 
2144 void os::print_memory_info(outputStream* st) {
2145 
2146   st->print("Memory:");
2147   st->print(" %dk page", os::vm_page_size()>>10);
2148 
2149   // values in struct sysinfo are "unsigned long"
2150   struct sysinfo si;
2151   sysinfo(&si);
2152 
2153   st->print(", physical " UINT64_FORMAT "k",
2154             os::physical_memory() >> 10);
2155   st->print("(" UINT64_FORMAT "k free)",
2156             os::available_memory() >> 10);
2157   st->print(", swap " UINT64_FORMAT "k",
2158             ((jlong)si.totalswap * si.mem_unit) >> 10);
2159   st->print("(" UINT64_FORMAT "k free)",
2160             ((jlong)si.freeswap * si.mem_unit) >> 10);
2161   st->cr();
2162 }
2163 
2164 // Print the first "model name" line and the first "flags" line
2165 // that we find and nothing more. We assume "model name" comes
2166 // before "flags" so if we find a second "model name", then the
2167 // "flags" field is considered missing.
2168 static bool print_model_name_and_flags(outputStream* st, char* buf, size_t buflen) {
2169 #if defined(IA32) || defined(AMD64)
2170   // Other platforms have less repetitive cpuinfo files
2171   FILE *fp = fopen("/proc/cpuinfo", "r");
2172   if (fp) {
2173     while (!feof(fp)) {
2174       if (fgets(buf, buflen, fp)) {
2175         // Assume model name comes before flags
2176         bool model_name_printed = false;
2177         if (strstr(buf, "model name") != NULL) {
2178           if (!model_name_printed) {
2179             st->print_raw("CPU Model and flags from /proc/cpuinfo:\n");
2180             st->print_raw(buf);
2181             model_name_printed = true;
2182           } else {
2183             // model name printed but not flags?  Odd, just return
2184             fclose(fp);
2185             return true;
2186           }
2187         }
2188         // print the flags line too
2189         if (strstr(buf, "flags") != NULL) {
2190           st->print_raw(buf);
2191           fclose(fp);
2192           return true;
2193         }
2194       }
2195     }
2196     fclose(fp);
2197   }
2198 #endif // x86 platforms
2199   return false;
2200 }
2201 
2202 void os::pd_print_cpu_info(outputStream* st, char* buf, size_t buflen) {
2203   // Only print the model name if the platform provides this as a summary
2204   if (!print_model_name_and_flags(st, buf, buflen)) {
2205     st->print("\n/proc/cpuinfo:\n");
2206     if (!_print_ascii_file("/proc/cpuinfo", st)) {
2207       st->print_cr("  <Not Available>");
2208     }
2209   }
2210 }
2211 
2212 #if defined(AMD64) || defined(IA32) || defined(X32)
2213 const char* search_string = "model name";
2214 #elif defined(PPC64)
2215 const char* search_string = "cpu";
2216 #elif defined(S390)
2217 const char* search_string = "processor";
2218 #elif defined(SPARC)
2219 const char* search_string = "cpu";
2220 #else
2221 const char* search_string = "Processor";
2222 #endif
2223 
2224 // Parses the cpuinfo file for string representing the model name.
2225 void os::get_summary_cpu_info(char* cpuinfo, size_t length) {
2226   FILE* fp = fopen("/proc/cpuinfo", "r");
2227   if (fp != NULL) {
2228     while (!feof(fp)) {
2229       char buf[256];
2230       if (fgets(buf, sizeof(buf), fp)) {
2231         char* start = strstr(buf, search_string);
2232         if (start != NULL) {
2233           char *ptr = start + strlen(search_string);
2234           char *end = buf + strlen(buf);
2235           while (ptr != end) {
2236              // skip whitespace and colon for the rest of the name.
2237              if (*ptr != ' ' && *ptr != '\t' && *ptr != ':') {
2238                break;
2239              }
2240              ptr++;
2241           }
2242           if (ptr != end) {
2243             // reasonable string, get rid of newline and keep the rest
2244             char* nl = strchr(buf, '\n');
2245             if (nl != NULL) *nl = '\0';
2246             strncpy(cpuinfo, ptr, length);
2247             fclose(fp);
2248             return;
2249           }
2250         }
2251       }
2252     }
2253     fclose(fp);
2254   }
2255   // cpuinfo not found or parsing failed, just print generic string.  The entire
2256   // /proc/cpuinfo file will be printed later in the file (or enough of it for x86)
2257 #if   defined(AARCH64)
2258   strncpy(cpuinfo, "AArch64", length);
2259 #elif defined(AMD64)
2260   strncpy(cpuinfo, "x86_64", length);
2261 #elif defined(ARM)  // Order wrt. AARCH64 is relevant!
2262   strncpy(cpuinfo, "ARM", length);
2263 #elif defined(IA32)
2264   strncpy(cpuinfo, "x86_32", length);
2265 #elif defined(IA64)
2266   strncpy(cpuinfo, "IA64", length);
2267 #elif defined(PPC)
2268   strncpy(cpuinfo, "PPC64", length);
2269 #elif defined(S390)
2270   strncpy(cpuinfo, "S390", length);
2271 #elif defined(SPARC)
2272   strncpy(cpuinfo, "sparcv9", length);
2273 #elif defined(ZERO_LIBARCH)
2274   strncpy(cpuinfo, ZERO_LIBARCH, length);
2275 #else
2276   strncpy(cpuinfo, "unknown", length);
2277 #endif
2278 }
2279 
2280 static void print_signal_handler(outputStream* st, int sig,
2281                                  char* buf, size_t buflen);
2282 
2283 void os::print_signal_handlers(outputStream* st, char* buf, size_t buflen) {
2284   st->print_cr("Signal Handlers:");
2285   print_signal_handler(st, SIGSEGV, buf, buflen);
2286   print_signal_handler(st, SIGBUS , buf, buflen);
2287   print_signal_handler(st, SIGFPE , buf, buflen);
2288   print_signal_handler(st, SIGPIPE, buf, buflen);
2289   print_signal_handler(st, SIGXFSZ, buf, buflen);
2290   print_signal_handler(st, SIGILL , buf, buflen);
2291   print_signal_handler(st, SR_signum, buf, buflen);
2292   print_signal_handler(st, SHUTDOWN1_SIGNAL, buf, buflen);
2293   print_signal_handler(st, SHUTDOWN2_SIGNAL , buf, buflen);
2294   print_signal_handler(st, SHUTDOWN3_SIGNAL , buf, buflen);
2295   print_signal_handler(st, BREAK_SIGNAL, buf, buflen);
2296 #if defined(PPC64)
2297   print_signal_handler(st, SIGTRAP, buf, buflen);
2298 #endif
2299 }
2300 
2301 static char saved_jvm_path[MAXPATHLEN] = {0};
2302 
2303 // Find the full path to the current module, libjvm.so
2304 void os::jvm_path(char *buf, jint buflen) {
2305   // Error checking.
2306   if (buflen < MAXPATHLEN) {
2307     assert(false, "must use a large-enough buffer");
2308     buf[0] = '\0';
2309     return;
2310   }
2311   // Lazy resolve the path to current module.
2312   if (saved_jvm_path[0] != 0) {
2313     strcpy(buf, saved_jvm_path);
2314     return;
2315   }
2316 
2317   char dli_fname[MAXPATHLEN];
2318   bool ret = dll_address_to_library_name(
2319                                          CAST_FROM_FN_PTR(address, os::jvm_path),
2320                                          dli_fname, sizeof(dli_fname), NULL);
2321   assert(ret, "cannot locate libjvm");
2322   char *rp = NULL;
2323   if (ret && dli_fname[0] != '\0') {
2324     rp = realpath(dli_fname, buf);
2325   }
2326   if (rp == NULL) {
2327     return;
2328   }
2329 
2330   if (Arguments::sun_java_launcher_is_altjvm()) {
2331     // Support for the java launcher's '-XXaltjvm=<path>' option. Typical
2332     // value for buf is "<JAVA_HOME>/jre/lib/<vmtype>/libjvm.so".
2333     // If "/jre/lib/" appears at the right place in the string, then
2334     // assume we are installed in a JDK and we're done. Otherwise, check
2335     // for a JAVA_HOME environment variable and fix up the path so it
2336     // looks like libjvm.so is installed there (append a fake suffix
2337     // hotspot/libjvm.so).
2338     const char *p = buf + strlen(buf) - 1;
2339     for (int count = 0; p > buf && count < 5; ++count) {
2340       for (--p; p > buf && *p != '/'; --p)
2341         /* empty */ ;
2342     }
2343 
2344     if (strncmp(p, "/jre/lib/", 9) != 0) {
2345       // Look for JAVA_HOME in the environment.
2346       char* java_home_var = ::getenv("JAVA_HOME");
2347       if (java_home_var != NULL && java_home_var[0] != 0) {
2348         char* jrelib_p;
2349         int len;
2350 
2351         // Check the current module name "libjvm.so".
2352         p = strrchr(buf, '/');
2353         if (p == NULL) {
2354           return;
2355         }
2356         assert(strstr(p, "/libjvm") == p, "invalid library name");
2357 
2358         rp = realpath(java_home_var, buf);
2359         if (rp == NULL) {
2360           return;
2361         }
2362 
2363         // determine if this is a legacy image or modules image
2364         // modules image doesn't have "jre" subdirectory
2365         len = strlen(buf);
2366         assert(len < buflen, "Ran out of buffer room");
2367         jrelib_p = buf + len;
2368         snprintf(jrelib_p, buflen-len, "/jre/lib");
2369         if (0 != access(buf, F_OK)) {
2370           snprintf(jrelib_p, buflen-len, "/lib");
2371         }
2372 
2373         if (0 == access(buf, F_OK)) {
2374           // Use current module name "libjvm.so"
2375           len = strlen(buf);
2376           snprintf(buf + len, buflen-len, "/hotspot/libjvm.so");
2377         } else {
2378           // Go back to path of .so
2379           rp = realpath(dli_fname, buf);
2380           if (rp == NULL) {
2381             return;
2382           }
2383         }
2384       }
2385     }
2386   }
2387 
2388   strncpy(saved_jvm_path, buf, MAXPATHLEN);
2389   saved_jvm_path[MAXPATHLEN - 1] = '\0';
2390 }
2391 
2392 void os::print_jni_name_prefix_on(outputStream* st, int args_size) {
2393   // no prefix required, not even "_"
2394 }
2395 
2396 void os::print_jni_name_suffix_on(outputStream* st, int args_size) {
2397   // no suffix required
2398 }
2399 
2400 ////////////////////////////////////////////////////////////////////////////////
2401 // sun.misc.Signal support
2402 
2403 static volatile jint sigint_count = 0;
2404 
2405 static void UserHandler(int sig, void *siginfo, void *context) {
2406   // 4511530 - sem_post is serialized and handled by the manager thread. When
2407   // the program is interrupted by Ctrl-C, SIGINT is sent to every thread. We
2408   // don't want to flood the manager thread with sem_post requests.
2409   if (sig == SIGINT && Atomic::add(1, &sigint_count) > 1) {
2410     return;
2411   }
2412 
2413   // Ctrl-C is pressed during error reporting, likely because the error
2414   // handler fails to abort. Let VM die immediately.
2415   if (sig == SIGINT && is_error_reported()) {
2416     os::die();
2417   }
2418 
2419   os::signal_notify(sig);
2420 }
2421 
2422 void* os::user_handler() {
2423   return CAST_FROM_FN_PTR(void*, UserHandler);
2424 }
2425 
2426 struct timespec PosixSemaphore::create_timespec(unsigned int sec, int nsec) {
2427   struct timespec ts;
2428   // Semaphore's are always associated with CLOCK_REALTIME
2429   os::Linux::clock_gettime(CLOCK_REALTIME, &ts);
2430   // see unpackTime for discussion on overflow checking
2431   if (sec >= MAX_SECS) {
2432     ts.tv_sec += MAX_SECS;
2433     ts.tv_nsec = 0;
2434   } else {
2435     ts.tv_sec += sec;
2436     ts.tv_nsec += nsec;
2437     if (ts.tv_nsec >= NANOSECS_PER_SEC) {
2438       ts.tv_nsec -= NANOSECS_PER_SEC;
2439       ++ts.tv_sec; // note: this must be <= max_secs
2440     }
2441   }
2442 
2443   return ts;
2444 }
2445 
2446 extern "C" {
2447   typedef void (*sa_handler_t)(int);
2448   typedef void (*sa_sigaction_t)(int, siginfo_t *, void *);
2449 }
2450 
2451 void* os::signal(int signal_number, void* handler) {
2452   struct sigaction sigAct, oldSigAct;
2453 
2454   sigfillset(&(sigAct.sa_mask));
2455   sigAct.sa_flags   = SA_RESTART|SA_SIGINFO;
2456   sigAct.sa_handler = CAST_TO_FN_PTR(sa_handler_t, handler);
2457 
2458   if (sigaction(signal_number, &sigAct, &oldSigAct)) {
2459     // -1 means registration failed
2460     return (void *)-1;
2461   }
2462 
2463   return CAST_FROM_FN_PTR(void*, oldSigAct.sa_handler);
2464 }
2465 
2466 void os::signal_raise(int signal_number) {
2467   ::raise(signal_number);
2468 }
2469 
2470 // The following code is moved from os.cpp for making this
2471 // code platform specific, which it is by its very nature.
2472 
2473 // Will be modified when max signal is changed to be dynamic
2474 int os::sigexitnum_pd() {
2475   return NSIG;
2476 }
2477 
2478 // a counter for each possible signal value
2479 static volatile jint pending_signals[NSIG+1] = { 0 };
2480 
2481 // Linux(POSIX) specific hand shaking semaphore.
2482 static sem_t sig_sem;
2483 static PosixSemaphore sr_semaphore;
2484 
2485 void os::signal_init_pd() {
2486   // Initialize signal structures
2487   ::memset((void*)pending_signals, 0, sizeof(pending_signals));
2488 
2489   // Initialize signal semaphore
2490   ::sem_init(&sig_sem, 0, 0);
2491 }
2492 
2493 void os::signal_notify(int sig) {
2494   Atomic::inc(&pending_signals[sig]);
2495   ::sem_post(&sig_sem);
2496 }
2497 
2498 static int check_pending_signals(bool wait) {
2499   Atomic::store(0, &sigint_count);
2500   for (;;) {
2501     for (int i = 0; i < NSIG + 1; i++) {
2502       jint n = pending_signals[i];
2503       if (n > 0 && n == Atomic::cmpxchg(n - 1, &pending_signals[i], n)) {
2504         return i;
2505       }
2506     }
2507     if (!wait) {
2508       return -1;
2509     }
2510     JavaThread *thread = JavaThread::current();
2511     ThreadBlockInVM tbivm(thread);
2512 
2513     bool threadIsSuspended;
2514     do {
2515       thread->set_suspend_equivalent();
2516       // cleared by handle_special_suspend_equivalent_condition() or java_suspend_self()
2517       ::sem_wait(&sig_sem);
2518 
2519       // were we externally suspended while we were waiting?
2520       threadIsSuspended = thread->handle_special_suspend_equivalent_condition();
2521       if (threadIsSuspended) {
2522         // The semaphore has been incremented, but while we were waiting
2523         // another thread suspended us. We don't want to continue running
2524         // while suspended because that would surprise the thread that
2525         // suspended us.
2526         ::sem_post(&sig_sem);
2527 
2528         thread->java_suspend_self();
2529       }
2530     } while (threadIsSuspended);
2531   }
2532 }
2533 
2534 int os::signal_lookup() {
2535   return check_pending_signals(false);
2536 }
2537 
2538 int os::signal_wait() {
2539   return check_pending_signals(true);
2540 }
2541 
2542 ////////////////////////////////////////////////////////////////////////////////
2543 // Virtual Memory
2544 
2545 int os::vm_page_size() {
2546   // Seems redundant as all get out
2547   assert(os::Linux::page_size() != -1, "must call os::init");
2548   return os::Linux::page_size();
2549 }
2550 
2551 // Solaris allocates memory by pages.
2552 int os::vm_allocation_granularity() {
2553   assert(os::Linux::page_size() != -1, "must call os::init");
2554   return os::Linux::page_size();
2555 }
2556 
2557 // Rationale behind this function:
2558 //  current (Mon Apr 25 20:12:18 MSD 2005) oprofile drops samples without executable
2559 //  mapping for address (see lookup_dcookie() in the kernel module), thus we cannot get
2560 //  samples for JITted code. Here we create private executable mapping over the code cache
2561 //  and then we can use standard (well, almost, as mapping can change) way to provide
2562 //  info for the reporting script by storing timestamp and location of symbol
2563 void linux_wrap_code(char* base, size_t size) {
2564   static volatile jint cnt = 0;
2565 
2566   if (!UseOprofile) {
2567     return;
2568   }
2569 
2570   char buf[PATH_MAX+1];
2571   int num = Atomic::add(1, &cnt);
2572 
2573   snprintf(buf, sizeof(buf), "%s/hs-vm-%d-%d",
2574            os::get_temp_directory(), os::current_process_id(), num);
2575   unlink(buf);
2576 
2577   int fd = ::open(buf, O_CREAT | O_RDWR, S_IRWXU);
2578 
2579   if (fd != -1) {
2580     off_t rv = ::lseek(fd, size-2, SEEK_SET);
2581     if (rv != (off_t)-1) {
2582       if (::write(fd, "", 1) == 1) {
2583         mmap(base, size,
2584              PROT_READ|PROT_WRITE|PROT_EXEC,
2585              MAP_PRIVATE|MAP_FIXED|MAP_NORESERVE, fd, 0);
2586       }
2587     }
2588     ::close(fd);
2589     unlink(buf);
2590   }
2591 }
2592 
2593 static bool recoverable_mmap_error(int err) {
2594   // See if the error is one we can let the caller handle. This
2595   // list of errno values comes from JBS-6843484. I can't find a
2596   // Linux man page that documents this specific set of errno
2597   // values so while this list currently matches Solaris, it may
2598   // change as we gain experience with this failure mode.
2599   switch (err) {
2600   case EBADF:
2601   case EINVAL:
2602   case ENOTSUP:
2603     // let the caller deal with these errors
2604     return true;
2605 
2606   default:
2607     // Any remaining errors on this OS can cause our reserved mapping
2608     // to be lost. That can cause confusion where different data
2609     // structures think they have the same memory mapped. The worst
2610     // scenario is if both the VM and a library think they have the
2611     // same memory mapped.
2612     return false;
2613   }
2614 }
2615 
2616 static void warn_fail_commit_memory(char* addr, size_t size, bool exec,
2617                                     int err) {
2618   warning("INFO: os::commit_memory(" PTR_FORMAT ", " SIZE_FORMAT
2619           ", %d) failed; error='%s' (errno=%d)", p2i(addr), size, exec,
2620           os::strerror(err), err);
2621 }
2622 
2623 static void warn_fail_commit_memory(char* addr, size_t size,
2624                                     size_t alignment_hint, bool exec,
2625                                     int err) {
2626   warning("INFO: os::commit_memory(" PTR_FORMAT ", " SIZE_FORMAT
2627           ", " SIZE_FORMAT ", %d) failed; error='%s' (errno=%d)", p2i(addr), size,
2628           alignment_hint, exec, os::strerror(err), err);
2629 }
2630 
2631 // NOTE: Linux kernel does not really reserve the pages for us.
2632 //       All it does is to check if there are enough free pages
2633 //       left at the time of mmap(). This could be a potential
2634 //       problem.
2635 int os::Linux::commit_memory_impl(char* addr, size_t size, bool exec) {
2636   int prot = exec ? PROT_READ|PROT_WRITE|PROT_EXEC : PROT_READ|PROT_WRITE;
2637   uintptr_t res = (uintptr_t) ::mmap(addr, size, prot,
2638                                      MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0);
2639   if (res != (uintptr_t) MAP_FAILED) {
2640     if (UseNUMAInterleaving) {
2641       numa_make_global(addr, size);
2642     }
2643     return 0;
2644   }
2645 
2646   int err = errno;  // save errno from mmap() call above
2647 
2648   if (!recoverable_mmap_error(err)) {
2649     warn_fail_commit_memory(addr, size, exec, err);
2650     vm_exit_out_of_memory(size, OOM_MMAP_ERROR, "committing reserved memory.");
2651   }
2652 
2653   return err;
2654 }
2655 
2656 bool os::pd_commit_memory(char* addr, size_t size, bool exec) {
2657   return os::Linux::commit_memory_impl(addr, size, exec) == 0;
2658 }
2659 
2660 void os::pd_commit_memory_or_exit(char* addr, size_t size, bool exec,
2661                                   const char* mesg) {
2662   assert(mesg != NULL, "mesg must be specified");
2663   int err = os::Linux::commit_memory_impl(addr, size, exec);
2664   if (err != 0) {
2665     // the caller wants all commit errors to exit with the specified mesg:
2666     warn_fail_commit_memory(addr, size, exec, err);
2667     vm_exit_out_of_memory(size, OOM_MMAP_ERROR, "%s", mesg);
2668   }
2669 }
2670 
2671 // Define MAP_HUGETLB here so we can build HotSpot on old systems.
2672 #ifndef MAP_HUGETLB
2673   #define MAP_HUGETLB 0x40000
2674 #endif
2675 
2676 // Define MADV_HUGEPAGE here so we can build HotSpot on old systems.
2677 #ifndef MADV_HUGEPAGE
2678   #define MADV_HUGEPAGE 14
2679 #endif
2680 
2681 int os::Linux::commit_memory_impl(char* addr, size_t size,
2682                                   size_t alignment_hint, bool exec) {
2683   int err = os::Linux::commit_memory_impl(addr, size, exec);
2684   if (err == 0) {
2685     realign_memory(addr, size, alignment_hint);
2686   }
2687   return err;
2688 }
2689 
2690 bool os::pd_commit_memory(char* addr, size_t size, size_t alignment_hint,
2691                           bool exec) {
2692   return os::Linux::commit_memory_impl(addr, size, alignment_hint, exec) == 0;
2693 }
2694 
2695 void os::pd_commit_memory_or_exit(char* addr, size_t size,
2696                                   size_t alignment_hint, bool exec,
2697                                   const char* mesg) {
2698   assert(mesg != NULL, "mesg must be specified");
2699   int err = os::Linux::commit_memory_impl(addr, size, alignment_hint, exec);
2700   if (err != 0) {
2701     // the caller wants all commit errors to exit with the specified mesg:
2702     warn_fail_commit_memory(addr, size, alignment_hint, exec, err);
2703     vm_exit_out_of_memory(size, OOM_MMAP_ERROR, "%s", mesg);
2704   }
2705 }
2706 
2707 void os::pd_realign_memory(char *addr, size_t bytes, size_t alignment_hint) {
2708   if (UseTransparentHugePages && alignment_hint > (size_t)vm_page_size()) {
2709     // We don't check the return value: madvise(MADV_HUGEPAGE) may not
2710     // be supported or the memory may already be backed by huge pages.
2711     ::madvise(addr, bytes, MADV_HUGEPAGE);
2712   }
2713 }
2714 
2715 void os::pd_free_memory(char *addr, size_t bytes, size_t alignment_hint) {
2716   // This method works by doing an mmap over an existing mmaping and effectively discarding
2717   // the existing pages. However it won't work for SHM-based large pages that cannot be
2718   // uncommitted at all. We don't do anything in this case to avoid creating a segment with
2719   // small pages on top of the SHM segment. This method always works for small pages, so we
2720   // allow that in any case.
2721   if (alignment_hint <= (size_t)os::vm_page_size() || can_commit_large_page_memory()) {
2722     commit_memory(addr, bytes, alignment_hint, !ExecMem);
2723   }
2724 }
2725 
2726 void os::numa_make_global(char *addr, size_t bytes) {
2727   Linux::numa_interleave_memory(addr, bytes);
2728 }
2729 
2730 // Define for numa_set_bind_policy(int). Setting the argument to 0 will set the
2731 // bind policy to MPOL_PREFERRED for the current thread.
2732 #define USE_MPOL_PREFERRED 0
2733 
2734 void os::numa_make_local(char *addr, size_t bytes, int lgrp_hint) {
2735   // To make NUMA and large pages more robust when both enabled, we need to ease
2736   // the requirements on where the memory should be allocated. MPOL_BIND is the
2737   // default policy and it will force memory to be allocated on the specified
2738   // node. Changing this to MPOL_PREFERRED will prefer to allocate the memory on
2739   // the specified node, but will not force it. Using this policy will prevent
2740   // getting SIGBUS when trying to allocate large pages on NUMA nodes with no
2741   // free large pages.
2742   Linux::numa_set_bind_policy(USE_MPOL_PREFERRED);
2743   Linux::numa_tonode_memory(addr, bytes, lgrp_hint);
2744 }
2745 
2746 bool os::numa_topology_changed() { return false; }
2747 
2748 size_t os::numa_get_groups_num() {
2749   int max_node = Linux::numa_max_node();
2750   return max_node > 0 ? max_node + 1 : 1;
2751 }
2752 
2753 int os::numa_get_group_id() {
2754   int cpu_id = Linux::sched_getcpu();
2755   if (cpu_id != -1) {
2756     int lgrp_id = Linux::get_node_by_cpu(cpu_id);
2757     if (lgrp_id != -1) {
2758       return lgrp_id;
2759     }
2760   }
2761   return 0;
2762 }
2763 
2764 size_t os::numa_get_leaf_groups(int *ids, size_t size) {
2765   for (size_t i = 0; i < size; i++) {
2766     ids[i] = i;
2767   }
2768   return size;
2769 }
2770 
2771 bool os::get_page_info(char *start, page_info* info) {
2772   return false;
2773 }
2774 
2775 char *os::scan_pages(char *start, char* end, page_info* page_expected,
2776                      page_info* page_found) {
2777   return end;
2778 }
2779 
2780 
2781 int os::Linux::sched_getcpu_syscall(void) {
2782   unsigned int cpu = 0;
2783   int retval = -1;
2784 
2785 #if defined(IA32)
2786   #ifndef SYS_getcpu
2787     #define SYS_getcpu 318
2788   #endif
2789   retval = syscall(SYS_getcpu, &cpu, NULL, NULL);
2790 #elif defined(AMD64)
2791 // Unfortunately we have to bring all these macros here from vsyscall.h
2792 // to be able to compile on old linuxes.
2793   #define __NR_vgetcpu 2
2794   #define VSYSCALL_START (-10UL << 20)
2795   #define VSYSCALL_SIZE 1024
2796   #define VSYSCALL_ADDR(vsyscall_nr) (VSYSCALL_START+VSYSCALL_SIZE*(vsyscall_nr))
2797   typedef long (*vgetcpu_t)(unsigned int *cpu, unsigned int *node, unsigned long *tcache);
2798   vgetcpu_t vgetcpu = (vgetcpu_t)VSYSCALL_ADDR(__NR_vgetcpu);
2799   retval = vgetcpu(&cpu, NULL, NULL);
2800 #endif
2801 
2802   return (retval == -1) ? retval : cpu;
2803 }
2804 
2805 // Something to do with the numa-aware allocator needs these symbols
2806 extern "C" JNIEXPORT void numa_warn(int number, char *where, ...) { }
2807 extern "C" JNIEXPORT void numa_error(char *where) { }
2808 
2809 
2810 // If we are running with libnuma version > 2, then we should
2811 // be trying to use symbols with versions 1.1
2812 // If we are running with earlier version, which did not have symbol versions,
2813 // we should use the base version.
2814 void* os::Linux::libnuma_dlsym(void* handle, const char *name) {
2815   typedef void* (*dlvsym_func_type)(void* handle, const char* name, const char* version);
2816   static dlvsym_func_type dlvsym_func;
2817   static bool initialized = false;
2818 
2819   if (!initialized) {
2820     dlvsym_func = (dlvsym_func_type)dlsym(RTLD_NEXT, "dlvsym");
2821     initialized = true;
2822   }
2823 
2824   if (dlvsym_func != NULL) {
2825     void *f = dlvsym_func(handle, name, "libnuma_1.1");
2826     if (f != NULL) {
2827       return f;
2828     }
2829   }
2830 
2831   return dlsym(handle, name);
2832 }
2833 
2834 bool os::Linux::libnuma_init() {
2835   // sched_getcpu() should be in libc.
2836   set_sched_getcpu(CAST_TO_FN_PTR(sched_getcpu_func_t,
2837                                   dlsym(RTLD_DEFAULT, "sched_getcpu")));
2838 
2839   // If it's not, try a direct syscall.
2840   if (sched_getcpu() == -1) {
2841     set_sched_getcpu(CAST_TO_FN_PTR(sched_getcpu_func_t,
2842                                     (void*)&sched_getcpu_syscall));
2843   }
2844 
2845   if (sched_getcpu() != -1) { // Does it work?
2846     void *handle = dlopen("libnuma.so.1", RTLD_LAZY);
2847     if (handle != NULL) {
2848       set_numa_node_to_cpus(CAST_TO_FN_PTR(numa_node_to_cpus_func_t,
2849                                            libnuma_dlsym(handle, "numa_node_to_cpus")));
2850       set_numa_max_node(CAST_TO_FN_PTR(numa_max_node_func_t,
2851                                        libnuma_dlsym(handle, "numa_max_node")));
2852       set_numa_available(CAST_TO_FN_PTR(numa_available_func_t,
2853                                         libnuma_dlsym(handle, "numa_available")));
2854       set_numa_tonode_memory(CAST_TO_FN_PTR(numa_tonode_memory_func_t,
2855                                             libnuma_dlsym(handle, "numa_tonode_memory")));
2856       set_numa_interleave_memory(CAST_TO_FN_PTR(numa_interleave_memory_func_t,
2857                                                 libnuma_dlsym(handle, "numa_interleave_memory")));
2858       set_numa_set_bind_policy(CAST_TO_FN_PTR(numa_set_bind_policy_func_t,
2859                                               libnuma_dlsym(handle, "numa_set_bind_policy")));
2860 
2861 
2862       if (numa_available() != -1) {
2863         set_numa_all_nodes((unsigned long*)libnuma_dlsym(handle, "numa_all_nodes"));
2864         // Create a cpu -> node mapping
2865         _cpu_to_node = new (ResourceObj::C_HEAP, mtInternal) GrowableArray<int>(0, true);
2866         rebuild_cpu_to_node_map();
2867         return true;
2868       }
2869     }
2870   }
2871   return false;
2872 }
2873 
2874 size_t os::Linux::default_guard_size(os::ThreadType thr_type) {
2875   // Creating guard page is very expensive. Java thread has HotSpot
2876   // guard pages, only enable glibc guard page for non-Java threads.
2877   // (Remember: compiler thread is a Java thread, too!)
2878   return ((thr_type == java_thread || thr_type == compiler_thread) ? 0 : page_size());
2879 }
2880 
2881 // rebuild_cpu_to_node_map() constructs a table mapping cpud id to node id.
2882 // The table is later used in get_node_by_cpu().
2883 void os::Linux::rebuild_cpu_to_node_map() {
2884   const size_t NCPUS = 32768; // Since the buffer size computation is very obscure
2885                               // in libnuma (possible values are starting from 16,
2886                               // and continuing up with every other power of 2, but less
2887                               // than the maximum number of CPUs supported by kernel), and
2888                               // is a subject to change (in libnuma version 2 the requirements
2889                               // are more reasonable) we'll just hardcode the number they use
2890                               // in the library.
2891   const size_t BitsPerCLong = sizeof(long) * CHAR_BIT;
2892 
2893   size_t cpu_num = processor_count();
2894   size_t cpu_map_size = NCPUS / BitsPerCLong;
2895   size_t cpu_map_valid_size =
2896     MIN2((cpu_num + BitsPerCLong - 1) / BitsPerCLong, cpu_map_size);
2897 
2898   cpu_to_node()->clear();
2899   cpu_to_node()->at_grow(cpu_num - 1);
2900   size_t node_num = numa_get_groups_num();
2901 
2902   unsigned long *cpu_map = NEW_C_HEAP_ARRAY(unsigned long, cpu_map_size, mtInternal);
2903   for (size_t i = 0; i < node_num; i++) {
2904     if (numa_node_to_cpus(i, cpu_map, cpu_map_size * sizeof(unsigned long)) != -1) {
2905       for (size_t j = 0; j < cpu_map_valid_size; j++) {
2906         if (cpu_map[j] != 0) {
2907           for (size_t k = 0; k < BitsPerCLong; k++) {
2908             if (cpu_map[j] & (1UL << k)) {
2909               cpu_to_node()->at_put(j * BitsPerCLong + k, i);
2910             }
2911           }
2912         }
2913       }
2914     }
2915   }
2916   FREE_C_HEAP_ARRAY(unsigned long, cpu_map);
2917 }
2918 
2919 int os::Linux::get_node_by_cpu(int cpu_id) {
2920   if (cpu_to_node() != NULL && cpu_id >= 0 && cpu_id < cpu_to_node()->length()) {
2921     return cpu_to_node()->at(cpu_id);
2922   }
2923   return -1;
2924 }
2925 
2926 GrowableArray<int>* os::Linux::_cpu_to_node;
2927 os::Linux::sched_getcpu_func_t os::Linux::_sched_getcpu;
2928 os::Linux::numa_node_to_cpus_func_t os::Linux::_numa_node_to_cpus;
2929 os::Linux::numa_max_node_func_t os::Linux::_numa_max_node;
2930 os::Linux::numa_available_func_t os::Linux::_numa_available;
2931 os::Linux::numa_tonode_memory_func_t os::Linux::_numa_tonode_memory;
2932 os::Linux::numa_interleave_memory_func_t os::Linux::_numa_interleave_memory;
2933 os::Linux::numa_set_bind_policy_func_t os::Linux::_numa_set_bind_policy;
2934 unsigned long* os::Linux::_numa_all_nodes;
2935 
2936 bool os::pd_uncommit_memory(char* addr, size_t size) {
2937   uintptr_t res = (uintptr_t) ::mmap(addr, size, PROT_NONE,
2938                                      MAP_PRIVATE|MAP_FIXED|MAP_NORESERVE|MAP_ANONYMOUS, -1, 0);
2939   return res  != (uintptr_t) MAP_FAILED;
2940 }
2941 
2942 static address get_stack_commited_bottom(address bottom, size_t size) {
2943   address nbot = bottom;
2944   address ntop = bottom + size;
2945 
2946   size_t page_sz = os::vm_page_size();
2947   unsigned pages = size / page_sz;
2948 
2949   unsigned char vec[1];
2950   unsigned imin = 1, imax = pages + 1, imid;
2951   int mincore_return_value = 0;
2952 
2953   assert(imin <= imax, "Unexpected page size");
2954 
2955   while (imin < imax) {
2956     imid = (imax + imin) / 2;
2957     nbot = ntop - (imid * page_sz);
2958 
2959     // Use a trick with mincore to check whether the page is mapped or not.
2960     // mincore sets vec to 1 if page resides in memory and to 0 if page
2961     // is swapped output but if page we are asking for is unmapped
2962     // it returns -1,ENOMEM
2963     mincore_return_value = mincore(nbot, page_sz, vec);
2964 
2965     if (mincore_return_value == -1) {
2966       // Page is not mapped go up
2967       // to find first mapped page
2968       if (errno != EAGAIN) {
2969         assert(errno == ENOMEM, "Unexpected mincore errno");
2970         imax = imid;
2971       }
2972     } else {
2973       // Page is mapped go down
2974       // to find first not mapped page
2975       imin = imid + 1;
2976     }
2977   }
2978 
2979   nbot = nbot + page_sz;
2980 
2981   // Adjust stack bottom one page up if last checked page is not mapped
2982   if (mincore_return_value == -1) {
2983     nbot = nbot + page_sz;
2984   }
2985 
2986   return nbot;
2987 }
2988 
2989 
2990 // Linux uses a growable mapping for the stack, and if the mapping for
2991 // the stack guard pages is not removed when we detach a thread the
2992 // stack cannot grow beyond the pages where the stack guard was
2993 // mapped.  If at some point later in the process the stack expands to
2994 // that point, the Linux kernel cannot expand the stack any further
2995 // because the guard pages are in the way, and a segfault occurs.
2996 //
2997 // However, it's essential not to split the stack region by unmapping
2998 // a region (leaving a hole) that's already part of the stack mapping,
2999 // so if the stack mapping has already grown beyond the guard pages at
3000 // the time we create them, we have to truncate the stack mapping.
3001 // So, we need to know the extent of the stack mapping when
3002 // create_stack_guard_pages() is called.
3003 
3004 // We only need this for stacks that are growable: at the time of
3005 // writing thread stacks don't use growable mappings (i.e. those
3006 // creeated with MAP_GROWSDOWN), and aren't marked "[stack]", so this
3007 // only applies to the main thread.
3008 
3009 // If the (growable) stack mapping already extends beyond the point
3010 // where we're going to put our guard pages, truncate the mapping at
3011 // that point by munmap()ping it.  This ensures that when we later
3012 // munmap() the guard pages we don't leave a hole in the stack
3013 // mapping. This only affects the main/initial thread
3014 
3015 bool os::pd_create_stack_guard_pages(char* addr, size_t size) {
3016   if (os::Linux::is_initial_thread()) {
3017     // As we manually grow stack up to bottom inside create_attached_thread(),
3018     // it's likely that os::Linux::initial_thread_stack_bottom is mapped and
3019     // we don't need to do anything special.
3020     // Check it first, before calling heavy function.
3021     uintptr_t stack_extent = (uintptr_t) os::Linux::initial_thread_stack_bottom();
3022     unsigned char vec[1];
3023 
3024     if (mincore((address)stack_extent, os::vm_page_size(), vec) == -1) {
3025       // Fallback to slow path on all errors, including EAGAIN
3026       stack_extent = (uintptr_t) get_stack_commited_bottom(
3027                                                            os::Linux::initial_thread_stack_bottom(),
3028                                                            (size_t)addr - stack_extent);
3029     }
3030 
3031     if (stack_extent < (uintptr_t)addr) {
3032       ::munmap((void*)stack_extent, (uintptr_t)(addr - stack_extent));
3033     }
3034   }
3035 
3036   return os::commit_memory(addr, size, !ExecMem);
3037 }
3038 
3039 // If this is a growable mapping, remove the guard pages entirely by
3040 // munmap()ping them.  If not, just call uncommit_memory(). This only
3041 // affects the main/initial thread, but guard against future OS changes
3042 // It's safe to always unmap guard pages for initial thread because we
3043 // always place it right after end of the mapped region
3044 
3045 bool os::remove_stack_guard_pages(char* addr, size_t size) {
3046   uintptr_t stack_extent, stack_base;
3047 
3048   if (os::Linux::is_initial_thread()) {
3049     return ::munmap(addr, size) == 0;
3050   }
3051 
3052   return os::uncommit_memory(addr, size);
3053 }
3054 
3055 // If 'fixed' is true, anon_mmap() will attempt to reserve anonymous memory
3056 // at 'requested_addr'. If there are existing memory mappings at the same
3057 // location, however, they will be overwritten. If 'fixed' is false,
3058 // 'requested_addr' is only treated as a hint, the return value may or
3059 // may not start from the requested address. Unlike Linux mmap(), this
3060 // function returns NULL to indicate failure.
3061 static char* anon_mmap(char* requested_addr, size_t bytes, bool fixed) {
3062   char * addr;
3063   int flags;
3064 
3065   flags = MAP_PRIVATE | MAP_NORESERVE | MAP_ANONYMOUS;
3066   if (fixed) {
3067     assert((uintptr_t)requested_addr % os::Linux::page_size() == 0, "unaligned address");
3068     flags |= MAP_FIXED;
3069   }
3070 
3071   // Map reserved/uncommitted pages PROT_NONE so we fail early if we
3072   // touch an uncommitted page. Otherwise, the read/write might
3073   // succeed if we have enough swap space to back the physical page.
3074   addr = (char*)::mmap(requested_addr, bytes, PROT_NONE,
3075                        flags, -1, 0);
3076 
3077   return addr == MAP_FAILED ? NULL : addr;
3078 }
3079 
3080 // Allocate (using mmap, NO_RESERVE, with small pages) at either a given request address
3081 //   (req_addr != NULL) or with a given alignment.
3082 //  - bytes shall be a multiple of alignment.
3083 //  - req_addr can be NULL. If not NULL, it must be a multiple of alignment.
3084 //  - alignment sets the alignment at which memory shall be allocated.
3085 //     It must be a multiple of allocation granularity.
3086 // Returns address of memory or NULL. If req_addr was not NULL, will only return
3087 //  req_addr or NULL.
3088 static char* anon_mmap_aligned(size_t bytes, size_t alignment, char* req_addr) {
3089 
3090   size_t extra_size = bytes;
3091   if (req_addr == NULL && alignment > 0) {
3092     extra_size += alignment;
3093   }
3094 
3095   char* start = (char*) ::mmap(req_addr, extra_size, PROT_NONE,
3096     MAP_PRIVATE|MAP_ANONYMOUS|MAP_NORESERVE,
3097     -1, 0);
3098   if (start == MAP_FAILED) {
3099     start = NULL;
3100   } else {
3101     if (req_addr != NULL) {
3102       if (start != req_addr) {
3103         ::munmap(start, extra_size);
3104         start = NULL;
3105       }
3106     } else {
3107       char* const start_aligned = (char*) align_ptr_up(start, alignment);
3108       char* const end_aligned = start_aligned + bytes;
3109       char* const end = start + extra_size;
3110       if (start_aligned > start) {
3111         ::munmap(start, start_aligned - start);
3112       }
3113       if (end_aligned < end) {
3114         ::munmap(end_aligned, end - end_aligned);
3115       }
3116       start = start_aligned;
3117     }
3118   }
3119   return start;
3120 }
3121 
3122 static int anon_munmap(char * addr, size_t size) {
3123   return ::munmap(addr, size) == 0;
3124 }
3125 
3126 char* os::pd_reserve_memory(size_t bytes, char* requested_addr,
3127                             size_t alignment_hint) {
3128   return anon_mmap(requested_addr, bytes, (requested_addr != NULL));
3129 }
3130 
3131 bool os::pd_release_memory(char* addr, size_t size) {
3132   return anon_munmap(addr, size);
3133 }
3134 
3135 static bool linux_mprotect(char* addr, size_t size, int prot) {
3136   // Linux wants the mprotect address argument to be page aligned.
3137   char* bottom = (char*)align_size_down((intptr_t)addr, os::Linux::page_size());
3138 
3139   // According to SUSv3, mprotect() should only be used with mappings
3140   // established by mmap(), and mmap() always maps whole pages. Unaligned
3141   // 'addr' likely indicates problem in the VM (e.g. trying to change
3142   // protection of malloc'ed or statically allocated memory). Check the
3143   // caller if you hit this assert.
3144   assert(addr == bottom, "sanity check");
3145 
3146   size = align_size_up(pointer_delta(addr, bottom, 1) + size, os::Linux::page_size());
3147   return ::mprotect(bottom, size, prot) == 0;
3148 }
3149 
3150 // Set protections specified
3151 bool os::protect_memory(char* addr, size_t bytes, ProtType prot,
3152                         bool is_committed) {
3153   unsigned int p = 0;
3154   switch (prot) {
3155   case MEM_PROT_NONE: p = PROT_NONE; break;
3156   case MEM_PROT_READ: p = PROT_READ; break;
3157   case MEM_PROT_RW:   p = PROT_READ|PROT_WRITE; break;
3158   case MEM_PROT_RWX:  p = PROT_READ|PROT_WRITE|PROT_EXEC; break;
3159   default:
3160     ShouldNotReachHere();
3161   }
3162   // is_committed is unused.
3163   return linux_mprotect(addr, bytes, p);
3164 }
3165 
3166 bool os::guard_memory(char* addr, size_t size) {
3167   return linux_mprotect(addr, size, PROT_NONE);
3168 }
3169 
3170 bool os::unguard_memory(char* addr, size_t size) {
3171   return linux_mprotect(addr, size, PROT_READ|PROT_WRITE);
3172 }
3173 
3174 bool os::Linux::transparent_huge_pages_sanity_check(bool warn,
3175                                                     size_t page_size) {
3176   bool result = false;
3177   void *p = mmap(NULL, page_size * 2, PROT_READ|PROT_WRITE,
3178                  MAP_ANONYMOUS|MAP_PRIVATE,
3179                  -1, 0);
3180   if (p != MAP_FAILED) {
3181     void *aligned_p = align_ptr_up(p, page_size);
3182 
3183     result = madvise(aligned_p, page_size, MADV_HUGEPAGE) == 0;
3184 
3185     munmap(p, page_size * 2);
3186   }
3187 
3188   if (warn && !result) {
3189     warning("TransparentHugePages is not supported by the operating system.");
3190   }
3191 
3192   return result;
3193 }
3194 
3195 bool os::Linux::hugetlbfs_sanity_check(bool warn, size_t page_size) {
3196   bool result = false;
3197   void *p = mmap(NULL, page_size, PROT_READ|PROT_WRITE,
3198                  MAP_ANONYMOUS|MAP_PRIVATE|MAP_HUGETLB,
3199                  -1, 0);
3200 
3201   if (p != MAP_FAILED) {
3202     // We don't know if this really is a huge page or not.
3203     FILE *fp = fopen("/proc/self/maps", "r");
3204     if (fp) {
3205       while (!feof(fp)) {
3206         char chars[257];
3207         long x = 0;
3208         if (fgets(chars, sizeof(chars), fp)) {
3209           if (sscanf(chars, "%lx-%*x", &x) == 1
3210               && x == (long)p) {
3211             if (strstr (chars, "hugepage")) {
3212               result = true;
3213               break;
3214             }
3215           }
3216         }
3217       }
3218       fclose(fp);
3219     }
3220     munmap(p, page_size);
3221   }
3222 
3223   if (warn && !result) {
3224     warning("HugeTLBFS is not supported by the operating system.");
3225   }
3226 
3227   return result;
3228 }
3229 
3230 // Set the coredump_filter bits to include largepages in core dump (bit 6)
3231 //
3232 // From the coredump_filter documentation:
3233 //
3234 // - (bit 0) anonymous private memory
3235 // - (bit 1) anonymous shared memory
3236 // - (bit 2) file-backed private memory
3237 // - (bit 3) file-backed shared memory
3238 // - (bit 4) ELF header pages in file-backed private memory areas (it is
3239 //           effective only if the bit 2 is cleared)
3240 // - (bit 5) hugetlb private memory
3241 // - (bit 6) hugetlb shared memory
3242 //
3243 static void set_coredump_filter(void) {
3244   FILE *f;
3245   long cdm;
3246 
3247   if ((f = fopen("/proc/self/coredump_filter", "r+")) == NULL) {
3248     return;
3249   }
3250 
3251   if (fscanf(f, "%lx", &cdm) != 1) {
3252     fclose(f);
3253     return;
3254   }
3255 
3256   rewind(f);
3257 
3258   if ((cdm & LARGEPAGES_BIT) == 0) {
3259     cdm |= LARGEPAGES_BIT;
3260     fprintf(f, "%#lx", cdm);
3261   }
3262 
3263   fclose(f);
3264 }
3265 
3266 // Large page support
3267 
3268 static size_t _large_page_size = 0;
3269 
3270 size_t os::Linux::find_large_page_size() {
3271   size_t large_page_size = 0;
3272 
3273   // large_page_size on Linux is used to round up heap size. x86 uses either
3274   // 2M or 4M page, depending on whether PAE (Physical Address Extensions)
3275   // mode is enabled. AMD64/EM64T uses 2M page in 64bit mode. IA64 can use
3276   // page as large as 256M.
3277   //
3278   // Here we try to figure out page size by parsing /proc/meminfo and looking
3279   // for a line with the following format:
3280   //    Hugepagesize:     2048 kB
3281   //
3282   // If we can't determine the value (e.g. /proc is not mounted, or the text
3283   // format has been changed), we'll use the largest page size supported by
3284   // the processor.
3285 
3286 #ifndef ZERO
3287   large_page_size =
3288     AARCH64_ONLY(2 * M)
3289     AMD64_ONLY(2 * M)
3290     ARM32_ONLY(2 * M)
3291     IA32_ONLY(4 * M)
3292     IA64_ONLY(256 * M)
3293     PPC_ONLY(4 * M)
3294     S390_ONLY(1 * M)
3295     SPARC_ONLY(4 * M);
3296 #endif // ZERO
3297 
3298   FILE *fp = fopen("/proc/meminfo", "r");
3299   if (fp) {
3300     while (!feof(fp)) {
3301       int x = 0;
3302       char buf[16];
3303       if (fscanf(fp, "Hugepagesize: %d", &x) == 1) {
3304         if (x && fgets(buf, sizeof(buf), fp) && strcmp(buf, " kB\n") == 0) {
3305           large_page_size = x * K;
3306           break;
3307         }
3308       } else {
3309         // skip to next line
3310         for (;;) {
3311           int ch = fgetc(fp);
3312           if (ch == EOF || ch == (int)'\n') break;
3313         }
3314       }
3315     }
3316     fclose(fp);
3317   }
3318 
3319   if (!FLAG_IS_DEFAULT(LargePageSizeInBytes) && LargePageSizeInBytes != large_page_size) {
3320     warning("Setting LargePageSizeInBytes has no effect on this OS. Large page size is "
3321             SIZE_FORMAT "%s.", byte_size_in_proper_unit(large_page_size),
3322             proper_unit_for_byte_size(large_page_size));
3323   }
3324 
3325   return large_page_size;
3326 }
3327 
3328 size_t os::Linux::setup_large_page_size() {
3329   _large_page_size = Linux::find_large_page_size();
3330   const size_t default_page_size = (size_t)Linux::page_size();
3331   if (_large_page_size > default_page_size) {
3332     _page_sizes[0] = _large_page_size;
3333     _page_sizes[1] = default_page_size;
3334     _page_sizes[2] = 0;
3335   }
3336 
3337   return _large_page_size;
3338 }
3339 
3340 bool os::Linux::setup_large_page_type(size_t page_size) {
3341   if (FLAG_IS_DEFAULT(UseHugeTLBFS) &&
3342       FLAG_IS_DEFAULT(UseSHM) &&
3343       FLAG_IS_DEFAULT(UseTransparentHugePages)) {
3344 
3345     // The type of large pages has not been specified by the user.
3346 
3347     // Try UseHugeTLBFS and then UseSHM.
3348     UseHugeTLBFS = UseSHM = true;
3349 
3350     // Don't try UseTransparentHugePages since there are known
3351     // performance issues with it turned on. This might change in the future.
3352     UseTransparentHugePages = false;
3353   }
3354 
3355   if (UseTransparentHugePages) {
3356     bool warn_on_failure = !FLAG_IS_DEFAULT(UseTransparentHugePages);
3357     if (transparent_huge_pages_sanity_check(warn_on_failure, page_size)) {
3358       UseHugeTLBFS = false;
3359       UseSHM = false;
3360       return true;
3361     }
3362     UseTransparentHugePages = false;
3363   }
3364 
3365   if (UseHugeTLBFS) {
3366     bool warn_on_failure = !FLAG_IS_DEFAULT(UseHugeTLBFS);
3367     if (hugetlbfs_sanity_check(warn_on_failure, page_size)) {
3368       UseSHM = false;
3369       return true;
3370     }
3371     UseHugeTLBFS = false;
3372   }
3373 
3374   return UseSHM;
3375 }
3376 
3377 void os::large_page_init() {
3378   if (!UseLargePages &&
3379       !UseTransparentHugePages &&
3380       !UseHugeTLBFS &&
3381       !UseSHM) {
3382     // Not using large pages.
3383     return;
3384   }
3385 
3386   if (!FLAG_IS_DEFAULT(UseLargePages) && !UseLargePages) {
3387     // The user explicitly turned off large pages.
3388     // Ignore the rest of the large pages flags.
3389     UseTransparentHugePages = false;
3390     UseHugeTLBFS = false;
3391     UseSHM = false;
3392     return;
3393   }
3394 
3395   size_t large_page_size = Linux::setup_large_page_size();
3396   UseLargePages          = Linux::setup_large_page_type(large_page_size);
3397 
3398   set_coredump_filter();
3399 }
3400 
3401 #ifndef SHM_HUGETLB
3402   #define SHM_HUGETLB 04000
3403 #endif
3404 
3405 #define shm_warning_format(format, ...)              \
3406   do {                                               \
3407     if (UseLargePages &&                             \
3408         (!FLAG_IS_DEFAULT(UseLargePages) ||          \
3409          !FLAG_IS_DEFAULT(UseSHM) ||                 \
3410          !FLAG_IS_DEFAULT(LargePageSizeInBytes))) {  \
3411       warning(format, __VA_ARGS__);                  \
3412     }                                                \
3413   } while (0)
3414 
3415 #define shm_warning(str) shm_warning_format("%s", str)
3416 
3417 #define shm_warning_with_errno(str)                \
3418   do {                                             \
3419     int err = errno;                               \
3420     shm_warning_format(str " (error = %d)", err);  \
3421   } while (0)
3422 
3423 static char* shmat_with_alignment(int shmid, size_t bytes, size_t alignment) {
3424   assert(is_size_aligned(bytes, alignment), "Must be divisible by the alignment");
3425 
3426   if (!is_size_aligned(alignment, SHMLBA)) {
3427     assert(false, "Code below assumes that alignment is at least SHMLBA aligned");
3428     return NULL;
3429   }
3430 
3431   // To ensure that we get 'alignment' aligned memory from shmat,
3432   // we pre-reserve aligned virtual memory and then attach to that.
3433 
3434   char* pre_reserved_addr = anon_mmap_aligned(bytes, alignment, NULL);
3435   if (pre_reserved_addr == NULL) {
3436     // Couldn't pre-reserve aligned memory.
3437     shm_warning("Failed to pre-reserve aligned memory for shmat.");
3438     return NULL;
3439   }
3440 
3441   // SHM_REMAP is needed to allow shmat to map over an existing mapping.
3442   char* addr = (char*)shmat(shmid, pre_reserved_addr, SHM_REMAP);
3443 
3444   if ((intptr_t)addr == -1) {
3445     int err = errno;
3446     shm_warning_with_errno("Failed to attach shared memory.");
3447 
3448     assert(err != EACCES, "Unexpected error");
3449     assert(err != EIDRM,  "Unexpected error");
3450     assert(err != EINVAL, "Unexpected error");
3451 
3452     // Since we don't know if the kernel unmapped the pre-reserved memory area
3453     // we can't unmap it, since that would potentially unmap memory that was
3454     // mapped from other threads.
3455     return NULL;
3456   }
3457 
3458   return addr;
3459 }
3460 
3461 static char* shmat_at_address(int shmid, char* req_addr) {
3462   if (!is_ptr_aligned(req_addr, SHMLBA)) {
3463     assert(false, "Requested address needs to be SHMLBA aligned");
3464     return NULL;
3465   }
3466 
3467   char* addr = (char*)shmat(shmid, req_addr, 0);
3468 
3469   if ((intptr_t)addr == -1) {
3470     shm_warning_with_errno("Failed to attach shared memory.");
3471     return NULL;
3472   }
3473 
3474   return addr;
3475 }
3476 
3477 static char* shmat_large_pages(int shmid, size_t bytes, size_t alignment, char* req_addr) {
3478   // If a req_addr has been provided, we assume that the caller has already aligned the address.
3479   if (req_addr != NULL) {
3480     assert(is_ptr_aligned(req_addr, os::large_page_size()), "Must be divisible by the large page size");
3481     assert(is_ptr_aligned(req_addr, alignment), "Must be divisible by given alignment");
3482     return shmat_at_address(shmid, req_addr);
3483   }
3484 
3485   // Since shmid has been setup with SHM_HUGETLB, shmat will automatically
3486   // return large page size aligned memory addresses when req_addr == NULL.
3487   // However, if the alignment is larger than the large page size, we have
3488   // to manually ensure that the memory returned is 'alignment' aligned.
3489   if (alignment > os::large_page_size()) {
3490     assert(is_size_aligned(alignment, os::large_page_size()), "Must be divisible by the large page size");
3491     return shmat_with_alignment(shmid, bytes, alignment);
3492   } else {
3493     return shmat_at_address(shmid, NULL);
3494   }
3495 }
3496 
3497 char* os::Linux::reserve_memory_special_shm(size_t bytes, size_t alignment,
3498                                             char* req_addr, bool exec) {
3499   // "exec" is passed in but not used.  Creating the shared image for
3500   // the code cache doesn't have an SHM_X executable permission to check.
3501   assert(UseLargePages && UseSHM, "only for SHM large pages");
3502   assert(is_ptr_aligned(req_addr, os::large_page_size()), "Unaligned address");
3503   assert(is_ptr_aligned(req_addr, alignment), "Unaligned address");
3504 
3505   if (!is_size_aligned(bytes, os::large_page_size())) {
3506     return NULL; // Fallback to small pages.
3507   }
3508 
3509   // Create a large shared memory region to attach to based on size.
3510   // Currently, size is the total size of the heap.
3511   int shmid = shmget(IPC_PRIVATE, bytes, SHM_HUGETLB|IPC_CREAT|SHM_R|SHM_W);
3512   if (shmid == -1) {
3513     // Possible reasons for shmget failure:
3514     // 1. shmmax is too small for Java heap.
3515     //    > check shmmax value: cat /proc/sys/kernel/shmmax
3516     //    > increase shmmax value: echo "0xffffffff" > /proc/sys/kernel/shmmax
3517     // 2. not enough large page memory.
3518     //    > check available large pages: cat /proc/meminfo
3519     //    > increase amount of large pages:
3520     //          echo new_value > /proc/sys/vm/nr_hugepages
3521     //      Note 1: different Linux may use different name for this property,
3522     //            e.g. on Redhat AS-3 it is "hugetlb_pool".
3523     //      Note 2: it's possible there's enough physical memory available but
3524     //            they are so fragmented after a long run that they can't
3525     //            coalesce into large pages. Try to reserve large pages when
3526     //            the system is still "fresh".
3527     shm_warning_with_errno("Failed to reserve shared memory.");
3528     return NULL;
3529   }
3530 
3531   // Attach to the region.
3532   char* addr = shmat_large_pages(shmid, bytes, alignment, req_addr);
3533 
3534   // Remove shmid. If shmat() is successful, the actual shared memory segment
3535   // will be deleted when it's detached by shmdt() or when the process
3536   // terminates. If shmat() is not successful this will remove the shared
3537   // segment immediately.
3538   shmctl(shmid, IPC_RMID, NULL);
3539 
3540   return addr;
3541 }
3542 
3543 static void warn_on_large_pages_failure(char* req_addr, size_t bytes,
3544                                         int error) {
3545   assert(error == ENOMEM, "Only expect to fail if no memory is available");
3546 
3547   bool warn_on_failure = UseLargePages &&
3548       (!FLAG_IS_DEFAULT(UseLargePages) ||
3549        !FLAG_IS_DEFAULT(UseHugeTLBFS) ||
3550        !FLAG_IS_DEFAULT(LargePageSizeInBytes));
3551 
3552   if (warn_on_failure) {
3553     char msg[128];
3554     jio_snprintf(msg, sizeof(msg), "Failed to reserve large pages memory req_addr: "
3555                  PTR_FORMAT " bytes: " SIZE_FORMAT " (errno = %d).", req_addr, bytes, error);
3556     warning("%s", msg);
3557   }
3558 }
3559 
3560 char* os::Linux::reserve_memory_special_huge_tlbfs_only(size_t bytes,
3561                                                         char* req_addr,
3562                                                         bool exec) {
3563   assert(UseLargePages && UseHugeTLBFS, "only for Huge TLBFS large pages");
3564   assert(is_size_aligned(bytes, os::large_page_size()), "Unaligned size");
3565   assert(is_ptr_aligned(req_addr, os::large_page_size()), "Unaligned address");
3566 
3567   int prot = exec ? PROT_READ|PROT_WRITE|PROT_EXEC : PROT_READ|PROT_WRITE;
3568   char* addr = (char*)::mmap(req_addr, bytes, prot,
3569                              MAP_PRIVATE|MAP_ANONYMOUS|MAP_HUGETLB,
3570                              -1, 0);
3571 
3572   if (addr == MAP_FAILED) {
3573     warn_on_large_pages_failure(req_addr, bytes, errno);
3574     return NULL;
3575   }
3576 
3577   assert(is_ptr_aligned(addr, os::large_page_size()), "Must be");
3578 
3579   return addr;
3580 }
3581 
3582 // Reserve memory using mmap(MAP_HUGETLB).
3583 //  - bytes shall be a multiple of alignment.
3584 //  - req_addr can be NULL. If not NULL, it must be a multiple of alignment.
3585 //  - alignment sets the alignment at which memory shall be allocated.
3586 //     It must be a multiple of allocation granularity.
3587 // Returns address of memory or NULL. If req_addr was not NULL, will only return
3588 //  req_addr or NULL.
3589 char* os::Linux::reserve_memory_special_huge_tlbfs_mixed(size_t bytes,
3590                                                          size_t alignment,
3591                                                          char* req_addr,
3592                                                          bool exec) {
3593   size_t large_page_size = os::large_page_size();
3594   assert(bytes >= large_page_size, "Shouldn't allocate large pages for small sizes");
3595 
3596   assert(is_ptr_aligned(req_addr, alignment), "Must be");
3597   assert(is_size_aligned(bytes, alignment), "Must be");
3598 
3599   // First reserve - but not commit - the address range in small pages.
3600   char* const start = anon_mmap_aligned(bytes, alignment, req_addr);
3601 
3602   if (start == NULL) {
3603     return NULL;
3604   }
3605 
3606   assert(is_ptr_aligned(start, alignment), "Must be");
3607 
3608   char* end = start + bytes;
3609 
3610   // Find the regions of the allocated chunk that can be promoted to large pages.
3611   char* lp_start = (char*)align_ptr_up(start, large_page_size);
3612   char* lp_end   = (char*)align_ptr_down(end, large_page_size);
3613 
3614   size_t lp_bytes = lp_end - lp_start;
3615 
3616   assert(is_size_aligned(lp_bytes, large_page_size), "Must be");
3617 
3618   if (lp_bytes == 0) {
3619     // The mapped region doesn't even span the start and the end of a large page.
3620     // Fall back to allocate a non-special area.
3621     ::munmap(start, end - start);
3622     return NULL;
3623   }
3624 
3625   int prot = exec ? PROT_READ|PROT_WRITE|PROT_EXEC : PROT_READ|PROT_WRITE;
3626 
3627   void* result;
3628 
3629   // Commit small-paged leading area.
3630   if (start != lp_start) {
3631     result = ::mmap(start, lp_start - start, prot,
3632                     MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED,
3633                     -1, 0);
3634     if (result == MAP_FAILED) {
3635       ::munmap(lp_start, end - lp_start);
3636       return NULL;
3637     }
3638   }
3639 
3640   // Commit large-paged area.
3641   result = ::mmap(lp_start, lp_bytes, prot,
3642                   MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED|MAP_HUGETLB,
3643                   -1, 0);
3644   if (result == MAP_FAILED) {
3645     warn_on_large_pages_failure(lp_start, lp_bytes, errno);
3646     // If the mmap above fails, the large pages region will be unmapped and we
3647     // have regions before and after with small pages. Release these regions.
3648     //
3649     // |  mapped  |  unmapped  |  mapped  |
3650     // ^          ^            ^          ^
3651     // start      lp_start     lp_end     end
3652     //
3653     ::munmap(start, lp_start - start);
3654     ::munmap(lp_end, end - lp_end);
3655     return NULL;
3656   }
3657 
3658   // Commit small-paged trailing area.
3659   if (lp_end != end) {
3660     result = ::mmap(lp_end, end - lp_end, prot,
3661                     MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED,
3662                     -1, 0);
3663     if (result == MAP_FAILED) {
3664       ::munmap(start, lp_end - start);
3665       return NULL;
3666     }
3667   }
3668 
3669   return start;
3670 }
3671 
3672 char* os::Linux::reserve_memory_special_huge_tlbfs(size_t bytes,
3673                                                    size_t alignment,
3674                                                    char* req_addr,
3675                                                    bool exec) {
3676   assert(UseLargePages && UseHugeTLBFS, "only for Huge TLBFS large pages");
3677   assert(is_ptr_aligned(req_addr, alignment), "Must be");
3678   assert(is_size_aligned(alignment, os::vm_allocation_granularity()), "Must be");
3679   assert(is_power_of_2(os::large_page_size()), "Must be");
3680   assert(bytes >= os::large_page_size(), "Shouldn't allocate large pages for small sizes");
3681 
3682   if (is_size_aligned(bytes, os::large_page_size()) && alignment <= os::large_page_size()) {
3683     return reserve_memory_special_huge_tlbfs_only(bytes, req_addr, exec);
3684   } else {
3685     return reserve_memory_special_huge_tlbfs_mixed(bytes, alignment, req_addr, exec);
3686   }
3687 }
3688 
3689 char* os::reserve_memory_special(size_t bytes, size_t alignment,
3690                                  char* req_addr, bool exec) {
3691   assert(UseLargePages, "only for large pages");
3692 
3693   char* addr;
3694   if (UseSHM) {
3695     addr = os::Linux::reserve_memory_special_shm(bytes, alignment, req_addr, exec);
3696   } else {
3697     assert(UseHugeTLBFS, "must be");
3698     addr = os::Linux::reserve_memory_special_huge_tlbfs(bytes, alignment, req_addr, exec);
3699   }
3700 
3701   if (addr != NULL) {
3702     if (UseNUMAInterleaving) {
3703       numa_make_global(addr, bytes);
3704     }
3705 
3706     // The memory is committed
3707     MemTracker::record_virtual_memory_reserve_and_commit((address)addr, bytes, CALLER_PC);
3708   }
3709 
3710   return addr;
3711 }
3712 
3713 bool os::Linux::release_memory_special_shm(char* base, size_t bytes) {
3714   // detaching the SHM segment will also delete it, see reserve_memory_special_shm()
3715   return shmdt(base) == 0;
3716 }
3717 
3718 bool os::Linux::release_memory_special_huge_tlbfs(char* base, size_t bytes) {
3719   return pd_release_memory(base, bytes);
3720 }
3721 
3722 bool os::release_memory_special(char* base, size_t bytes) {
3723   bool res;
3724   if (MemTracker::tracking_level() > NMT_minimal) {
3725     Tracker tkr = MemTracker::get_virtual_memory_release_tracker();
3726     res = os::Linux::release_memory_special_impl(base, bytes);
3727     if (res) {
3728       tkr.record((address)base, bytes);
3729     }
3730 
3731   } else {
3732     res = os::Linux::release_memory_special_impl(base, bytes);
3733   }
3734   return res;
3735 }
3736 
3737 bool os::Linux::release_memory_special_impl(char* base, size_t bytes) {
3738   assert(UseLargePages, "only for large pages");
3739   bool res;
3740 
3741   if (UseSHM) {
3742     res = os::Linux::release_memory_special_shm(base, bytes);
3743   } else {
3744     assert(UseHugeTLBFS, "must be");
3745     res = os::Linux::release_memory_special_huge_tlbfs(base, bytes);
3746   }
3747   return res;
3748 }
3749 
3750 size_t os::large_page_size() {
3751   return _large_page_size;
3752 }
3753 
3754 // With SysV SHM the entire memory region must be allocated as shared
3755 // memory.
3756 // HugeTLBFS allows application to commit large page memory on demand.
3757 // However, when committing memory with HugeTLBFS fails, the region
3758 // that was supposed to be committed will lose the old reservation
3759 // and allow other threads to steal that memory region. Because of this
3760 // behavior we can't commit HugeTLBFS memory.
3761 bool os::can_commit_large_page_memory() {
3762   return UseTransparentHugePages;
3763 }
3764 
3765 bool os::can_execute_large_page_memory() {
3766   return UseTransparentHugePages || UseHugeTLBFS;
3767 }
3768 
3769 // Reserve memory at an arbitrary address, only if that area is
3770 // available (and not reserved for something else).
3771 
3772 char* os::pd_attempt_reserve_memory_at(size_t bytes, char* requested_addr) {
3773   const int max_tries = 10;
3774   char* base[max_tries];
3775   size_t size[max_tries];
3776   const size_t gap = 0x000000;
3777 
3778   // Assert only that the size is a multiple of the page size, since
3779   // that's all that mmap requires, and since that's all we really know
3780   // about at this low abstraction level.  If we need higher alignment,
3781   // we can either pass an alignment to this method or verify alignment
3782   // in one of the methods further up the call chain.  See bug 5044738.
3783   assert(bytes % os::vm_page_size() == 0, "reserving unexpected size block");
3784 
3785   // Repeatedly allocate blocks until the block is allocated at the
3786   // right spot.
3787 
3788   // Linux mmap allows caller to pass an address as hint; give it a try first,
3789   // if kernel honors the hint then we can return immediately.
3790   char * addr = anon_mmap(requested_addr, bytes, false);
3791   if (addr == requested_addr) {
3792     return requested_addr;
3793   }
3794 
3795   if (addr != NULL) {
3796     // mmap() is successful but it fails to reserve at the requested address
3797     anon_munmap(addr, bytes);
3798   }
3799 
3800   int i;
3801   for (i = 0; i < max_tries; ++i) {
3802     base[i] = reserve_memory(bytes);
3803 
3804     if (base[i] != NULL) {
3805       // Is this the block we wanted?
3806       if (base[i] == requested_addr) {
3807         size[i] = bytes;
3808         break;
3809       }
3810 
3811       // Does this overlap the block we wanted? Give back the overlapped
3812       // parts and try again.
3813 
3814       ptrdiff_t top_overlap = requested_addr + (bytes + gap) - base[i];
3815       if (top_overlap >= 0 && (size_t)top_overlap < bytes) {
3816         unmap_memory(base[i], top_overlap);
3817         base[i] += top_overlap;
3818         size[i] = bytes - top_overlap;
3819       } else {
3820         ptrdiff_t bottom_overlap = base[i] + bytes - requested_addr;
3821         if (bottom_overlap >= 0 && (size_t)bottom_overlap < bytes) {
3822           unmap_memory(requested_addr, bottom_overlap);
3823           size[i] = bytes - bottom_overlap;
3824         } else {
3825           size[i] = bytes;
3826         }
3827       }
3828     }
3829   }
3830 
3831   // Give back the unused reserved pieces.
3832 
3833   for (int j = 0; j < i; ++j) {
3834     if (base[j] != NULL) {
3835       unmap_memory(base[j], size[j]);
3836     }
3837   }
3838 
3839   if (i < max_tries) {
3840     return requested_addr;
3841   } else {
3842     return NULL;
3843   }
3844 }
3845 
3846 size_t os::read(int fd, void *buf, unsigned int nBytes) {
3847   return ::read(fd, buf, nBytes);
3848 }
3849 
3850 size_t os::read_at(int fd, void *buf, unsigned int nBytes, jlong offset) {
3851   return ::pread(fd, buf, nBytes, offset);
3852 }
3853 
3854 // Short sleep, direct OS call.
3855 //
3856 // Note: certain versions of Linux CFS scheduler (since 2.6.23) do not guarantee
3857 // sched_yield(2) will actually give up the CPU:
3858 //
3859 //   * Alone on this pariticular CPU, keeps running.
3860 //   * Before the introduction of "skip_buddy" with "compat_yield" disabled
3861 //     (pre 2.6.39).
3862 //
3863 // So calling this with 0 is an alternative.
3864 //
3865 void os::naked_short_sleep(jlong ms) {
3866   struct timespec req;
3867 
3868   assert(ms < 1000, "Un-interruptable sleep, short time use only");
3869   req.tv_sec = 0;
3870   if (ms > 0) {
3871     req.tv_nsec = (ms % 1000) * 1000000;
3872   } else {
3873     req.tv_nsec = 1;
3874   }
3875 
3876   nanosleep(&req, NULL);
3877 
3878   return;
3879 }
3880 
3881 // Sleep forever; naked call to OS-specific sleep; use with CAUTION
3882 void os::infinite_sleep() {
3883   while (true) {    // sleep forever ...
3884     ::sleep(100);   // ... 100 seconds at a time
3885   }
3886 }
3887 
3888 // Used to convert frequent JVM_Yield() to nops
3889 bool os::dont_yield() {
3890   return DontYieldALot;
3891 }
3892 
3893 void os::naked_yield() {
3894   sched_yield();
3895 }
3896 
3897 ////////////////////////////////////////////////////////////////////////////////
3898 // thread priority support
3899 
3900 // Note: Normal Linux applications are run with SCHED_OTHER policy. SCHED_OTHER
3901 // only supports dynamic priority, static priority must be zero. For real-time
3902 // applications, Linux supports SCHED_RR which allows static priority (1-99).
3903 // However, for large multi-threaded applications, SCHED_RR is not only slower
3904 // than SCHED_OTHER, but also very unstable (my volano tests hang hard 4 out
3905 // of 5 runs - Sep 2005).
3906 //
3907 // The following code actually changes the niceness of kernel-thread/LWP. It
3908 // has an assumption that setpriority() only modifies one kernel-thread/LWP,
3909 // not the entire user process, and user level threads are 1:1 mapped to kernel
3910 // threads. It has always been the case, but could change in the future. For
3911 // this reason, the code should not be used as default (ThreadPriorityPolicy=0).
3912 // It is only used when ThreadPriorityPolicy=1 and requires root privilege.
3913 
3914 int os::java_to_os_priority[CriticalPriority + 1] = {
3915   19,              // 0 Entry should never be used
3916 
3917    4,              // 1 MinPriority
3918    3,              // 2
3919    2,              // 3
3920 
3921    1,              // 4
3922    0,              // 5 NormPriority
3923   -1,              // 6
3924 
3925   -2,              // 7
3926   -3,              // 8
3927   -4,              // 9 NearMaxPriority
3928 
3929   -5,              // 10 MaxPriority
3930 
3931   -5               // 11 CriticalPriority
3932 };
3933 
3934 static int prio_init() {
3935   if (ThreadPriorityPolicy == 1) {
3936     // Only root can raise thread priority. Don't allow ThreadPriorityPolicy=1
3937     // if effective uid is not root. Perhaps, a more elegant way of doing
3938     // this is to test CAP_SYS_NICE capability, but that will require libcap.so
3939     if (geteuid() != 0) {
3940       if (!FLAG_IS_DEFAULT(ThreadPriorityPolicy)) {
3941         warning("-XX:ThreadPriorityPolicy requires root privilege on Linux");
3942       }
3943       ThreadPriorityPolicy = 0;
3944     }
3945   }
3946   if (UseCriticalJavaThreadPriority) {
3947     os::java_to_os_priority[MaxPriority] = os::java_to_os_priority[CriticalPriority];
3948   }
3949   return 0;
3950 }
3951 
3952 OSReturn os::set_native_priority(Thread* thread, int newpri) {
3953   if (!UseThreadPriorities || ThreadPriorityPolicy == 0) return OS_OK;
3954 
3955   int ret = setpriority(PRIO_PROCESS, thread->osthread()->thread_id(), newpri);
3956   return (ret == 0) ? OS_OK : OS_ERR;
3957 }
3958 
3959 OSReturn os::get_native_priority(const Thread* const thread,
3960                                  int *priority_ptr) {
3961   if (!UseThreadPriorities || ThreadPriorityPolicy == 0) {
3962     *priority_ptr = java_to_os_priority[NormPriority];
3963     return OS_OK;
3964   }
3965 
3966   errno = 0;
3967   *priority_ptr = getpriority(PRIO_PROCESS, thread->osthread()->thread_id());
3968   return (*priority_ptr != -1 || errno == 0 ? OS_OK : OS_ERR);
3969 }
3970 
3971 // Hint to the underlying OS that a task switch would not be good.
3972 // Void return because it's a hint and can fail.
3973 void os::hint_no_preempt() {}
3974 
3975 ////////////////////////////////////////////////////////////////////////////////
3976 // suspend/resume support
3977 
3978 //  the low-level signal-based suspend/resume support is a remnant from the
3979 //  old VM-suspension that used to be for java-suspension, safepoints etc,
3980 //  within hotspot. Now there is a single use-case for this:
3981 //    - calling get_thread_pc() on the VMThread by the flat-profiler task
3982 //      that runs in the watcher thread.
3983 //  The remaining code is greatly simplified from the more general suspension
3984 //  code that used to be used.
3985 //
3986 //  The protocol is quite simple:
3987 //  - suspend:
3988 //      - sends a signal to the target thread
3989 //      - polls the suspend state of the osthread using a yield loop
3990 //      - target thread signal handler (SR_handler) sets suspend state
3991 //        and blocks in sigsuspend until continued
3992 //  - resume:
3993 //      - sets target osthread state to continue
3994 //      - sends signal to end the sigsuspend loop in the SR_handler
3995 //
3996 //  Note that the SR_lock plays no role in this suspend/resume protocol,
3997 //  but is checked for NULL in SR_handler as a thread termination indicator.
3998 
3999 static void resume_clear_context(OSThread *osthread) {
4000   osthread->set_ucontext(NULL);
4001   osthread->set_siginfo(NULL);
4002 }
4003 
4004 static void suspend_save_context(OSThread *osthread, siginfo_t* siginfo,
4005                                  ucontext_t* context) {
4006   osthread->set_ucontext(context);
4007   osthread->set_siginfo(siginfo);
4008 }
4009 
4010 // Handler function invoked when a thread's execution is suspended or
4011 // resumed. We have to be careful that only async-safe functions are
4012 // called here (Note: most pthread functions are not async safe and
4013 // should be avoided.)
4014 //
4015 // Note: sigwait() is a more natural fit than sigsuspend() from an
4016 // interface point of view, but sigwait() prevents the signal hander
4017 // from being run. libpthread would get very confused by not having
4018 // its signal handlers run and prevents sigwait()'s use with the
4019 // mutex granting granting signal.
4020 //
4021 // Currently only ever called on the VMThread and JavaThreads (PC sampling)
4022 //
4023 static void SR_handler(int sig, siginfo_t* siginfo, ucontext_t* context) {
4024   // Save and restore errno to avoid confusing native code with EINTR
4025   // after sigsuspend.
4026   int old_errno = errno;
4027 
4028   Thread* thread = Thread::current_or_null_safe();
4029   assert(thread != NULL, "Missing current thread in SR_handler");
4030 
4031   // On some systems we have seen signal delivery get "stuck" until the signal
4032   // mask is changed as part of thread termination. Check that the current thread
4033   // has not already terminated (via SR_lock()) - else the following assertion
4034   // will fail because the thread is no longer a JavaThread as the ~JavaThread
4035   // destructor has completed.
4036 
4037   if (thread->SR_lock() == NULL) {
4038     return;
4039   }
4040 
4041   assert(thread->is_VM_thread() || thread->is_Java_thread(), "Must be VMThread or JavaThread");
4042 
4043   OSThread* osthread = thread->osthread();
4044 
4045   os::SuspendResume::State current = osthread->sr.state();
4046   if (current == os::SuspendResume::SR_SUSPEND_REQUEST) {
4047     suspend_save_context(osthread, siginfo, context);
4048 
4049     // attempt to switch the state, we assume we had a SUSPEND_REQUEST
4050     os::SuspendResume::State state = osthread->sr.suspended();
4051     if (state == os::SuspendResume::SR_SUSPENDED) {
4052       sigset_t suspend_set;  // signals for sigsuspend()
4053       sigemptyset(&suspend_set);
4054       // get current set of blocked signals and unblock resume signal
4055       pthread_sigmask(SIG_BLOCK, NULL, &suspend_set);
4056       sigdelset(&suspend_set, SR_signum);
4057 
4058       sr_semaphore.signal();
4059       // wait here until we are resumed
4060       while (1) {
4061         sigsuspend(&suspend_set);
4062 
4063         os::SuspendResume::State result = osthread->sr.running();
4064         if (result == os::SuspendResume::SR_RUNNING) {
4065           sr_semaphore.signal();
4066           break;
4067         }
4068       }
4069 
4070     } else if (state == os::SuspendResume::SR_RUNNING) {
4071       // request was cancelled, continue
4072     } else {
4073       ShouldNotReachHere();
4074     }
4075 
4076     resume_clear_context(osthread);
4077   } else if (current == os::SuspendResume::SR_RUNNING) {
4078     // request was cancelled, continue
4079   } else if (current == os::SuspendResume::SR_WAKEUP_REQUEST) {
4080     // ignore
4081   } else {
4082     // ignore
4083   }
4084 
4085   errno = old_errno;
4086 }
4087 
4088 static int SR_initialize() {
4089   struct sigaction act;
4090   char *s;
4091 
4092   // Get signal number to use for suspend/resume
4093   if ((s = ::getenv("_JAVA_SR_SIGNUM")) != 0) {
4094     int sig = ::strtol(s, 0, 10);
4095     if (sig > MAX2(SIGSEGV, SIGBUS) &&  // See 4355769.
4096         sig < NSIG) {                   // Must be legal signal and fit into sigflags[].
4097       SR_signum = sig;
4098     } else {
4099       warning("You set _JAVA_SR_SIGNUM=%d. It must be in range [%d, %d]. Using %d instead.",
4100               sig, MAX2(SIGSEGV, SIGBUS)+1, NSIG-1, SR_signum);
4101     }
4102   }
4103 
4104   assert(SR_signum > SIGSEGV && SR_signum > SIGBUS,
4105          "SR_signum must be greater than max(SIGSEGV, SIGBUS), see 4355769");
4106 
4107   sigemptyset(&SR_sigset);
4108   sigaddset(&SR_sigset, SR_signum);
4109 
4110   // Set up signal handler for suspend/resume
4111   act.sa_flags = SA_RESTART|SA_SIGINFO;
4112   act.sa_handler = (void (*)(int)) SR_handler;
4113 
4114   // SR_signum is blocked by default.
4115   // 4528190 - We also need to block pthread restart signal (32 on all
4116   // supported Linux platforms). Note that LinuxThreads need to block
4117   // this signal for all threads to work properly. So we don't have
4118   // to use hard-coded signal number when setting up the mask.
4119   pthread_sigmask(SIG_BLOCK, NULL, &act.sa_mask);
4120 
4121   if (sigaction(SR_signum, &act, 0) == -1) {
4122     return -1;
4123   }
4124 
4125   // Save signal flag
4126   os::Linux::set_our_sigflags(SR_signum, act.sa_flags);
4127   return 0;
4128 }
4129 
4130 static int sr_notify(OSThread* osthread) {
4131   int status = pthread_kill(osthread->pthread_id(), SR_signum);
4132   assert_status(status == 0, status, "pthread_kill");
4133   return status;
4134 }
4135 
4136 // "Randomly" selected value for how long we want to spin
4137 // before bailing out on suspending a thread, also how often
4138 // we send a signal to a thread we want to resume
4139 static const int RANDOMLY_LARGE_INTEGER = 1000000;
4140 static const int RANDOMLY_LARGE_INTEGER2 = 100;
4141 
4142 // returns true on success and false on error - really an error is fatal
4143 // but this seems the normal response to library errors
4144 static bool do_suspend(OSThread* osthread) {
4145   assert(osthread->sr.is_running(), "thread should be running");
4146   assert(!sr_semaphore.trywait(), "semaphore has invalid state");
4147 
4148   // mark as suspended and send signal
4149   if (osthread->sr.request_suspend() != os::SuspendResume::SR_SUSPEND_REQUEST) {
4150     // failed to switch, state wasn't running?
4151     ShouldNotReachHere();
4152     return false;
4153   }
4154 
4155   if (sr_notify(osthread) != 0) {
4156     ShouldNotReachHere();
4157   }
4158 
4159   // managed to send the signal and switch to SUSPEND_REQUEST, now wait for SUSPENDED
4160   while (true) {
4161     if (sr_semaphore.timedwait(0, 2 * NANOSECS_PER_MILLISEC)) {
4162       break;
4163     } else {
4164       // timeout
4165       os::SuspendResume::State cancelled = osthread->sr.cancel_suspend();
4166       if (cancelled == os::SuspendResume::SR_RUNNING) {
4167         return false;
4168       } else if (cancelled == os::SuspendResume::SR_SUSPENDED) {
4169         // make sure that we consume the signal on the semaphore as well
4170         sr_semaphore.wait();
4171         break;
4172       } else {
4173         ShouldNotReachHere();
4174         return false;
4175       }
4176     }
4177   }
4178 
4179   guarantee(osthread->sr.is_suspended(), "Must be suspended");
4180   return true;
4181 }
4182 
4183 static void do_resume(OSThread* osthread) {
4184   assert(osthread->sr.is_suspended(), "thread should be suspended");
4185   assert(!sr_semaphore.trywait(), "invalid semaphore state");
4186 
4187   if (osthread->sr.request_wakeup() != os::SuspendResume::SR_WAKEUP_REQUEST) {
4188     // failed to switch to WAKEUP_REQUEST
4189     ShouldNotReachHere();
4190     return;
4191   }
4192 
4193   while (true) {
4194     if (sr_notify(osthread) == 0) {
4195       if (sr_semaphore.timedwait(0, 2 * NANOSECS_PER_MILLISEC)) {
4196         if (osthread->sr.is_running()) {
4197           return;
4198         }
4199       }
4200     } else {
4201       ShouldNotReachHere();
4202     }
4203   }
4204 
4205   guarantee(osthread->sr.is_running(), "Must be running!");
4206 }
4207 
4208 ///////////////////////////////////////////////////////////////////////////////////
4209 // signal handling (except suspend/resume)
4210 
4211 // This routine may be used by user applications as a "hook" to catch signals.
4212 // The user-defined signal handler must pass unrecognized signals to this
4213 // routine, and if it returns true (non-zero), then the signal handler must
4214 // return immediately.  If the flag "abort_if_unrecognized" is true, then this
4215 // routine will never retun false (zero), but instead will execute a VM panic
4216 // routine kill the process.
4217 //
4218 // If this routine returns false, it is OK to call it again.  This allows
4219 // the user-defined signal handler to perform checks either before or after
4220 // the VM performs its own checks.  Naturally, the user code would be making
4221 // a serious error if it tried to handle an exception (such as a null check
4222 // or breakpoint) that the VM was generating for its own correct operation.
4223 //
4224 // This routine may recognize any of the following kinds of signals:
4225 //    SIGBUS, SIGSEGV, SIGILL, SIGFPE, SIGQUIT, SIGPIPE, SIGXFSZ, SIGUSR1.
4226 // It should be consulted by handlers for any of those signals.
4227 //
4228 // The caller of this routine must pass in the three arguments supplied
4229 // to the function referred to in the "sa_sigaction" (not the "sa_handler")
4230 // field of the structure passed to sigaction().  This routine assumes that
4231 // the sa_flags field passed to sigaction() includes SA_SIGINFO and SA_RESTART.
4232 //
4233 // Note that the VM will print warnings if it detects conflicting signal
4234 // handlers, unless invoked with the option "-XX:+AllowUserSignalHandlers".
4235 //
4236 extern "C" JNIEXPORT int JVM_handle_linux_signal(int signo,
4237                                                  siginfo_t* siginfo,
4238                                                  void* ucontext,
4239                                                  int abort_if_unrecognized);
4240 
4241 void signalHandler(int sig, siginfo_t* info, void* uc) {
4242   assert(info != NULL && uc != NULL, "it must be old kernel");
4243   int orig_errno = errno;  // Preserve errno value over signal handler.
4244   JVM_handle_linux_signal(sig, info, uc, true);
4245   errno = orig_errno;
4246 }
4247 
4248 
4249 // This boolean allows users to forward their own non-matching signals
4250 // to JVM_handle_linux_signal, harmlessly.
4251 bool os::Linux::signal_handlers_are_installed = false;
4252 
4253 // For signal-chaining
4254 struct sigaction sigact[NSIG];
4255 uint64_t sigs = 0;
4256 #if (64 < NSIG-1)
4257 #error "Not all signals can be encoded in sigs. Adapt its type!"
4258 #endif
4259 bool os::Linux::libjsig_is_loaded = false;
4260 typedef struct sigaction *(*get_signal_t)(int);
4261 get_signal_t os::Linux::get_signal_action = NULL;
4262 
4263 struct sigaction* os::Linux::get_chained_signal_action(int sig) {
4264   struct sigaction *actp = NULL;
4265 
4266   if (libjsig_is_loaded) {
4267     // Retrieve the old signal handler from libjsig
4268     actp = (*get_signal_action)(sig);
4269   }
4270   if (actp == NULL) {
4271     // Retrieve the preinstalled signal handler from jvm
4272     actp = get_preinstalled_handler(sig);
4273   }
4274 
4275   return actp;
4276 }
4277 
4278 static bool call_chained_handler(struct sigaction *actp, int sig,
4279                                  siginfo_t *siginfo, void *context) {
4280   // Call the old signal handler
4281   if (actp->sa_handler == SIG_DFL) {
4282     // It's more reasonable to let jvm treat it as an unexpected exception
4283     // instead of taking the default action.
4284     return false;
4285   } else if (actp->sa_handler != SIG_IGN) {
4286     if ((actp->sa_flags & SA_NODEFER) == 0) {
4287       // automaticlly block the signal
4288       sigaddset(&(actp->sa_mask), sig);
4289     }
4290 
4291     sa_handler_t hand = NULL;
4292     sa_sigaction_t sa = NULL;
4293     bool siginfo_flag_set = (actp->sa_flags & SA_SIGINFO) != 0;
4294     // retrieve the chained handler
4295     if (siginfo_flag_set) {
4296       sa = actp->sa_sigaction;
4297     } else {
4298       hand = actp->sa_handler;
4299     }
4300 
4301     if ((actp->sa_flags & SA_RESETHAND) != 0) {
4302       actp->sa_handler = SIG_DFL;
4303     }
4304 
4305     // try to honor the signal mask
4306     sigset_t oset;
4307     sigemptyset(&oset);
4308     pthread_sigmask(SIG_SETMASK, &(actp->sa_mask), &oset);
4309 
4310     // call into the chained handler
4311     if (siginfo_flag_set) {
4312       (*sa)(sig, siginfo, context);
4313     } else {
4314       (*hand)(sig);
4315     }
4316 
4317     // restore the signal mask
4318     pthread_sigmask(SIG_SETMASK, &oset, NULL);
4319   }
4320   // Tell jvm's signal handler the signal is taken care of.
4321   return true;
4322 }
4323 
4324 bool os::Linux::chained_handler(int sig, siginfo_t* siginfo, void* context) {
4325   bool chained = false;
4326   // signal-chaining
4327   if (UseSignalChaining) {
4328     struct sigaction *actp = get_chained_signal_action(sig);
4329     if (actp != NULL) {
4330       chained = call_chained_handler(actp, sig, siginfo, context);
4331     }
4332   }
4333   return chained;
4334 }
4335 
4336 struct sigaction* os::Linux::get_preinstalled_handler(int sig) {
4337   if ((((uint64_t)1 << (sig-1)) & sigs) != 0) {
4338     return &sigact[sig];
4339   }
4340   return NULL;
4341 }
4342 
4343 void os::Linux::save_preinstalled_handler(int sig, struct sigaction& oldAct) {
4344   assert(sig > 0 && sig < NSIG, "vm signal out of expected range");
4345   sigact[sig] = oldAct;
4346   sigs |= (uint64_t)1 << (sig-1);
4347 }
4348 
4349 // for diagnostic
4350 int sigflags[NSIG];
4351 
4352 int os::Linux::get_our_sigflags(int sig) {
4353   assert(sig > 0 && sig < NSIG, "vm signal out of expected range");
4354   return sigflags[sig];
4355 }
4356 
4357 void os::Linux::set_our_sigflags(int sig, int flags) {
4358   assert(sig > 0 && sig < NSIG, "vm signal out of expected range");
4359   if (sig > 0 && sig < NSIG) {
4360     sigflags[sig] = flags;
4361   }
4362 }
4363 
4364 void os::Linux::set_signal_handler(int sig, bool set_installed) {
4365   // Check for overwrite.
4366   struct sigaction oldAct;
4367   sigaction(sig, (struct sigaction*)NULL, &oldAct);
4368 
4369   void* oldhand = oldAct.sa_sigaction
4370                 ? CAST_FROM_FN_PTR(void*,  oldAct.sa_sigaction)
4371                 : CAST_FROM_FN_PTR(void*,  oldAct.sa_handler);
4372   if (oldhand != CAST_FROM_FN_PTR(void*, SIG_DFL) &&
4373       oldhand != CAST_FROM_FN_PTR(void*, SIG_IGN) &&
4374       oldhand != CAST_FROM_FN_PTR(void*, (sa_sigaction_t)signalHandler)) {
4375     if (AllowUserSignalHandlers || !set_installed) {
4376       // Do not overwrite; user takes responsibility to forward to us.
4377       return;
4378     } else if (UseSignalChaining) {
4379       // save the old handler in jvm
4380       save_preinstalled_handler(sig, oldAct);
4381       // libjsig also interposes the sigaction() call below and saves the
4382       // old sigaction on it own.
4383     } else {
4384       fatal("Encountered unexpected pre-existing sigaction handler "
4385             "%#lx for signal %d.", (long)oldhand, sig);
4386     }
4387   }
4388 
4389   struct sigaction sigAct;
4390   sigfillset(&(sigAct.sa_mask));
4391   sigAct.sa_handler = SIG_DFL;
4392   if (!set_installed) {
4393     sigAct.sa_flags = SA_SIGINFO|SA_RESTART;
4394   } else {
4395     sigAct.sa_sigaction = signalHandler;
4396     sigAct.sa_flags = SA_SIGINFO|SA_RESTART;
4397   }
4398   // Save flags, which are set by ours
4399   assert(sig > 0 && sig < NSIG, "vm signal out of expected range");
4400   sigflags[sig] = sigAct.sa_flags;
4401 
4402   int ret = sigaction(sig, &sigAct, &oldAct);
4403   assert(ret == 0, "check");
4404 
4405   void* oldhand2  = oldAct.sa_sigaction
4406                   ? CAST_FROM_FN_PTR(void*, oldAct.sa_sigaction)
4407                   : CAST_FROM_FN_PTR(void*, oldAct.sa_handler);
4408   assert(oldhand2 == oldhand, "no concurrent signal handler installation");
4409 }
4410 
4411 // install signal handlers for signals that HotSpot needs to
4412 // handle in order to support Java-level exception handling.
4413 
4414 void os::Linux::install_signal_handlers() {
4415   if (!signal_handlers_are_installed) {
4416     signal_handlers_are_installed = true;
4417 
4418     // signal-chaining
4419     typedef void (*signal_setting_t)();
4420     signal_setting_t begin_signal_setting = NULL;
4421     signal_setting_t end_signal_setting = NULL;
4422     begin_signal_setting = CAST_TO_FN_PTR(signal_setting_t,
4423                                           dlsym(RTLD_DEFAULT, "JVM_begin_signal_setting"));
4424     if (begin_signal_setting != NULL) {
4425       end_signal_setting = CAST_TO_FN_PTR(signal_setting_t,
4426                                           dlsym(RTLD_DEFAULT, "JVM_end_signal_setting"));
4427       get_signal_action = CAST_TO_FN_PTR(get_signal_t,
4428                                          dlsym(RTLD_DEFAULT, "JVM_get_signal_action"));
4429       libjsig_is_loaded = true;
4430       assert(UseSignalChaining, "should enable signal-chaining");
4431     }
4432     if (libjsig_is_loaded) {
4433       // Tell libjsig jvm is setting signal handlers
4434       (*begin_signal_setting)();
4435     }
4436 
4437     set_signal_handler(SIGSEGV, true);
4438     set_signal_handler(SIGPIPE, true);
4439     set_signal_handler(SIGBUS, true);
4440     set_signal_handler(SIGILL, true);
4441     set_signal_handler(SIGFPE, true);
4442 #if defined(PPC64)
4443     set_signal_handler(SIGTRAP, true);
4444 #endif
4445     set_signal_handler(SIGXFSZ, true);
4446 
4447     if (libjsig_is_loaded) {
4448       // Tell libjsig jvm finishes setting signal handlers
4449       (*end_signal_setting)();
4450     }
4451 
4452     // We don't activate signal checker if libjsig is in place, we trust ourselves
4453     // and if UserSignalHandler is installed all bets are off.
4454     // Log that signal checking is off only if -verbose:jni is specified.
4455     if (CheckJNICalls) {
4456       if (libjsig_is_loaded) {
4457         if (PrintJNIResolving) {
4458           tty->print_cr("Info: libjsig is activated, all active signal checking is disabled");
4459         }
4460         check_signals = false;
4461       }
4462       if (AllowUserSignalHandlers) {
4463         if (PrintJNIResolving) {
4464           tty->print_cr("Info: AllowUserSignalHandlers is activated, all active signal checking is disabled");
4465         }
4466         check_signals = false;
4467       }
4468     }
4469   }
4470 }
4471 
4472 // This is the fastest way to get thread cpu time on Linux.
4473 // Returns cpu time (user+sys) for any thread, not only for current.
4474 // POSIX compliant clocks are implemented in the kernels 2.6.16+.
4475 // It might work on 2.6.10+ with a special kernel/glibc patch.
4476 // For reference, please, see IEEE Std 1003.1-2004:
4477 //   http://www.unix.org/single_unix_specification
4478 
4479 jlong os::Linux::fast_thread_cpu_time(clockid_t clockid) {
4480   struct timespec tp;
4481   int rc = os::Linux::clock_gettime(clockid, &tp);
4482   assert(rc == 0, "clock_gettime is expected to return 0 code");
4483 
4484   return (tp.tv_sec * NANOSECS_PER_SEC) + tp.tv_nsec;
4485 }
4486 
4487 void os::Linux::initialize_os_info() {
4488   assert(_os_version == 0, "OS info already initialized");
4489 
4490   struct utsname _uname;
4491 
4492   uint32_t major;
4493   uint32_t minor;
4494   uint32_t fix;
4495 
4496   int rc;
4497 
4498   // Kernel version is unknown if
4499   // verification below fails.
4500   _os_version = 0x01000000;
4501 
4502   rc = uname(&_uname);
4503   if (rc != -1) {
4504 
4505     rc = sscanf(_uname.release,"%d.%d.%d", &major, &minor, &fix);
4506     if (rc == 3) {
4507 
4508       if (major < 256 && minor < 256 && fix < 256) {
4509         // Kernel version format is as expected,
4510         // set it overriding unknown state.
4511         _os_version = (major << 16) |
4512                       (minor << 8 ) |
4513                       (fix   << 0 ) ;
4514       }
4515     }
4516   }
4517 }
4518 
4519 uint32_t os::Linux::os_version() {
4520   assert(_os_version != 0, "not initialized");
4521   return _os_version & 0x00FFFFFF;
4522 }
4523 
4524 bool os::Linux::os_version_is_known() {
4525   assert(_os_version != 0, "not initialized");
4526   return _os_version & 0x01000000 ? false : true;
4527 }
4528 
4529 /////
4530 // glibc on Linux platform uses non-documented flag
4531 // to indicate, that some special sort of signal
4532 // trampoline is used.
4533 // We will never set this flag, and we should
4534 // ignore this flag in our diagnostic
4535 #ifdef SIGNIFICANT_SIGNAL_MASK
4536   #undef SIGNIFICANT_SIGNAL_MASK
4537 #endif
4538 #define SIGNIFICANT_SIGNAL_MASK (~0x04000000)
4539 
4540 static const char* get_signal_handler_name(address handler,
4541                                            char* buf, int buflen) {
4542   int offset = 0;
4543   bool found = os::dll_address_to_library_name(handler, buf, buflen, &offset);
4544   if (found) {
4545     // skip directory names
4546     const char *p1, *p2;
4547     p1 = buf;
4548     size_t len = strlen(os::file_separator());
4549     while ((p2 = strstr(p1, os::file_separator())) != NULL) p1 = p2 + len;
4550     jio_snprintf(buf, buflen, "%s+0x%x", p1, offset);
4551   } else {
4552     jio_snprintf(buf, buflen, PTR_FORMAT, handler);
4553   }
4554   return buf;
4555 }
4556 
4557 static void print_signal_handler(outputStream* st, int sig,
4558                                  char* buf, size_t buflen) {
4559   struct sigaction sa;
4560 
4561   sigaction(sig, NULL, &sa);
4562 
4563   // See comment for SIGNIFICANT_SIGNAL_MASK define
4564   sa.sa_flags &= SIGNIFICANT_SIGNAL_MASK;
4565 
4566   st->print("%s: ", os::exception_name(sig, buf, buflen));
4567 
4568   address handler = (sa.sa_flags & SA_SIGINFO)
4569     ? CAST_FROM_FN_PTR(address, sa.sa_sigaction)
4570     : CAST_FROM_FN_PTR(address, sa.sa_handler);
4571 
4572   if (handler == CAST_FROM_FN_PTR(address, SIG_DFL)) {
4573     st->print("SIG_DFL");
4574   } else if (handler == CAST_FROM_FN_PTR(address, SIG_IGN)) {
4575     st->print("SIG_IGN");
4576   } else {
4577     st->print("[%s]", get_signal_handler_name(handler, buf, buflen));
4578   }
4579 
4580   st->print(", sa_mask[0]=");
4581   os::Posix::print_signal_set_short(st, &sa.sa_mask);
4582 
4583   address rh = VMError::get_resetted_sighandler(sig);
4584   // May be, handler was resetted by VMError?
4585   if (rh != NULL) {
4586     handler = rh;
4587     sa.sa_flags = VMError::get_resetted_sigflags(sig) & SIGNIFICANT_SIGNAL_MASK;
4588   }
4589 
4590   st->print(", sa_flags=");
4591   os::Posix::print_sa_flags(st, sa.sa_flags);
4592 
4593   // Check: is it our handler?
4594   if (handler == CAST_FROM_FN_PTR(address, (sa_sigaction_t)signalHandler) ||
4595       handler == CAST_FROM_FN_PTR(address, (sa_sigaction_t)SR_handler)) {
4596     // It is our signal handler
4597     // check for flags, reset system-used one!
4598     if ((int)sa.sa_flags != os::Linux::get_our_sigflags(sig)) {
4599       st->print(
4600                 ", flags was changed from " PTR32_FORMAT ", consider using jsig library",
4601                 os::Linux::get_our_sigflags(sig));
4602     }
4603   }
4604   st->cr();
4605 }
4606 
4607 
4608 #define DO_SIGNAL_CHECK(sig)                      \
4609   do {                                            \
4610     if (!sigismember(&check_signal_done, sig)) {  \
4611       os::Linux::check_signal_handler(sig);       \
4612     }                                             \
4613   } while (0)
4614 
4615 // This method is a periodic task to check for misbehaving JNI applications
4616 // under CheckJNI, we can add any periodic checks here
4617 
4618 void os::run_periodic_checks() {
4619   if (check_signals == false) return;
4620 
4621   // SEGV and BUS if overridden could potentially prevent
4622   // generation of hs*.log in the event of a crash, debugging
4623   // such a case can be very challenging, so we absolutely
4624   // check the following for a good measure:
4625   DO_SIGNAL_CHECK(SIGSEGV);
4626   DO_SIGNAL_CHECK(SIGILL);
4627   DO_SIGNAL_CHECK(SIGFPE);
4628   DO_SIGNAL_CHECK(SIGBUS);
4629   DO_SIGNAL_CHECK(SIGPIPE);
4630   DO_SIGNAL_CHECK(SIGXFSZ);
4631 #if defined(PPC64)
4632   DO_SIGNAL_CHECK(SIGTRAP);
4633 #endif
4634 
4635   // ReduceSignalUsage allows the user to override these handlers
4636   // see comments at the very top and jvm_solaris.h
4637   if (!ReduceSignalUsage) {
4638     DO_SIGNAL_CHECK(SHUTDOWN1_SIGNAL);
4639     DO_SIGNAL_CHECK(SHUTDOWN2_SIGNAL);
4640     DO_SIGNAL_CHECK(SHUTDOWN3_SIGNAL);
4641     DO_SIGNAL_CHECK(BREAK_SIGNAL);
4642   }
4643 
4644   DO_SIGNAL_CHECK(SR_signum);
4645 }
4646 
4647 typedef int (*os_sigaction_t)(int, const struct sigaction *, struct sigaction *);
4648 
4649 static os_sigaction_t os_sigaction = NULL;
4650 
4651 void os::Linux::check_signal_handler(int sig) {
4652   char buf[O_BUFLEN];
4653   address jvmHandler = NULL;
4654 
4655 
4656   struct sigaction act;
4657   if (os_sigaction == NULL) {
4658     // only trust the default sigaction, in case it has been interposed
4659     os_sigaction = (os_sigaction_t)dlsym(RTLD_DEFAULT, "sigaction");
4660     if (os_sigaction == NULL) return;
4661   }
4662 
4663   os_sigaction(sig, (struct sigaction*)NULL, &act);
4664 
4665 
4666   act.sa_flags &= SIGNIFICANT_SIGNAL_MASK;
4667 
4668   address thisHandler = (act.sa_flags & SA_SIGINFO)
4669     ? CAST_FROM_FN_PTR(address, act.sa_sigaction)
4670     : CAST_FROM_FN_PTR(address, act.sa_handler);
4671 
4672 
4673   switch (sig) {
4674   case SIGSEGV:
4675   case SIGBUS:
4676   case SIGFPE:
4677   case SIGPIPE:
4678   case SIGILL:
4679   case SIGXFSZ:
4680     jvmHandler = CAST_FROM_FN_PTR(address, (sa_sigaction_t)signalHandler);
4681     break;
4682 
4683   case SHUTDOWN1_SIGNAL:
4684   case SHUTDOWN2_SIGNAL:
4685   case SHUTDOWN3_SIGNAL:
4686   case BREAK_SIGNAL:
4687     jvmHandler = (address)user_handler();
4688     break;
4689 
4690   default:
4691     if (sig == SR_signum) {
4692       jvmHandler = CAST_FROM_FN_PTR(address, (sa_sigaction_t)SR_handler);
4693     } else {
4694       return;
4695     }
4696     break;
4697   }
4698 
4699   if (thisHandler != jvmHandler) {
4700     tty->print("Warning: %s handler ", exception_name(sig, buf, O_BUFLEN));
4701     tty->print("expected:%s", get_signal_handler_name(jvmHandler, buf, O_BUFLEN));
4702     tty->print_cr("  found:%s", get_signal_handler_name(thisHandler, buf, O_BUFLEN));
4703     // No need to check this sig any longer
4704     sigaddset(&check_signal_done, sig);
4705     // Running under non-interactive shell, SHUTDOWN2_SIGNAL will be reassigned SIG_IGN
4706     if (sig == SHUTDOWN2_SIGNAL && !isatty(fileno(stdin))) {
4707       tty->print_cr("Running in non-interactive shell, %s handler is replaced by shell",
4708                     exception_name(sig, buf, O_BUFLEN));
4709     }
4710   } else if(os::Linux::get_our_sigflags(sig) != 0 && (int)act.sa_flags != os::Linux::get_our_sigflags(sig)) {
4711     tty->print("Warning: %s handler flags ", exception_name(sig, buf, O_BUFLEN));
4712     tty->print("expected:");
4713     os::Posix::print_sa_flags(tty, os::Linux::get_our_sigflags(sig));
4714     tty->cr();
4715     tty->print("  found:");
4716     os::Posix::print_sa_flags(tty, act.sa_flags);
4717     tty->cr();
4718     // No need to check this sig any longer
4719     sigaddset(&check_signal_done, sig);
4720   }
4721 
4722   // Dump all the signal
4723   if (sigismember(&check_signal_done, sig)) {
4724     print_signal_handlers(tty, buf, O_BUFLEN);
4725   }
4726 }
4727 
4728 extern void report_error(char* file_name, int line_no, char* title,
4729                          char* format, ...);
4730 
4731 // Some linux distributions (notably: Alpine Linux) include the
4732 // grsecurity in the kernel by default. Of particular interest from a
4733 // JVM perspective is PaX (https://pax.grsecurity.net/), which adds
4734 // some security features related to page attributes. Specifically,
4735 // the MPROTECT PaX functionality
4736 // (https://pax.grsecurity.net/docs/mprotect.txt) prevents dynamic
4737 // code generation by disallowing a (previously) writable page to be
4738 // marked as executable. This is, of course, exactly what HotSpot does
4739 // for both JIT compiled method, as well as for stubs, adapters, etc.
4740 //
4741 // Instead of crashing "lazily" when trying to make a page executable,
4742 // this code probes for the presence of PaX and reports the failure
4743 // eagerly.
4744 static void check_pax(void) {
4745   // Zero doesn't generate code dynamically, so no need to perform the PaX check
4746 #ifndef ZERO
4747   size_t size = os::Linux::page_size();
4748 
4749   void* p = ::mmap(NULL, size, PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
4750   if (p == MAP_FAILED) {
4751     vm_exit_out_of_memory(size, OOM_MMAP_ERROR, "failed to allocate memory for PaX check.");
4752   }
4753 
4754   int res = ::mprotect(p, size, PROT_WRITE|PROT_EXEC);
4755   if (res == -1) {
4756     vm_exit_during_initialization("Failed to mark memory page as executable",
4757                                   "Please check if grsecurity/PaX is enabled in your kernel.\n"
4758                                   "\n"
4759                                   "For example, you can do this by running (note: you may need root privileges):\n"
4760                                   "\n"
4761                                   "    sysctl kernel.pax.softmode\n"
4762                                   "\n"
4763                                   "If PaX is included in the kernel you will see something like this:\n"
4764                                   "\n"
4765                                   "    kernel.pax.softmode = 0\n"
4766                                   "\n"
4767                                   "In particular, if the value is 0 (zero), then PaX is enabled.\n"
4768                                   "\n"
4769                                   "PaX includes security functionality which interferes with the dynamic code\n"
4770                                   "generation the JVM relies on. Specifically, the MPROTECT functionality as\n"
4771                                   "described on https://pax.grsecurity.net/docs/mprotect.txt is not compatible\n"
4772                                   "with the JVM. If you want to allow the JVM to run you will have to disable PaX.\n"
4773                                   "You can do this on a per-executable basis using the paxctl tool, for example:\n"
4774                                   "\n"
4775                                   "    paxctl -cm bin/java\n"
4776                                   "\n"
4777                                   "Please note that this modifies the executable binary in-place, so may want\n"
4778                                   "to make a backup of it first. Also note that you have to repeat this for other\n"
4779                                   "executables like javac, jar, jcmd, etc.\n"
4780                                   );
4781 
4782   }
4783 
4784   ::munmap(p, size);
4785 #endif
4786 }
4787 
4788 // this is called _before_ the most of global arguments have been parsed
4789 void os::init(void) {
4790   char dummy;   // used to get a guess on initial stack address
4791 //  first_hrtime = gethrtime();
4792 
4793   clock_tics_per_sec = sysconf(_SC_CLK_TCK);
4794 
4795   init_random(1234567);
4796 
4797   ThreadCritical::initialize();
4798 
4799   Linux::set_page_size(sysconf(_SC_PAGESIZE));
4800   if (Linux::page_size() == -1) {
4801     fatal("os_linux.cpp: os::init: sysconf failed (%s)",
4802           os::strerror(errno));
4803   }
4804   init_page_sizes((size_t) Linux::page_size());
4805 
4806   Linux::initialize_system_info();
4807 
4808   Linux::initialize_os_info();
4809 
4810   // main_thread points to the aboriginal thread
4811   Linux::_main_thread = pthread_self();
4812 
4813   Linux::clock_init();
4814   initial_time_count = javaTimeNanos();
4815 
4816   // pthread_condattr initialization for monotonic clock
4817   int status;
4818   pthread_condattr_t* _condattr = os::Linux::condAttr();
4819   if ((status = pthread_condattr_init(_condattr)) != 0) {
4820     fatal("pthread_condattr_init: %s", os::strerror(status));
4821   }
4822   // Only set the clock if CLOCK_MONOTONIC is available
4823   if (os::supports_monotonic_clock()) {
4824     if ((status = pthread_condattr_setclock(_condattr, CLOCK_MONOTONIC)) != 0) {
4825       if (status == EINVAL) {
4826         warning("Unable to use monotonic clock with relative timed-waits" \
4827                 " - changes to the time-of-day clock may have adverse affects");
4828       } else {
4829         fatal("pthread_condattr_setclock: %s", os::strerror(status));
4830       }
4831     }
4832   }
4833   // else it defaults to CLOCK_REALTIME
4834 
4835   // retrieve entry point for pthread_setname_np
4836   Linux::_pthread_setname_np =
4837     (int(*)(pthread_t, const char*))dlsym(RTLD_DEFAULT, "pthread_setname_np");
4838 
4839   check_pax();
4840 }
4841 
4842 // To install functions for atexit system call
4843 extern "C" {
4844   static void perfMemory_exit_helper() {
4845     perfMemory_exit();
4846   }
4847 }
4848 
4849 // this is called _after_ the global arguments have been parsed
4850 jint os::init_2(void) {
4851   Linux::fast_thread_clock_init();
4852 
4853   // Allocate a single page and mark it as readable for safepoint polling
4854   address polling_page = (address) ::mmap(NULL, Linux::page_size(), PROT_READ, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
4855   guarantee(polling_page != MAP_FAILED, "os::init_2: failed to allocate polling page");
4856 
4857   os::set_polling_page(polling_page);
4858   log_info(os)("SafePoint Polling address: " INTPTR_FORMAT, p2i(polling_page));
4859 
4860   if (!UseMembar) {
4861     address mem_serialize_page = (address) ::mmap(NULL, Linux::page_size(), PROT_READ | PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
4862     guarantee(mem_serialize_page != MAP_FAILED, "mmap Failed for memory serialize page");
4863     os::set_memory_serialize_page(mem_serialize_page);
4864     log_info(os)("Memory Serialize Page address: " INTPTR_FORMAT, p2i(mem_serialize_page));
4865   }
4866 
4867   // initialize suspend/resume support - must do this before signal_sets_init()
4868   if (SR_initialize() != 0) {
4869     perror("SR_initialize failed");
4870     return JNI_ERR;
4871   }
4872 
4873   Linux::signal_sets_init();
4874   Linux::install_signal_handlers();
4875 
4876   // Check and sets minimum stack sizes against command line options
4877   if (Posix::set_minimum_stack_sizes() == JNI_ERR) {
4878     return JNI_ERR;
4879   }
4880   Linux::capture_initial_stack(JavaThread::stack_size_at_create());
4881 
4882 #if defined(IA32)
4883   workaround_expand_exec_shield_cs_limit();
4884 #endif
4885 
4886   Linux::libpthread_init();
4887   log_info(os)("HotSpot is running with %s, %s",
4888                Linux::glibc_version(), Linux::libpthread_version());
4889 
4890   if (UseNUMA) {
4891     if (!Linux::libnuma_init()) {
4892       UseNUMA = false;
4893     } else {
4894       if ((Linux::numa_max_node() < 1)) {
4895         // There's only one node(they start from 0), disable NUMA.
4896         UseNUMA = false;
4897       }
4898     }
4899     // With SHM and HugeTLBFS large pages we cannot uncommit a page, so there's no way
4900     // we can make the adaptive lgrp chunk resizing work. If the user specified
4901     // both UseNUMA and UseLargePages (or UseSHM/UseHugeTLBFS) on the command line - warn and
4902     // disable adaptive resizing.
4903     if (UseNUMA && UseLargePages && !can_commit_large_page_memory()) {
4904       if (FLAG_IS_DEFAULT(UseNUMA)) {
4905         UseNUMA = false;
4906       } else {
4907         if (FLAG_IS_DEFAULT(UseLargePages) &&
4908             FLAG_IS_DEFAULT(UseSHM) &&
4909             FLAG_IS_DEFAULT(UseHugeTLBFS)) {
4910           UseLargePages = false;
4911         } else if (UseAdaptiveSizePolicy || UseAdaptiveNUMAChunkSizing) {
4912           warning("UseNUMA is not fully compatible with SHM/HugeTLBFS large pages, disabling adaptive resizing (-XX:-UseAdaptiveSizePolicy -XX:-UseAdaptiveNUMAChunkSizing)");
4913           UseAdaptiveSizePolicy = false;
4914           UseAdaptiveNUMAChunkSizing = false;
4915         }
4916       }
4917     }
4918     if (!UseNUMA && ForceNUMA) {
4919       UseNUMA = true;
4920     }
4921   }
4922 
4923   if (MaxFDLimit) {
4924     // set the number of file descriptors to max. print out error
4925     // if getrlimit/setrlimit fails but continue regardless.
4926     struct rlimit nbr_files;
4927     int status = getrlimit(RLIMIT_NOFILE, &nbr_files);
4928     if (status != 0) {
4929       log_info(os)("os::init_2 getrlimit failed: %s", os::strerror(errno));
4930     } else {
4931       nbr_files.rlim_cur = nbr_files.rlim_max;
4932       status = setrlimit(RLIMIT_NOFILE, &nbr_files);
4933       if (status != 0) {
4934         log_info(os)("os::init_2 setrlimit failed: %s", os::strerror(errno));
4935       }
4936     }
4937   }
4938 
4939   // Initialize lock used to serialize thread creation (see os::create_thread)
4940   Linux::set_createThread_lock(new Mutex(Mutex::leaf, "createThread_lock", false));
4941 
4942   // at-exit methods are called in the reverse order of their registration.
4943   // atexit functions are called on return from main or as a result of a
4944   // call to exit(3C). There can be only 32 of these functions registered
4945   // and atexit() does not set errno.
4946 
4947   if (PerfAllowAtExitRegistration) {
4948     // only register atexit functions if PerfAllowAtExitRegistration is set.
4949     // atexit functions can be delayed until process exit time, which
4950     // can be problematic for embedded VM situations. Embedded VMs should
4951     // call DestroyJavaVM() to assure that VM resources are released.
4952 
4953     // note: perfMemory_exit_helper atexit function may be removed in
4954     // the future if the appropriate cleanup code can be added to the
4955     // VM_Exit VMOperation's doit method.
4956     if (atexit(perfMemory_exit_helper) != 0) {
4957       warning("os::init_2 atexit(perfMemory_exit_helper) failed");
4958     }
4959   }
4960 
4961   // initialize thread priority policy
4962   prio_init();
4963 
4964   return JNI_OK;
4965 }
4966 
4967 // Mark the polling page as unreadable
4968 void os::make_polling_page_unreadable(void) {
4969   if (!guard_memory((char*)_polling_page, Linux::page_size())) {
4970     fatal("Could not disable polling page");
4971   }
4972 }
4973 
4974 // Mark the polling page as readable
4975 void os::make_polling_page_readable(void) {
4976   if (!linux_mprotect((char *)_polling_page, Linux::page_size(), PROT_READ)) {
4977     fatal("Could not enable polling page");
4978   }
4979 }
4980 
4981 // older glibc versions don't have this macro (which expands to
4982 // an optimized bit-counting function) so we have to roll our own
4983 #ifndef CPU_COUNT
4984 
4985 static int _cpu_count(const cpu_set_t* cpus) {
4986   int count = 0;
4987   // only look up to the number of configured processors
4988   for (int i = 0; i < os::processor_count(); i++) {
4989     if (CPU_ISSET(i, cpus)) {
4990       count++;
4991     }
4992   }
4993   return count;
4994 }
4995 
4996 #define CPU_COUNT(cpus) _cpu_count(cpus)
4997 
4998 #endif // CPU_COUNT
4999 
5000 // Get the current number of available processors for this process.
5001 // This value can change at any time during a process's lifetime.
5002 // sched_getaffinity gives an accurate answer as it accounts for cpusets.
5003 // If it appears there may be more than 1024 processors then we do a
5004 // dynamic check - see 6515172 for details.
5005 // If anything goes wrong we fallback to returning the number of online
5006 // processors - which can be greater than the number available to the process.
5007 int os::active_processor_count() {
5008   cpu_set_t cpus;  // can represent at most 1024 (CPU_SETSIZE) processors
5009   cpu_set_t* cpus_p = &cpus;
5010   int cpus_size = sizeof(cpu_set_t);
5011 
5012   int configured_cpus = processor_count();  // upper bound on available cpus
5013   int cpu_count = 0;
5014 
5015 // old build platforms may not support dynamic cpu sets
5016 #ifdef CPU_ALLOC
5017 
5018   // To enable easy testing of the dynamic path on different platforms we
5019   // introduce a diagnostic flag: UseCpuAllocPath
5020   if (configured_cpus >= CPU_SETSIZE || UseCpuAllocPath) {
5021     // kernel may use a mask bigger than cpu_set_t
5022     log_trace(os)("active_processor_count: using dynamic path %s"
5023                   "- configured processors: %d",
5024                   UseCpuAllocPath ? "(forced) " : "",
5025                   configured_cpus);
5026     cpus_p = CPU_ALLOC(configured_cpus);
5027     if (cpus_p != NULL) {
5028       cpus_size = CPU_ALLOC_SIZE(configured_cpus);
5029       // zero it just to be safe
5030       CPU_ZERO_S(cpus_size, cpus_p);
5031     }
5032     else {
5033        // failed to allocate so fallback to online cpus
5034        int online_cpus = ::sysconf(_SC_NPROCESSORS_ONLN);
5035        log_trace(os)("active_processor_count: "
5036                      "CPU_ALLOC failed (%s) - using "
5037                      "online processor count: %d",
5038                      os::strerror(errno), online_cpus);
5039        return online_cpus;
5040     }
5041   }
5042   else {
5043     log_trace(os)("active_processor_count: using static path - configured processors: %d",
5044                   configured_cpus);
5045   }
5046 #else // CPU_ALLOC
5047 // these stubs won't be executed
5048 #define CPU_COUNT_S(size, cpus) -1
5049 #define CPU_FREE(cpus)
5050 
5051   log_trace(os)("active_processor_count: only static path available - configured processors: %d",
5052                 configured_cpus);
5053 #endif // CPU_ALLOC
5054 
5055   // pid 0 means the current thread - which we have to assume represents the process
5056   if (sched_getaffinity(0, cpus_size, cpus_p) == 0) {
5057     if (cpus_p != &cpus) { // can only be true when CPU_ALLOC used
5058       cpu_count = CPU_COUNT_S(cpus_size, cpus_p);
5059     }
5060     else {
5061       cpu_count = CPU_COUNT(cpus_p);
5062     }
5063     log_trace(os)("active_processor_count: sched_getaffinity processor count: %d", cpu_count);
5064   }
5065   else {
5066     cpu_count = ::sysconf(_SC_NPROCESSORS_ONLN);
5067     warning("sched_getaffinity failed (%s)- using online processor count (%d) "
5068             "which may exceed available processors", os::strerror(errno), cpu_count);
5069   }
5070 
5071   if (cpus_p != &cpus) { // can only be true when CPU_ALLOC used
5072     CPU_FREE(cpus_p);
5073   }
5074 
5075   assert(cpu_count > 0 && cpu_count <= processor_count(), "sanity check");
5076   return cpu_count;
5077 }
5078 
5079 void os::set_native_thread_name(const char *name) {
5080   if (Linux::_pthread_setname_np) {
5081     char buf [16]; // according to glibc manpage, 16 chars incl. '/0'
5082     snprintf(buf, sizeof(buf), "%s", name);
5083     buf[sizeof(buf) - 1] = '\0';
5084     const int rc = Linux::_pthread_setname_np(pthread_self(), buf);
5085     // ERANGE should not happen; all other errors should just be ignored.
5086     assert(rc != ERANGE, "pthread_setname_np failed");
5087   }
5088 }
5089 
5090 bool os::distribute_processes(uint length, uint* distribution) {
5091   // Not yet implemented.
5092   return false;
5093 }
5094 
5095 bool os::bind_to_processor(uint processor_id) {
5096   // Not yet implemented.
5097   return false;
5098 }
5099 
5100 ///
5101 
5102 void os::SuspendedThreadTask::internal_do_task() {
5103   if (do_suspend(_thread->osthread())) {
5104     SuspendedThreadTaskContext context(_thread, _thread->osthread()->ucontext());
5105     do_task(context);
5106     do_resume(_thread->osthread());
5107   }
5108 }
5109 
5110 class PcFetcher : public os::SuspendedThreadTask {
5111  public:
5112   PcFetcher(Thread* thread) : os::SuspendedThreadTask(thread) {}
5113   ExtendedPC result();
5114  protected:
5115   void do_task(const os::SuspendedThreadTaskContext& context);
5116  private:
5117   ExtendedPC _epc;
5118 };
5119 
5120 ExtendedPC PcFetcher::result() {
5121   guarantee(is_done(), "task is not done yet.");
5122   return _epc;
5123 }
5124 
5125 void PcFetcher::do_task(const os::SuspendedThreadTaskContext& context) {
5126   Thread* thread = context.thread();
5127   OSThread* osthread = thread->osthread();
5128   if (osthread->ucontext() != NULL) {
5129     _epc = os::Linux::ucontext_get_pc((const ucontext_t *) context.ucontext());
5130   } else {
5131     // NULL context is unexpected, double-check this is the VMThread
5132     guarantee(thread->is_VM_thread(), "can only be called for VMThread");
5133   }
5134 }
5135 
5136 // Suspends the target using the signal mechanism and then grabs the PC before
5137 // resuming the target. Used by the flat-profiler only
5138 ExtendedPC os::get_thread_pc(Thread* thread) {
5139   // Make sure that it is called by the watcher for the VMThread
5140   assert(Thread::current()->is_Watcher_thread(), "Must be watcher");
5141   assert(thread->is_VM_thread(), "Can only be called for VMThread");
5142 
5143   PcFetcher fetcher(thread);
5144   fetcher.run();
5145   return fetcher.result();
5146 }
5147 
5148 ////////////////////////////////////////////////////////////////////////////////
5149 // debug support
5150 
5151 bool os::find(address addr, outputStream* st) {
5152   Dl_info dlinfo;
5153   memset(&dlinfo, 0, sizeof(dlinfo));
5154   if (dladdr(addr, &dlinfo) != 0) {
5155     st->print(PTR_FORMAT ": ", p2i(addr));
5156     if (dlinfo.dli_sname != NULL && dlinfo.dli_saddr != NULL) {
5157       st->print("%s+" PTR_FORMAT, dlinfo.dli_sname,
5158                 p2i(addr) - p2i(dlinfo.dli_saddr));
5159     } else if (dlinfo.dli_fbase != NULL) {
5160       st->print("<offset " PTR_FORMAT ">", p2i(addr) - p2i(dlinfo.dli_fbase));
5161     } else {
5162       st->print("<absolute address>");
5163     }
5164     if (dlinfo.dli_fname != NULL) {
5165       st->print(" in %s", dlinfo.dli_fname);
5166     }
5167     if (dlinfo.dli_fbase != NULL) {
5168       st->print(" at " PTR_FORMAT, p2i(dlinfo.dli_fbase));
5169     }
5170     st->cr();
5171 
5172     if (Verbose) {
5173       // decode some bytes around the PC
5174       address begin = clamp_address_in_page(addr-40, addr, os::vm_page_size());
5175       address end   = clamp_address_in_page(addr+40, addr, os::vm_page_size());
5176       address       lowest = (address) dlinfo.dli_sname;
5177       if (!lowest)  lowest = (address) dlinfo.dli_fbase;
5178       if (begin < lowest)  begin = lowest;
5179       Dl_info dlinfo2;
5180       if (dladdr(end, &dlinfo2) != 0 && dlinfo2.dli_saddr != dlinfo.dli_saddr
5181           && end > dlinfo2.dli_saddr && dlinfo2.dli_saddr > begin) {
5182         end = (address) dlinfo2.dli_saddr;
5183       }
5184       Disassembler::decode(begin, end, st);
5185     }
5186     return true;
5187   }
5188   return false;
5189 }
5190 
5191 ////////////////////////////////////////////////////////////////////////////////
5192 // misc
5193 
5194 // This does not do anything on Linux. This is basically a hook for being
5195 // able to use structured exception handling (thread-local exception filters)
5196 // on, e.g., Win32.
5197 void
5198 os::os_exception_wrapper(java_call_t f, JavaValue* value, const methodHandle& method,
5199                          JavaCallArguments* args, Thread* thread) {
5200   f(value, method, args, thread);
5201 }
5202 
5203 void os::print_statistics() {
5204 }
5205 
5206 bool os::message_box(const char* title, const char* message) {
5207   int i;
5208   fdStream err(defaultStream::error_fd());
5209   for (i = 0; i < 78; i++) err.print_raw("=");
5210   err.cr();
5211   err.print_raw_cr(title);
5212   for (i = 0; i < 78; i++) err.print_raw("-");
5213   err.cr();
5214   err.print_raw_cr(message);
5215   for (i = 0; i < 78; i++) err.print_raw("=");
5216   err.cr();
5217 
5218   char buf[16];
5219   // Prevent process from exiting upon "read error" without consuming all CPU
5220   while (::read(0, buf, sizeof(buf)) <= 0) { ::sleep(100); }
5221 
5222   return buf[0] == 'y' || buf[0] == 'Y';
5223 }
5224 
5225 int os::stat(const char *path, struct stat *sbuf) {
5226   char pathbuf[MAX_PATH];
5227   if (strlen(path) > MAX_PATH - 1) {
5228     errno = ENAMETOOLONG;
5229     return -1;
5230   }
5231   os::native_path(strcpy(pathbuf, path));
5232   return ::stat(pathbuf, sbuf);
5233 }
5234 
5235 // Is a (classpath) directory empty?
5236 bool os::dir_is_empty(const char* path) {
5237   DIR *dir = NULL;
5238   struct dirent *ptr;
5239 
5240   dir = opendir(path);
5241   if (dir == NULL) return true;
5242 
5243   // Scan the directory
5244   bool result = true;
5245   char buf[sizeof(struct dirent) + MAX_PATH];
5246   while (result && (ptr = ::readdir(dir)) != NULL) {
5247     if (strcmp(ptr->d_name, ".") != 0 && strcmp(ptr->d_name, "..") != 0) {
5248       result = false;
5249     }
5250   }
5251   closedir(dir);
5252   return result;
5253 }
5254 
5255 // This code originates from JDK's sysOpen and open64_w
5256 // from src/solaris/hpi/src/system_md.c
5257 
5258 int os::open(const char *path, int oflag, int mode) {
5259   if (strlen(path) > MAX_PATH - 1) {
5260     errno = ENAMETOOLONG;
5261     return -1;
5262   }
5263 
5264   // All file descriptors that are opened in the Java process and not
5265   // specifically destined for a subprocess should have the close-on-exec
5266   // flag set.  If we don't set it, then careless 3rd party native code
5267   // might fork and exec without closing all appropriate file descriptors
5268   // (e.g. as we do in closeDescriptors in UNIXProcess.c), and this in
5269   // turn might:
5270   //
5271   // - cause end-of-file to fail to be detected on some file
5272   //   descriptors, resulting in mysterious hangs, or
5273   //
5274   // - might cause an fopen in the subprocess to fail on a system
5275   //   suffering from bug 1085341.
5276   //
5277   // (Yes, the default setting of the close-on-exec flag is a Unix
5278   // design flaw)
5279   //
5280   // See:
5281   // 1085341: 32-bit stdio routines should support file descriptors >255
5282   // 4843136: (process) pipe file descriptor from Runtime.exec not being closed
5283   // 6339493: (process) Runtime.exec does not close all file descriptors on Solaris 9
5284   //
5285   // Modern Linux kernels (after 2.6.23 2007) support O_CLOEXEC with open().
5286   // O_CLOEXEC is preferable to using FD_CLOEXEC on an open file descriptor
5287   // because it saves a system call and removes a small window where the flag
5288   // is unset.  On ancient Linux kernels the O_CLOEXEC flag will be ignored
5289   // and we fall back to using FD_CLOEXEC (see below).
5290 #ifdef O_CLOEXEC
5291   oflag |= O_CLOEXEC;
5292 #endif
5293 
5294   int fd = ::open64(path, oflag, mode);
5295   if (fd == -1) return -1;
5296 
5297   //If the open succeeded, the file might still be a directory
5298   {
5299     struct stat64 buf64;
5300     int ret = ::fstat64(fd, &buf64);
5301     int st_mode = buf64.st_mode;
5302 
5303     if (ret != -1) {
5304       if ((st_mode & S_IFMT) == S_IFDIR) {
5305         errno = EISDIR;
5306         ::close(fd);
5307         return -1;
5308       }
5309     } else {
5310       ::close(fd);
5311       return -1;
5312     }
5313   }
5314 
5315 #ifdef FD_CLOEXEC
5316   // Validate that the use of the O_CLOEXEC flag on open above worked.
5317   // With recent kernels, we will perform this check exactly once.
5318   static sig_atomic_t O_CLOEXEC_is_known_to_work = 0;
5319   if (!O_CLOEXEC_is_known_to_work) {
5320     int flags = ::fcntl(fd, F_GETFD);
5321     if (flags != -1) {
5322       if ((flags & FD_CLOEXEC) != 0)
5323         O_CLOEXEC_is_known_to_work = 1;
5324       else
5325         ::fcntl(fd, F_SETFD, flags | FD_CLOEXEC);
5326     }
5327   }
5328 #endif
5329 
5330   return fd;
5331 }
5332 
5333 
5334 // create binary file, rewriting existing file if required
5335 int os::create_binary_file(const char* path, bool rewrite_existing) {
5336   int oflags = O_WRONLY | O_CREAT;
5337   if (!rewrite_existing) {
5338     oflags |= O_EXCL;
5339   }
5340   return ::open64(path, oflags, S_IREAD | S_IWRITE);
5341 }
5342 
5343 // return current position of file pointer
5344 jlong os::current_file_offset(int fd) {
5345   return (jlong)::lseek64(fd, (off64_t)0, SEEK_CUR);
5346 }
5347 
5348 // move file pointer to the specified offset
5349 jlong os::seek_to_file_offset(int fd, jlong offset) {
5350   return (jlong)::lseek64(fd, (off64_t)offset, SEEK_SET);
5351 }
5352 
5353 // This code originates from JDK's sysAvailable
5354 // from src/solaris/hpi/src/native_threads/src/sys_api_td.c
5355 
5356 int os::available(int fd, jlong *bytes) {
5357   jlong cur, end;
5358   int mode;
5359   struct stat64 buf64;
5360 
5361   if (::fstat64(fd, &buf64) >= 0) {
5362     mode = buf64.st_mode;
5363     if (S_ISCHR(mode) || S_ISFIFO(mode) || S_ISSOCK(mode)) {
5364       int n;
5365       if (::ioctl(fd, FIONREAD, &n) >= 0) {
5366         *bytes = n;
5367         return 1;
5368       }
5369     }
5370   }
5371   if ((cur = ::lseek64(fd, 0L, SEEK_CUR)) == -1) {
5372     return 0;
5373   } else if ((end = ::lseek64(fd, 0L, SEEK_END)) == -1) {
5374     return 0;
5375   } else if (::lseek64(fd, cur, SEEK_SET) == -1) {
5376     return 0;
5377   }
5378   *bytes = end - cur;
5379   return 1;
5380 }
5381 
5382 // Map a block of memory.
5383 char* os::pd_map_memory(int fd, const char* file_name, size_t file_offset,
5384                         char *addr, size_t bytes, bool read_only,
5385                         bool allow_exec) {
5386   int prot;
5387   int flags = MAP_PRIVATE;
5388 
5389   if (read_only) {
5390     prot = PROT_READ;
5391   } else {
5392     prot = PROT_READ | PROT_WRITE;
5393   }
5394 
5395   if (allow_exec) {
5396     prot |= PROT_EXEC;
5397   }
5398 
5399   if (addr != NULL) {
5400     flags |= MAP_FIXED;
5401   }
5402 
5403   char* mapped_address = (char*)mmap(addr, (size_t)bytes, prot, flags,
5404                                      fd, file_offset);
5405   if (mapped_address == MAP_FAILED) {
5406     return NULL;
5407   }
5408   return mapped_address;
5409 }
5410 
5411 
5412 // Remap a block of memory.
5413 char* os::pd_remap_memory(int fd, const char* file_name, size_t file_offset,
5414                           char *addr, size_t bytes, bool read_only,
5415                           bool allow_exec) {
5416   // same as map_memory() on this OS
5417   return os::map_memory(fd, file_name, file_offset, addr, bytes, read_only,
5418                         allow_exec);
5419 }
5420 
5421 
5422 // Unmap a block of memory.
5423 bool os::pd_unmap_memory(char* addr, size_t bytes) {
5424   return munmap(addr, bytes) == 0;
5425 }
5426 
5427 static jlong slow_thread_cpu_time(Thread *thread, bool user_sys_cpu_time);
5428 
5429 static clockid_t thread_cpu_clockid(Thread* thread) {
5430   pthread_t tid = thread->osthread()->pthread_id();
5431   clockid_t clockid;
5432 
5433   // Get thread clockid
5434   int rc = os::Linux::pthread_getcpuclockid(tid, &clockid);
5435   assert(rc == 0, "pthread_getcpuclockid is expected to return 0 code");
5436   return clockid;
5437 }
5438 
5439 // current_thread_cpu_time(bool) and thread_cpu_time(Thread*, bool)
5440 // are used by JVM M&M and JVMTI to get user+sys or user CPU time
5441 // of a thread.
5442 //
5443 // current_thread_cpu_time() and thread_cpu_time(Thread*) returns
5444 // the fast estimate available on the platform.
5445 
5446 jlong os::current_thread_cpu_time() {
5447   if (os::Linux::supports_fast_thread_cpu_time()) {
5448     return os::Linux::fast_thread_cpu_time(CLOCK_THREAD_CPUTIME_ID);
5449   } else {
5450     // return user + sys since the cost is the same
5451     return slow_thread_cpu_time(Thread::current(), true /* user + sys */);
5452   }
5453 }
5454 
5455 jlong os::thread_cpu_time(Thread* thread) {
5456   // consistent with what current_thread_cpu_time() returns
5457   if (os::Linux::supports_fast_thread_cpu_time()) {
5458     return os::Linux::fast_thread_cpu_time(thread_cpu_clockid(thread));
5459   } else {
5460     return slow_thread_cpu_time(thread, true /* user + sys */);
5461   }
5462 }
5463 
5464 jlong os::current_thread_cpu_time(bool user_sys_cpu_time) {
5465   if (user_sys_cpu_time && os::Linux::supports_fast_thread_cpu_time()) {
5466     return os::Linux::fast_thread_cpu_time(CLOCK_THREAD_CPUTIME_ID);
5467   } else {
5468     return slow_thread_cpu_time(Thread::current(), user_sys_cpu_time);
5469   }
5470 }
5471 
5472 jlong os::thread_cpu_time(Thread *thread, bool user_sys_cpu_time) {
5473   if (user_sys_cpu_time && os::Linux::supports_fast_thread_cpu_time()) {
5474     return os::Linux::fast_thread_cpu_time(thread_cpu_clockid(thread));
5475   } else {
5476     return slow_thread_cpu_time(thread, user_sys_cpu_time);
5477   }
5478 }
5479 
5480 //  -1 on error.
5481 static jlong slow_thread_cpu_time(Thread *thread, bool user_sys_cpu_time) {
5482   pid_t  tid = thread->osthread()->thread_id();
5483   char *s;
5484   char stat[2048];
5485   int statlen;
5486   char proc_name[64];
5487   int count;
5488   long sys_time, user_time;
5489   char cdummy;
5490   int idummy;
5491   long ldummy;
5492   FILE *fp;
5493 
5494   snprintf(proc_name, 64, "/proc/self/task/%d/stat", tid);
5495   fp = fopen(proc_name, "r");
5496   if (fp == NULL) return -1;
5497   statlen = fread(stat, 1, 2047, fp);
5498   stat[statlen] = '\0';
5499   fclose(fp);
5500 
5501   // Skip pid and the command string. Note that we could be dealing with
5502   // weird command names, e.g. user could decide to rename java launcher
5503   // to "java 1.4.2 :)", then the stat file would look like
5504   //                1234 (java 1.4.2 :)) R ... ...
5505   // We don't really need to know the command string, just find the last
5506   // occurrence of ")" and then start parsing from there. See bug 4726580.
5507   s = strrchr(stat, ')');
5508   if (s == NULL) return -1;
5509 
5510   // Skip blank chars
5511   do { s++; } while (s && isspace(*s));
5512 
5513   count = sscanf(s,"%c %d %d %d %d %d %lu %lu %lu %lu %lu %lu %lu",
5514                  &cdummy, &idummy, &idummy, &idummy, &idummy, &idummy,
5515                  &ldummy, &ldummy, &ldummy, &ldummy, &ldummy,
5516                  &user_time, &sys_time);
5517   if (count != 13) return -1;
5518   if (user_sys_cpu_time) {
5519     return ((jlong)sys_time + (jlong)user_time) * (1000000000 / clock_tics_per_sec);
5520   } else {
5521     return (jlong)user_time * (1000000000 / clock_tics_per_sec);
5522   }
5523 }
5524 
5525 void os::current_thread_cpu_time_info(jvmtiTimerInfo *info_ptr) {
5526   info_ptr->max_value = ALL_64_BITS;       // will not wrap in less than 64 bits
5527   info_ptr->may_skip_backward = false;     // elapsed time not wall time
5528   info_ptr->may_skip_forward = false;      // elapsed time not wall time
5529   info_ptr->kind = JVMTI_TIMER_TOTAL_CPU;  // user+system time is returned
5530 }
5531 
5532 void os::thread_cpu_time_info(jvmtiTimerInfo *info_ptr) {
5533   info_ptr->max_value = ALL_64_BITS;       // will not wrap in less than 64 bits
5534   info_ptr->may_skip_backward = false;     // elapsed time not wall time
5535   info_ptr->may_skip_forward = false;      // elapsed time not wall time
5536   info_ptr->kind = JVMTI_TIMER_TOTAL_CPU;  // user+system time is returned
5537 }
5538 
5539 bool os::is_thread_cpu_time_supported() {
5540   return true;
5541 }
5542 
5543 // System loadavg support.  Returns -1 if load average cannot be obtained.
5544 // Linux doesn't yet have a (official) notion of processor sets,
5545 // so just return the system wide load average.
5546 int os::loadavg(double loadavg[], int nelem) {
5547   return ::getloadavg(loadavg, nelem);
5548 }
5549 
5550 void os::pause() {
5551   char filename[MAX_PATH];
5552   if (PauseAtStartupFile && PauseAtStartupFile[0]) {
5553     jio_snprintf(filename, MAX_PATH, "%s", PauseAtStartupFile);
5554   } else {
5555     jio_snprintf(filename, MAX_PATH, "./vm.paused.%d", current_process_id());
5556   }
5557 
5558   int fd = ::open(filename, O_WRONLY | O_CREAT | O_TRUNC, 0666);
5559   if (fd != -1) {
5560     struct stat buf;
5561     ::close(fd);
5562     while (::stat(filename, &buf) == 0) {
5563       (void)::poll(NULL, 0, 100);
5564     }
5565   } else {
5566     jio_fprintf(stderr,
5567                 "Could not open pause file '%s', continuing immediately.\n", filename);
5568   }
5569 }
5570 
5571 
5572 // Refer to the comments in os_solaris.cpp park-unpark. The next two
5573 // comment paragraphs are worth repeating here:
5574 //
5575 // Assumption:
5576 //    Only one parker can exist on an event, which is why we allocate
5577 //    them per-thread. Multiple unparkers can coexist.
5578 //
5579 // _Event serves as a restricted-range semaphore.
5580 //   -1 : thread is blocked, i.e. there is a waiter
5581 //    0 : neutral: thread is running or ready,
5582 //        could have been signaled after a wait started
5583 //    1 : signaled - thread is running or ready
5584 //
5585 
5586 // utility to compute the abstime argument to timedwait:
5587 // millis is the relative timeout time
5588 // abstime will be the absolute timeout time
5589 // TODO: replace compute_abstime() with unpackTime()
5590 
5591 static struct timespec* compute_abstime(timespec* abstime, jlong millis) {
5592   if (millis < 0)  millis = 0;
5593 
5594   jlong seconds = millis / 1000;
5595   millis %= 1000;
5596   if (seconds > 50000000) { // see man cond_timedwait(3T)
5597     seconds = 50000000;
5598   }
5599 
5600   if (os::supports_monotonic_clock()) {
5601     struct timespec now;
5602     int status = os::Linux::clock_gettime(CLOCK_MONOTONIC, &now);
5603     assert_status(status == 0, status, "clock_gettime");
5604     abstime->tv_sec = now.tv_sec  + seconds;
5605     long nanos = now.tv_nsec + millis * NANOSECS_PER_MILLISEC;
5606     if (nanos >= NANOSECS_PER_SEC) {
5607       abstime->tv_sec += 1;
5608       nanos -= NANOSECS_PER_SEC;
5609     }
5610     abstime->tv_nsec = nanos;
5611   } else {
5612     struct timeval now;
5613     int status = gettimeofday(&now, NULL);
5614     assert(status == 0, "gettimeofday");
5615     abstime->tv_sec = now.tv_sec  + seconds;
5616     long usec = now.tv_usec + millis * 1000;
5617     if (usec >= 1000000) {
5618       abstime->tv_sec += 1;
5619       usec -= 1000000;
5620     }
5621     abstime->tv_nsec = usec * 1000;
5622   }
5623   return abstime;
5624 }
5625 
5626 void os::PlatformEvent::park() {       // AKA "down()"
5627   // Transitions for _Event:
5628   //   -1 => -1 : illegal
5629   //    1 =>  0 : pass - return immediately
5630   //    0 => -1 : block; then set _Event to 0 before returning
5631 
5632   // Invariant: Only the thread associated with the Event/PlatformEvent
5633   // may call park().
5634   // TODO: assert that _Assoc != NULL or _Assoc == Self
5635   assert(_nParked == 0, "invariant");
5636 
5637   int v;
5638   for (;;) {
5639     v = _Event;
5640     if (Atomic::cmpxchg(v-1, &_Event, v) == v) break;
5641   }
5642   guarantee(v >= 0, "invariant");
5643   if (v == 0) {
5644     // Do this the hard way by blocking ...
5645     int status = pthread_mutex_lock(_mutex);
5646     assert_status(status == 0, status, "mutex_lock");
5647     guarantee(_nParked == 0, "invariant");
5648     ++_nParked;
5649     while (_Event < 0) {
5650       status = pthread_cond_wait(_cond, _mutex);
5651       // for some reason, under 2.7 lwp_cond_wait() may return ETIME ...
5652       // Treat this the same as if the wait was interrupted
5653       if (status == ETIME) { status = EINTR; }
5654       assert_status(status == 0 || status == EINTR, status, "cond_wait");
5655     }
5656     --_nParked;
5657 
5658     _Event = 0;
5659     status = pthread_mutex_unlock(_mutex);
5660     assert_status(status == 0, status, "mutex_unlock");
5661     // Paranoia to ensure our locked and lock-free paths interact
5662     // correctly with each other.
5663     OrderAccess::fence();
5664   }
5665   guarantee(_Event >= 0, "invariant");
5666 }
5667 
5668 int os::PlatformEvent::park(jlong millis) {
5669   // Transitions for _Event:
5670   //   -1 => -1 : illegal
5671   //    1 =>  0 : pass - return immediately
5672   //    0 => -1 : block; then set _Event to 0 before returning
5673 
5674   guarantee(_nParked == 0, "invariant");
5675 
5676   int v;
5677   for (;;) {
5678     v = _Event;
5679     if (Atomic::cmpxchg(v-1, &_Event, v) == v) break;
5680   }
5681   guarantee(v >= 0, "invariant");
5682   if (v != 0) return OS_OK;
5683 
5684   // We do this the hard way, by blocking the thread.
5685   // Consider enforcing a minimum timeout value.
5686   struct timespec abst;
5687   compute_abstime(&abst, millis);
5688 
5689   int ret = OS_TIMEOUT;
5690   int status = pthread_mutex_lock(_mutex);
5691   assert_status(status == 0, status, "mutex_lock");
5692   guarantee(_nParked == 0, "invariant");
5693   ++_nParked;
5694 
5695   // Object.wait(timo) will return because of
5696   // (a) notification
5697   // (b) timeout
5698   // (c) thread.interrupt
5699   //
5700   // Thread.interrupt and object.notify{All} both call Event::set.
5701   // That is, we treat thread.interrupt as a special case of notification.
5702   // We ignore spurious OS wakeups unless FilterSpuriousWakeups is false.
5703   // We assume all ETIME returns are valid.
5704   //
5705   // TODO: properly differentiate simultaneous notify+interrupt.
5706   // In that case, we should propagate the notify to another waiter.
5707 
5708   while (_Event < 0) {
5709     status = pthread_cond_timedwait(_cond, _mutex, &abst);
5710     assert_status(status == 0 || status == EINTR ||
5711                   status == ETIME || status == ETIMEDOUT,
5712                   status, "cond_timedwait");
5713     if (!FilterSpuriousWakeups) break;                 // previous semantics
5714     if (status == ETIME || status == ETIMEDOUT) break;
5715     // We consume and ignore EINTR and spurious wakeups.
5716   }
5717   --_nParked;
5718   if (_Event >= 0) {
5719     ret = OS_OK;
5720   }
5721   _Event = 0;
5722   status = pthread_mutex_unlock(_mutex);
5723   assert_status(status == 0, status, "mutex_unlock");
5724   assert(_nParked == 0, "invariant");
5725   // Paranoia to ensure our locked and lock-free paths interact
5726   // correctly with each other.
5727   OrderAccess::fence();
5728   return ret;
5729 }
5730 
5731 void os::PlatformEvent::unpark() {
5732   // Transitions for _Event:
5733   //    0 => 1 : just return
5734   //    1 => 1 : just return
5735   //   -1 => either 0 or 1; must signal target thread
5736   //         That is, we can safely transition _Event from -1 to either
5737   //         0 or 1.
5738   // See also: "Semaphores in Plan 9" by Mullender & Cox
5739   //
5740   // Note: Forcing a transition from "-1" to "1" on an unpark() means
5741   // that it will take two back-to-back park() calls for the owning
5742   // thread to block. This has the benefit of forcing a spurious return
5743   // from the first park() call after an unpark() call which will help
5744   // shake out uses of park() and unpark() without condition variables.
5745 
5746   if (Atomic::xchg(1, &_Event) >= 0) return;
5747 
5748   // Wait for the thread associated with the event to vacate
5749   int status = pthread_mutex_lock(_mutex);
5750   assert_status(status == 0, status, "mutex_lock");
5751   int AnyWaiters = _nParked;
5752   assert(AnyWaiters == 0 || AnyWaiters == 1, "invariant");
5753   status = pthread_mutex_unlock(_mutex);
5754   assert_status(status == 0, status, "mutex_unlock");
5755   if (AnyWaiters != 0) {
5756     // Note that we signal() *after* dropping the lock for "immortal" Events.
5757     // This is safe and avoids a common class of  futile wakeups.  In rare
5758     // circumstances this can cause a thread to return prematurely from
5759     // cond_{timed}wait() but the spurious wakeup is benign and the victim
5760     // will simply re-test the condition and re-park itself.
5761     // This provides particular benefit if the underlying platform does not
5762     // provide wait morphing.
5763     status = pthread_cond_signal(_cond);
5764     assert_status(status == 0, status, "cond_signal");
5765   }
5766 }
5767 
5768 
5769 // JSR166
5770 // -------------------------------------------------------
5771 
5772 // The solaris and linux implementations of park/unpark are fairly
5773 // conservative for now, but can be improved. They currently use a
5774 // mutex/condvar pair, plus a a count.
5775 // Park decrements count if > 0, else does a condvar wait.  Unpark
5776 // sets count to 1 and signals condvar.  Only one thread ever waits
5777 // on the condvar. Contention seen when trying to park implies that someone
5778 // is unparking you, so don't wait. And spurious returns are fine, so there
5779 // is no need to track notifications.
5780 
5781 // This code is common to linux and solaris and will be moved to a
5782 // common place in dolphin.
5783 //
5784 // The passed in time value is either a relative time in nanoseconds
5785 // or an absolute time in milliseconds. Either way it has to be unpacked
5786 // into suitable seconds and nanoseconds components and stored in the
5787 // given timespec structure.
5788 // Given time is a 64-bit value and the time_t used in the timespec is only
5789 // a signed-32-bit value (except on 64-bit Linux) we have to watch for
5790 // overflow if times way in the future are given. Further on Solaris versions
5791 // prior to 10 there is a restriction (see cond_timedwait) that the specified
5792 // number of seconds, in abstime, is less than current_time  + 100,000,000.
5793 // As it will be 28 years before "now + 100000000" will overflow we can
5794 // ignore overflow and just impose a hard-limit on seconds using the value
5795 // of "now + 100,000,000". This places a limit on the timeout of about 3.17
5796 // years from "now".
5797 
5798 static void unpackTime(timespec* absTime, bool isAbsolute, jlong time) {
5799   assert(time > 0, "convertTime");
5800   time_t max_secs = 0;
5801 
5802   if (!os::supports_monotonic_clock() || isAbsolute) {
5803     struct timeval now;
5804     int status = gettimeofday(&now, NULL);
5805     assert(status == 0, "gettimeofday");
5806 
5807     max_secs = now.tv_sec + MAX_SECS;
5808 
5809     if (isAbsolute) {
5810       jlong secs = time / 1000;
5811       if (secs > max_secs) {
5812         absTime->tv_sec = max_secs;
5813       } else {
5814         absTime->tv_sec = secs;
5815       }
5816       absTime->tv_nsec = (time % 1000) * NANOSECS_PER_MILLISEC;
5817     } else {
5818       jlong secs = time / NANOSECS_PER_SEC;
5819       if (secs >= MAX_SECS) {
5820         absTime->tv_sec = max_secs;
5821         absTime->tv_nsec = 0;
5822       } else {
5823         absTime->tv_sec = now.tv_sec + secs;
5824         absTime->tv_nsec = (time % NANOSECS_PER_SEC) + now.tv_usec*1000;
5825         if (absTime->tv_nsec >= NANOSECS_PER_SEC) {
5826           absTime->tv_nsec -= NANOSECS_PER_SEC;
5827           ++absTime->tv_sec; // note: this must be <= max_secs
5828         }
5829       }
5830     }
5831   } else {
5832     // must be relative using monotonic clock
5833     struct timespec now;
5834     int status = os::Linux::clock_gettime(CLOCK_MONOTONIC, &now);
5835     assert_status(status == 0, status, "clock_gettime");
5836     max_secs = now.tv_sec + MAX_SECS;
5837     jlong secs = time / NANOSECS_PER_SEC;
5838     if (secs >= MAX_SECS) {
5839       absTime->tv_sec = max_secs;
5840       absTime->tv_nsec = 0;
5841     } else {
5842       absTime->tv_sec = now.tv_sec + secs;
5843       absTime->tv_nsec = (time % NANOSECS_PER_SEC) + now.tv_nsec;
5844       if (absTime->tv_nsec >= NANOSECS_PER_SEC) {
5845         absTime->tv_nsec -= NANOSECS_PER_SEC;
5846         ++absTime->tv_sec; // note: this must be <= max_secs
5847       }
5848     }
5849   }
5850   assert(absTime->tv_sec >= 0, "tv_sec < 0");
5851   assert(absTime->tv_sec <= max_secs, "tv_sec > max_secs");
5852   assert(absTime->tv_nsec >= 0, "tv_nsec < 0");
5853   assert(absTime->tv_nsec < NANOSECS_PER_SEC, "tv_nsec >= nanos_per_sec");
5854 }
5855 
5856 void Parker::park(bool isAbsolute, jlong time) {
5857   // Ideally we'd do something useful while spinning, such
5858   // as calling unpackTime().
5859 
5860   // Optional fast-path check:
5861   // Return immediately if a permit is available.
5862   // We depend on Atomic::xchg() having full barrier semantics
5863   // since we are doing a lock-free update to _counter.
5864   if (Atomic::xchg(0, &_counter) > 0) return;
5865 
5866   Thread* thread = Thread::current();
5867   assert(thread->is_Java_thread(), "Must be JavaThread");
5868   JavaThread *jt = (JavaThread *)thread;
5869 
5870   // Optional optimization -- avoid state transitions if there's an interrupt pending.
5871   // Check interrupt before trying to wait
5872   if (Thread::is_interrupted(thread, false)) {
5873     return;
5874   }
5875 
5876   // Next, demultiplex/decode time arguments
5877   timespec absTime;
5878   if (time < 0 || (isAbsolute && time == 0)) { // don't wait at all
5879     return;
5880   }
5881   if (time > 0) {
5882     unpackTime(&absTime, isAbsolute, time);
5883   }
5884 
5885 
5886   // Enter safepoint region
5887   // Beware of deadlocks such as 6317397.
5888   // The per-thread Parker:: mutex is a classic leaf-lock.
5889   // In particular a thread must never block on the Threads_lock while
5890   // holding the Parker:: mutex.  If safepoints are pending both the
5891   // the ThreadBlockInVM() CTOR and DTOR may grab Threads_lock.
5892   ThreadBlockInVM tbivm(jt);
5893 
5894   // Don't wait if cannot get lock since interference arises from
5895   // unblocking.  Also. check interrupt before trying wait
5896   if (Thread::is_interrupted(thread, false) || pthread_mutex_trylock(_mutex) != 0) {
5897     return;
5898   }
5899 
5900   int status;
5901   if (_counter > 0)  { // no wait needed
5902     _counter = 0;
5903     status = pthread_mutex_unlock(_mutex);
5904     assert_status(status == 0, status, "invariant");
5905     // Paranoia to ensure our locked and lock-free paths interact
5906     // correctly with each other and Java-level accesses.
5907     OrderAccess::fence();
5908     return;
5909   }
5910 
5911 #ifdef ASSERT
5912   // Don't catch signals while blocked; let the running threads have the signals.
5913   // (This allows a debugger to break into the running thread.)
5914   sigset_t oldsigs;
5915   sigemptyset(&oldsigs);
5916   sigset_t* allowdebug_blocked = os::Linux::allowdebug_blocked_signals();
5917   pthread_sigmask(SIG_BLOCK, allowdebug_blocked, &oldsigs);
5918 #endif
5919 
5920   OSThreadWaitState osts(thread->osthread(), false /* not Object.wait() */);
5921   jt->set_suspend_equivalent();
5922   // cleared by handle_special_suspend_equivalent_condition() or java_suspend_self()
5923 
5924   assert(_cur_index == -1, "invariant");
5925   if (time == 0) {
5926     _cur_index = REL_INDEX; // arbitrary choice when not timed
5927     status = pthread_cond_wait(&_cond[_cur_index], _mutex);
5928   } else {
5929     _cur_index = isAbsolute ? ABS_INDEX : REL_INDEX;
5930     status = pthread_cond_timedwait(&_cond[_cur_index], _mutex, &absTime);
5931   }
5932   _cur_index = -1;
5933   assert_status(status == 0 || status == EINTR ||
5934                 status == ETIME || status == ETIMEDOUT,
5935                 status, "cond_timedwait");
5936 
5937 #ifdef ASSERT
5938   pthread_sigmask(SIG_SETMASK, &oldsigs, NULL);
5939 #endif
5940 
5941   _counter = 0;
5942   status = pthread_mutex_unlock(_mutex);
5943   assert_status(status == 0, status, "invariant");
5944   // Paranoia to ensure our locked and lock-free paths interact
5945   // correctly with each other and Java-level accesses.
5946   OrderAccess::fence();
5947 
5948   // If externally suspended while waiting, re-suspend
5949   if (jt->handle_special_suspend_equivalent_condition()) {
5950     jt->java_suspend_self();
5951   }
5952 }
5953 
5954 void Parker::unpark() {
5955   int status = pthread_mutex_lock(_mutex);
5956   assert_status(status == 0, status, "invariant");
5957   const int s = _counter;
5958   _counter = 1;
5959   // must capture correct index before unlocking
5960   int index = _cur_index;
5961   status = pthread_mutex_unlock(_mutex);
5962   assert_status(status == 0, status, "invariant");
5963   if (s < 1 && index != -1) {
5964     // thread is definitely parked
5965     status = pthread_cond_signal(&_cond[index]);
5966     assert_status(status == 0, status, "invariant");
5967   }
5968 }
5969 
5970 
5971 extern char** environ;
5972 
5973 // Run the specified command in a separate process. Return its exit value,
5974 // or -1 on failure (e.g. can't fork a new process).
5975 // Unlike system(), this function can be called from signal handler. It
5976 // doesn't block SIGINT et al.
5977 int os::fork_and_exec(char* cmd) {
5978   const char * argv[4] = {"sh", "-c", cmd, NULL};
5979 
5980   pid_t pid = fork();
5981 
5982   if (pid < 0) {
5983     // fork failed
5984     return -1;
5985 
5986   } else if (pid == 0) {
5987     // child process
5988 
5989     execve("/bin/sh", (char* const*)argv, environ);
5990 
5991     // execve failed
5992     _exit(-1);
5993 
5994   } else  {
5995     // copied from J2SE ..._waitForProcessExit() in UNIXProcess_md.c; we don't
5996     // care about the actual exit code, for now.
5997 
5998     int status;
5999 
6000     // Wait for the child process to exit.  This returns immediately if
6001     // the child has already exited. */
6002     while (waitpid(pid, &status, 0) < 0) {
6003       switch (errno) {
6004       case ECHILD: return 0;
6005       case EINTR: break;
6006       default: return -1;
6007       }
6008     }
6009 
6010     if (WIFEXITED(status)) {
6011       // The child exited normally; get its exit code.
6012       return WEXITSTATUS(status);
6013     } else if (WIFSIGNALED(status)) {
6014       // The child exited because of a signal
6015       // The best value to return is 0x80 + signal number,
6016       // because that is what all Unix shells do, and because
6017       // it allows callers to distinguish between process exit and
6018       // process death by signal.
6019       return 0x80 + WTERMSIG(status);
6020     } else {
6021       // Unknown exit code; pass it through
6022       return status;
6023     }
6024   }
6025 }
6026 
6027 // is_headless_jre()
6028 //
6029 // Test for the existence of xawt/libmawt.so or libawt_xawt.so
6030 // in order to report if we are running in a headless jre
6031 //
6032 // Since JDK8 xawt/libmawt.so was moved into the same directory
6033 // as libawt.so, and renamed libawt_xawt.so
6034 //
6035 bool os::is_headless_jre() {
6036   struct stat statbuf;
6037   char buf[MAXPATHLEN];
6038   char libmawtpath[MAXPATHLEN];
6039   const char *xawtstr  = "/xawt/libmawt.so";
6040   const char *new_xawtstr = "/libawt_xawt.so";
6041   char *p;
6042 
6043   // Get path to libjvm.so
6044   os::jvm_path(buf, sizeof(buf));
6045 
6046   // Get rid of libjvm.so
6047   p = strrchr(buf, '/');
6048   if (p == NULL) {
6049     return false;
6050   } else {
6051     *p = '\0';
6052   }
6053 
6054   // Get rid of client or server
6055   p = strrchr(buf, '/');
6056   if (p == NULL) {
6057     return false;
6058   } else {
6059     *p = '\0';
6060   }
6061 
6062   // check xawt/libmawt.so
6063   strcpy(libmawtpath, buf);
6064   strcat(libmawtpath, xawtstr);
6065   if (::stat(libmawtpath, &statbuf) == 0) return false;
6066 
6067   // check libawt_xawt.so
6068   strcpy(libmawtpath, buf);
6069   strcat(libmawtpath, new_xawtstr);
6070   if (::stat(libmawtpath, &statbuf) == 0) return false;
6071 
6072   return true;
6073 }
6074 
6075 // Get the default path to the core file
6076 // Returns the length of the string
6077 int os::get_core_path(char* buffer, size_t bufferSize) {
6078   /*
6079    * Max length of /proc/sys/kernel/core_pattern is 128 characters.
6080    * See https://www.kernel.org/doc/Documentation/sysctl/kernel.txt
6081    */
6082   const int core_pattern_len = 129;
6083   char core_pattern[core_pattern_len] = {0};
6084 
6085   int core_pattern_file = ::open("/proc/sys/kernel/core_pattern", O_RDONLY);
6086   if (core_pattern_file == -1) {
6087     return -1;
6088   }
6089 
6090   ssize_t ret = ::read(core_pattern_file, core_pattern, core_pattern_len);
6091   ::close(core_pattern_file);
6092   if (ret <= 0 || ret >= core_pattern_len || core_pattern[0] == '\n') {
6093     return -1;
6094   }
6095   if (core_pattern[ret-1] == '\n') {
6096     core_pattern[ret-1] = '\0';
6097   } else {
6098     core_pattern[ret] = '\0';
6099   }
6100 
6101   char *pid_pos = strstr(core_pattern, "%p");
6102   int written;
6103 
6104   if (core_pattern[0] == '/') {
6105     written = jio_snprintf(buffer, bufferSize, "%s", core_pattern);
6106   } else {
6107     char cwd[PATH_MAX];
6108 
6109     const char* p = get_current_directory(cwd, PATH_MAX);
6110     if (p == NULL) {
6111       return -1;
6112     }
6113 
6114     if (core_pattern[0] == '|') {
6115       written = jio_snprintf(buffer, bufferSize,
6116                              "\"%s\" (or dumping to %s/core.%d)",
6117                              &core_pattern[1], p, current_process_id());
6118     } else {
6119       written = jio_snprintf(buffer, bufferSize, "%s/%s", p, core_pattern);
6120     }
6121   }
6122 
6123   if (written < 0) {
6124     return -1;
6125   }
6126 
6127   if (((size_t)written < bufferSize) && (pid_pos == NULL) && (core_pattern[0] != '|')) {
6128     int core_uses_pid_file = ::open("/proc/sys/kernel/core_uses_pid", O_RDONLY);
6129 
6130     if (core_uses_pid_file != -1) {
6131       char core_uses_pid = 0;
6132       ssize_t ret = ::read(core_uses_pid_file, &core_uses_pid, 1);
6133       ::close(core_uses_pid_file);
6134 
6135       if (core_uses_pid == '1') {
6136         jio_snprintf(buffer + written, bufferSize - written,
6137                                           ".%d", current_process_id());
6138       }
6139     }
6140   }
6141 
6142   return strlen(buffer);
6143 }
6144 
6145 bool os::start_debugging(char *buf, int buflen) {
6146   int len = (int)strlen(buf);
6147   char *p = &buf[len];
6148 
6149   jio_snprintf(p, buflen-len,
6150                "\n\n"
6151                "Do you want to debug the problem?\n\n"
6152                "To debug, run 'gdb /proc/%d/exe %d'; then switch to thread " UINTX_FORMAT " (" INTPTR_FORMAT ")\n"
6153                "Enter 'yes' to launch gdb automatically (PATH must include gdb)\n"
6154                "Otherwise, press RETURN to abort...",
6155                os::current_process_id(), os::current_process_id(),
6156                os::current_thread_id(), os::current_thread_id());
6157 
6158   bool yes = os::message_box("Unexpected Error", buf);
6159 
6160   if (yes) {
6161     // yes, user asked VM to launch debugger
6162     jio_snprintf(buf, sizeof(char)*buflen, "gdb /proc/%d/exe %d",
6163                  os::current_process_id(), os::current_process_id());
6164 
6165     os::fork_and_exec(buf);
6166     yes = false;
6167   }
6168   return yes;
6169 }
6170 
6171 
6172 // Java/Compiler thread:
6173 //
6174 //   Low memory addresses
6175 // P0 +------------------------+
6176 //    |                        |\  Java thread created by VM does not have glibc
6177 //    |    glibc guard page    | - guard page, attached Java thread usually has
6178 //    |                        |/  1 glibc guard page.
6179 // P1 +------------------------+ Thread::stack_base() - Thread::stack_size()
6180 //    |                        |\
6181 //    |  HotSpot Guard Pages   | - red, yellow and reserved pages
6182 //    |                        |/
6183 //    +------------------------+ JavaThread::stack_reserved_zone_base()
6184 //    |                        |\
6185 //    |      Normal Stack      | -
6186 //    |                        |/
6187 // P2 +------------------------+ Thread::stack_base()
6188 //
6189 // Non-Java thread:
6190 //
6191 //   Low memory addresses
6192 // P0 +------------------------+
6193 //    |                        |\
6194 //    |  glibc guard page      | - usually 1 page
6195 //    |                        |/
6196 // P1 +------------------------+ Thread::stack_base() - Thread::stack_size()
6197 //    |                        |\
6198 //    |      Normal Stack      | -
6199 //    |                        |/
6200 // P2 +------------------------+ Thread::stack_base()
6201 //
6202 // ** P1 (aka bottom) and size (P2 = P1 - size) are the address and stack size
6203 //    returned from pthread_attr_getstack().
6204 // ** Due to NPTL implementation error, linux takes the glibc guard page out
6205 //    of the stack size given in pthread_attr. We work around this for
6206 //    threads created by the VM. (We adapt bottom to be P1 and size accordingly.)
6207 //
6208 #ifndef ZERO
6209 static void current_stack_region(address * bottom, size_t * size) {
6210   if (os::Linux::is_initial_thread()) {
6211     // initial thread needs special handling because pthread_getattr_np()
6212     // may return bogus value.
6213     *bottom = os::Linux::initial_thread_stack_bottom();
6214     *size   = os::Linux::initial_thread_stack_size();
6215   } else {
6216     pthread_attr_t attr;
6217 
6218     int rslt = pthread_getattr_np(pthread_self(), &attr);
6219 
6220     // JVM needs to know exact stack location, abort if it fails
6221     if (rslt != 0) {
6222       if (rslt == ENOMEM) {
6223         vm_exit_out_of_memory(0, OOM_MMAP_ERROR, "pthread_getattr_np");
6224       } else {
6225         fatal("pthread_getattr_np failed with error = %d", rslt);
6226       }
6227     }
6228 
6229     if (pthread_attr_getstack(&attr, (void **)bottom, size) != 0) {
6230       fatal("Cannot locate current stack attributes!");
6231     }
6232 
6233     // Work around NPTL stack guard error.
6234     size_t guard_size = 0;
6235     rslt = pthread_attr_getguardsize(&attr, &guard_size);
6236     if (rslt != 0) {
6237       fatal("pthread_attr_getguardsize failed with error = %d", rslt);
6238     }
6239     *bottom += guard_size;
6240     *size   -= guard_size;
6241 
6242     pthread_attr_destroy(&attr);
6243 
6244   }
6245   assert(os::current_stack_pointer() >= *bottom &&
6246          os::current_stack_pointer() < *bottom + *size, "just checking");
6247 }
6248 
6249 address os::current_stack_base() {
6250   address bottom;
6251   size_t size;
6252   current_stack_region(&bottom, &size);
6253   return (bottom + size);
6254 }
6255 
6256 size_t os::current_stack_size() {
6257   // This stack size includes the usable stack and HotSpot guard pages
6258   // (for the threads that have Hotspot guard pages).
6259   address bottom;
6260   size_t size;
6261   current_stack_region(&bottom, &size);
6262   return size;
6263 }
6264 #endif
6265 
6266 static inline struct timespec get_mtime(const char* filename) {
6267   struct stat st;
6268   int ret = os::stat(filename, &st);
6269   assert(ret == 0, "failed to stat() file '%s': %s", filename, strerror(errno));
6270   return st.st_mtim;
6271 }
6272 
6273 int os::compare_file_modified_times(const char* file1, const char* file2) {
6274   struct timespec filetime1 = get_mtime(file1);
6275   struct timespec filetime2 = get_mtime(file2);
6276   int diff = filetime1.tv_sec - filetime2.tv_sec;
6277   if (diff == 0) {
6278     return filetime1.tv_nsec - filetime2.tv_nsec;
6279   }
6280   return diff;
6281 }
6282 
6283 /////////////// Unit tests ///////////////
6284 
6285 #ifndef PRODUCT
6286 
6287 #define test_log(...)              \
6288   do {                             \
6289     if (VerboseInternalVMTests) {  \
6290       tty->print_cr(__VA_ARGS__);  \
6291       tty->flush();                \
6292     }                              \
6293   } while (false)
6294 
6295 class TestReserveMemorySpecial : AllStatic {
6296  public:
6297   static void small_page_write(void* addr, size_t size) {
6298     size_t page_size = os::vm_page_size();
6299 
6300     char* end = (char*)addr + size;
6301     for (char* p = (char*)addr; p < end; p += page_size) {
6302       *p = 1;
6303     }
6304   }
6305 
6306   static void test_reserve_memory_special_huge_tlbfs_only(size_t size) {
6307     if (!UseHugeTLBFS) {
6308       return;
6309     }
6310 
6311     test_log("test_reserve_memory_special_huge_tlbfs_only(" SIZE_FORMAT ")", size);
6312 
6313     char* addr = os::Linux::reserve_memory_special_huge_tlbfs_only(size, NULL, false);
6314 
6315     if (addr != NULL) {
6316       small_page_write(addr, size);
6317 
6318       os::Linux::release_memory_special_huge_tlbfs(addr, size);
6319     }
6320   }
6321 
6322   static void test_reserve_memory_special_huge_tlbfs_only() {
6323     if (!UseHugeTLBFS) {
6324       return;
6325     }
6326 
6327     size_t lp = os::large_page_size();
6328 
6329     for (size_t size = lp; size <= lp * 10; size += lp) {
6330       test_reserve_memory_special_huge_tlbfs_only(size);
6331     }
6332   }
6333 
6334   static void test_reserve_memory_special_huge_tlbfs_mixed() {
6335     size_t lp = os::large_page_size();
6336     size_t ag = os::vm_allocation_granularity();
6337 
6338     // sizes to test
6339     const size_t sizes[] = {
6340       lp, lp + ag, lp + lp / 2, lp * 2,
6341       lp * 2 + ag, lp * 2 - ag, lp * 2 + lp / 2,
6342       lp * 10, lp * 10 + lp / 2
6343     };
6344     const int num_sizes = sizeof(sizes) / sizeof(size_t);
6345 
6346     // For each size/alignment combination, we test three scenarios:
6347     // 1) with req_addr == NULL
6348     // 2) with a non-null req_addr at which we expect to successfully allocate
6349     // 3) with a non-null req_addr which contains a pre-existing mapping, at which we
6350     //    expect the allocation to either fail or to ignore req_addr
6351 
6352     // Pre-allocate two areas; they shall be as large as the largest allocation
6353     //  and aligned to the largest alignment we will be testing.
6354     const size_t mapping_size = sizes[num_sizes - 1] * 2;
6355     char* const mapping1 = (char*) ::mmap(NULL, mapping_size,
6356       PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_NORESERVE,
6357       -1, 0);
6358     assert(mapping1 != MAP_FAILED, "should work");
6359 
6360     char* const mapping2 = (char*) ::mmap(NULL, mapping_size,
6361       PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_NORESERVE,
6362       -1, 0);
6363     assert(mapping2 != MAP_FAILED, "should work");
6364 
6365     // Unmap the first mapping, but leave the second mapping intact: the first
6366     // mapping will serve as a value for a "good" req_addr (case 2). The second
6367     // mapping, still intact, as "bad" req_addr (case 3).
6368     ::munmap(mapping1, mapping_size);
6369 
6370     // Case 1
6371     test_log("%s, req_addr NULL:", __FUNCTION__);
6372     test_log("size            align           result");
6373 
6374     for (int i = 0; i < num_sizes; i++) {
6375       const size_t size = sizes[i];
6376       for (size_t alignment = ag; is_size_aligned(size, alignment); alignment *= 2) {
6377         char* p = os::Linux::reserve_memory_special_huge_tlbfs_mixed(size, alignment, NULL, false);
6378         test_log(SIZE_FORMAT_HEX " " SIZE_FORMAT_HEX " ->  " PTR_FORMAT " %s",
6379                  size, alignment, p2i(p), (p != NULL ? "" : "(failed)"));
6380         if (p != NULL) {
6381           assert(is_ptr_aligned(p, alignment), "must be");
6382           small_page_write(p, size);
6383           os::Linux::release_memory_special_huge_tlbfs(p, size);
6384         }
6385       }
6386     }
6387 
6388     // Case 2
6389     test_log("%s, req_addr non-NULL:", __FUNCTION__);
6390     test_log("size            align           req_addr         result");
6391 
6392     for (int i = 0; i < num_sizes; i++) {
6393       const size_t size = sizes[i];
6394       for (size_t alignment = ag; is_size_aligned(size, alignment); alignment *= 2) {
6395         char* const req_addr = (char*) align_ptr_up(mapping1, alignment);
6396         char* p = os::Linux::reserve_memory_special_huge_tlbfs_mixed(size, alignment, req_addr, false);
6397         test_log(SIZE_FORMAT_HEX " " SIZE_FORMAT_HEX " " PTR_FORMAT " ->  " PTR_FORMAT " %s",
6398                  size, alignment, p2i(req_addr), p2i(p),
6399                  ((p != NULL ? (p == req_addr ? "(exact match)" : "") : "(failed)")));
6400         if (p != NULL) {
6401           assert(p == req_addr, "must be");
6402           small_page_write(p, size);
6403           os::Linux::release_memory_special_huge_tlbfs(p, size);
6404         }
6405       }
6406     }
6407 
6408     // Case 3
6409     test_log("%s, req_addr non-NULL with preexisting mapping:", __FUNCTION__);
6410     test_log("size            align           req_addr         result");
6411 
6412     for (int i = 0; i < num_sizes; i++) {
6413       const size_t size = sizes[i];
6414       for (size_t alignment = ag; is_size_aligned(size, alignment); alignment *= 2) {
6415         char* const req_addr = (char*) align_ptr_up(mapping2, alignment);
6416         char* p = os::Linux::reserve_memory_special_huge_tlbfs_mixed(size, alignment, req_addr, false);
6417         test_log(SIZE_FORMAT_HEX " " SIZE_FORMAT_HEX " " PTR_FORMAT " ->  " PTR_FORMAT " %s",
6418                  size, alignment, p2i(req_addr), p2i(p), ((p != NULL ? "" : "(failed)")));
6419         // as the area around req_addr contains already existing mappings, the API should always
6420         // return NULL (as per contract, it cannot return another address)
6421         assert(p == NULL, "must be");
6422       }
6423     }
6424 
6425     ::munmap(mapping2, mapping_size);
6426 
6427   }
6428 
6429   static void test_reserve_memory_special_huge_tlbfs() {
6430     if (!UseHugeTLBFS) {
6431       return;
6432     }
6433 
6434     test_reserve_memory_special_huge_tlbfs_only();
6435     test_reserve_memory_special_huge_tlbfs_mixed();
6436   }
6437 
6438   static void test_reserve_memory_special_shm(size_t size, size_t alignment) {
6439     if (!UseSHM) {
6440       return;
6441     }
6442 
6443     test_log("test_reserve_memory_special_shm(" SIZE_FORMAT ", " SIZE_FORMAT ")", size, alignment);
6444 
6445     char* addr = os::Linux::reserve_memory_special_shm(size, alignment, NULL, false);
6446 
6447     if (addr != NULL) {
6448       assert(is_ptr_aligned(addr, alignment), "Check");
6449       assert(is_ptr_aligned(addr, os::large_page_size()), "Check");
6450 
6451       small_page_write(addr, size);
6452 
6453       os::Linux::release_memory_special_shm(addr, size);
6454     }
6455   }
6456 
6457   static void test_reserve_memory_special_shm() {
6458     size_t lp = os::large_page_size();
6459     size_t ag = os::vm_allocation_granularity();
6460 
6461     for (size_t size = ag; size < lp * 3; size += ag) {
6462       for (size_t alignment = ag; is_size_aligned(size, alignment); alignment *= 2) {
6463         test_reserve_memory_special_shm(size, alignment);
6464       }
6465     }
6466   }
6467 
6468   static void test() {
6469     test_reserve_memory_special_huge_tlbfs();
6470     test_reserve_memory_special_shm();
6471   }
6472 };
6473 
6474 void TestReserveMemorySpecial_test() {
6475   TestReserveMemorySpecial::test();
6476 }
6477 
6478 #endif