New src/os/linux/vm/os

   1 /*
   2  * Copyright (c) 1999, 2016, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 // no precompiled headers
  26 #include "classfile/classLoader.hpp"
  27 #include "classfile/systemDictionary.hpp"
  28 #include "classfile/vmSymbols.hpp"
  29 #include "code/icBuffer.hpp"
  30 #include "code/vtableStubs.hpp"
  31 #include "compiler/compileBroker.hpp"
  32 #include "compiler/disassembler.hpp"
  33 #include "interpreter/interpreter.hpp"
  34 #include "jvm_linux.h"
  35 #include "logging/log.hpp"
  36 #include "memory/allocation.inline.hpp"
  37 #include "memory/filemap.hpp"
  38 #include "oops/oop.inline.hpp"
  39 #include "os_linux.inline.hpp"
  40 #include "os_share_linux.hpp"
  41 #include "prims/jniFastGetField.hpp"
  42 #include "prims/jvm.h"
  43 #include "prims/jvm_misc.hpp"
  44 #include "runtime/arguments.hpp"
  45 #include "runtime/atomic.hpp"
  46 #include "runtime/extendedPC.hpp"
  47 #include "runtime/globals.hpp"
  48 #include "runtime/interfaceSupport.hpp"
  49 #include "runtime/init.hpp"
  50 #include "runtime/java.hpp"
  51 #include "runtime/javaCalls.hpp"
  52 #include "runtime/mutexLocker.hpp"
  53 #include "runtime/objectMonitor.hpp"
  54 #include "runtime/orderAccess.inline.hpp"
  55 #include "runtime/osThread.hpp"
  56 #include "runtime/perfMemory.hpp"
  57 #include "runtime/sharedRuntime.hpp"
  58 #include "runtime/statSampler.hpp"
  59 #include "runtime/stubRoutines.hpp"
  60 #include "runtime/thread.inline.hpp"
  61 #include "runtime/threadCritical.hpp"
  62 #include "runtime/timer.hpp"
  63 #include "semaphore_posix.hpp"
  64 #include "services/attachListener.hpp"
  65 #include "services/memTracker.hpp"
  66 #include "services/runtimeService.hpp"
  67 #include "utilities/decoder.hpp"
  68 #include "utilities/defaultStream.hpp"
  69 #include "utilities/events.hpp"
  70 #include "utilities/elfFile.hpp"
  71 #include "utilities/growableArray.hpp"
  72 #include "utilities/macros.hpp"
  73 #include "utilities/vmError.hpp"
  74 
  75 // put OS-includes here
  76 # include <sys/types.h>
  77 # include <sys/mman.h>
  78 # include <sys/stat.h>
  79 # include <sys/select.h>
  80 # include <pthread.h>
  81 # include <signal.h>
  82 # include <errno.h>
  83 # include <dlfcn.h>
  84 # include <stdio.h>
  85 # include <unistd.h>
  86 # include <sys/resource.h>
  87 # include <pthread.h>
  88 # include <sys/stat.h>
  89 # include <sys/time.h>
  90 # include <sys/times.h>
  91 # include <sys/utsname.h>
  92 # include <sys/socket.h>
  93 # include <sys/wait.h>
  94 # include <pwd.h>
  95 # include <poll.h>
  96 # include <semaphore.h>
  97 # include <fcntl.h>
  98 # include <string.h>
  99 # include <syscall.h>
 100 # include <sys/sysinfo.h>
 101 # include <sys/ipc.h>
 102 # include <sys/shm.h>
 103 # include <link.h>
 104 # include <stdint.h>
 105 # include <inttypes.h>
 106 # include <sys/ioctl.h>
 107 
 108 #ifndef _GNU_SOURCE
 109   #define _GNU_SOURCE
 110   #include <sched.h>
 111   #undef _GNU_SOURCE
 112 #else
 113   #include <sched.h>
 114 #endif
 115 
 116 // if RUSAGE_THREAD for getrusage() has not been defined, do it here. The code calling
 117 // getrusage() is prepared to handle the associated failure.
 118 #ifndef RUSAGE_THREAD
 119   #define RUSAGE_THREAD   (1)               /* only the calling thread */
 120 #endif
 121 
 122 #define MAX_PATH    (2 * K)
 123 
 124 #define MAX_SECS 100000000
 125 
 126 // for timer info max values which include all bits
 127 #define ALL_64_BITS CONST64(0xFFFFFFFFFFFFFFFF)
 128 
 129 #define LARGEPAGES_BIT (1 << 6)
 130 ////////////////////////////////////////////////////////////////////////////////
 131 // global variables
 132 julong os::Linux::_physical_memory = 0;
 133 
 134 address   os::Linux::_initial_thread_stack_bottom = NULL;
 135 uintptr_t os::Linux::_initial_thread_stack_size   = 0;
 136 
 137 int (*os::Linux::_clock_gettime)(clockid_t, struct timespec *) = NULL;
 138 int (*os::Linux::_pthread_getcpuclockid)(pthread_t, clockid_t *) = NULL;
 139 int (*os::Linux::_pthread_setname_np)(pthread_t, const char*) = NULL;
 140 Mutex* os::Linux::_createThread_lock = NULL;
 141 pthread_t os::Linux::_main_thread;
 142 int os::Linux::_page_size = -1;
 143 bool os::Linux::_supports_fast_thread_cpu_time = false;
 144 uint32_t os::Linux::_os_version = 0;
 145 const char * os::Linux::_glibc_version = "unknown";
 146 const char * os::Linux::_libpthread_version = "unknown";
 147 pthread_condattr_t os::Linux::_condattr[1];
 148 
 149 static jlong initial_time_count=0;
 150 
 151 static int clock_tics_per_sec = 100;
 152 
 153 // For diagnostics to print a message once. see run_periodic_checks
 154 static sigset_t check_signal_done;
 155 static bool check_signals = true;
 156 
 157 // Signal number used to suspend/resume a thread
 158 
 159 // do not use any signal number less than SIGSEGV, see 4355769
 160 static int SR_signum = SIGUSR2;
 161 sigset_t SR_sigset;
 162 
 163 // Declarations
 164 static void unpackTime(timespec* absTime, bool isAbsolute, jlong time);
 165 
 166 // utility functions
 167 
 168 static int SR_initialize();
 169 
 170 julong os::available_memory() {
 171   return Linux::available_memory();
 172 }
 173 
 174 julong os::Linux::available_memory() {
 175   // values in struct sysinfo are "unsigned long"
 176   struct sysinfo si;
 177   sysinfo(&si);
 178 
 179   return (julong)si.freeram * si.mem_unit;
 180 }
 181 
 182 julong os::physical_memory() {
 183   return Linux::physical_memory();
 184 }
 185 
 186 // Return true if user is running as root.
 187 
 188 bool os::have_special_privileges() {
 189   static bool init = false;
 190   static bool privileges = false;
 191   if (!init) {
 192     privileges = (getuid() != geteuid()) || (getgid() != getegid());
 193     init = true;
 194   }
 195   return privileges;
 196 }
 197 
 198 
 199 #ifndef SYS_gettid
 200 // i386: 224, ia64: 1105, amd64: 186, sparc 143
 201   #ifdef __ia64__
 202     #define SYS_gettid 1105
 203   #else
 204     #ifdef __i386__
 205       #define SYS_gettid 224
 206     #else
 207       #ifdef __amd64__
 208         #define SYS_gettid 186
 209       #else
 210         #ifdef __sparc__
 211           #define SYS_gettid 143
 212         #else
 213           #error define gettid for the arch
 214         #endif
 215       #endif
 216     #endif
 217   #endif
 218 #endif
 219 
 220 
 221 // pid_t gettid()
 222 //
 223 // Returns the kernel thread id of the currently running thread. Kernel
 224 // thread id is used to access /proc.
 225 pid_t os::Linux::gettid() {
 226   int rslt = syscall(SYS_gettid);
 227   assert(rslt != -1, "must be."); // old linuxthreads implementation?
 228   return (pid_t)rslt;
 229 }
 230 
 231 // Most versions of linux have a bug where the number of processors are
 232 // determined by looking at the /proc file system.  In a chroot environment,
 233 // the system call returns 1.  This causes the VM to act as if it is
 234 // a single processor and elide locking (see is_MP() call).
 235 static bool unsafe_chroot_detected = false;
 236 static const char *unstable_chroot_error = "/proc file system not found.\n"
 237                      "Java may be unstable running multithreaded in a chroot "
 238                      "environment on Linux when /proc filesystem is not mounted.";
 239 
 240 void os::Linux::initialize_system_info() {
 241   set_processor_count(sysconf(_SC_NPROCESSORS_CONF));
 242   if (processor_count() == 1) {
 243     pid_t pid = os::Linux::gettid();
 244     char fname[32];
 245     jio_snprintf(fname, sizeof(fname), "/proc/%d", pid);
 246     FILE *fp = fopen(fname, "r");
 247     if (fp == NULL) {
 248       unsafe_chroot_detected = true;
 249     } else {
 250       fclose(fp);
 251     }
 252   }
 253   _physical_memory = (julong)sysconf(_SC_PHYS_PAGES) * (julong)sysconf(_SC_PAGESIZE);
 254   assert(processor_count() > 0, "linux error");
 255 }
 256 
 257 void os::init_system_properties_values() {
 258   // The next steps are taken in the product version:
 259   //
 260   // Obtain the JAVA_HOME value from the location of libjvm.so.
 261   // This library should be located at:
 262   // <JAVA_HOME>/lib/{client|server}/libjvm.so.
 263   //
 264   // If "/jre/lib/" appears at the right place in the path, then we
 265   // assume libjvm.so is installed in a JDK and we use this path.
 266   //
 267   // Otherwise exit with message: "Could not create the Java virtual machine."
 268   //
 269   // The following extra steps are taken in the debugging version:
 270   //
 271   // If "/jre/lib/" does NOT appear at the right place in the path
 272   // instead of exit check for $JAVA_HOME environment variable.
 273   //
 274   // If it is defined and we are able to locate $JAVA_HOME/jre/lib/<arch>,
 275   // then we append a fake suffix "hotspot/libjvm.so" to this path so
 276   // it looks like libjvm.so is installed there
 277   // <JAVA_HOME>/jre/lib/<arch>/hotspot/libjvm.so.
 278   //
 279   // Otherwise exit.
 280   //
 281   // Important note: if the location of libjvm.so changes this
 282   // code needs to be changed accordingly.
 283 
 284   // See ld(1):
 285   //      The linker uses the following search paths to locate required
 286   //      shared libraries:
 287   //        1: ...
 288   //        ...
 289   //        7: The default directories, normally /lib and /usr/lib.
 290 #if defined(AMD64) || (defined(_LP64) && defined(SPARC)) || defined(PPC64) || defined(S390)
 291   #define DEFAULT_LIBPATH "/usr/lib64:/lib64:/lib:/usr/lib"
 292 #else
 293   #define DEFAULT_LIBPATH "/lib:/usr/lib"
 294 #endif
 295 
 296 // Base path of extensions installed on the system.
 297 #define SYS_EXT_DIR     "/usr/java/packages"
 298 #define EXTENSIONS_DIR  "/lib/ext"
 299 
 300   // Buffer that fits several sprintfs.
 301   // Note that the space for the colon and the trailing null are provided
 302   // by the nulls included by the sizeof operator.
 303   const size_t bufsize =
 304     MAX2((size_t)MAXPATHLEN,  // For dll_dir & friends.
 305          (size_t)MAXPATHLEN + sizeof(EXTENSIONS_DIR) + sizeof(SYS_EXT_DIR) + sizeof(EXTENSIONS_DIR)); // extensions dir
 306   char *buf = (char *)NEW_C_HEAP_ARRAY(char, bufsize, mtInternal);
 307 
 308   // sysclasspath, java_home, dll_dir
 309   {
 310     char *pslash;
 311     os::jvm_path(buf, bufsize);
 312 
 313     // Found the full path to libjvm.so.
 314     // Now cut the path to <java_home>/jre if we can.
 315     pslash = strrchr(buf, '/');
 316     if (pslash != NULL) {
 317       *pslash = '\0';            // Get rid of /libjvm.so.
 318     }
 319     pslash = strrchr(buf, '/');
 320     if (pslash != NULL) {
 321       *pslash = '\0';            // Get rid of /{client|server|hotspot}.
 322     }
 323     Arguments::set_dll_dir(buf);
 324 
 325     if (pslash != NULL) {
 326       pslash = strrchr(buf, '/');
 327       if (pslash != NULL) {
 328         *pslash = '\0';        // Get rid of /lib.
 329       }
 330     }
 331     Arguments::set_java_home(buf);
 332     set_boot_path('/', ':');
 333   }
 334 
 335   // Where to look for native libraries.
 336   //
 337   // Note: Due to a legacy implementation, most of the library path
 338   // is set in the launcher. This was to accomodate linking restrictions
 339   // on legacy Linux implementations (which are no longer supported).
 340   // Eventually, all the library path setting will be done here.
 341   //
 342   // However, to prevent the proliferation of improperly built native
 343   // libraries, the new path component /usr/java/packages is added here.
 344   // Eventually, all the library path setting will be done here.
 345   {
 346     // Get the user setting of LD_LIBRARY_PATH, and prepended it. It
 347     // should always exist (until the legacy problem cited above is
 348     // addressed).
 349     const char *v = ::getenv("LD_LIBRARY_PATH");
 350     const char *v_colon = ":";
 351     if (v == NULL) { v = ""; v_colon = ""; }
 352     // That's +1 for the colon and +1 for the trailing '\0'.
 353     char *ld_library_path = (char *)NEW_C_HEAP_ARRAY(char,
 354                                                      strlen(v) + 1 +
 355                                                      sizeof(SYS_EXT_DIR) + sizeof("/lib/") + sizeof(DEFAULT_LIBPATH) + 1,
 356                                                      mtInternal);
 357     sprintf(ld_library_path, "%s%s" SYS_EXT_DIR "/lib:" DEFAULT_LIBPATH, v, v_colon);
 358     Arguments::set_library_path(ld_library_path);
 359     FREE_C_HEAP_ARRAY(char, ld_library_path);
 360   }
 361 
 362   // Extensions directories.
 363   sprintf(buf, "%s" EXTENSIONS_DIR ":" SYS_EXT_DIR EXTENSIONS_DIR, Arguments::get_java_home());
 364   Arguments::set_ext_dirs(buf);
 365 
 366   FREE_C_HEAP_ARRAY(char, buf);
 367 
 368 #undef DEFAULT_LIBPATH
 369 #undef SYS_EXT_DIR
 370 #undef EXTENSIONS_DIR
 371 }
 372 
 373 ////////////////////////////////////////////////////////////////////////////////
 374 // breakpoint support
 375 
 376 void os::breakpoint() {
 377   BREAKPOINT;
 378 }
 379 
 380 extern "C" void breakpoint() {
 381   // use debugger to set breakpoint here
 382 }
 383 
 384 ////////////////////////////////////////////////////////////////////////////////
 385 // signal support
 386 
 387 debug_only(static bool signal_sets_initialized = false);
 388 static sigset_t unblocked_sigs, vm_sigs, allowdebug_blocked_sigs;
 389 
 390 bool os::Linux::is_sig_ignored(int sig) {
 391   struct sigaction oact;
 392   sigaction(sig, (struct sigaction*)NULL, &oact);
 393   void* ohlr = oact.sa_sigaction ? CAST_FROM_FN_PTR(void*,  oact.sa_sigaction)
 394                                  : CAST_FROM_FN_PTR(void*,  oact.sa_handler);
 395   if (ohlr == CAST_FROM_FN_PTR(void*, SIG_IGN)) {
 396     return true;
 397   } else {
 398     return false;
 399   }
 400 }
 401 
 402 void os::Linux::signal_sets_init() {
 403   // Should also have an assertion stating we are still single-threaded.
 404   assert(!signal_sets_initialized, "Already initialized");
 405   // Fill in signals that are necessarily unblocked for all threads in
 406   // the VM. Currently, we unblock the following signals:
 407   // SHUTDOWN{1,2,3}_SIGNAL: for shutdown hooks support (unless over-ridden
 408   //                         by -Xrs (=ReduceSignalUsage));
 409   // BREAK_SIGNAL which is unblocked only by the VM thread and blocked by all
 410   // other threads. The "ReduceSignalUsage" boolean tells us not to alter
 411   // the dispositions or masks wrt these signals.
 412   // Programs embedding the VM that want to use the above signals for their
 413   // own purposes must, at this time, use the "-Xrs" option to prevent
 414   // interference with shutdown hooks and BREAK_SIGNAL thread dumping.
 415   // (See bug 4345157, and other related bugs).
 416   // In reality, though, unblocking these signals is really a nop, since
 417   // these signals are not blocked by default.
 418   sigemptyset(&unblocked_sigs);
 419   sigemptyset(&allowdebug_blocked_sigs);
 420   sigaddset(&unblocked_sigs, SIGILL);
 421   sigaddset(&unblocked_sigs, SIGSEGV);
 422   sigaddset(&unblocked_sigs, SIGBUS);
 423   sigaddset(&unblocked_sigs, SIGFPE);
 424 #if defined(PPC64)
 425   sigaddset(&unblocked_sigs, SIGTRAP);
 426 #endif
 427   sigaddset(&unblocked_sigs, SR_signum);
 428 
 429   if (!ReduceSignalUsage) {
 430     if (!os::Linux::is_sig_ignored(SHUTDOWN1_SIGNAL)) {
 431       sigaddset(&unblocked_sigs, SHUTDOWN1_SIGNAL);
 432       sigaddset(&allowdebug_blocked_sigs, SHUTDOWN1_SIGNAL);
 433     }
 434     if (!os::Linux::is_sig_ignored(SHUTDOWN2_SIGNAL)) {
 435       sigaddset(&unblocked_sigs, SHUTDOWN2_SIGNAL);
 436       sigaddset(&allowdebug_blocked_sigs, SHUTDOWN2_SIGNAL);
 437     }
 438     if (!os::Linux::is_sig_ignored(SHUTDOWN3_SIGNAL)) {
 439       sigaddset(&unblocked_sigs, SHUTDOWN3_SIGNAL);
 440       sigaddset(&allowdebug_blocked_sigs, SHUTDOWN3_SIGNAL);
 441     }
 442   }
 443   // Fill in signals that are blocked by all but the VM thread.
 444   sigemptyset(&vm_sigs);
 445   if (!ReduceSignalUsage) {
 446     sigaddset(&vm_sigs, BREAK_SIGNAL);
 447   }
 448   debug_only(signal_sets_initialized = true);
 449 
 450 }
 451 
 452 // These are signals that are unblocked while a thread is running Java.
 453 // (For some reason, they get blocked by default.)
 454 sigset_t* os::Linux::unblocked_signals() {
 455   assert(signal_sets_initialized, "Not initialized");
 456   return &unblocked_sigs;
 457 }
 458 
 459 // These are the signals that are blocked while a (non-VM) thread is
 460 // running Java. Only the VM thread handles these signals.
 461 sigset_t* os::Linux::vm_signals() {
 462   assert(signal_sets_initialized, "Not initialized");
 463   return &vm_sigs;
 464 }
 465 
 466 // These are signals that are blocked during cond_wait to allow debugger in
 467 sigset_t* os::Linux::allowdebug_blocked_signals() {
 468   assert(signal_sets_initialized, "Not initialized");
 469   return &allowdebug_blocked_sigs;
 470 }
 471 
 472 void os::Linux::hotspot_sigmask(Thread* thread) {
 473 
 474   //Save caller's signal mask before setting VM signal mask
 475   sigset_t caller_sigmask;
 476   pthread_sigmask(SIG_BLOCK, NULL, &caller_sigmask);
 477 
 478   OSThread* osthread = thread->osthread();
 479   osthread->set_caller_sigmask(caller_sigmask);
 480 
 481   pthread_sigmask(SIG_UNBLOCK, os::Linux::unblocked_signals(), NULL);
 482 
 483   if (!ReduceSignalUsage) {
 484     if (thread->is_VM_thread()) {
 485       // Only the VM thread handles BREAK_SIGNAL ...
 486       pthread_sigmask(SIG_UNBLOCK, vm_signals(), NULL);
 487     } else {
 488       // ... all other threads block BREAK_SIGNAL
 489       pthread_sigmask(SIG_BLOCK, vm_signals(), NULL);
 490     }
 491   }
 492 }
 493 
 494 //////////////////////////////////////////////////////////////////////////////
 495 // detecting pthread library
 496 
 497 void os::Linux::libpthread_init() {
 498   // Save glibc and pthread version strings.
 499 #if !defined(_CS_GNU_LIBC_VERSION) || \
 500     !defined(_CS_GNU_LIBPTHREAD_VERSION)
 501   #error "glibc too old (< 2.3.2)"
 502 #endif
 503 
 504   size_t n;
 505 
 506   n = confstr(_CS_GNU_LIBC_VERSION, NULL, 0);
 507   if (n > 0) {
 508     char* str = (char *)malloc(n, mtInternal);
 509     confstr(_CS_GNU_LIBC_VERSION, str, n);
 510     os::Linux::set_glibc_version(str);
 511   }
 512 
 513   n = confstr(_CS_GNU_LIBPTHREAD_VERSION, NULL, 0);
 514   if (n > 0) {
 515     char* str = (char *)malloc(n, mtInternal);
 516     confstr(_CS_GNU_LIBPTHREAD_VERSION, str, n);
 517     os::Linux::set_libpthread_version(str);
 518   }
 519 }
 520 
 521 /////////////////////////////////////////////////////////////////////////////
 522 // thread stack expansion
 523 
 524 // os::Linux::manually_expand_stack() takes care of expanding the thread
 525 // stack. Note that this is normally not needed: pthread stacks allocate
 526 // thread stack using mmap() without MAP_NORESERVE, so the stack is already
 527 // committed. Therefore it is not necessary to expand the stack manually.
 528 //
 529 // Manually expanding the stack was historically needed on LinuxThreads
 530 // thread stacks, which were allocated with mmap(MAP_GROWSDOWN). Nowadays
 531 // it is kept to deal with very rare corner cases:
 532 //
 533 // For one, user may run the VM on an own implementation of threads
 534 // whose stacks are - like the old LinuxThreads - implemented using
 535 // mmap(MAP_GROWSDOWN).
 536 //
 537 // Also, this coding may be needed if the VM is running on the primordial
 538 // thread. Normally we avoid running on the primordial thread; however,
 539 // user may still invoke the VM on the primordial thread.
 540 //
 541 // The following historical comment describes the details about running
 542 // on a thread stack allocated with mmap(MAP_GROWSDOWN):
 543 
 544 
 545 // Force Linux kernel to expand current thread stack. If "bottom" is close
 546 // to the stack guard, caller should block all signals.
 547 //
 548 // MAP_GROWSDOWN:
 549 //   A special mmap() flag that is used to implement thread stacks. It tells
 550 //   kernel that the memory region should extend downwards when needed. This
 551 //   allows early versions of LinuxThreads to only mmap the first few pages
 552 //   when creating a new thread. Linux kernel will automatically expand thread
 553 //   stack as needed (on page faults).
 554 //
 555 //   However, because the memory region of a MAP_GROWSDOWN stack can grow on
 556 //   demand, if a page fault happens outside an already mapped MAP_GROWSDOWN
 557 //   region, it's hard to tell if the fault is due to a legitimate stack
 558 //   access or because of reading/writing non-exist memory (e.g. buffer
 559 //   overrun). As a rule, if the fault happens below current stack pointer,
 560 //   Linux kernel does not expand stack, instead a SIGSEGV is sent to the
 561 //   application (see Linux kernel fault.c).
 562 //
 563 //   This Linux feature can cause SIGSEGV when VM bangs thread stack for
 564 //   stack overflow detection.
 565 //
 566 //   Newer version of LinuxThreads (since glibc-2.2, or, RH-7.x) and NPTL do
 567 //   not use MAP_GROWSDOWN.
 568 //
 569 // To get around the problem and allow stack banging on Linux, we need to
 570 // manually expand thread stack after receiving the SIGSEGV.
 571 //
 572 // There are two ways to expand thread stack to address "bottom", we used
 573 // both of them in JVM before 1.5:
 574 //   1. adjust stack pointer first so that it is below "bottom", and then
 575 //      touch "bottom"
 576 //   2. mmap() the page in question
 577 //
 578 // Now alternate signal stack is gone, it's harder to use 2. For instance,
 579 // if current sp is already near the lower end of page 101, and we need to
 580 // call mmap() to map page 100, it is possible that part of the mmap() frame
 581 // will be placed in page 100. When page 100 is mapped, it is zero-filled.
 582 // That will destroy the mmap() frame and cause VM to crash.
 583 //
 584 // The following code works by adjusting sp first, then accessing the "bottom"
 585 // page to force a page fault. Linux kernel will then automatically expand the
 586 // stack mapping.
 587 //
 588 // _expand_stack_to() assumes its frame size is less than page size, which
 589 // should always be true if the function is not inlined.
 590 
 591 static void NOINLINE _expand_stack_to(address bottom) {
 592   address sp;
 593   size_t size;
 594   volatile char *p;
 595 
 596   // Adjust bottom to point to the largest address within the same page, it
 597   // gives us a one-page buffer if alloca() allocates slightly more memory.
 598   bottom = (address)align_size_down((uintptr_t)bottom, os::Linux::page_size());
 599   bottom += os::Linux::page_size() - 1;
 600 
 601   // sp might be slightly above current stack pointer; if that's the case, we
 602   // will alloca() a little more space than necessary, which is OK. Don't use
 603   // os::current_stack_pointer(), as its result can be slightly below current
 604   // stack pointer, causing us to not alloca enough to reach "bottom".
 605   sp = (address)&sp;
 606 
 607   if (sp > bottom) {
 608     size = sp - bottom;
 609     p = (volatile char *)alloca(size);
 610     assert(p != NULL && p <= (volatile char *)bottom, "alloca problem?");
 611     p[0] = '\0';
 612   }
 613 }
 614 
 615 bool os::Linux::manually_expand_stack(JavaThread * t, address addr) {
 616   assert(t!=NULL, "just checking");
 617   assert(t->osthread()->expanding_stack(), "expand should be set");
 618   assert(t->stack_base() != NULL, "stack_base was not initialized");
 619 
 620   if (addr <  t->stack_base() && addr >= t->stack_reserved_zone_base()) {
 621     sigset_t mask_all, old_sigset;
 622     sigfillset(&mask_all);
 623     pthread_sigmask(SIG_SETMASK, &mask_all, &old_sigset);
 624     _expand_stack_to(addr);
 625     pthread_sigmask(SIG_SETMASK, &old_sigset, NULL);
 626     return true;
 627   }
 628   return false;
 629 }
 630 
 631 //////////////////////////////////////////////////////////////////////////////
 632 // create new thread
 633 
 634 // Thread start routine for all newly created threads
 635 static void *thread_native_entry(Thread *thread) {
 636   // Try to randomize the cache line index of hot stack frames.
 637   // This helps when threads of the same stack traces evict each other's
 638   // cache lines. The threads can be either from the same JVM instance, or
 639   // from different JVM instances. The benefit is especially true for
 640   // processors with hyperthreading technology.
 641   static int counter = 0;
 642   int pid = os::current_process_id();
 643   alloca(((pid ^ counter++) & 7) * 128);
 644 
 645   thread->initialize_thread_current();
 646 
 647   OSThread* osthread = thread->osthread();
 648   Monitor* sync = osthread->startThread_lock();
 649 
 650   osthread->set_thread_id(os::current_thread_id());
 651 
 652   log_info(os, thread)("Thread is alive (tid: " UINTX_FORMAT ", pthread id: " UINTX_FORMAT ").",
 653     os::current_thread_id(), (uintx) pthread_self());
 654 
 655   if (UseNUMA) {
 656     int lgrp_id = os::numa_get_group_id();
 657     if (lgrp_id != -1) {
 658       thread->set_lgrp_id(lgrp_id);
 659     }
 660   }
 661   // initialize signal mask for this thread
 662   os::Linux::hotspot_sigmask(thread);
 663 
 664   // initialize floating point control register
 665   os::Linux::init_thread_fpu_state();
 666 
 667   // handshaking with parent thread
 668   {
 669     MutexLockerEx ml(sync, Mutex::_no_safepoint_check_flag);
 670 
 671     // notify parent thread
 672     osthread->set_state(INITIALIZED);
 673     sync->notify_all();
 674 
 675     // wait until os::start_thread()
 676     while (osthread->get_state() == INITIALIZED) {
 677       sync->wait(Mutex::_no_safepoint_check_flag);
 678     }
 679   }
 680 
 681   // call one more level start routine
 682   thread->run();
 683 
 684   log_info(os, thread)("Thread finished (tid: " UINTX_FORMAT ", pthread id: " UINTX_FORMAT ").",
 685     os::current_thread_id(), (uintx) pthread_self());
 686 
 687   // If a thread has not deleted itself ("delete this") as part of its
 688   // termination sequence, we have to ensure thread-local-storage is
 689   // cleared before we actually terminate. No threads should ever be
 690   // deleted asynchronously with respect to their termination.
 691   if (Thread::current_or_null_safe() != NULL) {
 692     assert(Thread::current_or_null_safe() == thread, "current thread is wrong");
 693     thread->clear_thread_current();
 694   }
 695 
 696   return 0;
 697 }
 698 
 699 bool os::create_thread(Thread* thread, ThreadType thr_type,
 700                        size_t req_stack_size) {
 701   assert(thread->osthread() == NULL, "caller responsible");
 702 
 703   // Allocate the OSThread object
 704   OSThread* osthread = new OSThread(NULL, NULL);
 705   if (osthread == NULL) {
 706     return false;
 707   }
 708 
 709   // set the correct thread state
 710   osthread->set_thread_type(thr_type);
 711 
 712   // Initial state is ALLOCATED but not INITIALIZED
 713   osthread->set_state(ALLOCATED);
 714 
 715   thread->set_osthread(osthread);
 716 
 717   // init thread attributes
 718   pthread_attr_t attr;
 719   pthread_attr_init(&attr);
 720   pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
 721 
 722   // Calculate stack size if it's not specified by caller.
 723   size_t stack_size = os::Posix::get_initial_stack_size(thr_type, req_stack_size);
 724   // In the Linux NPTL pthread implementation the guard size mechanism
 725   // is not implemented properly. The posix standard requires adding
 726   // the size of the guard pages to the stack size, instead Linux
 727   // takes the space out of 'stacksize'. Thus we adapt the requested
 728   // stack_size by the size of the guard pages to mimick proper
 729   // behaviour.
 730   stack_size = align_size_up(stack_size + os::Linux::default_guard_size(thr_type), vm_page_size());
 731   pthread_attr_setstacksize(&attr, stack_size);
 732 
 733   // Configure glibc guard page.
 734   pthread_attr_setguardsize(&attr, os::Linux::default_guard_size(thr_type));
 735 
 736   ThreadState state;
 737 
 738   {
 739     pthread_t tid;
 740     int ret = pthread_create(&tid, &attr, (void* (*)(void*)) thread_native_entry, thread);
 741 
 742     char buf[64];
 743     if (ret == 0) {
 744       log_info(os, thread)("Thread started (pthread id: " UINTX_FORMAT ", attributes: %s). ",
 745         (uintx) tid, os::Posix::describe_pthread_attr(buf, sizeof(buf), &attr));
 746     } else {
 747       log_warning(os, thread)("Failed to start thread - pthread_create failed (%s) for attributes: %s.",
 748         os::errno_name(ret), os::Posix::describe_pthread_attr(buf, sizeof(buf), &attr));
 749     }
 750 
 751     pthread_attr_destroy(&attr);
 752 
 753     if (ret != 0) {
 754       // Need to clean up stuff we've allocated so far
 755       thread->set_osthread(NULL);
 756       delete osthread;
 757       return false;
 758     }
 759 
 760     // Store pthread info into the OSThread
 761     osthread->set_pthread_id(tid);
 762 
 763     // Wait until child thread is either initialized or aborted
 764     {
 765       Monitor* sync_with_child = osthread->startThread_lock();
 766       MutexLockerEx ml(sync_with_child, Mutex::_no_safepoint_check_flag);
 767       while ((state = osthread->get_state()) == ALLOCATED) {
 768         sync_with_child->wait(Mutex::_no_safepoint_check_flag);
 769       }
 770     }
 771   }
 772 
 773   // Aborted due to thread limit being reached
 774   if (state == ZOMBIE) {
 775     thread->set_osthread(NULL);
 776     delete osthread;
 777     return false;
 778   }
 779 
 780   // The thread is returned suspended (in state INITIALIZED),
 781   // and is started higher up in the call chain
 782   assert(state == INITIALIZED, "race condition");
 783   return true;
 784 }
 785 
 786 /////////////////////////////////////////////////////////////////////////////
 787 // attach existing thread
 788 
 789 // bootstrap the main thread
 790 bool os::create_main_thread(JavaThread* thread) {
 791   assert(os::Linux::_main_thread == pthread_self(), "should be called inside main thread");
 792   return create_attached_thread(thread);
 793 }
 794 
 795 bool os::create_attached_thread(JavaThread* thread) {
 796 #ifdef ASSERT
 797   thread->verify_not_published();
 798 #endif
 799 
 800   // Allocate the OSThread object
 801   OSThread* osthread = new OSThread(NULL, NULL);
 802 
 803   if (osthread == NULL) {
 804     return false;
 805   }
 806 
 807   // Store pthread info into the OSThread
 808   osthread->set_thread_id(os::Linux::gettid());
 809   osthread->set_pthread_id(::pthread_self());
 810 
 811   // initialize floating point control register
 812   os::Linux::init_thread_fpu_state();
 813 
 814   // Initial thread state is RUNNABLE
 815   osthread->set_state(RUNNABLE);
 816 
 817   thread->set_osthread(osthread);
 818 
 819   if (UseNUMA) {
 820     int lgrp_id = os::numa_get_group_id();
 821     if (lgrp_id != -1) {
 822       thread->set_lgrp_id(lgrp_id);
 823     }
 824   }
 825 
 826   if (os::Linux::is_initial_thread()) {
 827     // If current thread is initial thread, its stack is mapped on demand,
 828     // see notes about MAP_GROWSDOWN. Here we try to force kernel to map
 829     // the entire stack region to avoid SEGV in stack banging.
 830     // It is also useful to get around the heap-stack-gap problem on SuSE
 831     // kernel (see 4821821 for details). We first expand stack to the top
 832     // of yellow zone, then enable stack yellow zone (order is significant,
 833     // enabling yellow zone first will crash JVM on SuSE Linux), so there
 834     // is no gap between the last two virtual memory regions.
 835 
 836     JavaThread *jt = (JavaThread *)thread;
 837     address addr = jt->stack_reserved_zone_base();
 838     assert(addr != NULL, "initialization problem?");
 839     assert(jt->stack_available(addr) > 0, "stack guard should not be enabled");
 840 
 841     osthread->set_expanding_stack();
 842     os::Linux::manually_expand_stack(jt, addr);
 843     osthread->clear_expanding_stack();
 844   }
 845 
 846   // initialize signal mask for this thread
 847   // and save the caller's signal mask
 848   os::Linux::hotspot_sigmask(thread);
 849 
 850   log_info(os, thread)("Thread attached (tid: " UINTX_FORMAT ", pthread id: " UINTX_FORMAT ").",
 851     os::current_thread_id(), (uintx) pthread_self());
 852 
 853   return true;
 854 }
 855 
 856 void os::pd_start_thread(Thread* thread) {
 857   OSThread * osthread = thread->osthread();
 858   assert(osthread->get_state() != INITIALIZED, "just checking");
 859   Monitor* sync_with_child = osthread->startThread_lock();
 860   MutexLockerEx ml(sync_with_child, Mutex::_no_safepoint_check_flag);
 861   sync_with_child->notify();
 862 }
 863 
 864 // Free Linux resources related to the OSThread
 865 void os::free_thread(OSThread* osthread) {
 866   assert(osthread != NULL, "osthread not set");
 867 
 868   // We are told to free resources of the argument thread,
 869   // but we can only really operate on the current thread.
 870   assert(Thread::current()->osthread() == osthread,
 871          "os::free_thread but not current thread");
 872 
 873 #ifdef ASSERT
 874   sigset_t current;
 875   sigemptyset(&current);
 876   pthread_sigmask(SIG_SETMASK, NULL, &current);
 877   assert(!sigismember(&current, SR_signum), "SR signal should not be blocked!");
 878 #endif
 879 
 880   // Restore caller's signal mask
 881   sigset_t sigmask = osthread->caller_sigmask();
 882   pthread_sigmask(SIG_SETMASK, &sigmask, NULL);
 883 
 884   delete osthread;
 885 }
 886 
 887 //////////////////////////////////////////////////////////////////////////////
 888 // initial thread
 889 
 890 // Check if current thread is the initial thread, similar to Solaris thr_main.
 891 bool os::Linux::is_initial_thread(void) {
 892   char dummy;
 893   // If called before init complete, thread stack bottom will be null.
 894   // Can be called if fatal error occurs before initialization.
 895   if (initial_thread_stack_bottom() == NULL) return false;
 896   assert(initial_thread_stack_bottom() != NULL &&
 897          initial_thread_stack_size()   != 0,
 898          "os::init did not locate initial thread's stack region");
 899   if ((address)&dummy >= initial_thread_stack_bottom() &&
 900       (address)&dummy < initial_thread_stack_bottom() + initial_thread_stack_size()) {
 901     return true;
 902   } else {
 903     return false;
 904   }
 905 }
 906 
 907 // Find the virtual memory area that contains addr
 908 static bool find_vma(address addr, address* vma_low, address* vma_high) {
 909   FILE *fp = fopen("/proc/self/maps", "r");
 910   if (fp) {
 911     address low, high;
 912     while (!feof(fp)) {
 913       if (fscanf(fp, "%p-%p", &low, &high) == 2) {
 914         if (low <= addr && addr < high) {
 915           if (vma_low)  *vma_low  = low;
 916           if (vma_high) *vma_high = high;
 917           fclose(fp);
 918           return true;
 919         }
 920       }
 921       for (;;) {
 922         int ch = fgetc(fp);
 923         if (ch == EOF || ch == (int)'\n') break;
 924       }
 925     }
 926     fclose(fp);
 927   }
 928   return false;
 929 }
 930 
 931 // Locate initial thread stack. This special handling of initial thread stack
 932 // is needed because pthread_getattr_np() on most (all?) Linux distros returns
 933 // bogus value for the primordial process thread. While the launcher has created
 934 // the VM in a new thread since JDK 6, we still have to allow for the use of the
 935 // JNI invocation API from a primordial thread.
 936 void os::Linux::capture_initial_stack(size_t max_size) {
 937 
 938   // max_size is either 0 (which means accept OS default for thread stacks) or
 939   // a user-specified value known to be at least the minimum needed. If we
 940   // are actually on the primordial thread we can make it appear that we have a
 941   // smaller max_size stack by inserting the guard pages at that location. But we
 942   // cannot do anything to emulate a larger stack than what has been provided by
 943   // the OS or threading library. In fact if we try to use a stack greater than
 944   // what is set by rlimit then we will crash the hosting process.
 945 
 946   // Maximum stack size is the easy part, get it from RLIMIT_STACK.
 947   // If this is "unlimited" then it will be a huge value.
 948   struct rlimit rlim;
 949   getrlimit(RLIMIT_STACK, &rlim);
 950   size_t stack_size = rlim.rlim_cur;
 951 
 952   // 6308388: a bug in ld.so will relocate its own .data section to the
 953   //   lower end of primordial stack; reduce ulimit -s value a little bit
 954   //   so we won't install guard page on ld.so's data section.
 955   stack_size -= 2 * page_size();
 956 
 957   // Try to figure out where the stack base (top) is. This is harder.
 958   //
 959   // When an application is started, glibc saves the initial stack pointer in
 960   // a global variable "__libc_stack_end", which is then used by system
 961   // libraries. __libc_stack_end should be pretty close to stack top. The
 962   // variable is available since the very early days. However, because it is
 963   // a private interface, it could disappear in the future.
 964   //
 965   // Linux kernel saves start_stack information in /proc/<pid>/stat. Similar
 966   // to __libc_stack_end, it is very close to stack top, but isn't the real
 967   // stack top. Note that /proc may not exist if VM is running as a chroot
 968   // program, so reading /proc/<pid>/stat could fail. Also the contents of
 969   // /proc/<pid>/stat could change in the future (though unlikely).
 970   //
 971   // We try __libc_stack_end first. If that doesn't work, look for
 972   // /proc/<pid>/stat. If neither of them works, we use current stack pointer
 973   // as a hint, which should work well in most cases.
 974 
 975   uintptr_t stack_start;
 976 
 977   // try __libc_stack_end first
 978   uintptr_t *p = (uintptr_t *)dlsym(RTLD_DEFAULT, "__libc_stack_end");
 979   if (p && *p) {
 980     stack_start = *p;
 981   } else {
 982     // see if we can get the start_stack field from /proc/self/stat
 983     FILE *fp;
 984     int pid;
 985     char state;
 986     int ppid;
 987     int pgrp;
 988     int session;
 989     int nr;
 990     int tpgrp;
 991     unsigned long flags;
 992     unsigned long minflt;
 993     unsigned long cminflt;
 994     unsigned long majflt;
 995     unsigned long cmajflt;
 996     unsigned long utime;
 997     unsigned long stime;
 998     long cutime;
 999     long cstime;
1000     long prio;
1001     long nice;
1002     long junk;
1003     long it_real;
1004     uintptr_t start;
1005     uintptr_t vsize;
1006     intptr_t rss;
1007     uintptr_t rsslim;
1008     uintptr_t scodes;
1009     uintptr_t ecode;
1010     int i;
1011 
1012     // Figure what the primordial thread stack base is. Code is inspired
1013     // by email from Hans Boehm. /proc/self/stat begins with current pid,
1014     // followed by command name surrounded by parentheses, state, etc.
1015     char stat[2048];
1016     int statlen;
1017 
1018     fp = fopen("/proc/self/stat", "r");
1019     if (fp) {
1020       statlen = fread(stat, 1, 2047, fp);
1021       stat[statlen] = '\0';
1022       fclose(fp);
1023 
1024       // Skip pid and the command string. Note that we could be dealing with
1025       // weird command names, e.g. user could decide to rename java launcher
1026       // to "java 1.4.2 :)", then the stat file would look like
1027       //                1234 (java 1.4.2 :)) R ... ...
1028       // We don't really need to know the command string, just find the last
1029       // occurrence of ")" and then start parsing from there. See bug 4726580.
1030       char * s = strrchr(stat, ')');
1031 
1032       i = 0;
1033       if (s) {
1034         // Skip blank chars
1035         do { s++; } while (s && isspace(*s));
1036 
1037 #define _UFM UINTX_FORMAT
1038 #define _DFM INTX_FORMAT
1039 
1040         //                                     1   1   1   1   1   1   1   1   1   1   2   2    2    2    2    2    2    2    2
1041         //              3  4  5  6  7  8   9   0   1   2   3   4   5   6   7   8   9   0   1    2    3    4    5    6    7    8
1042         i = sscanf(s, "%c %d %d %d %d %d %lu %lu %lu %lu %lu %lu %lu %ld %ld %ld %ld %ld %ld " _UFM _UFM _DFM _UFM _UFM _UFM _UFM,
1043                    &state,          // 3  %c
1044                    &ppid,           // 4  %d
1045                    &pgrp,           // 5  %d
1046                    &session,        // 6  %d
1047                    &nr,             // 7  %d
1048                    &tpgrp,          // 8  %d
1049                    &flags,          // 9  %lu
1050                    &minflt,         // 10 %lu
1051                    &cminflt,        // 11 %lu
1052                    &majflt,         // 12 %lu
1053                    &cmajflt,        // 13 %lu
1054                    &utime,          // 14 %lu
1055                    &stime,          // 15 %lu
1056                    &cutime,         // 16 %ld
1057                    &cstime,         // 17 %ld
1058                    &prio,           // 18 %ld
1059                    &nice,           // 19 %ld
1060                    &junk,           // 20 %ld
1061                    &it_real,        // 21 %ld
1062                    &start,          // 22 UINTX_FORMAT
1063                    &vsize,          // 23 UINTX_FORMAT
1064                    &rss,            // 24 INTX_FORMAT
1065                    &rsslim,         // 25 UINTX_FORMAT
1066                    &scodes,         // 26 UINTX_FORMAT
1067                    &ecode,          // 27 UINTX_FORMAT
1068                    &stack_start);   // 28 UINTX_FORMAT
1069       }
1070 
1071 #undef _UFM
1072 #undef _DFM
1073 
1074       if (i != 28 - 2) {
1075         assert(false, "Bad conversion from /proc/self/stat");
1076         // product mode - assume we are the initial thread, good luck in the
1077         // embedded case.
1078         warning("Can't detect initial thread stack location - bad conversion");
1079         stack_start = (uintptr_t) &rlim;
1080       }
1081     } else {
1082       // For some reason we can't open /proc/self/stat (for example, running on
1083       // FreeBSD with a Linux emulator, or inside chroot), this should work for
1084       // most cases, so don't abort:
1085       warning("Can't detect initial thread stack location - no /proc/self/stat");
1086       stack_start = (uintptr_t) &rlim;
1087     }
1088   }
1089 
1090   // Now we have a pointer (stack_start) very close to the stack top, the
1091   // next thing to do is to figure out the exact location of stack top. We
1092   // can find out the virtual memory area that contains stack_start by
1093   // reading /proc/self/maps, it should be the last vma in /proc/self/maps,
1094   // and its upper limit is the real stack top. (again, this would fail if
1095   // running inside chroot, because /proc may not exist.)
1096 
1097   uintptr_t stack_top;
1098   address low, high;
1099   if (find_vma((address)stack_start, &low, &high)) {
1100     // success, "high" is the true stack top. (ignore "low", because initial
1101     // thread stack grows on demand, its real bottom is high - RLIMIT_STACK.)
1102     stack_top = (uintptr_t)high;
1103   } else {
1104     // failed, likely because /proc/self/maps does not exist
1105     warning("Can't detect initial thread stack location - find_vma failed");
1106     // best effort: stack_start is normally within a few pages below the real
1107     // stack top, use it as stack top, and reduce stack size so we won't put
1108     // guard page outside stack.
1109     stack_top = stack_start;
1110     stack_size -= 16 * page_size();
1111   }
1112 
1113   // stack_top could be partially down the page so align it
1114   stack_top = align_size_up(stack_top, page_size());
1115 
1116   // Allowed stack value is minimum of max_size and what we derived from rlimit
1117   if (max_size > 0) {
1118     _initial_thread_stack_size = MIN2(max_size, stack_size);
1119   } else {
1120     // Accept the rlimit max, but if stack is unlimited then it will be huge, so
1121     // clamp it at 8MB as we do on Solaris
1122     _initial_thread_stack_size = MIN2(stack_size, 8*M);
1123   }
1124   _initial_thread_stack_size = align_size_down(_initial_thread_stack_size, page_size());
1125   _initial_thread_stack_bottom = (address)stack_top - _initial_thread_stack_size;
1126 
1127   assert(_initial_thread_stack_bottom < (address)stack_top, "overflow!");
1128 
1129   if (log_is_enabled(Info, os, thread)) {
1130     // See if we seem to be on primordial process thread
1131     bool primordial = uintptr_t(&rlim) > uintptr_t(_initial_thread_stack_bottom) &&
1132                       uintptr_t(&rlim) < stack_top;
1133 
1134     log_info(os, thread)("Capturing initial stack in %s thread: req. size: " SIZE_FORMAT "K, actual size: "
1135                          SIZE_FORMAT "K, top=" INTPTR_FORMAT ", bottom=" INTPTR_FORMAT,
1136                          primordial ? "primordial" : "user", max_size / K,  _initial_thread_stack_size / K,
1137                          stack_top, intptr_t(_initial_thread_stack_bottom));
1138   }
1139 }
1140 
1141 ////////////////////////////////////////////////////////////////////////////////
1142 // time support
1143 
1144 // Time since start-up in seconds to a fine granularity.
1145 // Used by VMSelfDestructTimer and the MemProfiler.
1146 double os::elapsedTime() {
1147 
1148   return ((double)os::elapsed_counter()) / os::elapsed_frequency(); // nanosecond resolution
1149 }
1150 
1151 jlong os::elapsed_counter() {
1152   return javaTimeNanos() - initial_time_count;
1153 }
1154 
1155 jlong os::elapsed_frequency() {
1156   return NANOSECS_PER_SEC; // nanosecond resolution
1157 }
1158 
1159 bool os::supports_vtime() { return true; }
1160 bool os::enable_vtime()   { return false; }
1161 bool os::vtime_enabled()  { return false; }
1162 
1163 double os::elapsedVTime() {
1164   struct rusage usage;
1165   int retval = getrusage(RUSAGE_THREAD, &usage);
1166   if (retval == 0) {
1167     return (double) (usage.ru_utime.tv_sec + usage.ru_stime.tv_sec) + (double) (usage.ru_utime.tv_usec + usage.ru_stime.tv_usec) / (1000 * 1000);
1168   } else {
1169     // better than nothing, but not much
1170     return elapsedTime();
1171   }
1172 }
1173 
1174 jlong os::javaTimeMillis() {
1175   timeval time;
1176   int status = gettimeofday(&time, NULL);
1177   assert(status != -1, "linux error");
1178   return jlong(time.tv_sec) * 1000  +  jlong(time.tv_usec / 1000);
1179 }
1180 
1181 void os::javaTimeSystemUTC(jlong &seconds, jlong &nanos) {
1182   timeval time;
1183   int status = gettimeofday(&time, NULL);
1184   assert(status != -1, "linux error");
1185   seconds = jlong(time.tv_sec);
1186   nanos = jlong(time.tv_usec) * 1000;
1187 }
1188 
1189 
1190 #ifndef CLOCK_MONOTONIC
1191   #define CLOCK_MONOTONIC (1)
1192 #endif
1193 
1194 void os::Linux::clock_init() {
1195   // we do dlopen's in this particular order due to bug in linux
1196   // dynamical loader (see 6348968) leading to crash on exit
1197   void* handle = dlopen("librt.so.1", RTLD_LAZY);
1198   if (handle == NULL) {
1199     handle = dlopen("librt.so", RTLD_LAZY);
1200   }
1201 
1202   if (handle) {
1203     int (*clock_getres_func)(clockid_t, struct timespec*) =
1204            (int(*)(clockid_t, struct timespec*))dlsym(handle, "clock_getres");
1205     int (*clock_gettime_func)(clockid_t, struct timespec*) =
1206            (int(*)(clockid_t, struct timespec*))dlsym(handle, "clock_gettime");
1207     if (clock_getres_func && clock_gettime_func) {
1208       // See if monotonic clock is supported by the kernel. Note that some
1209       // early implementations simply return kernel jiffies (updated every
1210       // 1/100 or 1/1000 second). It would be bad to use such a low res clock
1211       // for nano time (though the monotonic property is still nice to have).
1212       // It's fixed in newer kernels, however clock_getres() still returns
1213       // 1/HZ. We check if clock_getres() works, but will ignore its reported
1214       // resolution for now. Hopefully as people move to new kernels, this
1215       // won't be a problem.
1216       struct timespec res;
1217       struct timespec tp;
1218       if (clock_getres_func (CLOCK_MONOTONIC, &res) == 0 &&
1219           clock_gettime_func(CLOCK_MONOTONIC, &tp)  == 0) {
1220         // yes, monotonic clock is supported
1221         _clock_gettime = clock_gettime_func;
1222         return;
1223       } else {
1224         // close librt if there is no monotonic clock
1225         dlclose(handle);
1226       }
1227     }
1228   }
1229   warning("No monotonic clock was available - timed services may " \
1230           "be adversely affected if the time-of-day clock changes");
1231 }
1232 
1233 #ifndef SYS_clock_getres
1234   #if defined(X86) || defined(PPC64) || defined(S390)
1235     #define SYS_clock_getres AMD64_ONLY(229) IA32_ONLY(266) PPC64_ONLY(247) S390_ONLY(261)
1236     #define sys_clock_getres(x,y)  ::syscall(SYS_clock_getres, x, y)
1237   #else
1238     #warning "SYS_clock_getres not defined for this platform, disabling fast_thread_cpu_time"
1239     #define sys_clock_getres(x,y)  -1
1240   #endif
1241 #else
1242   #define sys_clock_getres(x,y)  ::syscall(SYS_clock_getres, x, y)
1243 #endif
1244 
1245 void os::Linux::fast_thread_clock_init() {
1246   if (!UseLinuxPosixThreadCPUClocks) {
1247     return;
1248   }
1249   clockid_t clockid;
1250   struct timespec tp;
1251   int (*pthread_getcpuclockid_func)(pthread_t, clockid_t *) =
1252       (int(*)(pthread_t, clockid_t *)) dlsym(RTLD_DEFAULT, "pthread_getcpuclockid");
1253 
1254   // Switch to using fast clocks for thread cpu time if
1255   // the sys_clock_getres() returns 0 error code.
1256   // Note, that some kernels may support the current thread
1257   // clock (CLOCK_THREAD_CPUTIME_ID) but not the clocks
1258   // returned by the pthread_getcpuclockid().
1259   // If the fast Posix clocks are supported then the sys_clock_getres()
1260   // must return at least tp.tv_sec == 0 which means a resolution
1261   // better than 1 sec. This is extra check for reliability.
1262 
1263   if (pthread_getcpuclockid_func &&
1264       pthread_getcpuclockid_func(_main_thread, &clockid) == 0 &&
1265       sys_clock_getres(clockid, &tp) == 0 && tp.tv_sec == 0) {
1266     _supports_fast_thread_cpu_time = true;
1267     _pthread_getcpuclockid = pthread_getcpuclockid_func;
1268   }
1269 }
1270 
1271 jlong os::javaTimeNanos() {
1272   if (os::supports_monotonic_clock()) {
1273     struct timespec tp;
1274     int status = Linux::clock_gettime(CLOCK_MONOTONIC, &tp);
1275     assert(status == 0, "gettime error");
1276     jlong result = jlong(tp.tv_sec) * (1000 * 1000 * 1000) + jlong(tp.tv_nsec);
1277     return result;
1278   } else {
1279     timeval time;
1280     int status = gettimeofday(&time, NULL);
1281     assert(status != -1, "linux error");
1282     jlong usecs = jlong(time.tv_sec) * (1000 * 1000) + jlong(time.tv_usec);
1283     return 1000 * usecs;
1284   }
1285 }
1286 
1287 void os::javaTimeNanos_info(jvmtiTimerInfo *info_ptr) {
1288   if (os::supports_monotonic_clock()) {
1289     info_ptr->max_value = ALL_64_BITS;
1290 
1291     // CLOCK_MONOTONIC - amount of time since some arbitrary point in the past
1292     info_ptr->may_skip_backward = false;      // not subject to resetting or drifting
1293     info_ptr->may_skip_forward = false;       // not subject to resetting or drifting
1294   } else {
1295     // gettimeofday - based on time in seconds since the Epoch thus does not wrap
1296     info_ptr->max_value = ALL_64_BITS;
1297 
1298     // gettimeofday is a real time clock so it skips
1299     info_ptr->may_skip_backward = true;
1300     info_ptr->may_skip_forward = true;
1301   }
1302 
1303   info_ptr->kind = JVMTI_TIMER_ELAPSED;                // elapsed not CPU time
1304 }
1305 
1306 // Return the real, user, and system times in seconds from an
1307 // arbitrary fixed point in the past.
1308 bool os::getTimesSecs(double* process_real_time,
1309                       double* process_user_time,
1310                       double* process_system_time) {
1311   struct tms ticks;
1312   clock_t real_ticks = times(&ticks);
1313 
1314   if (real_ticks == (clock_t) (-1)) {
1315     return false;
1316   } else {
1317     double ticks_per_second = (double) clock_tics_per_sec;
1318     *process_user_time = ((double) ticks.tms_utime) / ticks_per_second;
1319     *process_system_time = ((double) ticks.tms_stime) / ticks_per_second;
1320     *process_real_time = ((double) real_ticks) / ticks_per_second;
1321 
1322     return true;
1323   }
1324 }
1325 
1326 
1327 char * os::local_time_string(char *buf, size_t buflen) {
1328   struct tm t;
1329   time_t long_time;
1330   time(&long_time);
1331   localtime_r(&long_time, &t);
1332   jio_snprintf(buf, buflen, "%d-%02d-%02d %02d:%02d:%02d",
1333                t.tm_year + 1900, t.tm_mon + 1, t.tm_mday,
1334                t.tm_hour, t.tm_min, t.tm_sec);
1335   return buf;
1336 }
1337 
1338 struct tm* os::localtime_pd(const time_t* clock, struct tm*  res) {
1339   return localtime_r(clock, res);
1340 }
1341 
1342 ////////////////////////////////////////////////////////////////////////////////
1343 // runtime exit support
1344 
1345 // Note: os::shutdown() might be called very early during initialization, or
1346 // called from signal handler. Before adding something to os::shutdown(), make
1347 // sure it is async-safe and can handle partially initialized VM.
1348 void os::shutdown() {
1349 
1350   // allow PerfMemory to attempt cleanup of any persistent resources
1351   perfMemory_exit();
1352 
1353   // needs to remove object in file system
1354   AttachListener::abort();
1355 
1356   // flush buffered output, finish log files
1357   ostream_abort();
1358 
1359   // Check for abort hook
1360   abort_hook_t abort_hook = Arguments::abort_hook();
1361   if (abort_hook != NULL) {
1362     abort_hook();
1363   }
1364 
1365 }
1366 
1367 // Note: os::abort() might be called very early during initialization, or
1368 // called from signal handler. Before adding something to os::abort(), make
1369 // sure it is async-safe and can handle partially initialized VM.
1370 void os::abort(bool dump_core, void* siginfo, const void* context) {
1371   os::shutdown();
1372   if (dump_core) {
1373 #ifndef PRODUCT
1374     fdStream out(defaultStream::output_fd());
1375     out.print_raw("Current thread is ");
1376     char buf[16];
1377     jio_snprintf(buf, sizeof(buf), UINTX_FORMAT, os::current_thread_id());
1378     out.print_raw_cr(buf);
1379     out.print_raw_cr("Dumping core ...");
1380 #endif
1381     ::abort(); // dump core
1382   }
1383 
1384   ::exit(1);
1385 }
1386 
1387 // Die immediately, no exit hook, no abort hook, no cleanup.
1388 void os::die() {
1389   ::abort();
1390 }
1391 
1392 
1393 // This method is a copy of JDK's sysGetLastErrorString
1394 // from src/solaris/hpi/src/system_md.c
1395 
1396 size_t os::lasterror(char *buf, size_t len) {
1397   if (errno == 0)  return 0;
1398 
1399   const char *s = os::strerror(errno);
1400   size_t n = ::strlen(s);
1401   if (n >= len) {
1402     n = len - 1;
1403   }
1404   ::strncpy(buf, s, n);
1405   buf[n] = '\0';
1406   return n;
1407 }
1408 
1409 // thread_id is kernel thread id (similar to Solaris LWP id)
1410 intx os::current_thread_id() { return os::Linux::gettid(); }
1411 int os::current_process_id() {
1412   return ::getpid();
1413 }
1414 
1415 // DLL functions
1416 
1417 const char* os::dll_file_extension() { return ".so"; }
1418 
1419 // This must be hard coded because it's the system's temporary
1420 // directory not the java application's temp directory, ala java.io.tmpdir.
1421 const char* os::get_temp_directory() { return "/tmp"; }
1422 
1423 static bool file_exists(const char* filename) {
1424   struct stat statbuf;
1425   if (filename == NULL || strlen(filename) == 0) {
1426     return false;
1427   }
1428   return os::stat(filename, &statbuf) == 0;
1429 }
1430 
1431 bool os::dll_build_name(char* buffer, size_t buflen,
1432                         const char* pname, const char* fname) {
1433   bool retval = false;
1434   // Copied from libhpi
1435   const size_t pnamelen = pname ? strlen(pname) : 0;
1436 
1437   // Return error on buffer overflow.
1438   if (pnamelen + strlen(fname) + 10 > (size_t) buflen) {
1439     return retval;
1440   }
1441 
1442   if (pnamelen == 0) {
1443     snprintf(buffer, buflen, "lib%s.so", fname);
1444     retval = true;
1445   } else if (strchr(pname, *os::path_separator()) != NULL) {
1446     int n;
1447     char** pelements = split_path(pname, &n);
1448     if (pelements == NULL) {
1449       return false;
1450     }
1451     for (int i = 0; i < n; i++) {
1452       // Really shouldn't be NULL, but check can't hurt
1453       if (pelements[i] == NULL || strlen(pelements[i]) == 0) {
1454         continue; // skip the empty path values
1455       }
1456       snprintf(buffer, buflen, "%s/lib%s.so", pelements[i], fname);
1457       if (file_exists(buffer)) {
1458         retval = true;
1459         break;
1460       }
1461     }
1462     // release the storage
1463     for (int i = 0; i < n; i++) {
1464       if (pelements[i] != NULL) {
1465         FREE_C_HEAP_ARRAY(char, pelements[i]);
1466       }
1467     }
1468     if (pelements != NULL) {
1469       FREE_C_HEAP_ARRAY(char*, pelements);
1470     }
1471   } else {
1472     snprintf(buffer, buflen, "%s/lib%s.so", pname, fname);
1473     retval = true;
1474   }
1475   return retval;
1476 }
1477 
1478 // check if addr is inside libjvm.so
1479 bool os::address_is_in_vm(address addr) {
1480   static address libjvm_base_addr;
1481   Dl_info dlinfo;
1482 
1483   if (libjvm_base_addr == NULL) {
1484     if (dladdr(CAST_FROM_FN_PTR(void *, os::address_is_in_vm), &dlinfo) != 0) {
1485       libjvm_base_addr = (address)dlinfo.dli_fbase;
1486     }
1487     assert(libjvm_base_addr !=NULL, "Cannot obtain base address for libjvm");
1488   }
1489 
1490   if (dladdr((void *)addr, &dlinfo) != 0) {
1491     if (libjvm_base_addr == (address)dlinfo.dli_fbase) return true;
1492   }
1493 
1494   return false;
1495 }
1496 
1497 bool os::dll_address_to_function_name(address addr, char *buf,
1498                                       int buflen, int *offset,
1499                                       bool demangle) {
1500   // buf is not optional, but offset is optional
1501   assert(buf != NULL, "sanity check");
1502 
1503   Dl_info dlinfo;
1504 
1505   if (dladdr((void*)addr, &dlinfo) != 0) {
1506     // see if we have a matching symbol
1507     if (dlinfo.dli_saddr != NULL && dlinfo.dli_sname != NULL) {
1508       if (!(demangle && Decoder::demangle(dlinfo.dli_sname, buf, buflen))) {
1509         jio_snprintf(buf, buflen, "%s", dlinfo.dli_sname);
1510       }
1511       if (offset != NULL) *offset = addr - (address)dlinfo.dli_saddr;
1512       return true;
1513     }
1514     // no matching symbol so try for just file info
1515     if (dlinfo.dli_fname != NULL && dlinfo.dli_fbase != NULL) {
1516       if (Decoder::decode((address)(addr - (address)dlinfo.dli_fbase),
1517                           buf, buflen, offset, dlinfo.dli_fname, demangle)) {
1518         return true;
1519       }
1520     }
1521   }
1522 
1523   buf[0] = '\0';
1524   if (offset != NULL) *offset = -1;
1525   return false;
1526 }
1527 
1528 struct _address_to_library_name {
1529   address addr;          // input : memory address
1530   size_t  buflen;        //         size of fname
1531   char*   fname;         // output: library name
1532   address base;          //         library base addr
1533 };
1534 
1535 static int address_to_library_name_callback(struct dl_phdr_info *info,
1536                                             size_t size, void *data) {
1537   int i;
1538   bool found = false;
1539   address libbase = NULL;
1540   struct _address_to_library_name * d = (struct _address_to_library_name *)data;
1541 
1542   // iterate through all loadable segments
1543   for (i = 0; i < info->dlpi_phnum; i++) {
1544     address segbase = (address)(info->dlpi_addr + info->dlpi_phdr[i].p_vaddr);
1545     if (info->dlpi_phdr[i].p_type == PT_LOAD) {
1546       // base address of a library is the lowest address of its loaded
1547       // segments.
1548       if (libbase == NULL || libbase > segbase) {
1549         libbase = segbase;
1550       }
1551       // see if 'addr' is within current segment
1552       if (segbase <= d->addr &&
1553           d->addr < segbase + info->dlpi_phdr[i].p_memsz) {
1554         found = true;
1555       }
1556     }
1557   }
1558 
1559   // dlpi_name is NULL or empty if the ELF file is executable, return 0
1560   // so dll_address_to_library_name() can fall through to use dladdr() which
1561   // can figure out executable name from argv[0].
1562   if (found && info->dlpi_name && info->dlpi_name[0]) {
1563     d->base = libbase;
1564     if (d->fname) {
1565       jio_snprintf(d->fname, d->buflen, "%s", info->dlpi_name);
1566     }
1567     return 1;
1568   }
1569   return 0;
1570 }
1571 
1572 bool os::dll_address_to_library_name(address addr, char* buf,
1573                                      int buflen, int* offset) {
1574   // buf is not optional, but offset is optional
1575   assert(buf != NULL, "sanity check");
1576 
1577   Dl_info dlinfo;
1578   struct _address_to_library_name data;
1579 
1580   // There is a bug in old glibc dladdr() implementation that it could resolve
1581   // to wrong library name if the .so file has a base address != NULL. Here
1582   // we iterate through the program headers of all loaded libraries to find
1583   // out which library 'addr' really belongs to. This workaround can be
1584   // removed once the minimum requirement for glibc is moved to 2.3.x.
1585   data.addr = addr;
1586   data.fname = buf;
1587   data.buflen = buflen;
1588   data.base = NULL;
1589   int rslt = dl_iterate_phdr(address_to_library_name_callback, (void *)&data);
1590 
1591   if (rslt) {
1592     // buf already contains library name
1593     if (offset) *offset = addr - data.base;
1594     return true;
1595   }
1596   if (dladdr((void*)addr, &dlinfo) != 0) {
1597     if (dlinfo.dli_fname != NULL) {
1598       jio_snprintf(buf, buflen, "%s", dlinfo.dli_fname);
1599     }
1600     if (dlinfo.dli_fbase != NULL && offset != NULL) {
1601       *offset = addr - (address)dlinfo.dli_fbase;
1602     }
1603     return true;
1604   }
1605 
1606   buf[0] = '\0';
1607   if (offset) *offset = -1;
1608   return false;
1609 }
1610 
1611 // Loads .dll/.so and
1612 // in case of error it checks if .dll/.so was built for the
1613 // same architecture as Hotspot is running on
1614 
1615 
1616 // Remember the stack's state. The Linux dynamic linker will change
1617 // the stack to 'executable' at most once, so we must safepoint only once.
1618 bool os::Linux::_stack_is_executable = false;
1619 
1620 // VM operation that loads a library.  This is necessary if stack protection
1621 // of the Java stacks can be lost during loading the library.  If we
1622 // do not stop the Java threads, they can stack overflow before the stacks
1623 // are protected again.
1624 class VM_LinuxDllLoad: public VM_Operation {
1625  private:
1626   const char *_filename;
1627   char *_ebuf;
1628   int _ebuflen;
1629   void *_lib;
1630  public:
1631   VM_LinuxDllLoad(const char *fn, char *ebuf, int ebuflen) :
1632     _filename(fn), _ebuf(ebuf), _ebuflen(ebuflen), _lib(NULL) {}
1633   VMOp_Type type() const { return VMOp_LinuxDllLoad; }
1634   void doit() {
1635     _lib = os::Linux::dll_load_in_vmthread(_filename, _ebuf, _ebuflen);
1636     os::Linux::_stack_is_executable = true;
1637   }
1638   void* loaded_library() { return _lib; }
1639 };
1640 
1641 void * os::dll_load(const char *filename, char *ebuf, int ebuflen) {
1642   void * result = NULL;
1643   bool load_attempted = false;
1644 
1645   // Check whether the library to load might change execution rights
1646   // of the stack. If they are changed, the protection of the stack
1647   // guard pages will be lost. We need a safepoint to fix this.
1648   //
1649   // See Linux man page execstack(8) for more info.
1650   if (os::uses_stack_guard_pages() && !os::Linux::_stack_is_executable) {
1651     ElfFile ef(filename);
1652     if (!ef.specifies_noexecstack()) {
1653       if (!is_init_completed()) {
1654         os::Linux::_stack_is_executable = true;
1655         // This is OK - No Java threads have been created yet, and hence no
1656         // stack guard pages to fix.
1657         //
1658         // This should happen only when you are building JDK7 using a very
1659         // old version of JDK6 (e.g., with JPRT) and running test_gamma.
1660         //
1661         // Dynamic loader will make all stacks executable after
1662         // this function returns, and will not do that again.
1663         assert(Threads::first() == NULL, "no Java threads should exist yet.");
1664       } else {
1665         warning("You have loaded library %s which might have disabled stack guard. "
1666                 "The VM will try to fix the stack guard now.\n"
1667                 "It's highly recommended that you fix the library with "
1668                 "'execstack -c <libfile>', or link it with '-z noexecstack'.",
1669                 filename);
1670 
1671         assert(Thread::current()->is_Java_thread(), "must be Java thread");
1672         JavaThread *jt = JavaThread::current();
1673         if (jt->thread_state() != _thread_in_native) {
1674           // This happens when a compiler thread tries to load a hsdis-<arch>.so file
1675           // that requires ExecStack. Cannot enter safe point. Let's give up.
1676           warning("Unable to fix stack guard. Giving up.");
1677         } else {
1678           if (!LoadExecStackDllInVMThread) {
1679             // This is for the case where the DLL has an static
1680             // constructor function that executes JNI code. We cannot
1681             // load such DLLs in the VMThread.
1682             result = os::Linux::dlopen_helper(filename, ebuf, ebuflen);
1683           }
1684 
1685           ThreadInVMfromNative tiv(jt);
1686           debug_only(VMNativeEntryWrapper vew;)
1687 
1688           VM_LinuxDllLoad op(filename, ebuf, ebuflen);
1689           VMThread::execute(&op);
1690           if (LoadExecStackDllInVMThread) {
1691             result = op.loaded_library();
1692           }
1693           load_attempted = true;
1694         }
1695       }
1696     }
1697   }
1698 
1699   if (!load_attempted) {
1700     result = os::Linux::dlopen_helper(filename, ebuf, ebuflen);
1701   }
1702 
1703   if (result != NULL) {
1704     // Successful loading
1705     return result;
1706   }
1707 
1708   Elf32_Ehdr elf_head;
1709   int diag_msg_max_length=ebuflen-strlen(ebuf);
1710   char* diag_msg_buf=ebuf+strlen(ebuf);
1711 
1712   if (diag_msg_max_length==0) {
1713     // No more space in ebuf for additional diagnostics message
1714     return NULL;
1715   }
1716 
1717 
1718   int file_descriptor= ::open(filename, O_RDONLY | O_NONBLOCK);
1719 
1720   if (file_descriptor < 0) {
1721     // Can't open library, report dlerror() message
1722     return NULL;
1723   }
1724 
1725   bool failed_to_read_elf_head=
1726     (sizeof(elf_head)!=
1727      (::read(file_descriptor, &elf_head,sizeof(elf_head))));
1728 
1729   ::close(file_descriptor);
1730   if (failed_to_read_elf_head) {
1731     // file i/o error - report dlerror() msg
1732     return NULL;
1733   }
1734 
1735   typedef struct {
1736     Elf32_Half    code;         // Actual value as defined in elf.h
1737     Elf32_Half    compat_class; // Compatibility of archs at VM's sense
1738     unsigned char elf_class;    // 32 or 64 bit
1739     unsigned char endianess;    // MSB or LSB
1740     char*         name;         // String representation
1741   } arch_t;
1742 
1743 #ifndef EM_486
1744   #define EM_486          6               /* Intel 80486 */
1745 #endif
1746 #ifndef EM_AARCH64
1747   #define EM_AARCH64    183               /* ARM AARCH64 */
1748 #endif
1749 
1750   static const arch_t arch_array[]={
1751     {EM_386,         EM_386,     ELFCLASS32, ELFDATA2LSB, (char*)"IA 32"},
1752     {EM_486,         EM_386,     ELFCLASS32, ELFDATA2LSB, (char*)"IA 32"},
1753     {EM_IA_64,       EM_IA_64,   ELFCLASS64, ELFDATA2LSB, (char*)"IA 64"},
1754     {EM_X86_64,      EM_X86_64,  ELFCLASS64, ELFDATA2LSB, (char*)"AMD 64"},
1755     {EM_SPARC,       EM_SPARC,   ELFCLASS32, ELFDATA2MSB, (char*)"Sparc 32"},
1756     {EM_SPARC32PLUS, EM_SPARC,   ELFCLASS32, ELFDATA2MSB, (char*)"Sparc 32"},
1757     {EM_SPARCV9,     EM_SPARCV9, ELFCLASS64, ELFDATA2MSB, (char*)"Sparc v9 64"},
1758     {EM_PPC,         EM_PPC,     ELFCLASS32, ELFDATA2MSB, (char*)"Power PC 32"},
1759 #if defined(VM_LITTLE_ENDIAN)
1760     {EM_PPC64,       EM_PPC64,   ELFCLASS64, ELFDATA2LSB, (char*)"Power PC 64"},
1761 #else
1762     {EM_PPC64,       EM_PPC64,   ELFCLASS64, ELFDATA2MSB, (char*)"Power PC 64 LE"},
1763 #endif
1764     {EM_ARM,         EM_ARM,     ELFCLASS32,   ELFDATA2LSB, (char*)"ARM"},
1765     {EM_S390,        EM_S390,    ELFCLASSNONE, ELFDATA2MSB, (char*)"IBM System/390"},
1766     {EM_ALPHA,       EM_ALPHA,   ELFCLASS64, ELFDATA2LSB, (char*)"Alpha"},
1767     {EM_MIPS_RS3_LE, EM_MIPS_RS3_LE, ELFCLASS32, ELFDATA2LSB, (char*)"MIPSel"},
1768     {EM_MIPS,        EM_MIPS,    ELFCLASS32, ELFDATA2MSB, (char*)"MIPS"},
1769     {EM_PARISC,      EM_PARISC,  ELFCLASS32, ELFDATA2MSB, (char*)"PARISC"},
1770     {EM_68K,         EM_68K,     ELFCLASS32, ELFDATA2MSB, (char*)"M68k"},
1771     {EM_AARCH64,     EM_AARCH64, ELFCLASS64, ELFDATA2LSB, (char*)"AARCH64"},
1772   };
1773 
1774 #if  (defined IA32)
1775   static  Elf32_Half running_arch_code=EM_386;
1776 #elif   (defined AMD64)
1777   static  Elf32_Half running_arch_code=EM_X86_64;
1778 #elif  (defined IA64)
1779   static  Elf32_Half running_arch_code=EM_IA_64;
1780 #elif  (defined __sparc) && (defined _LP64)
1781   static  Elf32_Half running_arch_code=EM_SPARCV9;
1782 #elif  (defined __sparc) && (!defined _LP64)
1783   static  Elf32_Half running_arch_code=EM_SPARC;
1784 #elif  (defined __powerpc64__)
1785   static  Elf32_Half running_arch_code=EM_PPC64;
1786 #elif  (defined __powerpc__)
1787   static  Elf32_Half running_arch_code=EM_PPC;
1788 #elif  (defined AARCH64)
1789   static  Elf32_Half running_arch_code=EM_AARCH64;
1790 #elif  (defined ARM)
1791   static  Elf32_Half running_arch_code=EM_ARM;
1792 #elif  (defined S390)
1793   static  Elf32_Half running_arch_code=EM_S390;
1794 #elif  (defined ALPHA)
1795   static  Elf32_Half running_arch_code=EM_ALPHA;
1796 #elif  (defined MIPSEL)
1797   static  Elf32_Half running_arch_code=EM_MIPS_RS3_LE;
1798 #elif  (defined PARISC)
1799   static  Elf32_Half running_arch_code=EM_PARISC;
1800 #elif  (defined MIPS)
1801   static  Elf32_Half running_arch_code=EM_MIPS;
1802 #elif  (defined M68K)
1803   static  Elf32_Half running_arch_code=EM_68K;
1804 #else
1805     #error Method os::dll_load requires that one of following is defined:\
1806         AARCH64, ALPHA, ARM, AMD64, IA32, IA64, M68K, MIPS, MIPSEL, PARISC, __powerpc__, __powerpc64__, S390, __sparc
1807 #endif
1808 
1809   // Identify compatability class for VM's architecture and library's architecture
1810   // Obtain string descriptions for architectures
1811 
1812   arch_t lib_arch={elf_head.e_machine,0,elf_head.e_ident[EI_CLASS], elf_head.e_ident[EI_DATA], NULL};
1813   int running_arch_index=-1;
1814 
1815   for (unsigned int i=0; i < ARRAY_SIZE(arch_array); i++) {
1816     if (running_arch_code == arch_array[i].code) {
1817       running_arch_index    = i;
1818     }
1819     if (lib_arch.code == arch_array[i].code) {
1820       lib_arch.compat_class = arch_array[i].compat_class;
1821       lib_arch.name         = arch_array[i].name;
1822     }
1823   }
1824 
1825   assert(running_arch_index != -1,
1826          "Didn't find running architecture code (running_arch_code) in arch_array");
1827   if (running_arch_index == -1) {
1828     // Even though running architecture detection failed
1829     // we may still continue with reporting dlerror() message
1830     return NULL;
1831   }
1832 
1833   if (lib_arch.endianess != arch_array[running_arch_index].endianess) {
1834     ::snprintf(diag_msg_buf, diag_msg_max_length-1," (Possible cause: endianness mismatch)");
1835     return NULL;
1836   }
1837 
1838 #ifndef S390
1839   if (lib_arch.elf_class != arch_array[running_arch_index].elf_class) {
1840     ::snprintf(diag_msg_buf, diag_msg_max_length-1," (Possible cause: architecture word width mismatch)");
1841     return NULL;
1842   }
1843 #endif // !S390
1844 
1845   if (lib_arch.compat_class != arch_array[running_arch_index].compat_class) {
1846     if (lib_arch.name!=NULL) {
1847       ::snprintf(diag_msg_buf, diag_msg_max_length-1,
1848                  " (Possible cause: can't load %s-bit .so on a %s-bit platform)",
1849                  lib_arch.name, arch_array[running_arch_index].name);
1850     } else {
1851       ::snprintf(diag_msg_buf, diag_msg_max_length-1,
1852                  " (Possible cause: can't load this .so (machine code=0x%x) on a %s-bit platform)",
1853                  lib_arch.code,
1854                  arch_array[running_arch_index].name);
1855     }
1856   }
1857 
1858   return NULL;
1859 }
1860 
1861 void * os::Linux::dlopen_helper(const char *filename, char *ebuf,
1862                                 int ebuflen) {
1863   void * result = ::dlopen(filename, RTLD_LAZY);
1864   if (result == NULL) {
1865     ::strncpy(ebuf, ::dlerror(), ebuflen - 1);
1866     ebuf[ebuflen-1] = '\0';
1867   }
1868   return result;
1869 }
1870 
1871 void * os::Linux::dll_load_in_vmthread(const char *filename, char *ebuf,
1872                                        int ebuflen) {
1873   void * result = NULL;
1874   if (LoadExecStackDllInVMThread) {
1875     result = dlopen_helper(filename, ebuf, ebuflen);
1876   }
1877 
1878   // Since 7019808, libjvm.so is linked with -noexecstack. If the VM loads a
1879   // library that requires an executable stack, or which does not have this
1880   // stack attribute set, dlopen changes the stack attribute to executable. The
1881   // read protection of the guard pages gets lost.
1882   //
1883   // Need to check _stack_is_executable again as multiple VM_LinuxDllLoad
1884   // may have been queued at the same time.
1885 
1886   if (!_stack_is_executable) {
1887     JavaThread *jt = Threads::first();
1888 
1889     while (jt) {
1890       if (!jt->stack_guard_zone_unused() &&     // Stack not yet fully initialized
1891           jt->stack_guards_enabled()) {         // No pending stack overflow exceptions
1892         if (!os::guard_memory((char *)jt->stack_end(), jt->stack_guard_zone_size())) {
1893           warning("Attempt to reguard stack yellow zone failed.");
1894         }
1895       }
1896       jt = jt->next();
1897     }
1898   }
1899 
1900   return result;
1901 }
1902 
1903 void* os::dll_lookup(void* handle, const char* name) {
1904   void* res = dlsym(handle, name);
1905   return res;
1906 }
1907 
1908 void* os::get_default_process_handle() {
1909   return (void*)::dlopen(NULL, RTLD_LAZY);
1910 }
1911 
1912 static bool _print_ascii_file(const char* filename, outputStream* st) {
1913   int fd = ::open(filename, O_RDONLY);
1914   if (fd == -1) {
1915     return false;
1916   }
1917 
1918   char buf[33];
1919   int bytes;
1920   buf[32] = '\0';
1921   while ((bytes = ::read(fd, buf, sizeof(buf)-1)) > 0) {
1922     st->print_raw(buf, bytes);
1923   }
1924 
1925   ::close(fd);
1926 
1927   return true;
1928 }
1929 
1930 void os::print_dll_info(outputStream *st) {
1931   st->print_cr("Dynamic libraries:");
1932 
1933   char fname[32];
1934   pid_t pid = os::Linux::gettid();
1935 
1936   jio_snprintf(fname, sizeof(fname), "/proc/%d/maps", pid);
1937 
1938   if (!_print_ascii_file(fname, st)) {
1939     st->print("Can not get library information for pid = %d\n", pid);
1940   }
1941 }
1942 
1943 int os::get_loaded_modules_info(os::LoadedModulesCallbackFunc callback, void *param) {
1944   FILE *procmapsFile = NULL;
1945 
1946   // Open the procfs maps file for the current process
1947   if ((procmapsFile = fopen("/proc/self/maps", "r")) != NULL) {
1948     // Allocate PATH_MAX for file name plus a reasonable size for other fields.
1949     char line[PATH_MAX + 100];
1950 
1951     // Read line by line from 'file'
1952     while (fgets(line, sizeof(line), procmapsFile) != NULL) {
1953       u8 base, top, offset, inode;
1954       char permissions[5];
1955       char device[6];
1956       char name[PATH_MAX + 1];
1957 
1958       // Parse fields from line
1959       sscanf(line, UINT64_FORMAT_X "-" UINT64_FORMAT_X " %4s " UINT64_FORMAT_X " %5s " INT64_FORMAT " %s",
1960              &base, &top, permissions, &offset, device, &inode, name);
1961 
1962       // Filter by device id '00:00' so that we only get file system mapped files.
1963       if (strcmp(device, "00:00") != 0) {
1964 
1965         // Call callback with the fields of interest
1966         if(callback(name, (address)base, (address)top, param)) {
1967           // Oops abort, callback aborted
1968           fclose(procmapsFile);
1969           return 1;
1970         }
1971       }
1972     }
1973     fclose(procmapsFile);
1974   }
1975   return 0;
1976 }
1977 
1978 void os::print_os_info_brief(outputStream* st) {
1979   os::Linux::print_distro_info(st);
1980 
1981   os::Posix::print_uname_info(st);
1982 
1983   os::Linux::print_libversion_info(st);
1984 
1985 }
1986 
1987 void os::print_os_info(outputStream* st) {
1988   st->print("OS:");
1989 
1990   os::Linux::print_distro_info(st);
1991 
1992   os::Posix::print_uname_info(st);
1993 
1994   // Print warning if unsafe chroot environment detected
1995   if (unsafe_chroot_detected) {
1996     st->print("WARNING!! ");
1997     st->print_cr("%s", unstable_chroot_error);
1998   }
1999 
2000   os::Linux::print_libversion_info(st);
2001 
2002   os::Posix::print_rlimit_info(st);
2003 
2004   os::Posix::print_load_average(st);
2005 
2006   os::Linux::print_full_memory_info(st);
2007 }
2008 
2009 // Try to identify popular distros.
2010 // Most Linux distributions have a /etc/XXX-release file, which contains
2011 // the OS version string. Newer Linux distributions have a /etc/lsb-release
2012 // file that also contains the OS version string. Some have more than one
2013 // /etc/XXX-release file (e.g. Mandrake has both /etc/mandrake-release and
2014 // /etc/redhat-release.), so the order is important.
2015 // Any Linux that is based on Redhat (i.e. Oracle, Mandrake, Sun JDS...) have
2016 // their own specific XXX-release file as well as a redhat-release file.
2017 // Because of this the XXX-release file needs to be searched for before the
2018 // redhat-release file.
2019 // Since Red Hat and SuSE have an lsb-release file that is not very descriptive the
2020 // search for redhat-release / SuSE-release needs to be before lsb-release.
2021 // Since the lsb-release file is the new standard it needs to be searched
2022 // before the older style release files.
2023 // Searching system-release (Red Hat) and os-release (other Linuxes) are a
2024 // next to last resort.  The os-release file is a new standard that contains
2025 // distribution information and the system-release file seems to be an old
2026 // standard that has been replaced by the lsb-release and os-release files.
2027 // Searching for the debian_version file is the last resort.  It contains
2028 // an informative string like "6.0.6" or "wheezy/sid". Because of this
2029 // "Debian " is printed before the contents of the debian_version file.
2030 
2031 const char* distro_files[] = {
2032   "/etc/oracle-release",
2033   "/etc/mandriva-release",
2034   "/etc/mandrake-release",
2035   "/etc/sun-release",
2036   "/etc/redhat-release",
2037   "/etc/SuSE-release",
2038   "/etc/lsb-release",
2039   "/etc/turbolinux-release",
2040   "/etc/gentoo-release",
2041   "/etc/ltib-release",
2042   "/etc/angstrom-version",
2043   "/etc/system-release",
2044   "/etc/os-release",
2045   NULL };
2046 
2047 void os::Linux::print_distro_info(outputStream* st) {
2048   for (int i = 0;; i++) {
2049     const char* file = distro_files[i];
2050     if (file == NULL) {
2051       break;  // done
2052     }
2053     // If file prints, we found it.
2054     if (_print_ascii_file(file, st)) {
2055       return;
2056     }
2057   }
2058 
2059   if (file_exists("/etc/debian_version")) {
2060     st->print("Debian ");
2061     _print_ascii_file("/etc/debian_version", st);
2062   } else {
2063     st->print("Linux");
2064   }
2065   st->cr();
2066 }
2067 
2068 static void parse_os_info_helper(FILE* fp, char* distro, size_t length, bool get_first_line) {
2069   char buf[256];
2070   while (fgets(buf, sizeof(buf), fp)) {
2071     // Edit out extra stuff in expected format
2072     if (strstr(buf, "DISTRIB_DESCRIPTION=") != NULL || strstr(buf, "PRETTY_NAME=") != NULL) {
2073       char* ptr = strstr(buf, "\"");  // the name is in quotes
2074       if (ptr != NULL) {
2075         ptr++; // go beyond first quote
2076         char* nl = strchr(ptr, '\"');
2077         if (nl != NULL) *nl = '\0';
2078         strncpy(distro, ptr, length);
2079       } else {
2080         ptr = strstr(buf, "=");
2081         ptr++; // go beyond equals then
2082         char* nl = strchr(ptr, '\n');
2083         if (nl != NULL) *nl = '\0';
2084         strncpy(distro, ptr, length);
2085       }
2086       return;
2087     } else if (get_first_line) {
2088       char* nl = strchr(buf, '\n');
2089       if (nl != NULL) *nl = '\0';
2090       strncpy(distro, buf, length);
2091       return;
2092     }
2093   }
2094   // print last line and close
2095   char* nl = strchr(buf, '\n');
2096   if (nl != NULL) *nl = '\0';
2097   strncpy(distro, buf, length);
2098 }
2099 
2100 static void parse_os_info(char* distro, size_t length, const char* file) {
2101   FILE* fp = fopen(file, "r");
2102   if (fp != NULL) {
2103     // if suse format, print out first line
2104     bool get_first_line = (strcmp(file, "/etc/SuSE-release") == 0);
2105     parse_os_info_helper(fp, distro, length, get_first_line);
2106     fclose(fp);
2107   }
2108 }
2109 
2110 void os::get_summary_os_info(char* buf, size_t buflen) {
2111   for (int i = 0;; i++) {
2112     const char* file = distro_files[i];
2113     if (file == NULL) {
2114       break; // ran out of distro_files
2115     }
2116     if (file_exists(file)) {
2117       parse_os_info(buf, buflen, file);
2118       return;
2119     }
2120   }
2121   // special case for debian
2122   if (file_exists("/etc/debian_version")) {
2123     strncpy(buf, "Debian ", buflen);
2124     parse_os_info(&buf[7], buflen-7, "/etc/debian_version");
2125   } else {
2126     strncpy(buf, "Linux", buflen);
2127   }
2128 }
2129 
2130 void os::Linux::print_libversion_info(outputStream* st) {
2131   // libc, pthread
2132   st->print("libc:");
2133   st->print("%s ", os::Linux::glibc_version());
2134   st->print("%s ", os::Linux::libpthread_version());
2135   st->cr();
2136 }
2137 
2138 void os::Linux::print_full_memory_info(outputStream* st) {
2139   st->print("\n/proc/meminfo:\n");
2140   _print_ascii_file("/proc/meminfo", st);
2141   st->cr();
2142 }
2143 
2144 void os::print_memory_info(outputStream* st) {
2145 
2146   st->print("Memory:");
2147   st->print(" %dk page", os::vm_page_size()>>10);
2148 
2149   // values in struct sysinfo are "unsigned long"
2150   struct sysinfo si;
2151   sysinfo(&si);
2152 
2153   st->print(", physical " UINT64_FORMAT "k",
2154             os::physical_memory() >> 10);
2155   st->print("(" UINT64_FORMAT "k free)",
2156             os::available_memory() >> 10);
2157   st->print(", swap " UINT64_FORMAT "k",
2158             ((jlong)si.totalswap * si.mem_unit) >> 10);
2159   st->print("(" UINT64_FORMAT "k free)",
2160             ((jlong)si.freeswap * si.mem_unit) >> 10);
2161   st->cr();
2162 }
2163 
2164 // Print the first "model name" line and the first "flags" line
2165 // that we find and nothing more. We assume "model name" comes
2166 // before "flags" so if we find a second "model name", then the
2167 // "flags" field is considered missing.
2168 static bool print_model_name_and_flags(outputStream* st, char* buf, size_t buflen) {
2169 #if defined(IA32) || defined(AMD64)
2170   // Other platforms have less repetitive cpuinfo files
2171   FILE *fp = fopen("/proc/cpuinfo", "r");
2172   if (fp) {
2173     while (!feof(fp)) {
2174       if (fgets(buf, buflen, fp)) {
2175         // Assume model name comes before flags
2176         bool model_name_printed = false;
2177         if (strstr(buf, "model name") != NULL) {
2178           if (!model_name_printed) {
2179             st->print_raw("CPU Model and flags from /proc/cpuinfo:\n");
2180             st->print_raw(buf);
2181             model_name_printed = true;
2182           } else {
2183             // model name printed but not flags?  Odd, just return
2184             fclose(fp);
2185             return true;
2186           }
2187         }
2188         // print the flags line too
2189         if (strstr(buf, "flags") != NULL) {
2190           st->print_raw(buf);
2191           fclose(fp);
2192           return true;
2193         }
2194       }
2195     }
2196     fclose(fp);
2197   }
2198 #endif // x86 platforms
2199   return false;
2200 }
2201 
2202 void os::pd_print_cpu_info(outputStream* st, char* buf, size_t buflen) {
2203   // Only print the model name if the platform provides this as a summary
2204   if (!print_model_name_and_flags(st, buf, buflen)) {
2205     st->print("\n/proc/cpuinfo:\n");
2206     if (!_print_ascii_file("/proc/cpuinfo", st)) {
2207       st->print_cr("  <Not Available>");
2208     }
2209   }
2210 }
2211 
2212 #if defined(AMD64) || defined(IA32) || defined(X32)
2213 const char* search_string = "model name";
2214 #elif defined(PPC64)
2215 const char* search_string = "cpu";
2216 #elif defined(S390)
2217 const char* search_string = "processor";
2218 #elif defined(SPARC)
2219 const char* search_string = "cpu";
2220 #else
2221 const char* search_string = "Processor";
2222 #endif
2223 
2224 // Parses the cpuinfo file for string representing the model name.
2225 void os::get_summary_cpu_info(char* cpuinfo, size_t length) {
2226   FILE* fp = fopen("/proc/cpuinfo", "r");
2227   if (fp != NULL) {
2228     while (!feof(fp)) {
2229       char buf[256];
2230       if (fgets(buf, sizeof(buf), fp)) {
2231         char* start = strstr(buf, search_string);
2232         if (start != NULL) {
2233           char *ptr = start + strlen(search_string);
2234           char *end = buf + strlen(buf);
2235           while (ptr != end) {
2236              // skip whitespace and colon for the rest of the name.
2237              if (*ptr != ' ' && *ptr != '\t' && *ptr != ':') {
2238                break;
2239              }
2240              ptr++;
2241           }
2242           if (ptr != end) {
2243             // reasonable string, get rid of newline and keep the rest
2244             char* nl = strchr(buf, '\n');
2245             if (nl != NULL) *nl = '\0';
2246             strncpy(cpuinfo, ptr, length);
2247             fclose(fp);
2248             return;
2249           }
2250         }
2251       }
2252     }
2253     fclose(fp);
2254   }
2255   // cpuinfo not found or parsing failed, just print generic string.  The entire
2256   // /proc/cpuinfo file will be printed later in the file (or enough of it for x86)
2257 #if   defined(AARCH64)
2258   strncpy(cpuinfo, "AArch64", length);
2259 #elif defined(AMD64)
2260   strncpy(cpuinfo, "x86_64", length);
2261 #elif defined(ARM)  // Order wrt. AARCH64 is relevant!
2262   strncpy(cpuinfo, "ARM", length);
2263 #elif defined(IA32)
2264   strncpy(cpuinfo, "x86_32", length);
2265 #elif defined(IA64)
2266   strncpy(cpuinfo, "IA64", length);
2267 #elif defined(PPC)
2268   strncpy(cpuinfo, "PPC64", length);
2269 #elif defined(S390)
2270   strncpy(cpuinfo, "S390", length);
2271 #elif defined(SPARC)
2272   strncpy(cpuinfo, "sparcv9", length);
2273 #elif defined(ZERO_LIBARCH)
2274   strncpy(cpuinfo, ZERO_LIBARCH, length);
2275 #else
2276   strncpy(cpuinfo, "unknown", length);
2277 #endif
2278 }
2279 
2280 static void print_signal_handler(outputStream* st, int sig,
2281                                  char* buf, size_t buflen);
2282 
2283 void os::print_signal_handlers(outputStream* st, char* buf, size_t buflen) {
2284   st->print_cr("Signal Handlers:");
2285   print_signal_handler(st, SIGSEGV, buf, buflen);
2286   print_signal_handler(st, SIGBUS , buf, buflen);
2287   print_signal_handler(st, SIGFPE , buf, buflen);
2288   print_signal_handler(st, SIGPIPE, buf, buflen);
2289   print_signal_handler(st, SIGXFSZ, buf, buflen);
2290   print_signal_handler(st, SIGILL , buf, buflen);
2291   print_signal_handler(st, SR_signum, buf, buflen);
2292   print_signal_handler(st, SHUTDOWN1_SIGNAL, buf, buflen);
2293   print_signal_handler(st, SHUTDOWN2_SIGNAL , buf, buflen);
2294   print_signal_handler(st, SHUTDOWN3_SIGNAL , buf, buflen);
2295   print_signal_handler(st, BREAK_SIGNAL, buf, buflen);
2296 #if defined(PPC64)
2297   print_signal_handler(st, SIGTRAP, buf, buflen);
2298 #endif
2299 }
2300 
2301 static char saved_jvm_path[MAXPATHLEN] = {0};
2302 
2303 // Find the full path to the current module, libjvm.so
2304 void os::jvm_path(char *buf, jint buflen) {
2305   // Error checking.
2306   if (buflen < MAXPATHLEN) {
2307     assert(false, "must use a large-enough buffer");
2308     buf[0] = '\0';
2309     return;
2310   }
2311   // Lazy resolve the path to current module.
2312   if (saved_jvm_path[0] != 0) {
2313     strcpy(buf, saved_jvm_path);
2314     return;
2315   }
2316 
2317   char dli_fname[MAXPATHLEN];
2318   bool ret = dll_address_to_library_name(
2319                                          CAST_FROM_FN_PTR(address, os::jvm_path),
2320                                          dli_fname, sizeof(dli_fname), NULL);
2321   assert(ret, "cannot locate libjvm");
2322   char *rp = NULL;
2323   if (ret && dli_fname[0] != '\0') {
2324     rp = realpath(dli_fname, buf);
2325   }
2326   if (rp == NULL) {
2327     return;
2328   }
2329 
2330   if (Arguments::sun_java_launcher_is_altjvm()) {
2331     // Support for the java launcher's '-XXaltjvm=<path>' option. Typical
2332     // value for buf is "<JAVA_HOME>/jre/lib/<vmtype>/libjvm.so".
2333     // If "/jre/lib/" appears at the right place in the string, then
2334     // assume we are installed in a JDK and we're done. Otherwise, check
2335     // for a JAVA_HOME environment variable and fix up the path so it
2336     // looks like libjvm.so is installed there (append a fake suffix
2337     // hotspot/libjvm.so).
2338     const char *p = buf + strlen(buf) - 1;
2339     for (int count = 0; p > buf && count < 5; ++count) {
2340       for (--p; p > buf && *p != '/'; --p)
2341         /* empty */ ;
2342     }
2343 
2344     if (strncmp(p, "/jre/lib/", 9) != 0) {
2345       // Look for JAVA_HOME in the environment.
2346       char* java_home_var = ::getenv("JAVA_HOME");
2347       if (java_home_var != NULL && java_home_var[0] != 0) {
2348         char* jrelib_p;
2349         int len;
2350 
2351         // Check the current module name "libjvm.so".
2352         p = strrchr(buf, '/');
2353         if (p == NULL) {
2354           return;
2355         }
2356         assert(strstr(p, "/libjvm") == p, "invalid library name");
2357 
2358         rp = realpath(java_home_var, buf);
2359         if (rp == NULL) {
2360           return;
2361         }
2362 
2363         // determine if this is a legacy image or modules image
2364         // modules image doesn't have "jre" subdirectory
2365         len = strlen(buf);
2366         assert(len < buflen, "Ran out of buffer room");
2367         jrelib_p = buf + len;
2368         snprintf(jrelib_p, buflen-len, "/jre/lib");
2369         if (0 != access(buf, F_OK)) {
2370           snprintf(jrelib_p, buflen-len, "/lib");
2371         }
2372 
2373         if (0 == access(buf, F_OK)) {
2374           // Use current module name "libjvm.so"
2375           len = strlen(buf);
2376           snprintf(buf + len, buflen-len, "/hotspot/libjvm.so");
2377         } else {
2378           // Go back to path of .so
2379           rp = realpath(dli_fname, buf);
2380           if (rp == NULL) {
2381             return;
2382           }
2383         }
2384       }
2385     }
2386   }
2387 
2388   strncpy(saved_jvm_path, buf, MAXPATHLEN);
2389   saved_jvm_path[MAXPATHLEN - 1] = '\0';
2390 }
2391 
2392 void os::print_jni_name_prefix_on(outputStream* st, int args_size) {
2393   // no prefix required, not even "_"
2394 }
2395 
2396 void os::print_jni_name_suffix_on(outputStream* st, int args_size) {
2397   // no suffix required
2398 }
2399 
2400 ////////////////////////////////////////////////////////////////////////////////
2401 // sun.misc.Signal support
2402 
2403 static volatile jint sigint_count = 0;
2404 
2405 static void UserHandler(int sig, void *siginfo, void *context) {
2406   // 4511530 - sem_post is serialized and handled by the manager thread. When
2407   // the program is interrupted by Ctrl-C, SIGINT is sent to every thread. We
2408   // don't want to flood the manager thread with sem_post requests.
2409   if (sig == SIGINT && Atomic::add(1, &sigint_count) > 1) {
2410     return;
2411   }
2412 
2413   // Ctrl-C is pressed during error reporting, likely because the error
2414   // handler fails to abort. Let VM die immediately.
2415   if (sig == SIGINT && is_error_reported()) {
2416     os::die();
2417   }
2418 
2419   os::signal_notify(sig);
2420 }
2421 
2422 void* os::user_handler() {
2423   return CAST_FROM_FN_PTR(void*, UserHandler);
2424 }
2425 
2426 struct timespec PosixSemaphore::create_timespec(unsigned int sec, int nsec) {
2427   struct timespec ts;
2428   // Semaphore's are always associated with CLOCK_REALTIME
2429   os::Linux::clock_gettime(CLOCK_REALTIME, &ts);
2430   // see unpackTime for discussion on overflow checking
2431   if (sec >= MAX_SECS) {
2432     ts.tv_sec += MAX_SECS;
2433     ts.tv_nsec = 0;
2434   } else {
2435     ts.tv_sec += sec;
2436     ts.tv_nsec += nsec;
2437     if (ts.tv_nsec >= NANOSECS_PER_SEC) {
2438       ts.tv_nsec -= NANOSECS_PER_SEC;
2439       ++ts.tv_sec; // note: this must be <= max_secs
2440     }
2441   }
2442 
2443   return ts;
2444 }
2445 
2446 extern "C" {
2447   typedef void (*sa_handler_t)(int);
2448   typedef void (*sa_sigaction_t)(int, siginfo_t *, void *);
2449 }
2450 
2451 void* os::signal(int signal_number, void* handler) {
2452   struct sigaction sigAct, oldSigAct;
2453 
2454   sigfillset(&(sigAct.sa_mask));
2455   sigAct.sa_flags   = SA_RESTART|SA_SIGINFO;
2456   sigAct.sa_handler = CAST_TO_FN_PTR(sa_handler_t, handler);
2457 
2458   if (sigaction(signal_number, &sigAct, &oldSigAct)) {
2459     // -1 means registration failed
2460     return (void *)-1;
2461   }
2462 
2463   return CAST_FROM_FN_PTR(void*, oldSigAct.sa_handler);
2464 }
2465 
2466 void os::signal_raise(int signal_number) {
2467   ::raise(signal_number);
2468 }
2469 
2470 // The following code is moved from os.cpp for making this
2471 // code platform specific, which it is by its very nature.
2472 
2473 // Will be modified when max signal is changed to be dynamic
2474 int os::sigexitnum_pd() {
2475   return NSIG;
2476 }
2477 
2478 // a counter for each possible signal value
2479 static volatile jint pending_signals[NSIG+1] = { 0 };
2480 
2481 // Linux(POSIX) specific hand shaking semaphore.
2482 static sem_t sig_sem;
2483 static PosixSemaphore sr_semaphore;
2484 
2485 void os::signal_init_pd() {
2486   // Initialize signal structures
2487   ::memset((void*)pending_signals, 0, sizeof(pending_signals));
2488 
2489   // Initialize signal semaphore
2490   ::sem_init(&sig_sem, 0, 0);
2491 }
2492 
2493 void os::signal_notify(int sig) {
2494   Atomic::inc(&pending_signals[sig]);
2495   ::sem_post(&sig_sem);
2496 }
2497 
2498 static int check_pending_signals(bool wait) {
2499   Atomic::store(0, &sigint_count);
2500   for (;;) {
2501     for (int i = 0; i < NSIG + 1; i++) {
2502       jint n = pending_signals[i];
2503       if (n > 0 && n == Atomic::cmpxchg(n - 1, &pending_signals[i], n)) {
2504         return i;
2505       }
2506     }
2507     if (!wait) {
2508       return -1;
2509     }
2510     JavaThread *thread = JavaThread::current();
2511     ThreadBlockInVM tbivm(thread);
2512 
2513     bool threadIsSuspended;
2514     do {
2515       thread->set_suspend_equivalent();
2516       // cleared by handle_special_suspend_equivalent_condition() or java_suspend_self()
2517       ::sem_wait(&sig_sem);
2518 
2519       // were we externally suspended while we were waiting?
2520       threadIsSuspended = thread->handle_special_suspend_equivalent_condition();
2521       if (threadIsSuspended) {
2522         // The semaphore has been incremented, but while we were waiting
2523         // another thread suspended us. We don't want to continue running
2524         // while suspended because that would surprise the thread that
2525         // suspended us.
2526         ::sem_post(&sig_sem);
2527 
2528         thread->java_suspend_self();
2529       }
2530     } while (threadIsSuspended);
2531   }
2532 }
2533 
2534 int os::signal_lookup() {
2535   return check_pending_signals(false);
2536 }
2537 
2538 int os::signal_wait() {
2539   return check_pending_signals(true);
2540 }
2541 
2542 ////////////////////////////////////////////////////////////////////////////////
2543 // Virtual Memory
2544 
2545 int os::vm_page_size() {
2546   // Seems redundant as all get out
2547   assert(os::Linux::page_size() != -1, "must call os::init");
2548   return os::Linux::page_size();
2549 }
2550 
2551 // Solaris allocates memory by pages.
2552 int os::vm_allocation_granularity() {
2553   assert(os::Linux::page_size() != -1, "must call os::init");
2554   return os::Linux::page_size();
2555 }
2556 
2557 // Rationale behind this function:
2558 //  current (Mon Apr 25 20:12:18 MSD 2005) oprofile drops samples without executable
2559 //  mapping for address (see lookup_dcookie() in the kernel module), thus we cannot get
2560 //  samples for JITted code. Here we create private executable mapping over the code cache
2561 //  and then we can use standard (well, almost, as mapping can change) way to provide
2562 //  info for the reporting script by storing timestamp and location of symbol
2563 void linux_wrap_code(char* base, size_t size) {
2564   static volatile jint cnt = 0;
2565 
2566   if (!UseOprofile) {
2567     return;
2568   }
2569 
2570   char buf[PATH_MAX+1];
2571   int num = Atomic::add(1, &cnt);
2572 
2573   snprintf(buf, sizeof(buf), "%s/hs-vm-%d-%d",
2574            os::get_temp_directory(), os::current_process_id(), num);
2575   unlink(buf);
2576 
2577   int fd = ::open(buf, O_CREAT | O_RDWR, S_IRWXU);
2578 
2579   if (fd != -1) {
2580     off_t rv = ::lseek(fd, size-2, SEEK_SET);
2581     if (rv != (off_t)-1) {
2582       if (::write(fd, "", 1) == 1) {
2583         mmap(base, size,
2584              PROT_READ|PROT_WRITE|PROT_EXEC,
2585              MAP_PRIVATE|MAP_FIXED|MAP_NORESERVE, fd, 0);
2586       }
2587     }
2588     ::close(fd);
2589     unlink(buf);
2590   }
2591 }
2592 
2593 static bool recoverable_mmap_error(int err) {
2594   // See if the error is one we can let the caller handle. This
2595   // list of errno values comes from JBS-6843484. I can't find a
2596   // Linux man page that documents this specific set of errno
2597   // values so while this list currently matches Solaris, it may
2598   // change as we gain experience with this failure mode.
2599   switch (err) {
2600   case EBADF:
2601   case EINVAL:
2602   case ENOTSUP:
2603     // let the caller deal with these errors
2604     return true;
2605 
2606   default:
2607     // Any remaining errors on this OS can cause our reserved mapping
2608     // to be lost. That can cause confusion where different data
2609     // structures think they have the same memory mapped. The worst
2610     // scenario is if both the VM and a library think they have the
2611     // same memory mapped.
2612     return false;
2613   }
2614 }
2615 
2616 static void warn_fail_commit_memory(char* addr, size_t size, bool exec,
2617                                     int err) {
2618   warning("INFO: os::commit_memory(" PTR_FORMAT ", " SIZE_FORMAT
2619           ", %d) failed; error='%s' (errno=%d)", p2i(addr), size, exec,
2620           os::strerror(err), err);
2621 }
2622 
2623 static void warn_fail_commit_memory(char* addr, size_t size,
2624                                     size_t alignment_hint, bool exec,
2625                                     int err) {
2626   warning("INFO: os::commit_memory(" PTR_FORMAT ", " SIZE_FORMAT
2627           ", " SIZE_FORMAT ", %d) failed; error='%s' (errno=%d)", p2i(addr), size,
2628           alignment_hint, exec, os::strerror(err), err);
2629 }
2630 
2631 // NOTE: Linux kernel does not really reserve the pages for us.
2632 //       All it does is to check if there are enough free pages
2633 //       left at the time of mmap(). This could be a potential
2634 //       problem.
2635 int os::Linux::commit_memory_impl(char* addr, size_t size, bool exec) {
2636   int prot = exec ? PROT_READ|PROT_WRITE|PROT_EXEC : PROT_READ|PROT_WRITE;
2637   uintptr_t res = (uintptr_t) ::mmap(addr, size, prot,
2638                                      MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0);
2639   if (res != (uintptr_t) MAP_FAILED) {
2640     if (UseNUMAInterleaving) {
2641       numa_make_global(addr, size);
2642     }
2643     return 0;
2644   }
2645 
2646   int err = errno;  // save errno from mmap() call above
2647 
2648   if (!recoverable_mmap_error(err)) {
2649     warn_fail_commit_memory(addr, size, exec, err);
2650     vm_exit_out_of_memory(size, OOM_MMAP_ERROR, "committing reserved memory.");
2651   }
2652 
2653   return err;
2654 }
2655 
2656 bool os::pd_commit_memory(char* addr, size_t size, bool exec) {
2657   return os::Linux::commit_memory_impl(addr, size, exec) == 0;
2658 }
2659 
2660 void os::pd_commit_memory_or_exit(char* addr, size_t size, bool exec,
2661                                   const char* mesg) {
2662   assert(mesg != NULL, "mesg must be specified");
2663   int err = os::Linux::commit_memory_impl(addr, size, exec);
2664   if (err != 0) {
2665     // the caller wants all commit errors to exit with the specified mesg:
2666     warn_fail_commit_memory(addr, size, exec, err);
2667     vm_exit_out_of_memory(size, OOM_MMAP_ERROR, "%s", mesg);
2668   }
2669 }
2670 
2671 // Define MAP_HUGETLB here so we can build HotSpot on old systems.
2672 #ifndef MAP_HUGETLB
2673   #define MAP_HUGETLB 0x40000
2674 #endif
2675 
2676 // Define MADV_HUGEPAGE here so we can build HotSpot on old systems.
2677 #ifndef MADV_HUGEPAGE
2678   #define MADV_HUGEPAGE 14
2679 #endif
2680 
2681 int os::Linux::commit_memory_impl(char* addr, size_t size,
2682                                   size_t alignment_hint, bool exec) {
2683   int err = os::Linux::commit_memory_impl(addr, size, exec);
2684   if (err == 0) {
2685     realign_memory(addr, size, alignment_hint);
2686   }
2687   return err;
2688 }
2689 
2690 bool os::pd_commit_memory(char* addr, size_t size, size_t alignment_hint,
2691                           bool exec) {
2692   return os::Linux::commit_memory_impl(addr, size, alignment_hint, exec) == 0;
2693 }
2694 
2695 void os::pd_commit_memory_or_exit(char* addr, size_t size,
2696                                   size_t alignment_hint, bool exec,
2697                                   const char* mesg) {
2698   assert(mesg != NULL, "mesg must be specified");
2699   int err = os::Linux::commit_memory_impl(addr, size, alignment_hint, exec);
2700   if (err != 0) {
2701     // the caller wants all commit errors to exit with the specified mesg:
2702     warn_fail_commit_memory(addr, size, alignment_hint, exec, err);
2703     vm_exit_out_of_memory(size, OOM_MMAP_ERROR, "%s", mesg);
2704   }
2705 }
2706 
2707 void os::pd_realign_memory(char *addr, size_t bytes, size_t alignment_hint) {
2708   if (UseTransparentHugePages && alignment_hint > (size_t)vm_page_size()) {
2709     // We don't check the return value: madvise(MADV_HUGEPAGE) may not
2710     // be supported or the memory may already be backed by huge pages.
2711     ::madvise(addr, bytes, MADV_HUGEPAGE);
2712   }
2713 }
2714 
2715 void os::pd_free_memory(char *addr, size_t bytes, size_t alignment_hint) {
2716   // This method works by doing an mmap over an existing mmaping and effectively discarding
2717   // the existing pages. However it won't work for SHM-based large pages that cannot be
2718   // uncommitted at all. We don't do anything in this case to avoid creating a segment with
2719   // small pages on top of the SHM segment. This method always works for small pages, so we
2720   // allow that in any case.
2721   if (alignment_hint <= (size_t)os::vm_page_size() || can_commit_large_page_memory()) {
2722     commit_memory(addr, bytes, alignment_hint, !ExecMem);
2723   }
2724 }
2725 
2726 void os::numa_make_global(char *addr, size_t bytes) {
2727   Linux::numa_interleave_memory(addr, bytes);
2728 }
2729 
2730 // Define for numa_set_bind_policy(int). Setting the argument to 0 will set the
2731 // bind policy to MPOL_PREFERRED for the current thread.
2732 #define USE_MPOL_PREFERRED 0
2733 
2734 void os::numa_make_local(char *addr, size_t bytes, int lgrp_hint) {
2735   // To make NUMA and large pages more robust when both enabled, we need to ease
2736   // the requirements on where the memory should be allocated. MPOL_BIND is the
2737   // default policy and it will force memory to be allocated on the specified
2738   // node. Changing this to MPOL_PREFERRED will prefer to allocate the memory on
2739   // the specified node, but will not force it. Using this policy will prevent
2740   // getting SIGBUS when trying to allocate large pages on NUMA nodes with no
2741   // free large pages.
2742   Linux::numa_set_bind_policy(USE_MPOL_PREFERRED);
2743   Linux::numa_tonode_memory(addr, bytes, lgrp_hint);
2744 }
2745 
2746 bool os::numa_topology_changed() { return false; }
2747 
2748 size_t os::numa_get_groups_num() {
2749   int max_node = Linux::numa_max_node();
2750   return max_node > 0 ? max_node + 1 : 1;
2751 }
2752 
2753 int os::numa_get_group_id() {
2754   int cpu_id = Linux::sched_getcpu();
2755   if (cpu_id != -1) {
2756     int lgrp_id = Linux::get_node_by_cpu(cpu_id);
2757     if (lgrp_id != -1) {
2758       return lgrp_id;
2759     }
2760   }
2761   return 0;
2762 }
2763 
2764 size_t os::numa_get_leaf_groups(int *ids, size_t size) {
2765   for (size_t i = 0; i < size; i++) {
2766     ids[i] = i;
2767   }
2768   return size;
2769 }
2770 
2771 bool os::get_page_info(char *start, page_info* info) {
2772   return false;
2773 }
2774 
2775 char *os::scan_pages(char *start, char* end, page_info* page_expected,
2776                      page_info* page_found) {
2777   return end;
2778 }
2779 
2780 
2781 int os::Linux::sched_getcpu_syscall(void) {
2782   unsigned int cpu = 0;
2783   int retval = -1;
2784 
2785 #if defined(IA32)
2786   #ifndef SYS_getcpu
2787     #define SYS_getcpu 318
2788   #endif
2789   retval = syscall(SYS_getcpu, &cpu, NULL, NULL);
2790 #elif defined(AMD64)
2791 // Unfortunately we have to bring all these macros here from vsyscall.h
2792 // to be able to compile on old linuxes.
2793   #define __NR_vgetcpu 2
2794   #define VSYSCALL_START (-10UL << 20)
2795   #define VSYSCALL_SIZE 1024
2796   #define VSYSCALL_ADDR(vsyscall_nr) (VSYSCALL_START+VSYSCALL_SIZE*(vsyscall_nr))
2797   typedef long (*vgetcpu_t)(unsigned int *cpu, unsigned int *node, unsigned long *tcache);
2798   vgetcpu_t vgetcpu = (vgetcpu_t)VSYSCALL_ADDR(__NR_vgetcpu);
2799   retval = vgetcpu(&cpu, NULL, NULL);
2800 #endif
2801 
2802   return (retval == -1) ? retval : cpu;
2803 }
2804 
2805 // Something to do with the numa-aware allocator needs these symbols
2806 extern "C" JNIEXPORT void numa_warn(int number, char *where, ...) { }
2807 extern "C" JNIEXPORT void numa_error(char *where) { }
2808 
2809 
2810 // If we are running with libnuma version > 2, then we should
2811 // be trying to use symbols with versions 1.1
2812 // If we are running with earlier version, which did not have symbol versions,
2813 // we should use the base version.
2814 void* os::Linux::libnuma_dlsym(void* handle, const char *name) {
2815   typedef void* (*dlvsym_func_type)(void* handle, const char* name, const char* version);
2816   static dlvsym_func_type dlvsym_func;
2817   static bool initialized = false;
2818 
2819   if (!initialized) {
2820     dlvsym_func = (dlvsym_func_type)dlsym(RTLD_NEXT, "dlvsym");
2821     initialized = true;
2822   }
2823 
2824   if (dlvsym_func != NULL) {
2825     void *f = dlvsym_func(handle, name, "libnuma_1.1");
2826     if (f != NULL) {
2827       return f;
2828     }
2829   }
2830 
2831   return dlsym(handle, name);
2832 }
2833 
2834 bool os::Linux::libnuma_init() {
2835   // sched_getcpu() should be in libc.
2836   set_sched_getcpu(CAST_TO_FN_PTR(sched_getcpu_func_t,
2837                                   dlsym(RTLD_DEFAULT, "sched_getcpu")));
2838 
2839   // If it's not, try a direct syscall.
2840   if (sched_getcpu() == -1) {
2841     set_sched_getcpu(CAST_TO_FN_PTR(sched_getcpu_func_t,
2842                                     (void*)&sched_getcpu_syscall));
2843   }
2844 
2845   if (sched_getcpu() != -1) { // Does it work?
2846     void *handle = dlopen("libnuma.so.1", RTLD_LAZY);
2847     if (handle != NULL) {
2848       set_numa_node_to_cpus(CAST_TO_FN_PTR(numa_node_to_cpus_func_t,
2849                                            libnuma_dlsym(handle, "numa_node_to_cpus")));
2850       set_numa_max_node(CAST_TO_FN_PTR(numa_max_node_func_t,
2851                                        libnuma_dlsym(handle, "numa_max_node")));
2852       set_numa_available(CAST_TO_FN_PTR(numa_available_func_t,
2853                                         libnuma_dlsym(handle, "numa_available")));
2854       set_numa_tonode_memory(CAST_TO_FN_PTR(numa_tonode_memory_func_t,
2855                                             libnuma_dlsym(handle, "numa_tonode_memory")));
2856       set_numa_interleave_memory(CAST_TO_FN_PTR(numa_interleave_memory_func_t,
2857                                                 libnuma_dlsym(handle, "numa_interleave_memory")));
2858       set_numa_set_bind_policy(CAST_TO_FN_PTR(numa_set_bind_policy_func_t,
2859                                               libnuma_dlsym(handle, "numa_set_bind_policy")));
2860 
2861 
2862       if (numa_available() != -1) {
2863         set_numa_all_nodes((unsigned long*)libnuma_dlsym(handle, "numa_all_nodes"));
2864         // Create a cpu -> node mapping
2865         _cpu_to_node = new (ResourceObj::C_HEAP, mtInternal) GrowableArray<int>(0, true);
2866         rebuild_cpu_to_node_map();
2867         return true;
2868       }
2869     }
2870   }
2871   return false;
2872 }
2873 
2874 size_t os::Linux::default_guard_size(os::ThreadType thr_type) {
2875   // Creating guard page is very expensive. Java thread has HotSpot
2876   // guard pages, only enable glibc guard page for non-Java threads.
2877   // (Remember: compiler thread is a Java thread, too!)
2878   return ((thr_type == java_thread || thr_type == compiler_thread) ? 0 : page_size());
2879 }
2880 
2881 // rebuild_cpu_to_node_map() constructs a table mapping cpud id to node id.
2882 // The table is later used in get_node_by_cpu().
2883 void os::Linux::rebuild_cpu_to_node_map() {
2884   const size_t NCPUS = 32768; // Since the buffer size computation is very obscure
2885                               // in libnuma (possible values are starting from 16,
2886                               // and continuing up with every other power of 2, but less
2887                               // than the maximum number of CPUs supported by kernel), and
2888                               // is a subject to change (in libnuma version 2 the requirements
2889                               // are more reasonable) we'll just hardcode the number they use
2890                               // in the library.
2891   const size_t BitsPerCLong = sizeof(long) * CHAR_BIT;
2892 
2893   size_t cpu_num = processor_count();
2894   size_t cpu_map_size = NCPUS / BitsPerCLong;
2895   size_t cpu_map_valid_size =
2896     MIN2((cpu_num + BitsPerCLong - 1) / BitsPerCLong, cpu_map_size);
2897 
2898   cpu_to_node()->clear();
2899   cpu_to_node()->at_grow(cpu_num - 1);
2900   size_t node_num = numa_get_groups_num();
2901 
2902   unsigned long *cpu_map = NEW_C_HEAP_ARRAY(unsigned long, cpu_map_size, mtInternal);
2903   for (size_t i = 0; i < node_num; i++) {
2904     if (numa_node_to_cpus(i, cpu_map, cpu_map_size * sizeof(unsigned long)) != -1) {
2905       for (size_t j = 0; j < cpu_map_valid_size; j++) {
2906         if (cpu_map[j] != 0) {
2907           for (size_t k = 0; k < BitsPerCLong; k++) {
2908             if (cpu_map[j] & (1UL << k)) {
2909               cpu_to_node()->at_put(j * BitsPerCLong + k, i);
2910             }
2911           }
2912         }
2913       }
2914     }
2915   }
2916   FREE_C_HEAP_ARRAY(unsigned long, cpu_map);
2917 }
2918 
2919 int os::Linux::get_node_by_cpu(int cpu_id) {
2920   if (cpu_to_node() != NULL && cpu_id >= 0 && cpu_id < cpu_to_node()->length()) {
2921     return cpu_to_node()->at(cpu_id);
2922   }
2923   return -1;
2924 }
2925 
2926 GrowableArray<int>* os::Linux::_cpu_to_node;
2927 os::Linux::sched_getcpu_func_t os::Linux::_sched_getcpu;
2928 os::Linux::numa_node_to_cpus_func_t os::Linux::_numa_node_to_cpus;
2929 os::Linux::numa_max_node_func_t os::Linux::_numa_max_node;
2930 os::Linux::numa_available_func_t os::Linux::_numa_available;
2931 os::Linux::numa_tonode_memory_func_t os::Linux::_numa_tonode_memory;
2932 os::Linux::numa_interleave_memory_func_t os::Linux::_numa_interleave_memory;
2933 os::Linux::numa_set_bind_policy_func_t os::Linux::_numa_set_bind_policy;
2934 unsigned long* os::Linux::_numa_all_nodes;
2935 
2936 bool os::pd_uncommit_memory(char* addr, size_t size) {
2937   uintptr_t res = (uintptr_t) ::mmap(addr, size, PROT_NONE,
2938                                      MAP_PRIVATE|MAP_FIXED|MAP_NORESERVE|MAP_ANONYMOUS, -1, 0);
2939   return res  != (uintptr_t) MAP_FAILED;
2940 }
2941 
2942 static address get_stack_commited_bottom(address bottom, size_t size) {
2943   address nbot = bottom;
2944   address ntop = bottom + size;
2945 
2946   size_t page_sz = os::vm_page_size();
2947   unsigned pages = size / page_sz;
2948 
2949   unsigned char vec[1];
2950   unsigned imin = 1, imax = pages + 1, imid;
2951   int mincore_return_value = 0;
2952 
2953   assert(imin <= imax, "Unexpected page size");
2954 
2955   while (imin < imax) {
2956     imid = (imax + imin) / 2;
2957     nbot = ntop - (imid * page_sz);
2958 
2959     // Use a trick with mincore to check whether the page is mapped or not.
2960     // mincore sets vec to 1 if page resides in memory and to 0 if page
2961     // is swapped output but if page we are asking for is unmapped
2962     // it returns -1,ENOMEM
2963     mincore_return_value = mincore(nbot, page_sz, vec);
2964 
2965     if (mincore_return_value == -1) {
2966       // Page is not mapped go up
2967       // to find first mapped page
2968       if (errno != EAGAIN) {
2969         assert(errno == ENOMEM, "Unexpected mincore errno");
2970         imax = imid;
2971       }
2972     } else {
2973       // Page is mapped go down
2974       // to find first not mapped page
2975       imin = imid + 1;
2976     }
2977   }
2978 
2979   nbot = nbot + page_sz;
2980 
2981   // Adjust stack bottom one page up if last checked page is not mapped
2982   if (mincore_return_value == -1) {
2983     nbot = nbot + page_sz;
2984   }
2985 
2986   return nbot;
2987 }
2988 
2989 
2990 // Linux uses a growable mapping for the stack, and if the mapping for
2991 // the stack guard pages is not removed when we detach a thread the
2992 // stack cannot grow beyond the pages where the stack guard was
2993 // mapped.  If at some point later in the process the stack expands to
2994 // that point, the Linux kernel cannot expand the stack any further
2995 // because the guard pages are in the way, and a segfault occurs.
2996 //
2997 // However, it's essential not to split the stack region by unmapping
2998 // a region (leaving a hole) that's already part of the stack mapping,
2999 // so if the stack mapping has already grown beyond the guard pages at
3000 // the time we create them, we have to truncate the stack mapping.
3001 // So, we need to know the extent of the stack mapping when
3002 // create_stack_guard_pages() is called.
3003 
3004 // We only need this for stacks that are growable: at the time of
3005 // writing thread stacks don't use growable mappings (i.e. those
3006 // creeated with MAP_GROWSDOWN), and aren't marked "[stack]", so this
3007 // only applies to the main thread.
3008 
3009 // If the (growable) stack mapping already extends beyond the point
3010 // where we're going to put our guard pages, truncate the mapping at
3011 // that point by munmap()ping it.  This ensures that when we later
3012 // munmap() the guard pages we don't leave a hole in the stack
3013 // mapping. This only affects the main/initial thread
3014 
3015 bool os::pd_create_stack_guard_pages(char* addr, size_t size) {
3016   if (os::Linux::is_initial_thread()) {
3017     // As we manually grow stack up to bottom inside create_attached_thread(),
3018     // it's likely that os::Linux::initial_thread_stack_bottom is mapped and
3019     // we don't need to do anything special.
3020     // Check it first, before calling heavy function.
3021     uintptr_t stack_extent = (uintptr_t) os::Linux::initial_thread_stack_bottom();
3022     unsigned char vec[1];
3023 
3024     if (mincore((address)stack_extent, os::vm_page_size(), vec) == -1) {
3025       // Fallback to slow path on all errors, including EAGAIN
3026       stack_extent = (uintptr_t) get_stack_commited_bottom(
3027                                                            os::Linux::initial_thread_stack_bottom(),
3028                                                            (size_t)addr - stack_extent);
3029     }
3030 
3031     if (stack_extent < (uintptr_t)addr) {
3032       ::munmap((void*)stack_extent, (uintptr_t)(addr - stack_extent));
3033     }
3034   }
3035 
3036   return os::commit_memory(addr, size, !ExecMem);
3037 }
3038 
3039 // If this is a growable mapping, remove the guard pages entirely by
3040 // munmap()ping them.  If not, just call uncommit_memory(). This only
3041 // affects the main/initial thread, but guard against future OS changes
3042 // It's safe to always unmap guard pages for initial thread because we
3043 // always place it right after end of the mapped region
3044 
3045 bool os::remove_stack_guard_pages(char* addr, size_t size) {
3046   uintptr_t stack_extent, stack_base;
3047 
3048   if (os::Linux::is_initial_thread()) {
3049     return ::munmap(addr, size) == 0;
3050   }
3051 
3052   return os::uncommit_memory(addr, size);
3053 }
3054 
3055 // If 'fixed' is true, anon_mmap() will attempt to reserve anonymous memory
3056 // at 'requested_addr'. If there are existing memory mappings at the same
3057 // location, however, they will be overwritten. If 'fixed' is false,
3058 // 'requested_addr' is only treated as a hint, the return value may or
3059 // may not start from the requested address. Unlike Linux mmap(), this
3060 // function returns NULL to indicate failure.
3061 static char* anon_mmap(char* requested_addr, size_t bytes, bool fixed) {
3062   char * addr;
3063   int flags;
3064 
3065   flags = MAP_PRIVATE | MAP_NORESERVE | MAP_ANONYMOUS;
3066   if (fixed) {
3067     assert((uintptr_t)requested_addr % os::Linux::page_size() == 0, "unaligned address");
3068     flags |= MAP_FIXED;
3069   }
3070 
3071   // Map reserved/uncommitted pages PROT_NONE so we fail early if we
3072   // touch an uncommitted page. Otherwise, the read/write might
3073   // succeed if we have enough swap space to back the physical page.
3074   addr = (char*)::mmap(requested_addr, bytes, PROT_NONE,
3075                        flags, -1, 0);
3076 
3077   return addr == MAP_FAILED ? NULL : addr;
3078 }
3079 
3080 // Allocate (using mmap, NO_RESERVE, with small pages) at either a given request address
3081 //   (req_addr != NULL) or with a given alignment.
3082 //  - bytes shall be a multiple of alignment.
3083 //  - req_addr can be NULL. If not NULL, it must be a multiple of alignment.
3084 //  - alignment sets the alignment at which memory shall be allocated.
3085 //     It must be a multiple of allocation granularity.
3086 // Returns address of memory or NULL. If req_addr was not NULL, will only return
3087 //  req_addr or NULL.
3088 static char* anon_mmap_aligned(size_t bytes, size_t alignment, char* req_addr) {
3089 
3090   size_t extra_size = bytes;
3091   if (req_addr == NULL && alignment > 0) {
3092     extra_size += alignment;
3093   }
3094 
3095   char* start = (char*) ::mmap(req_addr, extra_size, PROT_NONE,
3096     MAP_PRIVATE|MAP_ANONYMOUS|MAP_NORESERVE,
3097     -1, 0);
3098   if (start == MAP_FAILED) {
3099     start = NULL;
3100   } else {
3101     if (req_addr != NULL) {
3102       if (start != req_addr) {
3103         ::munmap(start, extra_size);
3104         start = NULL;
3105       }
3106     } else {
3107       char* const start_aligned = (char*) align_ptr_up(start, alignment);
3108       char* const end_aligned = start_aligned + bytes;
3109       char* const end = start + extra_size;
3110       if (start_aligned > start) {
3111         ::munmap(start, start_aligned - start);
3112       }
3113       if (end_aligned < end) {
3114         ::munmap(end_aligned, end - end_aligned);
3115       }
3116       start = start_aligned;
3117     }
3118   }
3119   return start;
3120 }
3121 
3122 static int anon_munmap(char * addr, size_t size) {
3123   return ::munmap(addr, size) == 0;
3124 }
3125 
3126 char* os::pd_reserve_memory(size_t bytes, char* requested_addr,
3127                             size_t alignment_hint) {
3128   return anon_mmap(requested_addr, bytes, (requested_addr != NULL));
3129 }
3130 
3131 bool os::pd_release_memory(char* addr, size_t size) {
3132   return anon_munmap(addr, size);
3133 }
3134 
3135 static bool linux_mprotect(char* addr, size_t size, int prot) {
3136   // Linux wants the mprotect address argument to be page aligned.
3137   char* bottom = (char*)align_size_down((intptr_t)addr, os::Linux::page_size());
3138 
3139   // According to SUSv3, mprotect() should only be used with mappings
3140   // established by mmap(), and mmap() always maps whole pages. Unaligned
3141   // 'addr' likely indicates problem in the VM (e.g. trying to change
3142   // protection of malloc'ed or statically allocated memory). Check the
3143   // caller if you hit this assert.
3144   assert(addr == bottom, "sanity check");
3145 
3146   size = align_size_up(pointer_delta(addr, bottom, 1) + size, os::Linux::page_size());
3147   return ::mprotect(bottom, size, prot) == 0;
3148 }
3149 
3150 // Set protections specified
3151 bool os::protect_memory(char* addr, size_t bytes, ProtType prot,
3152                         bool is_committed) {
3153   unsigned int p = 0;
3154   switch (prot) {
3155   case MEM_PROT_NONE: p = PROT_NONE; break;
3156   case MEM_PROT_READ: p = PROT_READ; break;
3157   case MEM_PROT_RW:   p = PROT_READ|PROT_WRITE; break;
3158   case MEM_PROT_RWX:  p = PROT_READ|PROT_WRITE|PROT_EXEC; break;
3159   default:
3160     ShouldNotReachHere();
3161   }
3162   // is_committed is unused.
3163   return linux_mprotect(addr, bytes, p);
3164 }
3165 
3166 bool os::guard_memory(char* addr, size_t size) {
3167   return linux_mprotect(addr, size, PROT_NONE);
3168 }
3169 
3170 bool os::unguard_memory(char* addr, size_t size) {
3171   return linux_mprotect(addr, size, PROT_READ|PROT_WRITE);
3172 }
3173 
3174 bool os::Linux::transparent_huge_pages_sanity_check(bool warn,
3175                                                     size_t page_size) {
3176   bool result = false;
3177   void *p = mmap(NULL, page_size * 2, PROT_READ|PROT_WRITE,
3178                  MAP_ANONYMOUS|MAP_PRIVATE,
3179                  -1, 0);
3180   if (p != MAP_FAILED) {
3181     void *aligned_p = align_ptr_up(p, page_size);
3182 
3183     result = madvise(aligned_p, page_size, MADV_HUGEPAGE) == 0;
3184 
3185     munmap(p, page_size * 2);
3186   }
3187 
3188   if (warn && !result) {
3189     warning("TransparentHugePages is not supported by the operating system.");
3190   }
3191 
3192   return result;
3193 }
3194 
3195 bool os::Linux::hugetlbfs_sanity_check(bool warn, size_t page_size) {
3196   bool result = false;
3197   void *p = mmap(NULL, page_size, PROT_READ|PROT_WRITE,
3198                  MAP_ANONYMOUS|MAP_PRIVATE|MAP_HUGETLB,
3199                  -1, 0);
3200 
3201   if (p != MAP_FAILED) {
3202     // We don't know if this really is a huge page or not.
3203     FILE *fp = fopen("/proc/self/maps", "r");
3204     if (fp) {
3205       while (!feof(fp)) {
3206         char chars[257];
3207         long x = 0;
3208         if (fgets(chars, sizeof(chars), fp)) {
3209           if (sscanf(chars, "%lx-%*x", &x) == 1
3210               && x == (long)p) {
3211             if (strstr (chars, "hugepage")) {
3212               result = true;
3213               break;
3214             }
3215           }
3216         }
3217       }
3218       fclose(fp);
3219     }
3220     munmap(p, page_size);
3221   }
3222 
3223   if (warn && !result) {
3224     warning("HugeTLBFS is not supported by the operating system.");
3225   }
3226 
3227   return result;
3228 }
3229 
3230 // Set the coredump_filter bits to include largepages in core dump (bit 6)
3231 //
3232 // From the coredump_filter documentation:
3233 //
3234 // - (bit 0) anonymous private memory
3235 // - (bit 1) anonymous shared memory
3236 // - (bit 2) file-backed private memory
3237 // - (bit 3) file-backed shared memory
3238 // - (bit 4) ELF header pages in file-backed private memory areas (it is
3239 //           effective only if the bit 2 is cleared)
3240 // - (bit 5) hugetlb private memory
3241 // - (bit 6) hugetlb shared memory
3242 //
3243 static void set_coredump_filter(void) {
3244   FILE *f;
3245   long cdm;
3246 
3247   if ((f = fopen("/proc/self/coredump_filter", "r+")) == NULL) {
3248     return;
3249   }
3250 
3251   if (fscanf(f, "%lx", &cdm) != 1) {
3252     fclose(f);
3253     return;
3254   }
3255 
3256   rewind(f);
3257 
3258   if ((cdm & LARGEPAGES_BIT) == 0) {
3259     cdm |= LARGEPAGES_BIT;
3260     fprintf(f, "%#lx", cdm);
3261   }
3262 
3263   fclose(f);
3264 }
3265 
3266 // Large page support
3267 
3268 static size_t _large_page_size = 0;
3269 
3270 size_t os::Linux::find_large_page_size() {
3271   size_t large_page_size = 0;
3272 
3273   // large_page_size on Linux is used to round up heap size. x86 uses either
3274   // 2M or 4M page, depending on whether PAE (Physical Address Extensions)
3275   // mode is enabled. AMD64/EM64T uses 2M page in 64bit mode. IA64 can use
3276   // page as large as 256M.
3277   //
3278   // Here we try to figure out page size by parsing /proc/meminfo and looking
3279   // for a line with the following format:
3280   //    Hugepagesize:     2048 kB
3281   //
3282   // If we can't determine the value (e.g. /proc is not mounted, or the text
3283   // format has been changed), we'll use the largest page size supported by
3284   // the processor.
3285 
3286 #ifndef ZERO
3287   large_page_size =
3288     AARCH64_ONLY(2 * M)
3289     AMD64_ONLY(2 * M)
3290     ARM32_ONLY(2 * M)
3291     IA32_ONLY(4 * M)
3292     IA64_ONLY(256 * M)
3293     PPC_ONLY(4 * M)
3294     S390_ONLY(1 * M)
3295     SPARC_ONLY(4 * M);
3296 #endif // ZERO
3297 
3298   FILE *fp = fopen("/proc/meminfo", "r");
3299   if (fp) {
3300     while (!feof(fp)) {
3301       int x = 0;
3302       char buf[16];
3303       if (fscanf(fp, "Hugepagesize: %d", &x) == 1) {
3304         if (x && fgets(buf, sizeof(buf), fp) && strcmp(buf, " kB\n") == 0) {
3305           large_page_size = x * K;
3306           break;
3307         }
3308       } else {
3309         // skip to next line
3310         for (;;) {
3311           int ch = fgetc(fp);
3312           if (ch == EOF || ch == (int)'\n') break;
3313         }
3314       }
3315     }
3316     fclose(fp);
3317   }
3318 
3319   if (!FLAG_IS_DEFAULT(LargePageSizeInBytes) && LargePageSizeInBytes != large_page_size) {
3320     warning("Setting LargePageSizeInBytes has no effect on this OS. Large page size is "
3321             SIZE_FORMAT "%s.", byte_size_in_proper_unit(large_page_size),
3322             proper_unit_for_byte_size(large_page_size));
3323   }
3324 
3325   return large_page_size;
3326 }
3327 
3328 size_t os::Linux::setup_large_page_size() {
3329   _large_page_size = Linux::find_large_page_size();
3330   const size_t default_page_size = (size_t)Linux::page_size();
3331   if (_large_page_size > default_page_size) {
3332     _page_sizes[0] = _large_page_size;
3333     _page_sizes[1] = default_page_size;
3334     _page_sizes[2] = 0;
3335   }
3336 
3337   return _large_page_size;
3338 }
3339 
3340 bool os::Linux::setup_large_page_type(size_t page_size) {
3341   if (FLAG_IS_DEFAULT(UseHugeTLBFS) &&
3342       FLAG_IS_DEFAULT(UseSHM) &&
3343       FLAG_IS_DEFAULT(UseTransparentHugePages)) {
3344 
3345     // The type of large pages has not been specified by the user.
3346 
3347     // Try UseHugeTLBFS and then UseSHM.
3348     UseHugeTLBFS = UseSHM = true;
3349 
3350     // Don't try UseTransparentHugePages since there are known
3351     // performance issues with it turned on. This might change in the future.
3352     UseTransparentHugePages = false;
3353   }
3354 
3355   if (UseTransparentHugePages) {
3356     bool warn_on_failure = !FLAG_IS_DEFAULT(UseTransparentHugePages);
3357     if (transparent_huge_pages_sanity_check(warn_on_failure, page_size)) {
3358       UseHugeTLBFS = false;
3359       UseSHM = false;
3360       return true;
3361     }
3362     UseTransparentHugePages = false;
3363   }
3364 
3365   if (UseHugeTLBFS) {
3366     bool warn_on_failure = !FLAG_IS_DEFAULT(UseHugeTLBFS);
3367     if (hugetlbfs_sanity_check(warn_on_failure, page_size)) {
3368       UseSHM = false;
3369       return true;
3370     }
3371     UseHugeTLBFS = false;
3372   }
3373 
3374   return UseSHM;
3375 }
3376 
3377 void os::large_page_init() {
3378   if (!UseLargePages &&
3379       !UseTransparentHugePages &&
3380       !UseHugeTLBFS &&
3381       !UseSHM) {
3382     // Not using large pages.
3383     return;
3384   }
3385 
3386   if (!FLAG_IS_DEFAULT(UseLargePages) && !UseLargePages) {
3387     // The user explicitly turned off large pages.
3388     // Ignore the rest of the large pages flags.
3389     UseTransparentHugePages = false;
3390     UseHugeTLBFS = false;
3391     UseSHM = false;
3392     return;
3393   }
3394 
3395   size_t large_page_size = Linux::setup_large_page_size();
3396   UseLargePages          = Linux::setup_large_page_type(large_page_size);
3397 
3398   set_coredump_filter();
3399 }
3400 
3401 #ifndef SHM_HUGETLB
3402   #define SHM_HUGETLB 04000
3403 #endif
3404 
3405 #define shm_warning_format(format, ...)              \
3406   do {                                               \
3407     if (UseLargePages &&                             \
3408         (!FLAG_IS_DEFAULT(UseLargePages) ||          \
3409          !FLAG_IS_DEFAULT(UseSHM) ||                 \
3410          !FLAG_IS_DEFAULT(LargePageSizeInBytes))) {  \
3411       warning(format, __VA_ARGS__);                  \
3412     }                                                \
3413   } while (0)
3414 
3415 #define shm_warning(str) shm_warning_format("%s", str)
3416 
3417 #define shm_warning_with_errno(str)                \
3418   do {                                             \
3419     int err = errno;                               \
3420     shm_warning_format(str " (error = %d)", err);  \
3421   } while (0)
3422 
3423 static char* shmat_with_alignment(int shmid, size_t bytes, size_t alignment) {
3424   assert(is_size_aligned(bytes, alignment), "Must be divisible by the alignment");
3425 
3426   if (!is_size_aligned(alignment, SHMLBA)) {
3427     assert(false, "Code below assumes that alignment is at least SHMLBA aligned");
3428     return NULL;
3429   }
3430 
3431   // To ensure that we get 'alignment' aligned memory from shmat,
3432   // we pre-reserve aligned virtual memory and then attach to that.
3433 
3434   char* pre_reserved_addr = anon_mmap_aligned(bytes, alignment, NULL);
3435   if (pre_reserved_addr == NULL) {
3436     // Couldn't pre-reserve aligned memory.
3437     shm_warning("Failed to pre-reserve aligned memory for shmat.");
3438     return NULL;
3439   }
3440 
3441   // SHM_REMAP is needed to allow shmat to map over an existing mapping.
3442   char* addr = (char*)shmat(shmid, pre_reserved_addr, SHM_REMAP);
3443 
3444   if ((intptr_t)addr == -1) {
3445     int err = errno;
3446     shm_warning_with_errno("Failed to attach shared memory.");
3447 
3448     assert(err != EACCES, "Unexpected error");
3449     assert(err != EIDRM,  "Unexpected error");
3450     assert(err != EINVAL, "Unexpected error");
3451 
3452     // Since we don't know if the kernel unmapped the pre-reserved memory area
3453     // we can't unmap it, since that would potentially unmap memory that was
3454     // mapped from other threads.
3455     return NULL;
3456   }
3457 
3458   return addr;
3459 }
3460 
3461 static char* shmat_at_address(int shmid, char* req_addr) {
3462   if (!is_ptr_aligned(req_addr, SHMLBA)) {
3463     assert(false, "Requested address needs to be SHMLBA aligned");
3464     return NULL;
3465   }
3466 
3467   char* addr = (char*)shmat(shmid, req_addr, 0);
3468 
3469   if ((intptr_t)addr == -1) {
3470     shm_warning_with_errno("Failed to attach shared memory.");
3471     return NULL;
3472   }
3473 
3474   return addr;
3475 }
3476 
3477 static char* shmat_large_pages(int shmid, size_t bytes, size_t alignment, char* req_addr) {
3478   // If a req_addr has been provided, we assume that the caller has already aligned the address.
3479   if (req_addr != NULL) {
3480     assert(is_ptr_aligned(req_addr, os::large_page_size()), "Must be divisible by the large page size");
3481     assert(is_ptr_aligned(req_addr, alignment), "Must be divisible by given alignment");
3482     return shmat_at_address(shmid, req_addr);
3483   }
3484 
3485   // Since shmid has been setup with SHM_HUGETLB, shmat will automatically
3486   // return large page size aligned memory addresses when req_addr == NULL.
3487   // However, if the alignment is larger than the large page size, we have
3488   // to manually ensure that the memory returned is 'alignment' aligned.
3489   if (alignment > os::large_page_size()) {
3490     assert(is_size_aligned(alignment, os::large_page_size()), "Must be divisible by the large page size");
3491     return shmat_with_alignment(shmid, bytes, alignment);
3492   } else {
3493     return shmat_at_address(shmid, NULL);
3494   }
3495 }
3496 
3497 char* os::Linux::reserve_memory_special_shm(size_t bytes, size_t alignment,
3498                                             char* req_addr, bool exec) {
3499   // "exec" is passed in but not used.  Creating the shared image for
3500   // the code cache doesn't have an SHM_X executable permission to check.
3501   assert(UseLargePages && UseSHM, "only for SHM large pages");
3502   assert(is_ptr_aligned(req_addr, os::large_page_size()), "Unaligned address");
3503   assert(is_ptr_aligned(req_addr, alignment), "Unaligned address");
3504 
3505   if (!is_size_aligned(bytes, os::large_page_size())) {
3506     return NULL; // Fallback to small pages.
3507   }
3508 
3509   // Create a large shared memory region to attach to based on size.
3510   // Currently, size is the total size of the heap.
3511   int shmid = shmget(IPC_PRIVATE, bytes, SHM_HUGETLB|IPC_CREAT|SHM_R|SHM_W);
3512   if (shmid == -1) {
3513     // Possible reasons for shmget failure:
3514     // 1. shmmax is too small for Java heap.
3515     //    > check shmmax value: cat /proc/sys/kernel/shmmax
3516     //    > increase shmmax value: echo "0xffffffff" > /proc/sys/kernel/shmmax
3517     // 2. not enough large page memory.
3518     //    > check available large pages: cat /proc/meminfo
3519     //    > increase amount of large pages:
3520     //          echo new_value > /proc/sys/vm/nr_hugepages
3521     //      Note 1: different Linux may use different name for this property,
3522     //            e.g. on Redhat AS-3 it is "hugetlb_pool".
3523     //      Note 2: it's possible there's enough physical memory available but
3524     //            they are so fragmented after a long run that they can't
3525     //            coalesce into large pages. Try to reserve large pages when
3526     //            the system is still "fresh".
3527     shm_warning_with_errno("Failed to reserve shared memory.");
3528     return NULL;
3529   }
3530 
3531   // Attach to the region.
3532   char* addr = shmat_large_pages(shmid, bytes, alignment, req_addr);
3533 
3534   // Remove shmid. If shmat() is successful, the actual shared memory segment
3535   // will be deleted when it's detached by shmdt() or when the process
3536   // terminates. If shmat() is not successful this will remove the shared
3537   // segment immediately.
3538   shmctl(shmid, IPC_RMID, NULL);
3539 
3540   return addr;
3541 }
3542 
3543 static void warn_on_large_pages_failure(char* req_addr, size_t bytes,
3544                                         int error) {
3545   assert(error == ENOMEM, "Only expect to fail if no memory is available");
3546 
3547   bool warn_on_failure = UseLargePages &&
3548       (!FLAG_IS_DEFAULT(UseLargePages) ||
3549        !FLAG_IS_DEFAULT(UseHugeTLBFS) ||
3550        !FLAG_IS_DEFAULT(LargePageSizeInBytes));
3551 
3552   if (warn_on_failure) {
3553     char msg[128];
3554     jio_snprintf(msg, sizeof(msg), "Failed to reserve large pages memory req_addr: "
3555                  PTR_FORMAT " bytes: " SIZE_FORMAT " (errno = %d).", req_addr, bytes, error);
3556     warning("%s", msg);
3557   }
3558 }
3559 
3560 char* os::Linux::reserve_memory_special_huge_tlbfs_only(size_t bytes,
3561                                                         char* req_addr,
3562                                                         bool exec) {
3563   assert(UseLargePages && UseHugeTLBFS, "only for Huge TLBFS large pages");
3564   assert(is_size_aligned(bytes, os::large_page_size()), "Unaligned size");
3565   assert(is_ptr_aligned(req_addr, os::large_page_size()), "Unaligned address");
3566 
3567   int prot = exec ? PROT_READ|PROT_WRITE|PROT_EXEC : PROT_READ|PROT_WRITE;
3568   char* addr = (char*)::mmap(req_addr, bytes, prot,
3569                              MAP_PRIVATE|MAP_ANONYMOUS|MAP_HUGETLB,
3570                              -1, 0);
3571 
3572   if (addr == MAP_FAILED) {
3573     warn_on_large_pages_failure(req_addr, bytes, errno);
3574     return NULL;
3575   }
3576 
3577   assert(is_ptr_aligned(addr, os::large_page_size()), "Must be");
3578 
3579   return addr;
3580 }
3581 
3582 // Reserve memory using mmap(MAP_HUGETLB).
3583 //  - bytes shall be a multiple of alignment.
3584 //  - req_addr can be NULL. If not NULL, it must be a multiple of alignment.
3585 //  - alignment sets the alignment at which memory shall be allocated.
3586 //     It must be a multiple of allocation granularity.
3587 // Returns address of memory or NULL. If req_addr was not NULL, will only return
3588 //  req_addr or NULL.
3589 char* os::Linux::reserve_memory_special_huge_tlbfs_mixed(size_t bytes,
3590                                                          size_t alignment,
3591                                                          char* req_addr,
3592                                                          bool exec) {
3593   size_t large_page_size = os::large_page_size();
3594   assert(bytes >= large_page_size, "Shouldn't allocate large pages for small sizes");
3595 
3596   assert(is_ptr_aligned(req_addr, alignment), "Must be");
3597   assert(is_size_aligned(bytes, alignment), "Must be");
3598 
3599   // First reserve - but not commit - the address range in small pages.
3600   char* const start = anon_mmap_aligned(bytes, alignment, req_addr);
3601 
3602   if (start == NULL) {
3603     return NULL;
3604   }
3605 
3606   assert(is_ptr_aligned(start, alignment), "Must be");
3607 
3608   char* end = start + bytes;
3609 
3610   // Find the regions of the allocated chunk that can be promoted to large pages.
3611   char* lp_start = (char*)align_ptr_up(start, large_page_size);
3612   char* lp_end   = (char*)align_ptr_down(end, large_page_size);
3613 
3614   size_t lp_bytes = lp_end - lp_start;
3615 
3616   assert(is_size_aligned(lp_bytes, large_page_size), "Must be");
3617 
3618   if (lp_bytes == 0) {
3619     // The mapped region doesn't even span the start and the end of a large page.
3620     // Fall back to allocate a non-special area.
3621     ::munmap(start, end - start);
3622     return NULL;
3623   }
3624 
3625   int prot = exec ? PROT_READ|PROT_WRITE|PROT_EXEC : PROT_READ|PROT_WRITE;
3626 
3627   void* result;
3628 
3629   // Commit small-paged leading area.
3630   if (start != lp_start) {
3631     result = ::mmap(start, lp_start - start, prot,
3632                     MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED,
3633                     -1, 0);
3634     if (result == MAP_FAILED) {
3635       ::munmap(lp_start, end - lp_start);
3636       return NULL;
3637     }
3638   }
3639 
3640   // Commit large-paged area.
3641   result = ::mmap(lp_start, lp_bytes, prot,
3642                   MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED|MAP_HUGETLB,
3643                   -1, 0);
3644   if (result == MAP_FAILED) {
3645     warn_on_large_pages_failure(lp_start, lp_bytes, errno);
3646     // If the mmap above fails, the large pages region will be unmapped and we
3647     // have regions before and after with small pages. Release these regions.
3648     //
3649     // |  mapped  |  unmapped  |  mapped  |
3650     // ^          ^            ^          ^
3651     // start      lp_start     lp_end     end
3652     //
3653     ::munmap(start, lp_start - start);
3654     ::munmap(lp_end, end - lp_end);
3655     return NULL;
3656   }
3657 
3658   // Commit small-paged trailing area.
3659   if (lp_end != end) {
3660     result = ::mmap(lp_end, end - lp_end, prot,
3661                     MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED,
3662                     -1, 0);
3663     if (result == MAP_FAILED) {
3664       ::munmap(start, lp_end - start);
3665       return NULL;
3666     }
3667   }
3668 
3669   return start;
3670 }
3671 
3672 char* os::Linux::reserve_memory_special_huge_tlbfs(size_t bytes,
3673                                                    size_t alignment,
3674                                                    char* req_addr,
3675                                                    bool exec) {
3676   assert(UseLargePages && UseHugeTLBFS, "only for Huge TLBFS large pages");
3677   assert(is_ptr_aligned(req_addr, alignment), "Must be");
3678   assert(is_size_aligned(alignment, os::vm_allocation_granularity()), "Must be");
3679   assert(is_power_of_2(os::large_page_size()), "Must be");
3680   assert(bytes >= os::large_page_size(), "Shouldn't allocate large pages for small sizes");
3681 
3682   if (is_size_aligned(bytes, os::large_page_size()) && alignment <= os::large_page_size()) {
3683     return reserve_memory_special_huge_tlbfs_only(bytes, req_addr, exec);
3684   } else {
3685     return reserve_memory_special_huge_tlbfs_mixed(bytes, alignment, req_addr, exec);
3686   }
3687 }
3688 
3689 char* os::reserve_memory_special(size_t bytes, size_t alignment,
3690                                  char* req_addr, bool exec) {
3691   assert(UseLargePages, "only for large pages");
3692 
3693   char* addr;
3694   if (UseSHM) {
3695     addr = os::Linux::reserve_memory_special_shm(bytes, alignment, req_addr, exec);
3696   } else {
3697     assert(UseHugeTLBFS, "must be");
3698     addr = os::Linux::reserve_memory_special_huge_tlbfs(bytes, alignment, req_addr, exec);
3699   }
3700 
3701   if (addr != NULL) {
3702     if (UseNUMAInterleaving) {
3703       numa_make_global(addr, bytes);
3704     }
3705 
3706     // The memory is committed
3707     MemTracker::record_virtual_memory_reserve_and_commit((address)addr, bytes, CALLER_PC);
3708   }
3709 
3710   return addr;
3711 }
3712 
3713 bool os::Linux::release_memory_special_shm(char* base, size_t bytes) {
3714   // detaching the SHM segment will also delete it, see reserve_memory_special_shm()
3715   return shmdt(base) == 0;
3716 }
3717 
3718 bool os::Linux::release_memory_special_huge_tlbfs(char* base, size_t bytes) {
3719   return pd_release_memory(base, bytes);
3720 }
3721 
3722 bool os::release_memory_special(char* base, size_t bytes) {
3723   bool res;
3724   if (MemTracker::tracking_level() > NMT_minimal) {
3725     Tracker tkr = MemTracker::get_virtual_memory_release_tracker();
3726     res = os::Linux::release_memory_special_impl(base, bytes);
3727     if (res) {
3728       tkr.record((address)base, bytes);
3729     }
3730 
3731   } else {
3732     res = os::Linux::release_memory_special_impl(base, bytes);
3733   }
3734   return res;
3735 }
3736 
3737 bool os::Linux::release_memory_special_impl(char* base, size_t bytes) {
3738   assert(UseLargePages, "only for large pages");
3739   bool res;
3740 
3741   if (UseSHM) {
3742     res = os::Linux::release_memory_special_shm(base, bytes);
3743   } else {
3744     assert(UseHugeTLBFS, "must be");
3745     res = os::Linux::release_memory_special_huge_tlbfs(base, bytes);
3746   }
3747   return res;
3748 }
3749 
3750 size_t os::large_page_size() {
3751   return _large_page_size;
3752 }
3753 
3754 // With SysV SHM the entire memory region must be allocated as shared
3755 // memory.
3756 // HugeTLBFS allows application to commit large page memory on demand.
3757 // However, when committing memory with HugeTLBFS fails, the region
3758 // that was supposed to be committed will lose the old reservation
3759 // and allow other threads to steal that memory region. Because of this
3760 // behavior we can't commit HugeTLBFS memory.
3761 bool os::can_commit_large_page_memory() {
3762   return UseTransparentHugePages;
3763 }
3764 
3765 bool os::can_execute_large_page_memory() {
3766   return UseTransparentHugePages || UseHugeTLBFS;
3767 }
3768 
3769 // Reserve memory at an arbitrary address, only if that area is
3770 // available (and not reserved for something else).
3771 
3772 char* os::pd_attempt_reserve_memory_at(size_t bytes, char* requested_addr) {
3773   const int max_tries = 10;
3774   char* base[max_tries];
3775   size_t size[max_tries];
3776   const size_t gap = 0x000000;
3777 
3778   // Assert only that the size is a multiple of the page size, since
3779   // that's all that mmap requires, and since that's all we really know
3780   // about at this low abstraction level.  If we need higher alignment,
3781   // we can either pass an alignment to this method or verify alignment
3782   // in one of the methods further up the call chain.  See bug 5044738.
3783   assert(bytes % os::vm_page_size() == 0, "reserving unexpected size block");
3784 
3785   // Repeatedly allocate blocks until the block is allocated at the
3786   // right spot.
3787 
3788   // Linux mmap allows caller to pass an address as hint; give it a try first,
3789   // if kernel honors the hint then we can return immediately.
3790   char * addr = anon_mmap(requested_addr, bytes, false);
3791   if (addr == requested_addr) {
3792     return requested_addr;
3793   }
3794 
3795   if (addr != NULL) {
3796     // mmap() is successful but it fails to reserve at the requested address
3797     anon_munmap(addr, bytes);
3798   }
3799 
3800   int i;
3801   for (i = 0; i < max_tries; ++i) {
3802     base[i] = reserve_memory(bytes);
3803 
3804     if (base[i] != NULL) {
3805       // Is this the block we wanted?
3806       if (base[i] == requested_addr) {
3807         size[i] = bytes;
3808         break;
3809       }
3810 
3811       // Does this overlap the block we wanted? Give back the overlapped
3812       // parts and try again.
3813 
3814       ptrdiff_t top_overlap = requested_addr + (bytes + gap) - base[i];
3815       if (top_overlap >= 0 && (size_t)top_overlap < bytes) {
3816         unmap_memory(base[i], top_overlap);
3817         base[i] += top_overlap;
3818         size[i] = bytes - top_overlap;
3819       } else {
3820         ptrdiff_t bottom_overlap = base[i] + bytes - requested_addr;
3821         if (bottom_overlap >= 0 && (size_t)bottom_overlap < bytes) {
3822           unmap_memory(requested_addr, bottom_overlap);
3823           size[i] = bytes - bottom_overlap;
3824         } else {
3825           size[i] = bytes;
3826         }
3827       }
3828     }
3829   }
3830 
3831   // Give back the unused reserved pieces.
3832 
3833   for (int j = 0; j < i; ++j) {
3834     if (base[j] != NULL) {
3835       unmap_memory(base[j], size[j]);
3836     }
3837   }
3838 
3839   if (i < max_tries) {
3840     return requested_addr;
3841   } else {
3842     return NULL;
3843   }
3844 }
3845 
3846 size_t os::read(int fd, void *buf, unsigned int nBytes) {
3847   return ::read(fd, buf, nBytes);
3848 }
3849 
3850 size_t os::read_at(int fd, void *buf, unsigned int nBytes, jlong offset) {
3851   return ::pread(fd, buf, nBytes, offset);
3852 }
3853 
3854 // Short sleep, direct OS call.
3855 //
3856 // Note: certain versions of Linux CFS scheduler (since 2.6.23) do not guarantee
3857 // sched_yield(2) will actually give up the CPU:
3858 //
3859 //   * Alone on this pariticular CPU, keeps running.
3860 //   * Before the introduction of "skip_buddy" with "compat_yield" disabled
3861 //     (pre 2.6.39).
3862 //
3863 // So calling this with 0 is an alternative.
3864 //
3865 void os::naked_short_sleep(jlong ms) {
3866   struct timespec req;
3867 
3868   assert(ms < 1000, "Un-interruptable sleep, short time use only");
3869   req.tv_sec = 0;
3870   if (ms > 0) {
3871     req.tv_nsec = (ms % 1000) * 1000000;
3872   } else {
3873     req.tv_nsec = 1;
3874   }
3875 
3876   nanosleep(&req, NULL);
3877 
3878   return;
3879 }
3880 
3881 // Sleep forever; naked call to OS-specific sleep; use with CAUTION
3882 void os::infinite_sleep() {
3883   while (true) {    // sleep forever ...
3884     ::sleep(100);   // ... 100 seconds at a time
3885   }
3886 }
3887 
3888 // Used to convert frequent JVM_Yield() to nops
3889 bool os::dont_yield() {
3890   return DontYieldALot;
3891 }
3892 
3893 void os::naked_yield() {
3894   sched_yield();
3895 }
3896 
3897 ////////////////////////////////////////////////////////////////////////////////
3898 // thread priority support
3899 
3900 // Note: Normal Linux applications are run with SCHED_OTHER policy. SCHED_OTHER
3901 // only supports dynamic priority, static priority must be zero. For real-time
3902 // applications, Linux supports SCHED_RR which allows static priority (1-99).
3903 // However, for large multi-threaded applications, SCHED_RR is not only slower
3904 // than SCHED_OTHER, but also very unstable (my volano tests hang hard 4 out
3905 // of 5 runs - Sep 2005).
3906 //
3907 // The following code actually changes the niceness of kernel-thread/LWP. It
3908 // has an assumption that setpriority() only modifies one kernel-thread/LWP,
3909 // not the entire user process, and user level threads are 1:1 mapped to kernel
3910 // threads. It has always been the case, but could change in the future. For
3911 // this reason, the code should not be used as default (ThreadPriorityPolicy=0).
3912 // It is only used when ThreadPriorityPolicy=1 and requires root privilege.
3913 
3914 int os::java_to_os_priority[CriticalPriority + 1] = {
3915   19,              // 0 Entry should never be used
3916 
3917    4,              // 1 MinPriority
3918    3,              // 2
3919    2,              // 3
3920 
3921    1,              // 4
3922    0,              // 5 NormPriority
3923   -1,              // 6
3924 
3925   -2,              // 7
3926   -3,              // 8
3927   -4,              // 9 NearMaxPriority
3928 
3929   -5,              // 10 MaxPriority
3930 
3931   -5               // 11 CriticalPriority
3932 };
3933 
3934 static int prio_init() {
3935   if (ThreadPriorityPolicy == 1) {
3936     // Only root can raise thread priority. Don't allow ThreadPriorityPolicy=1
3937     // if effective uid is not root. Perhaps, a more elegant way of doing
3938     // this is to test CAP_SYS_NICE capability, but that will require libcap.so
3939     if (geteuid() != 0) {
3940       if (!FLAG_IS_DEFAULT(ThreadPriorityPolicy)) {
3941         warning("-XX:ThreadPriorityPolicy requires root privilege on Linux");
3942       }
3943       ThreadPriorityPolicy = 0;
3944     }
3945   }
3946   if (UseCriticalJavaThreadPriority) {
3947     os::java_to_os_priority[MaxPriority] = os::java_to_os_priority[CriticalPriority];
3948   }
3949   return 0;
3950 }
3951 
3952 OSReturn os::set_native_priority(Thread* thread, int newpri) {
3953   if (!UseThreadPriorities || ThreadPriorityPolicy == 0) return OS_OK;
3954 
3955   int ret = setpriority(PRIO_PROCESS, thread->osthread()->thread_id(), newpri);
3956   return (ret == 0) ? OS_OK : OS_ERR;
3957 }
3958 
3959 OSReturn os::get_native_priority(const Thread* const thread,
3960                                  int *priority_ptr) {
3961   if (!UseThreadPriorities || ThreadPriorityPolicy == 0) {
3962     *priority_ptr = java_to_os_priority[NormPriority];
3963     return OS_OK;
3964   }
3965 
3966   errno = 0;
3967   *priority_ptr = getpriority(PRIO_PROCESS, thread->osthread()->thread_id());
3968   return (*priority_ptr != -1 || errno == 0 ? OS_OK : OS_ERR);
3969 }
3970 
3971 // Hint to the underlying OS that a task switch would not be good.
3972 // Void return because it's a hint and can fail.
3973 void os::hint_no_preempt() {}
3974 
3975 ////////////////////////////////////////////////////////////////////////////////
3976 // suspend/resume support
3977 
3978 //  the low-level signal-based suspend/resume support is a remnant from the
3979 //  old VM-suspension that used to be for java-suspension, safepoints etc,
3980 //  within hotspot. Now there is a single use-case for this:
3981 //    - calling get_thread_pc() on the VMThread by the flat-profiler task
3982 //      that runs in the watcher thread.
3983 //  The remaining code is greatly simplified from the more general suspension
3984 //  code that used to be used.
3985 //
3986 //  The protocol is quite simple:
3987 //  - suspend:
3988 //      - sends a signal to the target thread
3989 //      - polls the suspend state of the osthread using a yield loop
3990 //      - target thread signal handler (SR_handler) sets suspend state
3991 //        and blocks in sigsuspend until continued
3992 //  - resume:
3993 //      - sets target osthread state to continue
3994 //      - sends signal to end the sigsuspend loop in the SR_handler
3995 //
3996 //  Note that the SR_lock plays no role in this suspend/resume protocol,
3997 //  but is checked for NULL in SR_handler as a thread termination indicator.
3998 
3999 static void resume_clear_context(OSThread *osthread) {
4000   osthread->set_ucontext(NULL);
4001   osthread->set_siginfo(NULL);
4002 }
4003 
4004 static void suspend_save_context(OSThread *osthread, siginfo_t* siginfo,
4005                                  ucontext_t* context) {
4006   osthread->set_ucontext(context);
4007   osthread->set_siginfo(siginfo);
4008 }
4009 
4010 // Handler function invoked when a thread's execution is suspended or
4011 // resumed. We have to be careful that only async-safe functions are
4012 // called here (Note: most pthread functions are not async safe and
4013 // should be avoided.)
4014 //
4015 // Note: sigwait() is a more natural fit than sigsuspend() from an
4016 // interface point of view, but sigwait() prevents the signal hander
4017 // from being run. libpthread would get very confused by not having
4018 // its signal handlers run and prevents sigwait()'s use with the
4019 // mutex granting granting signal.
4020 //
4021 // Currently only ever called on the VMThread and JavaThreads (PC sampling)
4022 //
4023 static void SR_handler(int sig, siginfo_t* siginfo, ucontext_t* context) {
4024   // Save and restore errno to avoid confusing native code with EINTR
4025   // after sigsuspend.
4026   int old_errno = errno;
4027 
4028   Thread* thread = Thread::current_or_null_safe();
4029   assert(thread != NULL, "Missing current thread in SR_handler");
4030 
4031   // On some systems we have seen signal delivery get "stuck" until the signal
4032   // mask is changed as part of thread termination. Check that the current thread
4033   // has not already terminated (via SR_lock()) - else the following assertion
4034   // will fail because the thread is no longer a JavaThread as the ~JavaThread
4035   // destructor has completed.
4036 
4037   if (thread->SR_lock() == NULL) {
4038     return;
4039   }
4040 
4041   assert(thread->is_VM_thread() || thread->is_Java_thread(), "Must be VMThread or JavaThread");
4042 
4043   OSThread* osthread = thread->osthread();
4044 
4045   os::SuspendResume::State current = osthread->sr.state();
4046   if (current == os::SuspendResume::SR_SUSPEND_REQUEST) {
4047     suspend_save_context(osthread, siginfo, context);
4048 
4049     // attempt to switch the state, we assume we had a SUSPEND_REQUEST
4050     os::SuspendResume::State state = osthread->sr.suspended();
4051     if (state == os::SuspendResume::SR_SUSPENDED) {
4052       sigset_t suspend_set;  // signals for sigsuspend()
4053       sigemptyset(&suspend_set);
4054       // get current set of blocked signals and unblock resume signal
4055       pthread_sigmask(SIG_BLOCK, NULL, &suspend_set);
4056       sigdelset(&suspend_set, SR_signum);
4057 
4058       sr_semaphore.signal();
4059       // wait here until we are resumed
4060       while (1) {
4061         sigsuspend(&suspend_set);
4062 
4063         os::SuspendResume::State result = osthread->sr.running();
4064         if (result == os::SuspendResume::SR_RUNNING) {
4065           sr_semaphore.signal();
4066           break;
4067         }
4068       }
4069 
4070     } else if (state == os::SuspendResume::SR_RUNNING) {
4071       // request was cancelled, continue
4072     } else {
4073       ShouldNotReachHere();
4074     }
4075 
4076     resume_clear_context(osthread);
4077   } else if (current == os::SuspendResume::SR_RUNNING) {
4078     // request was cancelled, continue
4079   } else if (current == os::SuspendResume::SR_WAKEUP_REQUEST) {
4080     // ignore
4081   } else {
4082     // ignore
4083   }
4084 
4085   errno = old_errno;
4086 }
4087 
4088 static int SR_initialize() {
4089   struct sigaction act;
4090   char *s;
4091 
4092   // Get signal number to use for suspend/resume
4093   if ((s = ::getenv("_JAVA_SR_SIGNUM")) != 0) {
4094     int sig = ::strtol(s, 0, 10);
4095     if (sig > MAX2(SIGSEGV, SIGBUS) &&  // See 4355769.
4096         sig < NSIG) {                   // Must be legal signal and fit into sigflags[].
4097       SR_signum = sig;
4098     } else {
4099       warning("You set _JAVA_SR_SIGNUM=%d. It must be in range [%d, %d]. Using %d instead.",
4100               sig, MAX2(SIGSEGV, SIGBUS)+1, NSIG-1, SR_signum);
4101     }
4102   }
4103 
4104   assert(SR_signum > SIGSEGV && SR_signum > SIGBUS,
4105          "SR_signum must be greater than max(SIGSEGV, SIGBUS), see 4355769");
4106 
4107   sigemptyset(&SR_sigset);
4108   sigaddset(&SR_sigset, SR_signum);
4109 
4110   // Set up signal handler for suspend/resume
4111   act.sa_flags = SA_RESTART|SA_SIGINFO;
4112   act.sa_handler = (void (*)(int)) SR_handler;
4113 
4114   // SR_signum is blocked by default.
4115   // 4528190 - We also need to block pthread restart signal (32 on all
4116   // supported Linux platforms). Note that LinuxThreads need to block
4117   // this signal for all threads to work properly. So we don't have
4118   // to use hard-coded signal number when setting up the mask.
4119   pthread_sigmask(SIG_BLOCK, NULL, &act.sa_mask);
4120 
4121   if (sigaction(SR_signum, &act, 0) == -1) {
4122     return -1;
4123   }
4124 
4125   // Save signal flag
4126   os::Linux::set_our_sigflags(SR_signum, act.sa_flags);
4127   return 0;
4128 }
4129 
4130 static int sr_notify(OSThread* osthread) {
4131   int status = pthread_kill(osthread->pthread_id(), SR_signum);
4132   assert_status(status == 0, status, "pthread_kill");
4133   return status;
4134 }
4135 
4136 // "Randomly" selected value for how long we want to spin
4137 // before bailing out on suspending a thread, also how often
4138 // we send a signal to a thread we want to resume
4139 static const int RANDOMLY_LARGE_INTEGER = 1000000;
4140 static const int RANDOMLY_LARGE_INTEGER2 = 100;
4141 
4142 // returns true on success and false on error - really an error is fatal
4143 // but this seems the normal response to library errors
4144 static bool do_suspend(OSThread* osthread) {
4145   assert(osthread->sr.is_running(), "thread should be running");
4146   assert(!sr_semaphore.trywait(), "semaphore has invalid state");
4147 
4148   // mark as suspended and send signal
4149   if (osthread->sr.request_suspend() != os::SuspendResume::SR_SUSPEND_REQUEST) {
4150     // failed to switch, state wasn't running?
4151     ShouldNotReachHere();
4152     return false;
4153   }
4154 
4155   if (sr_notify(osthread) != 0) {
4156     ShouldNotReachHere();
4157   }
4158 
4159   // managed to send the signal and switch to SUSPEND_REQUEST, now wait for SUSPENDED
4160   while (true) {
4161     if (sr_semaphore.timedwait(0, 2 * NANOSECS_PER_MILLISEC)) {
4162       break;
4163     } else {
4164       // timeout
4165       os::SuspendResume::State cancelled = osthread->sr.cancel_suspend();
4166       if (cancelled == os::SuspendResume::SR_RUNNING) {
4167         return false;
4168       } else if (cancelled == os::SuspendResume::SR_SUSPENDED) {
4169         // make sure that we consume the signal on the semaphore as well
4170         sr_semaphore.wait();
4171         break;
4172       } else {
4173         ShouldNotReachHere();
4174         return false;
4175       }
4176     }
4177   }
4178 
4179   guarantee(osthread->sr.is_suspended(), "Must be suspended");
4180   return true;
4181 }
4182 
4183 static void do_resume(OSThread* osthread) {
4184   assert(osthread->sr.is_suspended(), "thread should be suspended");
4185   assert(!sr_semaphore.trywait(), "invalid semaphore state");
4186 
4187   if (osthread->sr.request_wakeup() != os::SuspendResume::SR_WAKEUP_REQUEST) {
4188     // failed to switch to WAKEUP_REQUEST
4189     ShouldNotReachHere();
4190     return;
4191   }
4192 
4193   while (true) {
4194     if (sr_notify(osthread) == 0) {
4195       if (sr_semaphore.timedwait(0, 2 * NANOSECS_PER_MILLISEC)) {
4196         if (osthread->sr.is_running()) {
4197           return;
4198         }
4199       }
4200     } else {
4201       ShouldNotReachHere();
4202     }
4203   }
4204 
4205   guarantee(osthread->sr.is_running(), "Must be running!");
4206 }
4207 
4208 ///////////////////////////////////////////////////////////////////////////////////
4209 // signal handling (except suspend/resume)
4210 
4211 // This routine may be used by user applications as a "hook" to catch signals.
4212 // The user-defined signal handler must pass unrecognized signals to this
4213 // routine, and if it returns true (non-zero), then the signal handler must
4214 // return immediately.  If the flag "abort_if_unrecognized" is true, then this
4215 // routine will never retun false (zero), but instead will execute a VM panic
4216 // routine kill the process.
4217 //
4218 // If this routine returns false, it is OK to call it again.  This allows
4219 // the user-defined signal handler to perform checks either before or after
4220 // the VM performs its own checks.  Naturally, the user code would be making
4221 // a serious error if it tried to handle an exception (such as a null check
4222 // or breakpoint) that the VM was generating for its own correct operation.
4223 //
4224 // This routine may recognize any of the following kinds of signals:
4225 //    SIGBUS, SIGSEGV, SIGILL, SIGFPE, SIGQUIT, SIGPIPE, SIGXFSZ, SIGUSR1.
4226 // It should be consulted by handlers for any of those signals.
4227 //
4228 // The caller of this routine must pass in the three arguments supplied
4229 // to the function referred to in the "sa_sigaction" (not the "sa_handler")
4230 // field of the structure passed to sigaction().  This routine assumes that
4231 // the sa_flags field passed to sigaction() includes SA_SIGINFO and SA_RESTART.
4232 //
4233 // Note that the VM will print warnings if it detects conflicting signal
4234 // handlers, unless invoked with the option "-XX:+AllowUserSignalHandlers".
4235 //
4236 extern "C" JNIEXPORT int JVM_handle_linux_signal(int signo,
4237                                                  siginfo_t* siginfo,
4238                                                  void* ucontext,
4239                                                  int abort_if_unrecognized);
4240 
4241 void signalHandler(int sig, siginfo_t* info, void* uc) {
4242   assert(info != NULL && uc != NULL, "it must be old kernel");
4243   int orig_errno = errno;  // Preserve errno value over signal handler.
4244   JVM_handle_linux_signal(sig, info, uc, true);
4245   errno = orig_errno;
4246 }
4247 
4248 
4249 // This boolean allows users to forward their own non-matching signals
4250 // to JVM_handle_linux_signal, harmlessly.
4251 bool os::Linux::signal_handlers_are_installed = false;
4252 
4253 // For signal-chaining
4254 struct sigaction sigact[NSIG];
4255 uint64_t sigs = 0;
4256 #if (64 < NSIG-1)
4257 #error "Not all signals can be encoded in sigs. Adapt its type!"
4258 #endif
4259 bool os::Linux::libjsig_is_loaded = false;
4260 typedef struct sigaction *(*get_signal_t)(int);
4261 get_signal_t os::Linux::get_signal_action = NULL;
4262 
4263 struct sigaction* os::Linux::get_chained_signal_action(int sig) {
4264   struct sigaction *actp = NULL;
4265 
4266   if (libjsig_is_loaded) {
4267     // Retrieve the old signal handler from libjsig
4268     actp = (*get_signal_action)(sig);
4269   }
4270   if (actp == NULL) {
4271     // Retrieve the preinstalled signal handler from jvm
4272     actp = get_preinstalled_handler(sig);
4273   }
4274 
4275   return actp;
4276 }
4277 
4278 static bool call_chained_handler(struct sigaction *actp, int sig,
4279                                  siginfo_t *siginfo, void *context) {
4280   // Call the old signal handler
4281   if (actp->sa_handler == SIG_DFL) {
4282     // It's more reasonable to let jvm treat it as an unexpected exception
4283     // instead of taking the default action.
4284     return false;
4285   } else if (actp->sa_handler != SIG_IGN) {
4286     if ((actp->sa_flags & SA_NODEFER) == 0) {
4287       // automaticlly block the signal
4288       sigaddset(&(actp->sa_mask), sig);
4289     }
4290 
4291     sa_handler_t hand = NULL;
4292     sa_sigaction_t sa = NULL;
4293     bool siginfo_flag_set = (actp->sa_flags & SA_SIGINFO) != 0;
4294     // retrieve the chained handler
4295     if (siginfo_flag_set) {
4296       sa = actp->sa_sigaction;
4297     } else {
4298       hand = actp->sa_handler;
4299     }
4300 
4301     if ((actp->sa_flags & SA_RESETHAND) != 0) {
4302       actp->sa_handler = SIG_DFL;
4303     }
4304 
4305     // try to honor the signal mask
4306     sigset_t oset;
4307     sigemptyset(&oset);
4308     pthread_sigmask(SIG_SETMASK, &(actp->sa_mask), &oset);
4309 
4310     // call into the chained handler
4311     if (siginfo_flag_set) {
4312       (*sa)(sig, siginfo, context);
4313     } else {
4314       (*hand)(sig);
4315     }
4316 
4317     // restore the signal mask
4318     pthread_sigmask(SIG_SETMASK, &oset, NULL);
4319   }
4320   // Tell jvm's signal handler the signal is taken care of.
4321   return true;
4322 }
4323 
4324 bool os::Linux::chained_handler(int sig, siginfo_t* siginfo, void* context) {
4325   bool chained = false;
4326   // signal-chaining
4327   if (UseSignalChaining) {
4328     struct sigaction *actp = get_chained_signal_action(sig);
4329     if (actp != NULL) {
4330       chained = call_chained_handler(actp, sig, siginfo, context);
4331     }
4332   }
4333   return chained;
4334 }
4335 
4336 struct sigaction* os::Linux::get_preinstalled_handler(int sig) {
4337   if ((((uint64_t)1 << (sig-1)) & sigs) != 0) {
4338     return &sigact[sig];
4339   }
4340   return NULL;
4341 }
4342 
4343 void os::Linux::save_preinstalled_handler(int sig, struct sigaction& oldAct) {
4344   assert(sig > 0 && sig < NSIG, "vm signal out of expected range");
4345   sigact[sig] = oldAct;
4346   sigs |= (uint64_t)1 << (sig-1);
4347 }
4348 
4349 // for diagnostic
4350 int sigflags[NSIG];
4351 
4352 int os::Linux::get_our_sigflags(int sig) {
4353   assert(sig > 0 && sig < NSIG, "vm signal out of expected range");
4354   return sigflags[sig];
4355 }
4356 
4357 void os::Linux::set_our_sigflags(int sig, int flags) {
4358   assert(sig > 0 && sig < NSIG, "vm signal out of expected range");
4359   if (sig > 0 && sig < NSIG) {
4360     sigflags[sig] = flags;
4361   }
4362 }
4363 
4364 void os::Linux::set_signal_handler(int sig, bool set_installed) {
4365   // Check for overwrite.
4366   struct sigaction oldAct;
4367   sigaction(sig, (struct sigaction*)NULL, &oldAct);
4368 
4369   void* oldhand = oldAct.sa_sigaction
4370                 ? CAST_FROM_FN_PTR(void*,  oldAct.sa_sigaction)
4371                 : CAST_FROM_FN_PTR(void*,  oldAct.sa_handler);
4372   if (oldhand != CAST_FROM_FN_PTR(void*, SIG_DFL) &&
4373       oldhand != CAST_FROM_FN_PTR(void*, SIG_IGN) &&
4374       oldhand != CAST_FROM_FN_PTR(void*, (sa_sigaction_t)signalHandler)) {
4375     if (AllowUserSignalHandlers || !set_installed) {
4376       // Do not overwrite; user takes responsibility to forward to us.
4377       return;
4378     } else if (UseSignalChaining) {
4379       // save the old handler in jvm
4380       save_preinstalled_handler(sig, oldAct);
4381       // libjsig also interposes the sigaction() call below and saves the
4382       // old sigaction on it own.
4383     } else {
4384       fatal("Encountered unexpected pre-existing sigaction handler "
4385             "%#lx for signal %d.", (long)oldhand, sig);
4386     }
4387   }
4388 
4389   struct sigaction sigAct;
4390   sigfillset(&(sigAct.sa_mask));
4391   sigAct.sa_handler = SIG_DFL;
4392   if (!set_installed) {
4393     sigAct.sa_flags = SA_SIGINFO|SA_RESTART;
4394   } else {
4395     sigAct.sa_sigaction = signalHandler;
4396     sigAct.sa_flags = SA_SIGINFO|SA_RESTART;
4397   }
4398   // Save flags, which are set by ours
4399   assert(sig > 0 && sig < NSIG, "vm signal out of expected range");
4400   sigflags[sig] = sigAct.sa_flags;
4401 
4402   int ret = sigaction(sig, &sigAct, &oldAct);
4403   assert(ret == 0, "check");
4404 
4405   void* oldhand2  = oldAct.sa_sigaction
4406                   ? CAST_FROM_FN_PTR(void*, oldAct.sa_sigaction)
4407                   : CAST_FROM_FN_PTR(void*, oldAct.sa_handler);
4408   assert(oldhand2 == oldhand, "no concurrent signal handler installation");
4409 }
4410 
4411 // install signal handlers for signals that HotSpot needs to
4412 // handle in order to support Java-level exception handling.
4413 
4414 void os::Linux::install_signal_handlers() {
4415   if (!signal_handlers_are_installed) {
4416     signal_handlers_are_installed = true;
4417 
4418     // signal-chaining
4419     typedef void (*signal_setting_t)();
4420     signal_setting_t begin_signal_setting = NULL;
4421     signal_setting_t end_signal_setting = NULL;
4422     begin_signal_setting = CAST_TO_FN_PTR(signal_setting_t,
4423                                           dlsym(RTLD_DEFAULT, "JVM_begin_signal_setting"));
4424     if (begin_signal_setting != NULL) {
4425       end_signal_setting = CAST_TO_FN_PTR(signal_setting_t,
4426                                           dlsym(RTLD_DEFAULT, "JVM_end_signal_setting"));
4427       get_signal_action = CAST_TO_FN_PTR(get_signal_t,
4428                                          dlsym(RTLD_DEFAULT, "JVM_get_signal_action"));
4429       libjsig_is_loaded = true;
4430       assert(UseSignalChaining, "should enable signal-chaining");
4431     }
4432     if (libjsig_is_loaded) {
4433       // Tell libjsig jvm is setting signal handlers
4434       (*begin_signal_setting)();
4435     }
4436 
4437     set_signal_handler(SIGSEGV, true);
4438     set_signal_handler(SIGPIPE, true);
4439     set_signal_handler(SIGBUS, true);
4440     set_signal_handler(SIGILL, true);
4441     set_signal_handler(SIGFPE, true);
4442 #if defined(PPC64)
4443     set_signal_handler(SIGTRAP, true);
4444 #endif
4445     set_signal_handler(SIGXFSZ, true);
4446 
4447     if (libjsig_is_loaded) {
4448       // Tell libjsig jvm finishes setting signal handlers
4449       (*end_signal_setting)();
4450     }
4451 
4452     // We don't activate signal checker if libjsig is in place, we trust ourselves
4453     // and if UserSignalHandler is installed all bets are off.
4454     // Log that signal checking is off only if -verbose:jni is specified.
4455     if (CheckJNICalls) {
4456       if (libjsig_is_loaded) {
4457         if (PrintJNIResolving) {
4458           tty->print_cr("Info: libjsig is activated, all active signal checking is disabled");
4459         }
4460         check_signals = false;
4461       }
4462       if (AllowUserSignalHandlers) {
4463         if (PrintJNIResolving) {
4464           tty->print_cr("Info: AllowUserSignalHandlers is activated, all active signal checking is disabled");
4465         }
4466         check_signals = false;
4467       }
4468     }
4469   }
4470 }
4471 
4472 // This is the fastest way to get thread cpu time on Linux.
4473 // Returns cpu time (user+sys) for any thread, not only for current.
4474 // POSIX compliant clocks are implemented in the kernels 2.6.16+.
4475 // It might work on 2.6.10+ with a special kernel/glibc patch.
4476 // For reference, please, see IEEE Std 1003.1-2004:
4477 //   http://www.unix.org/single_unix_specification
4478 
4479 jlong os::Linux::fast_thread_cpu_time(clockid_t clockid) {
4480   struct timespec tp;
4481   int rc = os::Linux::clock_gettime(clockid, &tp);
4482   assert(rc == 0, "clock_gettime is expected to return 0 code");
4483 
4484   return (tp.tv_sec * NANOSECS_PER_SEC) + tp.tv_nsec;
4485 }
4486 
4487 void os::Linux::initialize_os_info() {
4488   assert(_os_version == 0, "OS info already initialized");
4489 
4490   struct utsname _uname;
4491 
4492   uint32_t major;
4493   uint32_t minor;
4494   uint32_t fix;
4495 
4496   int rc;
4497 
4498   // Kernel version is unknown if
4499   // verification below fails.
4500   _os_version = 0x01000000;
4501 
4502   rc = uname(&_uname);
4503   if (rc != -1) {
4504 
4505     rc = sscanf(_uname.release,"%d.%d.%d", &major, &minor, &fix);
4506     if (rc == 3) {
4507 
4508       if (major < 256 && minor < 256 && fix < 256) {
4509         // Kernel version format is as expected,
4510         // set it overriding unknown state.
4511         _os_version = (major << 16) |
4512                       (minor << 8 ) |
4513                       (fix   << 0 ) ;
4514       }
4515     }
4516   }
4517 }
4518 
4519 uint32_t os::Linux::os_version() {
4520   assert(_os_version != 0, "not initialized");
4521   return _os_version & 0x00FFFFFF;
4522 }
4523 
4524 bool os::Linux::os_version_is_known() {
4525   assert(_os_version != 0, "not initialized");
4526   return _os_version & 0x01000000 ? false : true;
4527 }
4528 
4529 /////
4530 // glibc on Linux platform uses non-documented flag
4531 // to indicate, that some special sort of signal
4532 // trampoline is used.
4533 // We will never set this flag, and we should
4534 // ignore this flag in our diagnostic
4535 #ifdef SIGNIFICANT_SIGNAL_MASK
4536   #undef SIGNIFICANT_SIGNAL_MASK
4537 #endif
4538 #define SIGNIFICANT_SIGNAL_MASK (~0x04000000)
4539 
4540 static const char* get_signal_handler_name(address handler,
4541                                            char* buf, int buflen) {
4542   int offset = 0;
4543   bool found = os::dll_address_to_library_name(handler, buf, buflen, &offset);
4544   if (found) {
4545     // skip directory names
4546     const char *p1, *p2;
4547     p1 = buf;
4548     size_t len = strlen(os::file_separator());
4549     while ((p2 = strstr(p1, os::file_separator())) != NULL) p1 = p2 + len;
4550     jio_snprintf(buf, buflen, "%s+0x%x", p1, offset);
4551   } else {
4552     jio_snprintf(buf, buflen, PTR_FORMAT, handler);
4553   }
4554   return buf;
4555 }
4556 
4557 static void print_signal_handler(outputStream* st, int sig,
4558                                  char* buf, size_t buflen) {
4559   struct sigaction sa;
4560 
4561   sigaction(sig, NULL, &sa);
4562 
4563   // See comment for SIGNIFICANT_SIGNAL_MASK define
4564   sa.sa_flags &= SIGNIFICANT_SIGNAL_MASK;
4565 
4566   st->print("%s: ", os::exception_name(sig, buf, buflen));
4567 
4568   address handler = (sa.sa_flags & SA_SIGINFO)
4569     ? CAST_FROM_FN_PTR(address, sa.sa_sigaction)
4570     : CAST_FROM_FN_PTR(address, sa.sa_handler);
4571 
4572   if (handler == CAST_FROM_FN_PTR(address, SIG_DFL)) {
4573     st->print("SIG_DFL");
4574   } else if (handler == CAST_FROM_FN_PTR(address, SIG_IGN)) {
4575     st->print("SIG_IGN");
4576   } else {
4577     st->print("[%s]", get_signal_handler_name(handler, buf, buflen));
4578   }
4579 
4580   st->print(", sa_mask[0]=");
4581   os::Posix::print_signal_set_short(st, &sa.sa_mask);
4582 
4583   address rh = VMError::get_resetted_sighandler(sig);
4584   // May be, handler was resetted by VMError?
4585   if (rh != NULL) {
4586     handler = rh;
4587     sa.sa_flags = VMError::get_resetted_sigflags(sig) & SIGNIFICANT_SIGNAL_MASK;
4588   }
4589 
4590   st->print(", sa_flags=");
4591   os::Posix::print_sa_flags(st, sa.sa_flags);
4592 
4593   // Check: is it our handler?
4594   if (handler == CAST_FROM_FN_PTR(address, (sa_sigaction_t)signalHandler) ||
4595       handler == CAST_FROM_FN_PTR(address, (sa_sigaction_t)SR_handler)) {
4596     // It is our signal handler
4597     // check for flags, reset system-used one!
4598     if ((int)sa.sa_flags != os::Linux::get_our_sigflags(sig)) {
4599       st->print(
4600                 ", flags was changed from " PTR32_FORMAT ", consider using jsig library",
4601                 os::Linux::get_our_sigflags(sig));
4602     }
4603   }
4604   st->cr();
4605 }
4606 
4607 
4608 #define DO_SIGNAL_CHECK(sig)                      \
4609   do {                                            \
4610     if (!sigismember(&check_signal_done, sig)) {  \
4611       os::Linux::check_signal_handler(sig);       \
4612     }                                             \
4613   } while (0)
4614 
4615 // This method is a periodic task to check for misbehaving JNI applications
4616 // under CheckJNI, we can add any periodic checks here
4617 
4618 void os::run_periodic_checks() {
4619   if (check_signals == false) return;
4620 
4621   // SEGV and BUS if overridden could potentially prevent
4622   // generation of hs*.log in the event of a crash, debugging
4623   // such a case can be very challenging, so we absolutely
4624   // check the following for a good measure:
4625   DO_SIGNAL_CHECK(SIGSEGV);
4626   DO_SIGNAL_CHECK(SIGILL);
4627   DO_SIGNAL_CHECK(SIGFPE);
4628   DO_SIGNAL_CHECK(SIGBUS);
4629   DO_SIGNAL_CHECK(SIGPIPE);
4630   DO_SIGNAL_CHECK(SIGXFSZ);
4631 #if defined(PPC64)
4632   DO_SIGNAL_CHECK(SIGTRAP);
4633 #endif
4634 
4635   // ReduceSignalUsage allows the user to override these handlers
4636   // see comments at the very top and jvm_solaris.h
4637   if (!ReduceSignalUsage) {
4638     DO_SIGNAL_CHECK(SHUTDOWN1_SIGNAL);
4639     DO_SIGNAL_CHECK(SHUTDOWN2_SIGNAL);
4640     DO_SIGNAL_CHECK(SHUTDOWN3_SIGNAL);
4641     DO_SIGNAL_CHECK(BREAK_SIGNAL);
4642   }
4643 
4644   DO_SIGNAL_CHECK(SR_signum);
4645 }
4646 
4647 typedef int (*os_sigaction_t)(int, const struct sigaction *, struct sigaction *);
4648 
4649 static os_sigaction_t os_sigaction = NULL;
4650 
4651 void os::Linux::check_signal_handler(int sig) {
4652   char buf[O_BUFLEN];
4653   address jvmHandler = NULL;
4654 
4655 
4656   struct sigaction act;
4657   if (os_sigaction == NULL) {
4658     // only trust the default sigaction, in case it has been interposed
4659     os_sigaction = (os_sigaction_t)dlsym(RTLD_DEFAULT, "sigaction");
4660     if (os_sigaction == NULL) return;
4661   }
4662 
4663   os_sigaction(sig, (struct sigaction*)NULL, &act);
4664 
4665 
4666   act.sa_flags &= SIGNIFICANT_SIGNAL_MASK;
4667 
4668   address thisHandler = (act.sa_flags & SA_SIGINFO)
4669     ? CAST_FROM_FN_PTR(address, act.sa_sigaction)
4670     : CAST_FROM_FN_PTR(address, act.sa_handler);
4671 
4672 
4673   switch (sig) {
4674   case SIGSEGV:
4675   case SIGBUS:
4676   case SIGFPE:
4677   case SIGPIPE:
4678   case SIGILL:
4679   case SIGXFSZ:
4680     jvmHandler = CAST_FROM_FN_PTR(address, (sa_sigaction_t)signalHandler);
4681     break;
4682 
4683   case SHUTDOWN1_SIGNAL:
4684   case SHUTDOWN2_SIGNAL:
4685   case SHUTDOWN3_SIGNAL:
4686   case BREAK_SIGNAL:
4687     jvmHandler = (address)user_handler();
4688     break;
4689 
4690   default:
4691     if (sig == SR_signum) {
4692       jvmHandler = CAST_FROM_FN_PTR(address, (sa_sigaction_t)SR_handler);
4693     } else {
4694       return;
4695     }
4696     break;
4697   }
4698 
4699   if (thisHandler != jvmHandler) {
4700     tty->print("Warning: %s handler ", exception_name(sig, buf, O_BUFLEN));
4701     tty->print("expected:%s", get_signal_handler_name(jvmHandler, buf, O_BUFLEN));
4702     tty->print_cr("  found:%s", get_signal_handler_name(thisHandler, buf, O_BUFLEN));
4703     // No need to check this sig any longer
4704     sigaddset(&check_signal_done, sig);
4705     // Running under non-interactive shell, SHUTDOWN2_SIGNAL will be reassigned SIG_IGN
4706     if (sig == SHUTDOWN2_SIGNAL && !isatty(fileno(stdin))) {
4707       tty->print_cr("Running in non-interactive shell, %s handler is replaced by shell",
4708                     exception_name(sig, buf, O_BUFLEN));
4709     }
4710   } else if(os::Linux::get_our_sigflags(sig) != 0 && (int)act.sa_flags != os::Linux::get_our_sigflags(sig)) {
4711     tty->print("Warning: %s handler flags ", exception_name(sig, buf, O_BUFLEN));
4712     tty->print("expected:");
4713     os::Posix::print_sa_flags(tty, os::Linux::get_our_sigflags(sig));
4714     tty->cr();
4715     tty->print("  found:");
4716     os::Posix::print_sa_flags(tty, act.sa_flags);
4717     tty->cr();
4718     // No need to check this sig any longer
4719     sigaddset(&check_signal_done, sig);
4720   }
4721 
4722   // Dump all the signal
4723   if (sigismember(&check_signal_done, sig)) {
4724     print_signal_handlers(tty, buf, O_BUFLEN);
4725   }
4726 }
4727 
4728 extern void report_error(char* file_name, int line_no, char* title,
4729                          char* format, ...);
4730 
4731 // Some linux distributions (notably: Alpine Linux) include the
4732 // grsecurity in the kernel by default. Of particular interest from a
4733 // JVM perspective is PaX (https://pax.grsecurity.net/), which adds
4734 // some security features related to page attributes. Specifically,
4735 // the MPROTECT PaX functionality
4736 // (https://pax.grsecurity.net/docs/mprotect.txt) prevents dynamic
4737 // code generation by disallowing a (previously) writable page to be
4738 // marked as executable. This is, of course, exactly what HotSpot does
4739 // for both JIT compiled method, as well as for stubs, adapters, etc.
4740 //
4741 // Instead of crashing "lazily" when trying to make a page executable,
4742 // this code probes for the presence of PaX and reports the failure
4743 // eagerly.
4744 static void check_pax(void) {
4745   // Zero doesn't generate code dynamically, so no need to perform the PaX check
4746 #ifndef ZERO
4747   size_t size = os::Linux::page_size();
4748 
4749   void* p = ::mmap(NULL, size, PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
4750   if (p == MAP_FAILED) {
4751     vm_exit_out_of_memory(size, OOM_MMAP_ERROR, "failed to allocate memory for PaX check.");
4752   }
4753 
4754   int res = ::mprotect(p, size, PROT_WRITE|PROT_EXEC);
4755   if (res == -1) {
4756     vm_exit_during_initialization("Failed to mark memory page as executable",
4757                                   "Please check if grsecurity/PaX is enabled in your kernel.\n"
4758                                   "\n"
4759                                   "For example, you can do this by running (note: you may need root privileges):\n"
4760                                   "\n"
4761                                   "    sysctl kernel.pax.softmode\n"
4762                                   "\n"
4763                                   "If PaX is included in the kernel you will see something like this:\n"
4764                                   "\n"
4765                                   "    kernel.pax.softmode = 0\n"
4766                                   "\n"
4767                                   "In particular, if the value is 0 (zero), then PaX is enabled.\n"
4768                                   "\n"
4769                                   "PaX includes security functionality which interferes with the dynamic code\n"
4770                                   "generation the JVM relies on. Specifically, the MPROTECT functionality as\n"
4771                                   "described on https://pax.grsecurity.net/docs/mprotect.txt is not compatible\n"
4772                                   "with the JVM. If you want to allow the JVM to run you will have to disable PaX.\n"
4773                                   "You can do this on a per-executable basis using the paxctl tool.\n");
4774 
4775   }
4776 
4777   ::munmap(p, size);
4778 #endif
4779 }
4780 
4781 // this is called _before_ the most of global arguments have been parsed
4782 void os::init(void) {
4783   char dummy;   // used to get a guess on initial stack address
4784 //  first_hrtime = gethrtime();
4785 
4786   clock_tics_per_sec = sysconf(_SC_CLK_TCK);
4787 
4788   init_random(1234567);
4789 
4790   ThreadCritical::initialize();
4791 
4792   Linux::set_page_size(sysconf(_SC_PAGESIZE));
4793   if (Linux::page_size() == -1) {
4794     fatal("os_linux.cpp: os::init: sysconf failed (%s)",
4795           os::strerror(errno));
4796   }
4797   init_page_sizes((size_t) Linux::page_size());
4798 
4799   Linux::initialize_system_info();
4800 
4801   Linux::initialize_os_info();
4802 
4803   // main_thread points to the aboriginal thread
4804   Linux::_main_thread = pthread_self();
4805 
4806   Linux::clock_init();
4807   initial_time_count = javaTimeNanos();
4808 
4809   // pthread_condattr initialization for monotonic clock
4810   int status;
4811   pthread_condattr_t* _condattr = os::Linux::condAttr();
4812   if ((status = pthread_condattr_init(_condattr)) != 0) {
4813     fatal("pthread_condattr_init: %s", os::strerror(status));
4814   }
4815   // Only set the clock if CLOCK_MONOTONIC is available
4816   if (os::supports_monotonic_clock()) {
4817     if ((status = pthread_condattr_setclock(_condattr, CLOCK_MONOTONIC)) != 0) {
4818       if (status == EINVAL) {
4819         warning("Unable to use monotonic clock with relative timed-waits" \
4820                 " - changes to the time-of-day clock may have adverse affects");
4821       } else {
4822         fatal("pthread_condattr_setclock: %s", os::strerror(status));
4823       }
4824     }
4825   }
4826   // else it defaults to CLOCK_REALTIME
4827 
4828   // retrieve entry point for pthread_setname_np
4829   Linux::_pthread_setname_np =
4830     (int(*)(pthread_t, const char*))dlsym(RTLD_DEFAULT, "pthread_setname_np");
4831 
4832   check_pax();
4833 }
4834 
4835 // To install functions for atexit system call
4836 extern "C" {
4837   static void perfMemory_exit_helper() {
4838     perfMemory_exit();
4839   }
4840 }
4841 
4842 // this is called _after_ the global arguments have been parsed
4843 jint os::init_2(void) {
4844   Linux::fast_thread_clock_init();
4845 
4846   // Allocate a single page and mark it as readable for safepoint polling
4847   address polling_page = (address) ::mmap(NULL, Linux::page_size(), PROT_READ, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
4848   guarantee(polling_page != MAP_FAILED, "os::init_2: failed to allocate polling page");
4849 
4850   os::set_polling_page(polling_page);
4851   log_info(os)("SafePoint Polling address: " INTPTR_FORMAT, p2i(polling_page));
4852 
4853   if (!UseMembar) {
4854     address mem_serialize_page = (address) ::mmap(NULL, Linux::page_size(), PROT_READ | PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
4855     guarantee(mem_serialize_page != MAP_FAILED, "mmap Failed for memory serialize page");
4856     os::set_memory_serialize_page(mem_serialize_page);
4857     log_info(os)("Memory Serialize Page address: " INTPTR_FORMAT, p2i(mem_serialize_page));
4858   }
4859 
4860   // initialize suspend/resume support - must do this before signal_sets_init()
4861   if (SR_initialize() != 0) {
4862     perror("SR_initialize failed");
4863     return JNI_ERR;
4864   }
4865 
4866   Linux::signal_sets_init();
4867   Linux::install_signal_handlers();
4868 
4869   // Check and sets minimum stack sizes against command line options
4870   if (Posix::set_minimum_stack_sizes() == JNI_ERR) {
4871     return JNI_ERR;
4872   }
4873   Linux::capture_initial_stack(JavaThread::stack_size_at_create());
4874 
4875 #if defined(IA32)
4876   workaround_expand_exec_shield_cs_limit();
4877 #endif
4878 
4879   Linux::libpthread_init();
4880   log_info(os)("HotSpot is running with %s, %s",
4881                Linux::glibc_version(), Linux::libpthread_version());
4882 
4883   if (UseNUMA) {
4884     if (!Linux::libnuma_init()) {
4885       UseNUMA = false;
4886     } else {
4887       if ((Linux::numa_max_node() < 1)) {
4888         // There's only one node(they start from 0), disable NUMA.
4889         UseNUMA = false;
4890       }
4891     }
4892     // With SHM and HugeTLBFS large pages we cannot uncommit a page, so there's no way
4893     // we can make the adaptive lgrp chunk resizing work. If the user specified
4894     // both UseNUMA and UseLargePages (or UseSHM/UseHugeTLBFS) on the command line - warn and
4895     // disable adaptive resizing.
4896     if (UseNUMA && UseLargePages && !can_commit_large_page_memory()) {
4897       if (FLAG_IS_DEFAULT(UseNUMA)) {
4898         UseNUMA = false;
4899       } else {
4900         if (FLAG_IS_DEFAULT(UseLargePages) &&
4901             FLAG_IS_DEFAULT(UseSHM) &&
4902             FLAG_IS_DEFAULT(UseHugeTLBFS)) {
4903           UseLargePages = false;
4904         } else if (UseAdaptiveSizePolicy || UseAdaptiveNUMAChunkSizing) {
4905           warning("UseNUMA is not fully compatible with SHM/HugeTLBFS large pages, disabling adaptive resizing (-XX:-UseAdaptiveSizePolicy -XX:-UseAdaptiveNUMAChunkSizing)");
4906           UseAdaptiveSizePolicy = false;
4907           UseAdaptiveNUMAChunkSizing = false;
4908         }
4909       }
4910     }
4911     if (!UseNUMA && ForceNUMA) {
4912       UseNUMA = true;
4913     }
4914   }
4915 
4916   if (MaxFDLimit) {
4917     // set the number of file descriptors to max. print out error
4918     // if getrlimit/setrlimit fails but continue regardless.
4919     struct rlimit nbr_files;
4920     int status = getrlimit(RLIMIT_NOFILE, &nbr_files);
4921     if (status != 0) {
4922       log_info(os)("os::init_2 getrlimit failed: %s", os::strerror(errno));
4923     } else {
4924       nbr_files.rlim_cur = nbr_files.rlim_max;
4925       status = setrlimit(RLIMIT_NOFILE, &nbr_files);
4926       if (status != 0) {
4927         log_info(os)("os::init_2 setrlimit failed: %s", os::strerror(errno));
4928       }
4929     }
4930   }
4931 
4932   // Initialize lock used to serialize thread creation (see os::create_thread)
4933   Linux::set_createThread_lock(new Mutex(Mutex::leaf, "createThread_lock", false));
4934 
4935   // at-exit methods are called in the reverse order of their registration.
4936   // atexit functions are called on return from main or as a result of a
4937   // call to exit(3C). There can be only 32 of these functions registered
4938   // and atexit() does not set errno.
4939 
4940   if (PerfAllowAtExitRegistration) {
4941     // only register atexit functions if PerfAllowAtExitRegistration is set.
4942     // atexit functions can be delayed until process exit time, which
4943     // can be problematic for embedded VM situations. Embedded VMs should
4944     // call DestroyJavaVM() to assure that VM resources are released.
4945 
4946     // note: perfMemory_exit_helper atexit function may be removed in
4947     // the future if the appropriate cleanup code can be added to the
4948     // VM_Exit VMOperation's doit method.
4949     if (atexit(perfMemory_exit_helper) != 0) {
4950       warning("os::init_2 atexit(perfMemory_exit_helper) failed");
4951     }
4952   }
4953 
4954   // initialize thread priority policy
4955   prio_init();
4956 
4957   return JNI_OK;
4958 }
4959 
4960 // Mark the polling page as unreadable
4961 void os::make_polling_page_unreadable(void) {
4962   if (!guard_memory((char*)_polling_page, Linux::page_size())) {
4963     fatal("Could not disable polling page");
4964   }
4965 }
4966 
4967 // Mark the polling page as readable
4968 void os::make_polling_page_readable(void) {
4969   if (!linux_mprotect((char *)_polling_page, Linux::page_size(), PROT_READ)) {
4970     fatal("Could not enable polling page");
4971   }
4972 }
4973 
4974 // older glibc versions don't have this macro (which expands to
4975 // an optimized bit-counting function) so we have to roll our own
4976 #ifndef CPU_COUNT
4977 
4978 static int _cpu_count(const cpu_set_t* cpus) {
4979   int count = 0;
4980   // only look up to the number of configured processors
4981   for (int i = 0; i < os::processor_count(); i++) {
4982     if (CPU_ISSET(i, cpus)) {
4983       count++;
4984     }
4985   }
4986   return count;
4987 }
4988 
4989 #define CPU_COUNT(cpus) _cpu_count(cpus)
4990 
4991 #endif // CPU_COUNT
4992 
4993 // Get the current number of available processors for this process.
4994 // This value can change at any time during a process's lifetime.
4995 // sched_getaffinity gives an accurate answer as it accounts for cpusets.
4996 // If it appears there may be more than 1024 processors then we do a
4997 // dynamic check - see 6515172 for details.
4998 // If anything goes wrong we fallback to returning the number of online
4999 // processors - which can be greater than the number available to the process.
5000 int os::active_processor_count() {
5001   cpu_set_t cpus;  // can represent at most 1024 (CPU_SETSIZE) processors
5002   cpu_set_t* cpus_p = &cpus;
5003   int cpus_size = sizeof(cpu_set_t);
5004 
5005   int configured_cpus = processor_count();  // upper bound on available cpus
5006   int cpu_count = 0;
5007 
5008 // old build platforms may not support dynamic cpu sets
5009 #ifdef CPU_ALLOC
5010 
5011   // To enable easy testing of the dynamic path on different platforms we
5012   // introduce a diagnostic flag: UseCpuAllocPath
5013   if (configured_cpus >= CPU_SETSIZE || UseCpuAllocPath) {
5014     // kernel may use a mask bigger than cpu_set_t
5015     log_trace(os)("active_processor_count: using dynamic path %s"
5016                   "- configured processors: %d",
5017                   UseCpuAllocPath ? "(forced) " : "",
5018                   configured_cpus);
5019     cpus_p = CPU_ALLOC(configured_cpus);
5020     if (cpus_p != NULL) {
5021       cpus_size = CPU_ALLOC_SIZE(configured_cpus);
5022       // zero it just to be safe
5023       CPU_ZERO_S(cpus_size, cpus_p);
5024     }
5025     else {
5026        // failed to allocate so fallback to online cpus
5027        int online_cpus = ::sysconf(_SC_NPROCESSORS_ONLN);
5028        log_trace(os)("active_processor_count: "
5029                      "CPU_ALLOC failed (%s) - using "
5030                      "online processor count: %d",
5031                      os::strerror(errno), online_cpus);
5032        return online_cpus;
5033     }
5034   }
5035   else {
5036     log_trace(os)("active_processor_count: using static path - configured processors: %d",
5037                   configured_cpus);
5038   }
5039 #else // CPU_ALLOC
5040 // these stubs won't be executed
5041 #define CPU_COUNT_S(size, cpus) -1
5042 #define CPU_FREE(cpus)
5043 
5044   log_trace(os)("active_processor_count: only static path available - configured processors: %d",
5045                 configured_cpus);
5046 #endif // CPU_ALLOC
5047 
5048   // pid 0 means the current thread - which we have to assume represents the process
5049   if (sched_getaffinity(0, cpus_size, cpus_p) == 0) {
5050     if (cpus_p != &cpus) { // can only be true when CPU_ALLOC used
5051       cpu_count = CPU_COUNT_S(cpus_size, cpus_p);
5052     }
5053     else {
5054       cpu_count = CPU_COUNT(cpus_p);
5055     }
5056     log_trace(os)("active_processor_count: sched_getaffinity processor count: %d", cpu_count);
5057   }
5058   else {
5059     cpu_count = ::sysconf(_SC_NPROCESSORS_ONLN);
5060     warning("sched_getaffinity failed (%s)- using online processor count (%d) "
5061             "which may exceed available processors", os::strerror(errno), cpu_count);
5062   }
5063 
5064   if (cpus_p != &cpus) { // can only be true when CPU_ALLOC used
5065     CPU_FREE(cpus_p);
5066   }
5067 
5068   assert(cpu_count > 0 && cpu_count <= processor_count(), "sanity check");
5069   return cpu_count;
5070 }
5071 
5072 void os::set_native_thread_name(const char *name) {
5073   if (Linux::_pthread_setname_np) {
5074     char buf [16]; // according to glibc manpage, 16 chars incl. '/0'
5075     snprintf(buf, sizeof(buf), "%s", name);
5076     buf[sizeof(buf) - 1] = '\0';
5077     const int rc = Linux::_pthread_setname_np(pthread_self(), buf);
5078     // ERANGE should not happen; all other errors should just be ignored.
5079     assert(rc != ERANGE, "pthread_setname_np failed");
5080   }
5081 }
5082 
5083 bool os::distribute_processes(uint length, uint* distribution) {
5084   // Not yet implemented.
5085   return false;
5086 }
5087 
5088 bool os::bind_to_processor(uint processor_id) {
5089   // Not yet implemented.
5090   return false;
5091 }
5092 
5093 ///
5094 
5095 void os::SuspendedThreadTask::internal_do_task() {
5096   if (do_suspend(_thread->osthread())) {
5097     SuspendedThreadTaskContext context(_thread, _thread->osthread()->ucontext());
5098     do_task(context);
5099     do_resume(_thread->osthread());
5100   }
5101 }
5102 
5103 class PcFetcher : public os::SuspendedThreadTask {
5104  public:
5105   PcFetcher(Thread* thread) : os::SuspendedThreadTask(thread) {}
5106   ExtendedPC result();
5107  protected:
5108   void do_task(const os::SuspendedThreadTaskContext& context);
5109  private:
5110   ExtendedPC _epc;
5111 };
5112 
5113 ExtendedPC PcFetcher::result() {
5114   guarantee(is_done(), "task is not done yet.");
5115   return _epc;
5116 }
5117 
5118 void PcFetcher::do_task(const os::SuspendedThreadTaskContext& context) {
5119   Thread* thread = context.thread();
5120   OSThread* osthread = thread->osthread();
5121   if (osthread->ucontext() != NULL) {
5122     _epc = os::Linux::ucontext_get_pc((const ucontext_t *) context.ucontext());
5123   } else {
5124     // NULL context is unexpected, double-check this is the VMThread
5125     guarantee(thread->is_VM_thread(), "can only be called for VMThread");
5126   }
5127 }
5128 
5129 // Suspends the target using the signal mechanism and then grabs the PC before
5130 // resuming the target. Used by the flat-profiler only
5131 ExtendedPC os::get_thread_pc(Thread* thread) {
5132   // Make sure that it is called by the watcher for the VMThread
5133   assert(Thread::current()->is_Watcher_thread(), "Must be watcher");
5134   assert(thread->is_VM_thread(), "Can only be called for VMThread");
5135 
5136   PcFetcher fetcher(thread);
5137   fetcher.run();
5138   return fetcher.result();
5139 }
5140 
5141 ////////////////////////////////////////////////////////////////////////////////
5142 // debug support
5143 
5144 bool os::find(address addr, outputStream* st) {
5145   Dl_info dlinfo;
5146   memset(&dlinfo, 0, sizeof(dlinfo));
5147   if (dladdr(addr, &dlinfo) != 0) {
5148     st->print(PTR_FORMAT ": ", p2i(addr));
5149     if (dlinfo.dli_sname != NULL && dlinfo.dli_saddr != NULL) {
5150       st->print("%s+" PTR_FORMAT, dlinfo.dli_sname,
5151                 p2i(addr) - p2i(dlinfo.dli_saddr));
5152     } else if (dlinfo.dli_fbase != NULL) {
5153       st->print("<offset " PTR_FORMAT ">", p2i(addr) - p2i(dlinfo.dli_fbase));
5154     } else {
5155       st->print("<absolute address>");
5156     }
5157     if (dlinfo.dli_fname != NULL) {
5158       st->print(" in %s", dlinfo.dli_fname);
5159     }
5160     if (dlinfo.dli_fbase != NULL) {
5161       st->print(" at " PTR_FORMAT, p2i(dlinfo.dli_fbase));
5162     }
5163     st->cr();
5164 
5165     if (Verbose) {
5166       // decode some bytes around the PC
5167       address begin = clamp_address_in_page(addr-40, addr, os::vm_page_size());
5168       address end   = clamp_address_in_page(addr+40, addr, os::vm_page_size());
5169       address       lowest = (address) dlinfo.dli_sname;
5170       if (!lowest)  lowest = (address) dlinfo.dli_fbase;
5171       if (begin < lowest)  begin = lowest;
5172       Dl_info dlinfo2;
5173       if (dladdr(end, &dlinfo2) != 0 && dlinfo2.dli_saddr != dlinfo.dli_saddr
5174           && end > dlinfo2.dli_saddr && dlinfo2.dli_saddr > begin) {
5175         end = (address) dlinfo2.dli_saddr;
5176       }
5177       Disassembler::decode(begin, end, st);
5178     }
5179     return true;
5180   }
5181   return false;
5182 }
5183 
5184 ////////////////////////////////////////////////////////////////////////////////
5185 // misc
5186 
5187 // This does not do anything on Linux. This is basically a hook for being
5188 // able to use structured exception handling (thread-local exception filters)
5189 // on, e.g., Win32.
5190 void
5191 os::os_exception_wrapper(java_call_t f, JavaValue* value, const methodHandle& method,
5192                          JavaCallArguments* args, Thread* thread) {
5193   f(value, method, args, thread);
5194 }
5195 
5196 void os::print_statistics() {
5197 }
5198 
5199 bool os::message_box(const char* title, const char* message) {
5200   int i;
5201   fdStream err(defaultStream::error_fd());
5202   for (i = 0; i < 78; i++) err.print_raw("=");
5203   err.cr();
5204   err.print_raw_cr(title);
5205   for (i = 0; i < 78; i++) err.print_raw("-");
5206   err.cr();
5207   err.print_raw_cr(message);
5208   for (i = 0; i < 78; i++) err.print_raw("=");
5209   err.cr();
5210 
5211   char buf[16];
5212   // Prevent process from exiting upon "read error" without consuming all CPU
5213   while (::read(0, buf, sizeof(buf)) <= 0) { ::sleep(100); }
5214 
5215   return buf[0] == 'y' || buf[0] == 'Y';
5216 }
5217 
5218 int os::stat(const char *path, struct stat *sbuf) {
5219   char pathbuf[MAX_PATH];
5220   if (strlen(path) > MAX_PATH - 1) {
5221     errno = ENAMETOOLONG;
5222     return -1;
5223   }
5224   os::native_path(strcpy(pathbuf, path));
5225   return ::stat(pathbuf, sbuf);
5226 }
5227 
5228 // Is a (classpath) directory empty?
5229 bool os::dir_is_empty(const char* path) {
5230   DIR *dir = NULL;
5231   struct dirent *ptr;
5232 
5233   dir = opendir(path);
5234   if (dir == NULL) return true;
5235 
5236   // Scan the directory
5237   bool result = true;
5238   char buf[sizeof(struct dirent) + MAX_PATH];
5239   while (result && (ptr = ::readdir(dir)) != NULL) {
5240     if (strcmp(ptr->d_name, ".") != 0 && strcmp(ptr->d_name, "..") != 0) {
5241       result = false;
5242     }
5243   }
5244   closedir(dir);
5245   return result;
5246 }
5247 
5248 // This code originates from JDK's sysOpen and open64_w
5249 // from src/solaris/hpi/src/system_md.c
5250 
5251 int os::open(const char *path, int oflag, int mode) {
5252   if (strlen(path) > MAX_PATH - 1) {
5253     errno = ENAMETOOLONG;
5254     return -1;
5255   }
5256 
5257   // All file descriptors that are opened in the Java process and not
5258   // specifically destined for a subprocess should have the close-on-exec
5259   // flag set.  If we don't set it, then careless 3rd party native code
5260   // might fork and exec without closing all appropriate file descriptors
5261   // (e.g. as we do in closeDescriptors in UNIXProcess.c), and this in
5262   // turn might:
5263   //
5264   // - cause end-of-file to fail to be detected on some file
5265   //   descriptors, resulting in mysterious hangs, or
5266   //
5267   // - might cause an fopen in the subprocess to fail on a system
5268   //   suffering from bug 1085341.
5269   //
5270   // (Yes, the default setting of the close-on-exec flag is a Unix
5271   // design flaw)
5272   //
5273   // See:
5274   // 1085341: 32-bit stdio routines should support file descriptors >255
5275   // 4843136: (process) pipe file descriptor from Runtime.exec not being closed
5276   // 6339493: (process) Runtime.exec does not close all file descriptors on Solaris 9
5277   //
5278   // Modern Linux kernels (after 2.6.23 2007) support O_CLOEXEC with open().
5279   // O_CLOEXEC is preferable to using FD_CLOEXEC on an open file descriptor
5280   // because it saves a system call and removes a small window where the flag
5281   // is unset.  On ancient Linux kernels the O_CLOEXEC flag will be ignored
5282   // and we fall back to using FD_CLOEXEC (see below).
5283 #ifdef O_CLOEXEC
5284   oflag |= O_CLOEXEC;
5285 #endif
5286 
5287   int fd = ::open64(path, oflag, mode);
5288   if (fd == -1) return -1;
5289 
5290   //If the open succeeded, the file might still be a directory
5291   {
5292     struct stat64 buf64;
5293     int ret = ::fstat64(fd, &buf64);
5294     int st_mode = buf64.st_mode;
5295 
5296     if (ret != -1) {
5297       if ((st_mode & S_IFMT) == S_IFDIR) {
5298         errno = EISDIR;
5299         ::close(fd);
5300         return -1;
5301       }
5302     } else {
5303       ::close(fd);
5304       return -1;
5305     }
5306   }
5307 
5308 #ifdef FD_CLOEXEC
5309   // Validate that the use of the O_CLOEXEC flag on open above worked.
5310   // With recent kernels, we will perform this check exactly once.
5311   static sig_atomic_t O_CLOEXEC_is_known_to_work = 0;
5312   if (!O_CLOEXEC_is_known_to_work) {
5313     int flags = ::fcntl(fd, F_GETFD);
5314     if (flags != -1) {
5315       if ((flags & FD_CLOEXEC) != 0)
5316         O_CLOEXEC_is_known_to_work = 1;
5317       else
5318         ::fcntl(fd, F_SETFD, flags | FD_CLOEXEC);
5319     }
5320   }
5321 #endif
5322 
5323   return fd;
5324 }
5325 
5326 
5327 // create binary file, rewriting existing file if required
5328 int os::create_binary_file(const char* path, bool rewrite_existing) {
5329   int oflags = O_WRONLY | O_CREAT;
5330   if (!rewrite_existing) {
5331     oflags |= O_EXCL;
5332   }
5333   return ::open64(path, oflags, S_IREAD | S_IWRITE);
5334 }
5335 
5336 // return current position of file pointer
5337 jlong os::current_file_offset(int fd) {
5338   return (jlong)::lseek64(fd, (off64_t)0, SEEK_CUR);
5339 }
5340 
5341 // move file pointer to the specified offset
5342 jlong os::seek_to_file_offset(int fd, jlong offset) {
5343   return (jlong)::lseek64(fd, (off64_t)offset, SEEK_SET);
5344 }
5345 
5346 // This code originates from JDK's sysAvailable
5347 // from src/solaris/hpi/src/native_threads/src/sys_api_td.c
5348 
5349 int os::available(int fd, jlong *bytes) {
5350   jlong cur, end;
5351   int mode;
5352   struct stat64 buf64;
5353 
5354   if (::fstat64(fd, &buf64) >= 0) {
5355     mode = buf64.st_mode;
5356     if (S_ISCHR(mode) || S_ISFIFO(mode) || S_ISSOCK(mode)) {
5357       int n;
5358       if (::ioctl(fd, FIONREAD, &n) >= 0) {
5359         *bytes = n;
5360         return 1;
5361       }
5362     }
5363   }
5364   if ((cur = ::lseek64(fd, 0L, SEEK_CUR)) == -1) {
5365     return 0;
5366   } else if ((end = ::lseek64(fd, 0L, SEEK_END)) == -1) {
5367     return 0;
5368   } else if (::lseek64(fd, cur, SEEK_SET) == -1) {
5369     return 0;
5370   }
5371   *bytes = end - cur;
5372   return 1;
5373 }
5374 
5375 // Map a block of memory.
5376 char* os::pd_map_memory(int fd, const char* file_name, size_t file_offset,
5377                         char *addr, size_t bytes, bool read_only,
5378                         bool allow_exec) {
5379   int prot;
5380   int flags = MAP_PRIVATE;
5381 
5382   if (read_only) {
5383     prot = PROT_READ;
5384   } else {
5385     prot = PROT_READ | PROT_WRITE;
5386   }
5387 
5388   if (allow_exec) {
5389     prot |= PROT_EXEC;
5390   }
5391 
5392   if (addr != NULL) {
5393     flags |= MAP_FIXED;
5394   }
5395 
5396   char* mapped_address = (char*)mmap(addr, (size_t)bytes, prot, flags,
5397                                      fd, file_offset);
5398   if (mapped_address == MAP_FAILED) {
5399     return NULL;
5400   }
5401   return mapped_address;
5402 }
5403 
5404 
5405 // Remap a block of memory.
5406 char* os::pd_remap_memory(int fd, const char* file_name, size_t file_offset,
5407                           char *addr, size_t bytes, bool read_only,
5408                           bool allow_exec) {
5409   // same as map_memory() on this OS
5410   return os::map_memory(fd, file_name, file_offset, addr, bytes, read_only,
5411                         allow_exec);
5412 }
5413 
5414 
5415 // Unmap a block of memory.
5416 bool os::pd_unmap_memory(char* addr, size_t bytes) {
5417   return munmap(addr, bytes) == 0;
5418 }
5419 
5420 static jlong slow_thread_cpu_time(Thread *thread, bool user_sys_cpu_time);
5421 
5422 static clockid_t thread_cpu_clockid(Thread* thread) {
5423   pthread_t tid = thread->osthread()->pthread_id();
5424   clockid_t clockid;
5425 
5426   // Get thread clockid
5427   int rc = os::Linux::pthread_getcpuclockid(tid, &clockid);
5428   assert(rc == 0, "pthread_getcpuclockid is expected to return 0 code");
5429   return clockid;
5430 }
5431 
5432 // current_thread_cpu_time(bool) and thread_cpu_time(Thread*, bool)
5433 // are used by JVM M&M and JVMTI to get user+sys or user CPU time
5434 // of a thread.
5435 //
5436 // current_thread_cpu_time() and thread_cpu_time(Thread*) returns
5437 // the fast estimate available on the platform.
5438 
5439 jlong os::current_thread_cpu_time() {
5440   if (os::Linux::supports_fast_thread_cpu_time()) {
5441     return os::Linux::fast_thread_cpu_time(CLOCK_THREAD_CPUTIME_ID);
5442   } else {
5443     // return user + sys since the cost is the same
5444     return slow_thread_cpu_time(Thread::current(), true /* user + sys */);
5445   }
5446 }
5447 
5448 jlong os::thread_cpu_time(Thread* thread) {
5449   // consistent with what current_thread_cpu_time() returns
5450   if (os::Linux::supports_fast_thread_cpu_time()) {
5451     return os::Linux::fast_thread_cpu_time(thread_cpu_clockid(thread));
5452   } else {
5453     return slow_thread_cpu_time(thread, true /* user + sys */);
5454   }
5455 }
5456 
5457 jlong os::current_thread_cpu_time(bool user_sys_cpu_time) {
5458   if (user_sys_cpu_time && os::Linux::supports_fast_thread_cpu_time()) {
5459     return os::Linux::fast_thread_cpu_time(CLOCK_THREAD_CPUTIME_ID);
5460   } else {
5461     return slow_thread_cpu_time(Thread::current(), user_sys_cpu_time);
5462   }
5463 }
5464 
5465 jlong os::thread_cpu_time(Thread *thread, bool user_sys_cpu_time) {
5466   if (user_sys_cpu_time && os::Linux::supports_fast_thread_cpu_time()) {
5467     return os::Linux::fast_thread_cpu_time(thread_cpu_clockid(thread));
5468   } else {
5469     return slow_thread_cpu_time(thread, user_sys_cpu_time);
5470   }
5471 }
5472 
5473 //  -1 on error.
5474 static jlong slow_thread_cpu_time(Thread *thread, bool user_sys_cpu_time) {
5475   pid_t  tid = thread->osthread()->thread_id();
5476   char *s;
5477   char stat[2048];
5478   int statlen;
5479   char proc_name[64];
5480   int count;
5481   long sys_time, user_time;
5482   char cdummy;
5483   int idummy;
5484   long ldummy;
5485   FILE *fp;
5486 
5487   snprintf(proc_name, 64, "/proc/self/task/%d/stat", tid);
5488   fp = fopen(proc_name, "r");
5489   if (fp == NULL) return -1;
5490   statlen = fread(stat, 1, 2047, fp);
5491   stat[statlen] = '\0';
5492   fclose(fp);
5493 
5494   // Skip pid and the command string. Note that we could be dealing with
5495   // weird command names, e.g. user could decide to rename java launcher
5496   // to "java 1.4.2 :)", then the stat file would look like
5497   //                1234 (java 1.4.2 :)) R ... ...
5498   // We don't really need to know the command string, just find the last
5499   // occurrence of ")" and then start parsing from there. See bug 4726580.
5500   s = strrchr(stat, ')');
5501   if (s == NULL) return -1;
5502 
5503   // Skip blank chars
5504   do { s++; } while (s && isspace(*s));
5505 
5506   count = sscanf(s,"%c %d %d %d %d %d %lu %lu %lu %lu %lu %lu %lu",
5507                  &cdummy, &idummy, &idummy, &idummy, &idummy, &idummy,
5508                  &ldummy, &ldummy, &ldummy, &ldummy, &ldummy,
5509                  &user_time, &sys_time);
5510   if (count != 13) return -1;
5511   if (user_sys_cpu_time) {
5512     return ((jlong)sys_time + (jlong)user_time) * (1000000000 / clock_tics_per_sec);
5513   } else {
5514     return (jlong)user_time * (1000000000 / clock_tics_per_sec);
5515   }
5516 }
5517 
5518 void os::current_thread_cpu_time_info(jvmtiTimerInfo *info_ptr) {
5519   info_ptr->max_value = ALL_64_BITS;       // will not wrap in less than 64 bits
5520   info_ptr->may_skip_backward = false;     // elapsed time not wall time
5521   info_ptr->may_skip_forward = false;      // elapsed time not wall time
5522   info_ptr->kind = JVMTI_TIMER_TOTAL_CPU;  // user+system time is returned
5523 }
5524 
5525 void os::thread_cpu_time_info(jvmtiTimerInfo *info_ptr) {
5526   info_ptr->max_value = ALL_64_BITS;       // will not wrap in less than 64 bits
5527   info_ptr->may_skip_backward = false;     // elapsed time not wall time
5528   info_ptr->may_skip_forward = false;      // elapsed time not wall time
5529   info_ptr->kind = JVMTI_TIMER_TOTAL_CPU;  // user+system time is returned
5530 }
5531 
5532 bool os::is_thread_cpu_time_supported() {
5533   return true;
5534 }
5535 
5536 // System loadavg support.  Returns -1 if load average cannot be obtained.
5537 // Linux doesn't yet have a (official) notion of processor sets,
5538 // so just return the system wide load average.
5539 int os::loadavg(double loadavg[], int nelem) {
5540   return ::getloadavg(loadavg, nelem);
5541 }
5542 
5543 void os::pause() {
5544   char filename[MAX_PATH];
5545   if (PauseAtStartupFile && PauseAtStartupFile[0]) {
5546     jio_snprintf(filename, MAX_PATH, "%s", PauseAtStartupFile);
5547   } else {
5548     jio_snprintf(filename, MAX_PATH, "./vm.paused.%d", current_process_id());
5549   }
5550 
5551   int fd = ::open(filename, O_WRONLY | O_CREAT | O_TRUNC, 0666);
5552   if (fd != -1) {
5553     struct stat buf;
5554     ::close(fd);
5555     while (::stat(filename, &buf) == 0) {
5556       (void)::poll(NULL, 0, 100);
5557     }
5558   } else {
5559     jio_fprintf(stderr,
5560                 "Could not open pause file '%s', continuing immediately.\n", filename);
5561   }
5562 }
5563 
5564 
5565 // Refer to the comments in os_solaris.cpp park-unpark. The next two
5566 // comment paragraphs are worth repeating here:
5567 //
5568 // Assumption:
5569 //    Only one parker can exist on an event, which is why we allocate
5570 //    them per-thread. Multiple unparkers can coexist.
5571 //
5572 // _Event serves as a restricted-range semaphore.
5573 //   -1 : thread is blocked, i.e. there is a waiter
5574 //    0 : neutral: thread is running or ready,
5575 //        could have been signaled after a wait started
5576 //    1 : signaled - thread is running or ready
5577 //
5578 
5579 // utility to compute the abstime argument to timedwait:
5580 // millis is the relative timeout time
5581 // abstime will be the absolute timeout time
5582 // TODO: replace compute_abstime() with unpackTime()
5583 
5584 static struct timespec* compute_abstime(timespec* abstime, jlong millis) {
5585   if (millis < 0)  millis = 0;
5586 
5587   jlong seconds = millis / 1000;
5588   millis %= 1000;
5589   if (seconds > 50000000) { // see man cond_timedwait(3T)
5590     seconds = 50000000;
5591   }
5592 
5593   if (os::supports_monotonic_clock()) {
5594     struct timespec now;
5595     int status = os::Linux::clock_gettime(CLOCK_MONOTONIC, &now);
5596     assert_status(status == 0, status, "clock_gettime");
5597     abstime->tv_sec = now.tv_sec  + seconds;
5598     long nanos = now.tv_nsec + millis * NANOSECS_PER_MILLISEC;
5599     if (nanos >= NANOSECS_PER_SEC) {
5600       abstime->tv_sec += 1;
5601       nanos -= NANOSECS_PER_SEC;
5602     }
5603     abstime->tv_nsec = nanos;
5604   } else {
5605     struct timeval now;
5606     int status = gettimeofday(&now, NULL);
5607     assert(status == 0, "gettimeofday");
5608     abstime->tv_sec = now.tv_sec  + seconds;
5609     long usec = now.tv_usec + millis * 1000;
5610     if (usec >= 1000000) {
5611       abstime->tv_sec += 1;
5612       usec -= 1000000;
5613     }
5614     abstime->tv_nsec = usec * 1000;
5615   }
5616   return abstime;
5617 }
5618 
5619 void os::PlatformEvent::park() {       // AKA "down()"
5620   // Transitions for _Event:
5621   //   -1 => -1 : illegal
5622   //    1 =>  0 : pass - return immediately
5623   //    0 => -1 : block; then set _Event to 0 before returning
5624 
5625   // Invariant: Only the thread associated with the Event/PlatformEvent
5626   // may call park().
5627   // TODO: assert that _Assoc != NULL or _Assoc == Self
5628   assert(_nParked == 0, "invariant");
5629 
5630   int v;
5631   for (;;) {
5632     v = _Event;
5633     if (Atomic::cmpxchg(v-1, &_Event, v) == v) break;
5634   }
5635   guarantee(v >= 0, "invariant");
5636   if (v == 0) {
5637     // Do this the hard way by blocking ...
5638     int status = pthread_mutex_lock(_mutex);
5639     assert_status(status == 0, status, "mutex_lock");
5640     guarantee(_nParked == 0, "invariant");
5641     ++_nParked;
5642     while (_Event < 0) {
5643       status = pthread_cond_wait(_cond, _mutex);
5644       // for some reason, under 2.7 lwp_cond_wait() may return ETIME ...
5645       // Treat this the same as if the wait was interrupted
5646       if (status == ETIME) { status = EINTR; }
5647       assert_status(status == 0 || status == EINTR, status, "cond_wait");
5648     }
5649     --_nParked;
5650 
5651     _Event = 0;
5652     status = pthread_mutex_unlock(_mutex);
5653     assert_status(status == 0, status, "mutex_unlock");
5654     // Paranoia to ensure our locked and lock-free paths interact
5655     // correctly with each other.
5656     OrderAccess::fence();
5657   }
5658   guarantee(_Event >= 0, "invariant");
5659 }
5660 
5661 int os::PlatformEvent::park(jlong millis) {
5662   // Transitions for _Event:
5663   //   -1 => -1 : illegal
5664   //    1 =>  0 : pass - return immediately
5665   //    0 => -1 : block; then set _Event to 0 before returning
5666 
5667   guarantee(_nParked == 0, "invariant");
5668 
5669   int v;
5670   for (;;) {
5671     v = _Event;
5672     if (Atomic::cmpxchg(v-1, &_Event, v) == v) break;
5673   }
5674   guarantee(v >= 0, "invariant");
5675   if (v != 0) return OS_OK;
5676 
5677   // We do this the hard way, by blocking the thread.
5678   // Consider enforcing a minimum timeout value.
5679   struct timespec abst;
5680   compute_abstime(&abst, millis);
5681 
5682   int ret = OS_TIMEOUT;
5683   int status = pthread_mutex_lock(_mutex);
5684   assert_status(status == 0, status, "mutex_lock");
5685   guarantee(_nParked == 0, "invariant");
5686   ++_nParked;
5687 
5688   // Object.wait(timo) will return because of
5689   // (a) notification
5690   // (b) timeout
5691   // (c) thread.interrupt
5692   //
5693   // Thread.interrupt and object.notify{All} both call Event::set.
5694   // That is, we treat thread.interrupt as a special case of notification.
5695   // We ignore spurious OS wakeups unless FilterSpuriousWakeups is false.
5696   // We assume all ETIME returns are valid.
5697   //
5698   // TODO: properly differentiate simultaneous notify+interrupt.
5699   // In that case, we should propagate the notify to another waiter.
5700 
5701   while (_Event < 0) {
5702     status = pthread_cond_timedwait(_cond, _mutex, &abst);
5703     assert_status(status == 0 || status == EINTR ||
5704                   status == ETIME || status == ETIMEDOUT,
5705                   status, "cond_timedwait");
5706     if (!FilterSpuriousWakeups) break;                 // previous semantics
5707     if (status == ETIME || status == ETIMEDOUT) break;
5708     // We consume and ignore EINTR and spurious wakeups.
5709   }
5710   --_nParked;
5711   if (_Event >= 0) {
5712     ret = OS_OK;
5713   }
5714   _Event = 0;
5715   status = pthread_mutex_unlock(_mutex);
5716   assert_status(status == 0, status, "mutex_unlock");
5717   assert(_nParked == 0, "invariant");
5718   // Paranoia to ensure our locked and lock-free paths interact
5719   // correctly with each other.
5720   OrderAccess::fence();
5721   return ret;
5722 }
5723 
5724 void os::PlatformEvent::unpark() {
5725   // Transitions for _Event:
5726   //    0 => 1 : just return
5727   //    1 => 1 : just return
5728   //   -1 => either 0 or 1; must signal target thread
5729   //         That is, we can safely transition _Event from -1 to either
5730   //         0 or 1.
5731   // See also: "Semaphores in Plan 9" by Mullender & Cox
5732   //
5733   // Note: Forcing a transition from "-1" to "1" on an unpark() means
5734   // that it will take two back-to-back park() calls for the owning
5735   // thread to block. This has the benefit of forcing a spurious return
5736   // from the first park() call after an unpark() call which will help
5737   // shake out uses of park() and unpark() without condition variables.
5738 
5739   if (Atomic::xchg(1, &_Event) >= 0) return;
5740 
5741   // Wait for the thread associated with the event to vacate
5742   int status = pthread_mutex_lock(_mutex);
5743   assert_status(status == 0, status, "mutex_lock");
5744   int AnyWaiters = _nParked;
5745   assert(AnyWaiters == 0 || AnyWaiters == 1, "invariant");
5746   status = pthread_mutex_unlock(_mutex);
5747   assert_status(status == 0, status, "mutex_unlock");
5748   if (AnyWaiters != 0) {
5749     // Note that we signal() *after* dropping the lock for "immortal" Events.
5750     // This is safe and avoids a common class of  futile wakeups.  In rare
5751     // circumstances this can cause a thread to return prematurely from
5752     // cond_{timed}wait() but the spurious wakeup is benign and the victim
5753     // will simply re-test the condition and re-park itself.
5754     // This provides particular benefit if the underlying platform does not
5755     // provide wait morphing.
5756     status = pthread_cond_signal(_cond);
5757     assert_status(status == 0, status, "cond_signal");
5758   }
5759 }
5760 
5761 
5762 // JSR166
5763 // -------------------------------------------------------
5764 
5765 // The solaris and linux implementations of park/unpark are fairly
5766 // conservative for now, but can be improved. They currently use a
5767 // mutex/condvar pair, plus a a count.
5768 // Park decrements count if > 0, else does a condvar wait.  Unpark
5769 // sets count to 1 and signals condvar.  Only one thread ever waits
5770 // on the condvar. Contention seen when trying to park implies that someone
5771 // is unparking you, so don't wait. And spurious returns are fine, so there
5772 // is no need to track notifications.
5773 
5774 // This code is common to linux and solaris and will be moved to a
5775 // common place in dolphin.
5776 //
5777 // The passed in time value is either a relative time in nanoseconds
5778 // or an absolute time in milliseconds. Either way it has to be unpacked
5779 // into suitable seconds and nanoseconds components and stored in the
5780 // given timespec structure.
5781 // Given time is a 64-bit value and the time_t used in the timespec is only
5782 // a signed-32-bit value (except on 64-bit Linux) we have to watch for
5783 // overflow if times way in the future are given. Further on Solaris versions
5784 // prior to 10 there is a restriction (see cond_timedwait) that the specified
5785 // number of seconds, in abstime, is less than current_time  + 100,000,000.
5786 // As it will be 28 years before "now + 100000000" will overflow we can
5787 // ignore overflow and just impose a hard-limit on seconds using the value
5788 // of "now + 100,000,000". This places a limit on the timeout of about 3.17
5789 // years from "now".
5790 
5791 static void unpackTime(timespec* absTime, bool isAbsolute, jlong time) {
5792   assert(time > 0, "convertTime");
5793   time_t max_secs = 0;
5794 
5795   if (!os::supports_monotonic_clock() || isAbsolute) {
5796     struct timeval now;
5797     int status = gettimeofday(&now, NULL);
5798     assert(status == 0, "gettimeofday");
5799 
5800     max_secs = now.tv_sec + MAX_SECS;
5801 
5802     if (isAbsolute) {
5803       jlong secs = time / 1000;
5804       if (secs > max_secs) {
5805         absTime->tv_sec = max_secs;
5806       } else {
5807         absTime->tv_sec = secs;
5808       }
5809       absTime->tv_nsec = (time % 1000) * NANOSECS_PER_MILLISEC;
5810     } else {
5811       jlong secs = time / NANOSECS_PER_SEC;
5812       if (secs >= MAX_SECS) {
5813         absTime->tv_sec = max_secs;
5814         absTime->tv_nsec = 0;
5815       } else {
5816         absTime->tv_sec = now.tv_sec + secs;
5817         absTime->tv_nsec = (time % NANOSECS_PER_SEC) + now.tv_usec*1000;
5818         if (absTime->tv_nsec >= NANOSECS_PER_SEC) {
5819           absTime->tv_nsec -= NANOSECS_PER_SEC;
5820           ++absTime->tv_sec; // note: this must be <= max_secs
5821         }
5822       }
5823     }
5824   } else {
5825     // must be relative using monotonic clock
5826     struct timespec now;
5827     int status = os::Linux::clock_gettime(CLOCK_MONOTONIC, &now);
5828     assert_status(status == 0, status, "clock_gettime");
5829     max_secs = now.tv_sec + MAX_SECS;
5830     jlong secs = time / NANOSECS_PER_SEC;
5831     if (secs >= MAX_SECS) {
5832       absTime->tv_sec = max_secs;
5833       absTime->tv_nsec = 0;
5834     } else {
5835       absTime->tv_sec = now.tv_sec + secs;
5836       absTime->tv_nsec = (time % NANOSECS_PER_SEC) + now.tv_nsec;
5837       if (absTime->tv_nsec >= NANOSECS_PER_SEC) {
5838         absTime->tv_nsec -= NANOSECS_PER_SEC;
5839         ++absTime->tv_sec; // note: this must be <= max_secs
5840       }
5841     }
5842   }
5843   assert(absTime->tv_sec >= 0, "tv_sec < 0");
5844   assert(absTime->tv_sec <= max_secs, "tv_sec > max_secs");
5845   assert(absTime->tv_nsec >= 0, "tv_nsec < 0");
5846   assert(absTime->tv_nsec < NANOSECS_PER_SEC, "tv_nsec >= nanos_per_sec");
5847 }
5848 
5849 void Parker::park(bool isAbsolute, jlong time) {
5850   // Ideally we'd do something useful while spinning, such
5851   // as calling unpackTime().
5852 
5853   // Optional fast-path check:
5854   // Return immediately if a permit is available.
5855   // We depend on Atomic::xchg() having full barrier semantics
5856   // since we are doing a lock-free update to _counter.
5857   if (Atomic::xchg(0, &_counter) > 0) return;
5858 
5859   Thread* thread = Thread::current();
5860   assert(thread->is_Java_thread(), "Must be JavaThread");
5861   JavaThread *jt = (JavaThread *)thread;
5862 
5863   // Optional optimization -- avoid state transitions if there's an interrupt pending.
5864   // Check interrupt before trying to wait
5865   if (Thread::is_interrupted(thread, false)) {
5866     return;
5867   }
5868 
5869   // Next, demultiplex/decode time arguments
5870   timespec absTime;
5871   if (time < 0 || (isAbsolute && time == 0)) { // don't wait at all
5872     return;
5873   }
5874   if (time > 0) {
5875     unpackTime(&absTime, isAbsolute, time);
5876   }
5877 
5878 
5879   // Enter safepoint region
5880   // Beware of deadlocks such as 6317397.
5881   // The per-thread Parker:: mutex is a classic leaf-lock.
5882   // In particular a thread must never block on the Threads_lock while
5883   // holding the Parker:: mutex.  If safepoints are pending both the
5884   // the ThreadBlockInVM() CTOR and DTOR may grab Threads_lock.
5885   ThreadBlockInVM tbivm(jt);
5886 
5887   // Don't wait if cannot get lock since interference arises from
5888   // unblocking.  Also. check interrupt before trying wait
5889   if (Thread::is_interrupted(thread, false) || pthread_mutex_trylock(_mutex) != 0) {
5890     return;
5891   }
5892 
5893   int status;
5894   if (_counter > 0)  { // no wait needed
5895     _counter = 0;
5896     status = pthread_mutex_unlock(_mutex);
5897     assert_status(status == 0, status, "invariant");
5898     // Paranoia to ensure our locked and lock-free paths interact
5899     // correctly with each other and Java-level accesses.
5900     OrderAccess::fence();
5901     return;
5902   }
5903 
5904 #ifdef ASSERT
5905   // Don't catch signals while blocked; let the running threads have the signals.
5906   // (This allows a debugger to break into the running thread.)
5907   sigset_t oldsigs;
5908   sigemptyset(&oldsigs);
5909   sigset_t* allowdebug_blocked = os::Linux::allowdebug_blocked_signals();
5910   pthread_sigmask(SIG_BLOCK, allowdebug_blocked, &oldsigs);
5911 #endif
5912 
5913   OSThreadWaitState osts(thread->osthread(), false /* not Object.wait() */);
5914   jt->set_suspend_equivalent();
5915   // cleared by handle_special_suspend_equivalent_condition() or java_suspend_self()
5916 
5917   assert(_cur_index == -1, "invariant");
5918   if (time == 0) {
5919     _cur_index = REL_INDEX; // arbitrary choice when not timed
5920     status = pthread_cond_wait(&_cond[_cur_index], _mutex);
5921   } else {
5922     _cur_index = isAbsolute ? ABS_INDEX : REL_INDEX;
5923     status = pthread_cond_timedwait(&_cond[_cur_index], _mutex, &absTime);
5924   }
5925   _cur_index = -1;
5926   assert_status(status == 0 || status == EINTR ||
5927                 status == ETIME || status == ETIMEDOUT,
5928                 status, "cond_timedwait");
5929 
5930 #ifdef ASSERT
5931   pthread_sigmask(SIG_SETMASK, &oldsigs, NULL);
5932 #endif
5933 
5934   _counter = 0;
5935   status = pthread_mutex_unlock(_mutex);
5936   assert_status(status == 0, status, "invariant");
5937   // Paranoia to ensure our locked and lock-free paths interact
5938   // correctly with each other and Java-level accesses.
5939   OrderAccess::fence();
5940 
5941   // If externally suspended while waiting, re-suspend
5942   if (jt->handle_special_suspend_equivalent_condition()) {
5943     jt->java_suspend_self();
5944   }
5945 }
5946 
5947 void Parker::unpark() {
5948   int status = pthread_mutex_lock(_mutex);
5949   assert_status(status == 0, status, "invariant");
5950   const int s = _counter;
5951   _counter = 1;
5952   // must capture correct index before unlocking
5953   int index = _cur_index;
5954   status = pthread_mutex_unlock(_mutex);
5955   assert_status(status == 0, status, "invariant");
5956   if (s < 1 && index != -1) {
5957     // thread is definitely parked
5958     status = pthread_cond_signal(&_cond[index]);
5959     assert_status(status == 0, status, "invariant");
5960   }
5961 }
5962 
5963 
5964 extern char** environ;
5965 
5966 // Run the specified command in a separate process. Return its exit value,
5967 // or -1 on failure (e.g. can't fork a new process).
5968 // Unlike system(), this function can be called from signal handler. It
5969 // doesn't block SIGINT et al.
5970 int os::fork_and_exec(char* cmd) {
5971   const char * argv[4] = {"sh", "-c", cmd, NULL};
5972 
5973   pid_t pid = fork();
5974 
5975   if (pid < 0) {
5976     // fork failed
5977     return -1;
5978 
5979   } else if (pid == 0) {
5980     // child process
5981 
5982     execve("/bin/sh", (char* const*)argv, environ);
5983 
5984     // execve failed
5985     _exit(-1);
5986 
5987   } else  {
5988     // copied from J2SE ..._waitForProcessExit() in UNIXProcess_md.c; we don't
5989     // care about the actual exit code, for now.
5990 
5991     int status;
5992 
5993     // Wait for the child process to exit.  This returns immediately if
5994     // the child has already exited. */
5995     while (waitpid(pid, &status, 0) < 0) {
5996       switch (errno) {
5997       case ECHILD: return 0;
5998       case EINTR: break;
5999       default: return -1;
6000       }
6001     }
6002 
6003     if (WIFEXITED(status)) {
6004       // The child exited normally; get its exit code.
6005       return WEXITSTATUS(status);
6006     } else if (WIFSIGNALED(status)) {
6007       // The child exited because of a signal
6008       // The best value to return is 0x80 + signal number,
6009       // because that is what all Unix shells do, and because
6010       // it allows callers to distinguish between process exit and
6011       // process death by signal.
6012       return 0x80 + WTERMSIG(status);
6013     } else {
6014       // Unknown exit code; pass it through
6015       return status;
6016     }
6017   }
6018 }
6019 
6020 // is_headless_jre()
6021 //
6022 // Test for the existence of xawt/libmawt.so or libawt_xawt.so
6023 // in order to report if we are running in a headless jre
6024 //
6025 // Since JDK8 xawt/libmawt.so was moved into the same directory
6026 // as libawt.so, and renamed libawt_xawt.so
6027 //
6028 bool os::is_headless_jre() {
6029   struct stat statbuf;
6030   char buf[MAXPATHLEN];
6031   char libmawtpath[MAXPATHLEN];
6032   const char *xawtstr  = "/xawt/libmawt.so";
6033   const char *new_xawtstr = "/libawt_xawt.so";
6034   char *p;
6035 
6036   // Get path to libjvm.so
6037   os::jvm_path(buf, sizeof(buf));
6038 
6039   // Get rid of libjvm.so
6040   p = strrchr(buf, '/');
6041   if (p == NULL) {
6042     return false;
6043   } else {
6044     *p = '\0';
6045   }
6046 
6047   // Get rid of client or server
6048   p = strrchr(buf, '/');
6049   if (p == NULL) {
6050     return false;
6051   } else {
6052     *p = '\0';
6053   }
6054 
6055   // check xawt/libmawt.so
6056   strcpy(libmawtpath, buf);
6057   strcat(libmawtpath, xawtstr);
6058   if (::stat(libmawtpath, &statbuf) == 0) return false;
6059 
6060   // check libawt_xawt.so
6061   strcpy(libmawtpath, buf);
6062   strcat(libmawtpath, new_xawtstr);
6063   if (::stat(libmawtpath, &statbuf) == 0) return false;
6064 
6065   return true;
6066 }
6067 
6068 // Get the default path to the core file
6069 // Returns the length of the string
6070 int os::get_core_path(char* buffer, size_t bufferSize) {
6071   /*
6072    * Max length of /proc/sys/kernel/core_pattern is 128 characters.
6073    * See https://www.kernel.org/doc/Documentation/sysctl/kernel.txt
6074    */
6075   const int core_pattern_len = 129;
6076   char core_pattern[core_pattern_len] = {0};
6077 
6078   int core_pattern_file = ::open("/proc/sys/kernel/core_pattern", O_RDONLY);
6079   if (core_pattern_file == -1) {
6080     return -1;
6081   }
6082 
6083   ssize_t ret = ::read(core_pattern_file, core_pattern, core_pattern_len);
6084   ::close(core_pattern_file);
6085   if (ret <= 0 || ret >= core_pattern_len || core_pattern[0] == '\n') {
6086     return -1;
6087   }
6088   if (core_pattern[ret-1] == '\n') {
6089     core_pattern[ret-1] = '\0';
6090   } else {
6091     core_pattern[ret] = '\0';
6092   }
6093 
6094   char *pid_pos = strstr(core_pattern, "%p");
6095   int written;
6096 
6097   if (core_pattern[0] == '/') {
6098     written = jio_snprintf(buffer, bufferSize, "%s", core_pattern);
6099   } else {
6100     char cwd[PATH_MAX];
6101 
6102     const char* p = get_current_directory(cwd, PATH_MAX);
6103     if (p == NULL) {
6104       return -1;
6105     }
6106 
6107     if (core_pattern[0] == '|') {
6108       written = jio_snprintf(buffer, bufferSize,
6109                              "\"%s\" (or dumping to %s/core.%d)",
6110                              &core_pattern[1], p, current_process_id());
6111     } else {
6112       written = jio_snprintf(buffer, bufferSize, "%s/%s", p, core_pattern);
6113     }
6114   }
6115 
6116   if (written < 0) {
6117     return -1;
6118   }
6119 
6120   if (((size_t)written < bufferSize) && (pid_pos == NULL) && (core_pattern[0] != '|')) {
6121     int core_uses_pid_file = ::open("/proc/sys/kernel/core_uses_pid", O_RDONLY);
6122 
6123     if (core_uses_pid_file != -1) {
6124       char core_uses_pid = 0;
6125       ssize_t ret = ::read(core_uses_pid_file, &core_uses_pid, 1);
6126       ::close(core_uses_pid_file);
6127 
6128       if (core_uses_pid == '1') {
6129         jio_snprintf(buffer + written, bufferSize - written,
6130                                           ".%d", current_process_id());
6131       }
6132     }
6133   }
6134 
6135   return strlen(buffer);
6136 }
6137 
6138 bool os::start_debugging(char *buf, int buflen) {
6139   int len = (int)strlen(buf);
6140   char *p = &buf[len];
6141 
6142   jio_snprintf(p, buflen-len,
6143                "\n\n"
6144                "Do you want to debug the problem?\n\n"
6145                "To debug, run 'gdb /proc/%d/exe %d'; then switch to thread " UINTX_FORMAT " (" INTPTR_FORMAT ")\n"
6146                "Enter 'yes' to launch gdb automatically (PATH must include gdb)\n"
6147                "Otherwise, press RETURN to abort...",
6148                os::current_process_id(), os::current_process_id(),
6149                os::current_thread_id(), os::current_thread_id());
6150 
6151   bool yes = os::message_box("Unexpected Error", buf);
6152 
6153   if (yes) {
6154     // yes, user asked VM to launch debugger
6155     jio_snprintf(buf, sizeof(char)*buflen, "gdb /proc/%d/exe %d",
6156                  os::current_process_id(), os::current_process_id());
6157 
6158     os::fork_and_exec(buf);
6159     yes = false;
6160   }
6161   return yes;
6162 }
6163 
6164 
6165 // Java/Compiler thread:
6166 //
6167 //   Low memory addresses
6168 // P0 +------------------------+
6169 //    |                        |\  Java thread created by VM does not have glibc
6170 //    |    glibc guard page    | - guard page, attached Java thread usually has
6171 //    |                        |/  1 glibc guard page.
6172 // P1 +------------------------+ Thread::stack_base() - Thread::stack_size()
6173 //    |                        |\
6174 //    |  HotSpot Guard Pages   | - red, yellow and reserved pages
6175 //    |                        |/
6176 //    +------------------------+ JavaThread::stack_reserved_zone_base()
6177 //    |                        |\
6178 //    |      Normal Stack      | -
6179 //    |                        |/
6180 // P2 +------------------------+ Thread::stack_base()
6181 //
6182 // Non-Java thread:
6183 //
6184 //   Low memory addresses
6185 // P0 +------------------------+
6186 //    |                        |\
6187 //    |  glibc guard page      | - usually 1 page
6188 //    |                        |/
6189 // P1 +------------------------+ Thread::stack_base() - Thread::stack_size()
6190 //    |                        |\
6191 //    |      Normal Stack      | -
6192 //    |                        |/
6193 // P2 +------------------------+ Thread::stack_base()
6194 //
6195 // ** P1 (aka bottom) and size (P2 = P1 - size) are the address and stack size
6196 //    returned from pthread_attr_getstack().
6197 // ** Due to NPTL implementation error, linux takes the glibc guard page out
6198 //    of the stack size given in pthread_attr. We work around this for
6199 //    threads created by the VM. (We adapt bottom to be P1 and size accordingly.)
6200 //
6201 #ifndef ZERO
6202 static void current_stack_region(address * bottom, size_t * size) {
6203   if (os::Linux::is_initial_thread()) {
6204     // initial thread needs special handling because pthread_getattr_np()
6205     // may return bogus value.
6206     *bottom = os::Linux::initial_thread_stack_bottom();
6207     *size   = os::Linux::initial_thread_stack_size();
6208   } else {
6209     pthread_attr_t attr;
6210 
6211     int rslt = pthread_getattr_np(pthread_self(), &attr);
6212 
6213     // JVM needs to know exact stack location, abort if it fails
6214     if (rslt != 0) {
6215       if (rslt == ENOMEM) {
6216         vm_exit_out_of_memory(0, OOM_MMAP_ERROR, "pthread_getattr_np");
6217       } else {
6218         fatal("pthread_getattr_np failed with error = %d", rslt);
6219       }
6220     }
6221 
6222     if (pthread_attr_getstack(&attr, (void **)bottom, size) != 0) {
6223       fatal("Cannot locate current stack attributes!");
6224     }
6225 
6226     // Work around NPTL stack guard error.
6227     size_t guard_size = 0;
6228     rslt = pthread_attr_getguardsize(&attr, &guard_size);
6229     if (rslt != 0) {
6230       fatal("pthread_attr_getguardsize failed with error = %d", rslt);
6231     }
6232     *bottom += guard_size;
6233     *size   -= guard_size;
6234 
6235     pthread_attr_destroy(&attr);
6236 
6237   }
6238   assert(os::current_stack_pointer() >= *bottom &&
6239          os::current_stack_pointer() < *bottom + *size, "just checking");
6240 }
6241 
6242 address os::current_stack_base() {
6243   address bottom;
6244   size_t size;
6245   current_stack_region(&bottom, &size);
6246   return (bottom + size);
6247 }
6248 
6249 size_t os::current_stack_size() {
6250   // This stack size includes the usable stack and HotSpot guard pages
6251   // (for the threads that have Hotspot guard pages).
6252   address bottom;
6253   size_t size;
6254   current_stack_region(&bottom, &size);
6255   return size;
6256 }
6257 #endif
6258 
6259 static inline struct timespec get_mtime(const char* filename) {
6260   struct stat st;
6261   int ret = os::stat(filename, &st);
6262   assert(ret == 0, "failed to stat() file '%s': %s", filename, strerror(errno));
6263   return st.st_mtim;
6264 }
6265 
6266 int os::compare_file_modified_times(const char* file1, const char* file2) {
6267   struct timespec filetime1 = get_mtime(file1);
6268   struct timespec filetime2 = get_mtime(file2);
6269   int diff = filetime1.tv_sec - filetime2.tv_sec;
6270   if (diff == 0) {
6271     return filetime1.tv_nsec - filetime2.tv_nsec;
6272   }
6273   return diff;
6274 }
6275 
6276 /////////////// Unit tests ///////////////
6277 
6278 #ifndef PRODUCT
6279 
6280 #define test_log(...)              \
6281   do {                             \
6282     if (VerboseInternalVMTests) {  \
6283       tty->print_cr(__VA_ARGS__);  \
6284       tty->flush();                \
6285     }                              \
6286   } while (false)
6287 
6288 class TestReserveMemorySpecial : AllStatic {
6289  public:
6290   static void small_page_write(void* addr, size_t size) {
6291     size_t page_size = os::vm_page_size();
6292 
6293     char* end = (char*)addr + size;
6294     for (char* p = (char*)addr; p < end; p += page_size) {
6295       *p = 1;
6296     }
6297   }
6298 
6299   static void test_reserve_memory_special_huge_tlbfs_only(size_t size) {
6300     if (!UseHugeTLBFS) {
6301       return;
6302     }
6303 
6304     test_log("test_reserve_memory_special_huge_tlbfs_only(" SIZE_FORMAT ")", size);
6305 
6306     char* addr = os::Linux::reserve_memory_special_huge_tlbfs_only(size, NULL, false);
6307 
6308     if (addr != NULL) {
6309       small_page_write(addr, size);
6310 
6311       os::Linux::release_memory_special_huge_tlbfs(addr, size);
6312     }
6313   }
6314 
6315   static void test_reserve_memory_special_huge_tlbfs_only() {
6316     if (!UseHugeTLBFS) {
6317       return;
6318     }
6319 
6320     size_t lp = os::large_page_size();
6321 
6322     for (size_t size = lp; size <= lp * 10; size += lp) {
6323       test_reserve_memory_special_huge_tlbfs_only(size);
6324     }
6325   }
6326 
6327   static void test_reserve_memory_special_huge_tlbfs_mixed() {
6328     size_t lp = os::large_page_size();
6329     size_t ag = os::vm_allocation_granularity();
6330 
6331     // sizes to test
6332     const size_t sizes[] = {
6333       lp, lp + ag, lp + lp / 2, lp * 2,
6334       lp * 2 + ag, lp * 2 - ag, lp * 2 + lp / 2,
6335       lp * 10, lp * 10 + lp / 2
6336     };
6337     const int num_sizes = sizeof(sizes) / sizeof(size_t);
6338 
6339     // For each size/alignment combination, we test three scenarios:
6340     // 1) with req_addr == NULL
6341     // 2) with a non-null req_addr at which we expect to successfully allocate
6342     // 3) with a non-null req_addr which contains a pre-existing mapping, at which we
6343     //    expect the allocation to either fail or to ignore req_addr
6344 
6345     // Pre-allocate two areas; they shall be as large as the largest allocation
6346     //  and aligned to the largest alignment we will be testing.
6347     const size_t mapping_size = sizes[num_sizes - 1] * 2;
6348     char* const mapping1 = (char*) ::mmap(NULL, mapping_size,
6349       PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_NORESERVE,
6350       -1, 0);
6351     assert(mapping1 != MAP_FAILED, "should work");
6352 
6353     char* const mapping2 = (char*) ::mmap(NULL, mapping_size,
6354       PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_NORESERVE,
6355       -1, 0);
6356     assert(mapping2 != MAP_FAILED, "should work");
6357 
6358     // Unmap the first mapping, but leave the second mapping intact: the first
6359     // mapping will serve as a value for a "good" req_addr (case 2). The second
6360     // mapping, still intact, as "bad" req_addr (case 3).
6361     ::munmap(mapping1, mapping_size);
6362 
6363     // Case 1
6364     test_log("%s, req_addr NULL:", __FUNCTION__);
6365     test_log("size            align           result");
6366 
6367     for (int i = 0; i < num_sizes; i++) {
6368       const size_t size = sizes[i];
6369       for (size_t alignment = ag; is_size_aligned(size, alignment); alignment *= 2) {
6370         char* p = os::Linux::reserve_memory_special_huge_tlbfs_mixed(size, alignment, NULL, false);
6371         test_log(SIZE_FORMAT_HEX " " SIZE_FORMAT_HEX " ->  " PTR_FORMAT " %s",
6372                  size, alignment, p2i(p), (p != NULL ? "" : "(failed)"));
6373         if (p != NULL) {
6374           assert(is_ptr_aligned(p, alignment), "must be");
6375           small_page_write(p, size);
6376           os::Linux::release_memory_special_huge_tlbfs(p, size);
6377         }
6378       }
6379     }
6380 
6381     // Case 2
6382     test_log("%s, req_addr non-NULL:", __FUNCTION__);
6383     test_log("size            align           req_addr         result");
6384 
6385     for (int i = 0; i < num_sizes; i++) {
6386       const size_t size = sizes[i];
6387       for (size_t alignment = ag; is_size_aligned(size, alignment); alignment *= 2) {
6388         char* const req_addr = (char*) align_ptr_up(mapping1, alignment);
6389         char* p = os::Linux::reserve_memory_special_huge_tlbfs_mixed(size, alignment, req_addr, false);
6390         test_log(SIZE_FORMAT_HEX " " SIZE_FORMAT_HEX " " PTR_FORMAT " ->  " PTR_FORMAT " %s",
6391                  size, alignment, p2i(req_addr), p2i(p),
6392                  ((p != NULL ? (p == req_addr ? "(exact match)" : "") : "(failed)")));
6393         if (p != NULL) {
6394           assert(p == req_addr, "must be");
6395           small_page_write(p, size);
6396           os::Linux::release_memory_special_huge_tlbfs(p, size);
6397         }
6398       }
6399     }
6400 
6401     // Case 3
6402     test_log("%s, req_addr non-NULL with preexisting mapping:", __FUNCTION__);
6403     test_log("size            align           req_addr         result");
6404 
6405     for (int i = 0; i < num_sizes; i++) {
6406       const size_t size = sizes[i];
6407       for (size_t alignment = ag; is_size_aligned(size, alignment); alignment *= 2) {
6408         char* const req_addr = (char*) align_ptr_up(mapping2, alignment);
6409         char* p = os::Linux::reserve_memory_special_huge_tlbfs_mixed(size, alignment, req_addr, false);
6410         test_log(SIZE_FORMAT_HEX " " SIZE_FORMAT_HEX " " PTR_FORMAT " ->  " PTR_FORMAT " %s",
6411                  size, alignment, p2i(req_addr), p2i(p), ((p != NULL ? "" : "(failed)")));
6412         // as the area around req_addr contains already existing mappings, the API should always
6413         // return NULL (as per contract, it cannot return another address)
6414         assert(p == NULL, "must be");
6415       }
6416     }
6417 
6418     ::munmap(mapping2, mapping_size);
6419 
6420   }
6421 
6422   static void test_reserve_memory_special_huge_tlbfs() {
6423     if (!UseHugeTLBFS) {
6424       return;
6425     }
6426 
6427     test_reserve_memory_special_huge_tlbfs_only();
6428     test_reserve_memory_special_huge_tlbfs_mixed();
6429   }
6430 
6431   static void test_reserve_memory_special_shm(size_t size, size_t alignment) {
6432     if (!UseSHM) {
6433       return;
6434     }
6435 
6436     test_log("test_reserve_memory_special_shm(" SIZE_FORMAT ", " SIZE_FORMAT ")", size, alignment);
6437 
6438     char* addr = os::Linux::reserve_memory_special_shm(size, alignment, NULL, false);
6439 
6440     if (addr != NULL) {
6441       assert(is_ptr_aligned(addr, alignment), "Check");
6442       assert(is_ptr_aligned(addr, os::large_page_size()), "Check");
6443 
6444       small_page_write(addr, size);
6445 
6446       os::Linux::release_memory_special_shm(addr, size);
6447     }
6448   }
6449 
6450   static void test_reserve_memory_special_shm() {
6451     size_t lp = os::large_page_size();
6452     size_t ag = os::vm_allocation_granularity();
6453 
6454     for (size_t size = ag; size < lp * 3; size += ag) {
6455       for (size_t alignment = ag; is_size_aligned(size, alignment); alignment *= 2) {
6456         test_reserve_memory_special_shm(size, alignment);
6457       }
6458     }
6459   }
6460 
6461   static void test() {
6462     test_reserve_memory_special_huge_tlbfs();
6463     test_reserve_memory_special_shm();
6464   }
6465 };
6466 
6467 void TestReserveMemorySpecial_test() {
6468   TestReserveMemorySpecial::test();
6469 }
6470 
6471 #endif