1 /*
   2  * Copyright (c) 2015, 2020, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  */
  23 
  24 #include "precompiled.hpp"
  25 #include "gc/z/zArray.inline.hpp"
  26 #include "gc/z/zErrno.hpp"
  27 #include "gc/z/zGlobals.hpp"
  28 #include "gc/z/zLargePages.inline.hpp"
  29 #include "gc/z/zMountPoint_linux.hpp"
  30 #include "gc/z/zNUMA.inline.hpp"
  31 #include "gc/z/zPhysicalMemoryBacking_linux.hpp"
  32 #include "gc/z/zSyscall_linux.hpp"
  33 #include "logging/log.hpp"
  34 #include "runtime/init.hpp"
  35 #include "runtime/os.hpp"
  36 #include "runtime/stubRoutines.hpp"
  37 #include "utilities/align.hpp"
  38 #include "utilities/debug.hpp"
  39 #include "utilities/growableArray.hpp"
  40 
  41 #include <fcntl.h>
  42 #include <stdio.h>
  43 #include <sys/mman.h>
  44 #include <sys/stat.h>
  45 #include <sys/statfs.h>
  46 #include <sys/types.h>
  47 #include <unistd.h>
  48 
  49 //
  50 // Support for building on older Linux systems
  51 //
  52 
  53 // memfd_create(2) flags
  54 #ifndef MFD_CLOEXEC
  55 #define MFD_CLOEXEC                      0x0001U
  56 #endif
  57 #ifndef MFD_HUGETLB
  58 #define MFD_HUGETLB                      0x0004U
  59 #endif
  60 
  61 // open(2) flags
  62 #ifndef O_CLOEXEC
  63 #define O_CLOEXEC                        02000000
  64 #endif
  65 #ifndef O_TMPFILE
  66 #define O_TMPFILE                        (020000000 | O_DIRECTORY)
  67 #endif
  68 
  69 // fallocate(2) flags
  70 #ifndef FALLOC_FL_KEEP_SIZE
  71 #define FALLOC_FL_KEEP_SIZE              0x01
  72 #endif
  73 #ifndef FALLOC_FL_PUNCH_HOLE
  74 #define FALLOC_FL_PUNCH_HOLE             0x02
  75 #endif
  76 
  77 // Filesystem types, see statfs(2)
  78 #ifndef TMPFS_MAGIC
  79 #define TMPFS_MAGIC                      0x01021994
  80 #endif
  81 #ifndef HUGETLBFS_MAGIC
  82 #define HUGETLBFS_MAGIC                  0x958458f6
  83 #endif
  84 
  85 // Filesystem names
  86 #define ZFILESYSTEM_TMPFS                "tmpfs"
  87 #define ZFILESYSTEM_HUGETLBFS            "hugetlbfs"
  88 
  89 // Proc file entry for max map mount
  90 #define ZFILENAME_PROC_MAX_MAP_COUNT     "/proc/sys/vm/max_map_count"
  91 
  92 // Sysfs file for transparent huge page on tmpfs
  93 #define ZFILENAME_SHMEM_ENABLED          "/sys/kernel/mm/transparent_hugepage/shmem_enabled"
  94 
  95 // Java heap filename
  96 #define ZFILENAME_HEAP                   "java_heap"
  97 
  98 // Preferred tmpfs mount points, ordered by priority
  99 static const char* z_preferred_tmpfs_mountpoints[] = {
 100   "/dev/shm",
 101   "/run/shm",
 102   NULL
 103 };
 104 
 105 // Preferred hugetlbfs mount points, ordered by priority
 106 static const char* z_preferred_hugetlbfs_mountpoints[] = {
 107   "/dev/hugepages",
 108   "/hugepages",
 109   NULL
 110 };
 111 
 112 static int z_fallocate_hugetlbfs_attempts = 3;
 113 static bool z_fallocate_supported = true;
 114 
 115 ZPhysicalMemoryBacking::ZPhysicalMemoryBacking() :
 116     _fd(-1),
 117     _size(0),
 118     _filesystem(0),
 119     _block_size(0),
 120     _available(0),
 121     _initialized(false) {
 122 
 123   // Create backing file
 124   _fd = create_fd(ZFILENAME_HEAP);
 125   if (_fd == -1) {
 126     return;
 127   }
 128 
 129   // Get filesystem statistics
 130   struct statfs buf;
 131   if (fstatfs(_fd, &buf) == -1) {
 132     ZErrno err;
 133     log_error(gc)("Failed to determine filesystem type for backing file (%s)", err.to_string());
 134     return;
 135   }
 136 
 137   _filesystem = buf.f_type;
 138   _block_size = buf.f_bsize;
 139   _available = buf.f_bavail * _block_size;
 140 
 141   log_info(gc, init)("Heap Backing Filesystem: %s (0x" UINT64_FORMAT_X ")",
 142                      is_tmpfs() ? ZFILESYSTEM_TMPFS : is_hugetlbfs() ? ZFILESYSTEM_HUGETLBFS : "other", _filesystem);
 143 
 144   // Make sure the filesystem type matches requested large page type
 145   if (ZLargePages::is_transparent() && !is_tmpfs()) {
 146     log_error(gc)("-XX:+UseTransparentHugePages can only be enabled when using a %s filesystem",
 147                   ZFILESYSTEM_TMPFS);
 148     return;
 149   }
 150 
 151   if (ZLargePages::is_transparent() && !tmpfs_supports_transparent_huge_pages()) {
 152     log_error(gc)("-XX:+UseTransparentHugePages on a %s filesystem not supported by kernel",
 153                   ZFILESYSTEM_TMPFS);
 154     return;
 155   }
 156 
 157   if (ZLargePages::is_explicit() && !is_hugetlbfs()) {
 158     log_error(gc)("-XX:+UseLargePages (without -XX:+UseTransparentHugePages) can only be enabled "
 159                   "when using a %s filesystem", ZFILESYSTEM_HUGETLBFS);
 160     return;
 161   }
 162 
 163   if (!ZLargePages::is_explicit() && is_hugetlbfs()) {
 164     log_error(gc)("-XX:+UseLargePages must be enabled when using a %s filesystem",
 165                   ZFILESYSTEM_HUGETLBFS);
 166     return;
 167   }
 168 
 169   if (ZLargePages::is_explicit() && os::large_page_size() != ZGranuleSize) {
 170     log_error(gc)("Incompatible large page size configured " SIZE_FORMAT " (expected " SIZE_FORMAT ")",
 171                   os::large_page_size(), ZGranuleSize);
 172     return;
 173   }
 174 
 175   // Make sure the filesystem block size is compatible
 176   if (ZGranuleSize % _block_size != 0) {
 177     log_error(gc)("Filesystem backing the heap has incompatible block size (" SIZE_FORMAT ")",
 178                   _block_size);
 179     return;
 180   }
 181 
 182   if (is_hugetlbfs() && _block_size != ZGranuleSize) {
 183     log_error(gc)("%s filesystem has unexpected block size " SIZE_FORMAT " (expected " SIZE_FORMAT ")",
 184                   ZFILESYSTEM_HUGETLBFS, _block_size, ZGranuleSize);
 185     return;
 186   }
 187 
 188   // Successfully initialized
 189   _initialized = true;
 190 }
 191 
 192 int ZPhysicalMemoryBacking::create_mem_fd(const char* name) const {
 193   // Create file name
 194   char filename[PATH_MAX];
 195   snprintf(filename, sizeof(filename), "%s%s", name, ZLargePages::is_explicit() ? ".hugetlb" : "");
 196 
 197   // Create file
 198   const int extra_flags = ZLargePages::is_explicit() ? MFD_HUGETLB : 0;
 199   const int fd = ZSyscall::memfd_create(filename, MFD_CLOEXEC | extra_flags);
 200   if (fd == -1) {
 201     ZErrno err;
 202     log_debug(gc, init)("Failed to create memfd file (%s)",
 203                         ((ZLargePages::is_explicit() && err == EINVAL) ? "Hugepages not supported" : err.to_string()));
 204     return -1;
 205   }
 206 
 207   log_info(gc, init)("Heap Backing File: /memfd:%s", filename);
 208 
 209   return fd;
 210 }
 211 
 212 int ZPhysicalMemoryBacking::create_file_fd(const char* name) const {
 213   const char* const filesystem = ZLargePages::is_explicit()
 214                                  ? ZFILESYSTEM_HUGETLBFS
 215                                  : ZFILESYSTEM_TMPFS;
 216   const char** const preferred_mountpoints = ZLargePages::is_explicit()
 217                                              ? z_preferred_hugetlbfs_mountpoints
 218                                              : z_preferred_tmpfs_mountpoints;
 219 
 220   // Find mountpoint
 221   ZMountPoint mountpoint(filesystem, preferred_mountpoints);
 222   if (mountpoint.get() == NULL) {
 223     log_error(gc)("Use -XX:AllocateHeapAt to specify the path to a %s filesystem", filesystem);
 224     return -1;
 225   }
 226 
 227   // Try to create an anonymous file using the O_TMPFILE flag. Note that this
 228   // flag requires kernel >= 3.11. If this fails we fall back to open/unlink.
 229   const int fd_anon = os::open(mountpoint.get(), O_TMPFILE|O_EXCL|O_RDWR|O_CLOEXEC, S_IRUSR|S_IWUSR);
 230   if (fd_anon == -1) {
 231     ZErrno err;
 232     log_debug(gc, init)("Failed to create anonymous file in %s (%s)", mountpoint.get(),
 233                         (err == EINVAL ? "Not supported" : err.to_string()));
 234   } else {
 235     // Get inode number for anonymous file
 236     struct stat stat_buf;
 237     if (fstat(fd_anon, &stat_buf) == -1) {
 238       ZErrno err;
 239       log_error(gc)("Failed to determine inode number for anonymous file (%s)", err.to_string());
 240       return -1;
 241     }
 242 
 243     log_info(gc, init)("Heap Backing File: %s/#" UINT64_FORMAT, mountpoint.get(), (uint64_t)stat_buf.st_ino);
 244 
 245     return fd_anon;
 246   }
 247 
 248   log_debug(gc, init)("Falling back to open/unlink");
 249 
 250   // Create file name
 251   char filename[PATH_MAX];
 252   snprintf(filename, sizeof(filename), "%s/%s.%d", mountpoint.get(), name, os::current_process_id());
 253 
 254   // Create file
 255   const int fd = os::open(filename, O_CREAT|O_EXCL|O_RDWR|O_CLOEXEC, S_IRUSR|S_IWUSR);
 256   if (fd == -1) {
 257     ZErrno err;
 258     log_error(gc)("Failed to create file %s (%s)", filename, err.to_string());
 259     return -1;
 260   }
 261 
 262   // Unlink file
 263   if (unlink(filename) == -1) {
 264     ZErrno err;
 265     log_error(gc)("Failed to unlink file %s (%s)", filename, err.to_string());
 266     return -1;
 267   }
 268 
 269   log_info(gc, init)("Heap Backing File: %s", filename);
 270 
 271   return fd;
 272 }
 273 
 274 int ZPhysicalMemoryBacking::create_fd(const char* name) const {
 275   if (AllocateHeapAt == NULL) {
 276     // If the path is not explicitly specified, then we first try to create a memfd file
 277     // instead of looking for a tmpfd/hugetlbfs mount point. Note that memfd_create() might
 278     // not be supported at all (requires kernel >= 3.17), or it might not support large
 279     // pages (requires kernel >= 4.14). If memfd_create() fails, then we try to create a
 280     // file on an accessible tmpfs or hugetlbfs mount point.
 281     const int fd = create_mem_fd(name);
 282     if (fd != -1) {
 283       return fd;
 284     }
 285 
 286     log_debug(gc, init)("Falling back to searching for an accessible mount point");
 287   }
 288 
 289   return create_file_fd(name);
 290 }
 291 
 292 bool ZPhysicalMemoryBacking::is_initialized() const {
 293   return _initialized;
 294 }
 295 
 296 void ZPhysicalMemoryBacking::warn_available_space(size_t max) const {
 297   // Note that the available space on a tmpfs or a hugetlbfs filesystem
 298   // will be zero if no size limit was specified when it was mounted.
 299   if (_available == 0) {
 300     // No size limit set, skip check
 301     log_info(gc, init)("Available space on backing filesystem: N/A");
 302     return;
 303   }
 304 
 305   log_info(gc, init)("Available space on backing filesystem: " SIZE_FORMAT "M", _available / M);
 306 
 307   // Warn if the filesystem doesn't currently have enough space available to hold
 308   // the max heap size. The max heap size will be capped if we later hit this limit
 309   // when trying to expand the heap.
 310   if (_available < max) {
 311     log_warning(gc)("***** WARNING! INCORRECT SYSTEM CONFIGURATION DETECTED! *****");
 312     log_warning(gc)("Not enough space available on the backing filesystem to hold the current max Java heap");
 313     log_warning(gc)("size (" SIZE_FORMAT "M). Please adjust the size of the backing filesystem accordingly "
 314                     "(available", max / M);
 315     log_warning(gc)("space is currently " SIZE_FORMAT "M). Continuing execution with the current filesystem "
 316                     "size could", _available / M);
 317     log_warning(gc)("lead to a premature OutOfMemoryError being thrown, due to failure to map memory.");
 318   }
 319 }
 320 
 321 void ZPhysicalMemoryBacking::warn_max_map_count(size_t max) const {
 322   const char* const filename = ZFILENAME_PROC_MAX_MAP_COUNT;
 323   FILE* const file = fopen(filename, "r");
 324   if (file == NULL) {
 325     // Failed to open file, skip check
 326     log_debug(gc, init)("Failed to open %s", filename);
 327     return;
 328   }
 329 
 330   size_t actual_max_map_count = 0;
 331   const int result = fscanf(file, SIZE_FORMAT, &actual_max_map_count);
 332   fclose(file);
 333   if (result != 1) {
 334     // Failed to read file, skip check
 335     log_debug(gc, init)("Failed to read %s", filename);
 336     return;
 337   }
 338 
 339   // The required max map count is impossible to calculate exactly since subsystems
 340   // other than ZGC are also creating memory mappings, and we have no control over that.
 341   // However, ZGC tends to create the most mappings and dominate the total count.
 342   // In the worst cases, ZGC will map each granule three times, i.e. once per heap view.
 343   // We speculate that we need another 20% to allow for non-ZGC subsystems to map memory.
 344   const size_t required_max_map_count = (max / ZGranuleSize) * 3 * 1.2;
 345   if (actual_max_map_count < required_max_map_count) {
 346     log_warning(gc)("***** WARNING! INCORRECT SYSTEM CONFIGURATION DETECTED! *****");
 347     log_warning(gc)("The system limit on number of memory mappings per process might be too low for the given");
 348     log_warning(gc)("max Java heap size (" SIZE_FORMAT "M). Please adjust %s to allow for at",
 349                     max / M, filename);
 350     log_warning(gc)("least " SIZE_FORMAT " mappings (current limit is " SIZE_FORMAT "). Continuing execution "
 351                     "with the current", required_max_map_count, actual_max_map_count);
 352     log_warning(gc)("limit could lead to a fatal error, due to failure to map memory.");
 353   }
 354 }
 355 
 356 void ZPhysicalMemoryBacking::warn_commit_limits(size_t max) const {
 357   // Warn if available space is too low
 358   warn_available_space(max);
 359 
 360   // Warn if max map count is too low
 361   warn_max_map_count(max);
 362 }
 363 
 364 size_t ZPhysicalMemoryBacking::size() const {
 365   return _size;
 366 }
 367 
 368 bool ZPhysicalMemoryBacking::is_tmpfs() const {
 369   return _filesystem == TMPFS_MAGIC;
 370 }
 371 
 372 bool ZPhysicalMemoryBacking::is_hugetlbfs() const {
 373   return _filesystem == HUGETLBFS_MAGIC;
 374 }
 375 
 376 bool ZPhysicalMemoryBacking::tmpfs_supports_transparent_huge_pages() const {
 377   // If the shmem_enabled file exists and is readable then we
 378   // know the kernel supports transparent huge pages for tmpfs.
 379   return access(ZFILENAME_SHMEM_ENABLED, R_OK) == 0;
 380 }
 381 
 382 ZErrno ZPhysicalMemoryBacking::fallocate_compat_ftruncate(size_t size) const {
 383   while (ftruncate(_fd, size) == -1) {
 384     if (errno != EINTR) {
 385       // Failed
 386       return errno;
 387     }
 388   }
 389 
 390   // Success
 391   return 0;
 392 }
 393 
 394 ZErrno ZPhysicalMemoryBacking::fallocate_compat_mmap_hugetlbfs(size_t offset, size_t length, bool touch) const {
 395   // On hugetlbfs, mapping a file segment will fail immediately, without
 396   // the need to touch the mapped pages first, if there aren't enough huge
 397   // pages available to back the mapping.
 398   void* const addr = mmap(0, length, PROT_READ|PROT_WRITE, MAP_SHARED, _fd, offset);
 399   if (addr == MAP_FAILED) {
 400     // Failed
 401     return errno;
 402   }
 403 
 404   // Once mapped, the huge pages are only reserved. We need to touch them
 405   // to associate them with the file segment. Note that we can not punch
 406   // hole in file segments which only have reserved pages.
 407   if (touch) {
 408     char* const start = (char*)addr;
 409     char* const end = start + length;
 410     os::pretouch_memory(start, end, _block_size);
 411   }
 412 
 413   // Unmap again. From now on, the huge pages that were mapped are allocated
 414   // to this file. There's no risk of getting a SIGBUS when mapping and
 415   // touching them again.
 416   if (munmap(addr, length) == -1) {
 417     // Failed
 418     return errno;
 419   }
 420 
 421   // Success
 422   return 0;
 423 }
 424 
 425 static bool safe_touch_mapping(void* addr, size_t length, size_t page_size) {
 426   char* const start = (char*)addr;
 427   char* const end = start + length;
 428 
 429   // Touching a mapping that can't be backed by memory will generate a
 430   // SIGBUS. By using SafeFetch32 any SIGBUS will be safely caught and
 431   // handled. On tmpfs, doing a fetch (rather than a store) is enough
 432   // to cause backing pages to be allocated (there's no zero-page to
 433   // worry about).
 434   for (char *p = start; p < end; p += page_size) {
 435     if (SafeFetch32((int*)p, -1) == -1) {
 436       // Failed
 437       return false;
 438     }
 439   }
 440 
 441   // Success
 442   return true;
 443 }
 444 
 445 ZErrno ZPhysicalMemoryBacking::fallocate_compat_mmap_tmpfs(size_t offset, size_t length) const {
 446   // On tmpfs, we need to touch the mapped pages to figure out
 447   // if there are enough pages available to back the mapping.
 448   void* const addr = mmap(0, length, PROT_READ|PROT_WRITE, MAP_SHARED, _fd, offset);
 449   if (addr == MAP_FAILED) {
 450     // Failed
 451     return errno;
 452   }
 453 
 454   // Advise mapping to use transparent huge pages
 455   os::realign_memory((char*)addr, length, os::large_page_size());
 456 
 457   // Touch the mapping (safely) to make sure it's backed by memory
 458   const bool backed = safe_touch_mapping(addr, length, _block_size);
 459 
 460   // Unmap again. If we successfully touched, the backing memory will
 461   // be allocated to this file. There's no risk of getting a SIGBUS
 462   // when mapping and touching them again.
 463   if (munmap(addr, length) == -1) {
 464     // Failed
 465     return errno;
 466   }
 467 
 468   // Success
 469   return backed ? 0 : ENOMEM;
 470 }
 471 
 472 ZErrno ZPhysicalMemoryBacking::fallocate_compat_pwrite(size_t offset, size_t length) const {
 473   uint8_t data = 0;
 474 
 475   // Allocate backing memory by writing to each block
 476   for (size_t pos = offset; pos < offset + length; pos += _block_size) {
 477     if (pwrite(_fd, &data, sizeof(data), pos) == -1) {
 478       // Failed
 479       return errno;
 480     }
 481   }
 482 
 483   // Success
 484   return 0;
 485 }
 486 
 487 ZErrno ZPhysicalMemoryBacking::fallocate_fill_hole_compat(size_t offset, size_t length) {
 488   // fallocate(2) is only supported by tmpfs since Linux 3.5, and by hugetlbfs
 489   // since Linux 4.3. When fallocate(2) is not supported we emulate it using
 490   // mmap/munmap (for hugetlbfs and tmpfs with transparent huge pages) or pwrite
 491   // (for tmpfs without transparent huge pages and other filesystem types).
 492 
 493   const size_t end = offset + length;
 494   if (end > _size) {
 495     // Increase file size
 496     const ZErrno err = fallocate_compat_ftruncate(end);
 497     if (err) {
 498       // Failed
 499       return err;
 500     }
 501   }
 502 
 503   // Allocate backing memory
 504   const ZErrno err = ZLargePages::is_explicit()
 505                      ? fallocate_compat_mmap_hugetlbfs(offset, length, false /* touch */)
 506                      : (ZLargePages::is_transparent()
 507                         ? fallocate_compat_mmap_tmpfs(offset, length)
 508                         : fallocate_compat_pwrite(offset, length));
 509 
 510   if (err) {
 511     if (end > _size) {
 512       // Restore file size
 513       fallocate_compat_ftruncate(_size);
 514     }
 515 
 516     // Failed
 517     return err;
 518   }
 519 
 520   if (end > _size) {
 521     // Record new file size
 522     _size = end;
 523   }
 524 
 525   // Success
 526   return 0;
 527 }
 528 
 529 ZErrno ZPhysicalMemoryBacking::fallocate_fill_hole_syscall(size_t offset, size_t length) {
 530   const int mode = 0; // Allocate
 531   const int res = ZSyscall::fallocate(_fd, mode, offset, length);
 532   if (res == -1) {
 533     // Failed
 534     return errno;
 535   }
 536 
 537   const size_t end = offset + length;
 538   if (end > _size) {
 539     // Record new file size
 540     _size = end;
 541   }
 542 
 543   // Success
 544   return 0;
 545 }
 546 
 547 ZErrno ZPhysicalMemoryBacking::fallocate_fill_hole(size_t offset, size_t length) {
 548   // Using compat mode is more efficient when allocating space on hugetlbfs.
 549   // Note that allocating huge pages this way will only reserve them, and not
 550   // associate them with segments of the file. We must guarantee that we at
 551   // some point touch these segments, otherwise we can not punch hole in them.
 552   if (z_fallocate_supported && !ZLargePages::is_enabled()) {
 553      const ZErrno err = fallocate_fill_hole_syscall(offset, length);
 554      if (!err) {
 555        // Success
 556        return 0;
 557      }
 558 
 559      if (err != ENOSYS && err != EOPNOTSUPP) {
 560        // Failed
 561        return err;
 562      }
 563 
 564      // Not supported
 565      log_debug(gc)("Falling back to fallocate() compatibility mode");
 566      z_fallocate_supported = false;
 567   }
 568 
 569   return fallocate_fill_hole_compat(offset, length);
 570 }
 571 
 572 ZErrno ZPhysicalMemoryBacking::fallocate_punch_hole(size_t offset, size_t length) {
 573   if (ZLargePages::is_explicit()) {
 574     // We can only punch hole in pages that have been touched. Non-touched
 575     // pages are only reserved, and not associated with any specific file
 576     // segment. We don't know which pages have been previously touched, so
 577     // we always touch them here to guarantee that we can punch hole.
 578     const ZErrno err = fallocate_compat_mmap_hugetlbfs(offset, length, true /* touch */);
 579     if (err) {
 580       // Failed
 581       return err;
 582     }
 583   }
 584 
 585   const int mode = FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE;
 586   if (ZSyscall::fallocate(_fd, mode, offset, length) == -1) {
 587     // Failed
 588     return errno;
 589   }
 590 
 591   // Success
 592   return 0;
 593 }
 594 
 595 ZErrno ZPhysicalMemoryBacking::split_and_fallocate(bool punch_hole, size_t offset, size_t length) {
 596   // Try first half
 597   const size_t offset0 = offset;
 598   const size_t length0 = align_up(length / 2, _block_size);
 599   const ZErrno err0 = fallocate(punch_hole, offset0, length0);
 600   if (err0) {
 601     return err0;
 602   }
 603 
 604   // Try second half
 605   const size_t offset1 = offset0 + length0;
 606   const size_t length1 = length - length0;
 607   const ZErrno err1 = fallocate(punch_hole, offset1, length1);
 608   if (err1) {
 609     return err1;
 610   }
 611 
 612   // Success
 613   return 0;
 614 }
 615 
 616 ZErrno ZPhysicalMemoryBacking::fallocate(bool punch_hole, size_t offset, size_t length) {
 617   assert(is_aligned(offset, _block_size), "Invalid offset");
 618   assert(is_aligned(length, _block_size), "Invalid length");
 619 
 620   const ZErrno err = punch_hole ? fallocate_punch_hole(offset, length) : fallocate_fill_hole(offset, length);
 621   if (err == EINTR && length > _block_size) {
 622     // Calling fallocate(2) with a large length can take a long time to
 623     // complete. When running profilers, such as VTune, this syscall will
 624     // be constantly interrupted by signals. Expanding the file in smaller
 625     // steps avoids this problem.
 626     return split_and_fallocate(punch_hole, offset, length);
 627   }
 628 
 629   return err;
 630 }
 631 
 632 bool ZPhysicalMemoryBacking::commit_inner(size_t offset, size_t length) {
 633   log_trace(gc, heap)("Committing memory: " SIZE_FORMAT "M-" SIZE_FORMAT "M (" SIZE_FORMAT "M)",
 634                       offset / M, (offset + length) / M, length / M);
 635 
 636 retry:
 637   const ZErrno err = fallocate(false /* punch_hole */, offset, length);
 638   if (err) {
 639     if (err == ENOSPC && !is_init_completed() && ZLargePages::is_explicit() && z_fallocate_hugetlbfs_attempts-- > 0) {
 640       // If we fail to allocate during initialization, due to lack of space on
 641       // the hugetlbfs filesystem, then we wait and retry a few times before
 642       // giving up. Otherwise there is a risk that running JVMs back-to-back
 643       // will fail, since there is a delay between process termination and the
 644       // huge pages owned by that process being returned to the huge page pool
 645       // and made available for new allocations.
 646       log_debug(gc, init)("Failed to commit memory (%s), retrying", err.to_string());
 647 
 648       // Wait and retry in one second, in the hope that huge pages will be
 649       // available by then.
 650       sleep(1);
 651       goto retry;
 652     }
 653 
 654     // Failed
 655     log_error(gc)("Failed to commit memory (%s)", err.to_string());
 656     return false;
 657   }
 658 
 659   // Success
 660   return true;
 661 }
 662 
 663 static int offset_to_node(size_t offset) {
 664   const GrowableArray<int>* mapping = os::Linux::numa_nindex_to_node();
 665   const size_t nindex = (offset >> ZGranuleSizeShift) % mapping->length();
 666   return mapping->at((int)nindex);
 667 }
 668 
 669 size_t ZPhysicalMemoryBacking::commit_numa_interleaved(size_t offset, size_t length) {
 670   size_t committed = 0;
 671 
 672   // Commit one granule at a time, so that each granule
 673   // can be allocated from a different preferred node.
 674   while (committed < length) {
 675     const size_t granule_offset = offset + committed;
 676 
 677     // Setup NUMA policy to allocate memory from a preferred node
 678     os::Linux::numa_set_preferred(offset_to_node(granule_offset));
 679 
 680     if (!commit_inner(granule_offset, ZGranuleSize)) {
 681       // Failed
 682       break;
 683     }
 684 
 685     committed += ZGranuleSize;
 686   }
 687 
 688   // Restore NUMA policy
 689   os::Linux::numa_set_preferred(-1);
 690 
 691   return committed;
 692 }
 693 
 694 size_t ZPhysicalMemoryBacking::commit_default(size_t offset, size_t length) {
 695   // Try to commit the whole region
 696   if (commit_inner(offset, length)) {
 697     // Success
 698     return length;
 699   }
 700 
 701   // Failed, try to commit as much as possible
 702   size_t start = offset;
 703   size_t end = offset + length;
 704 
 705   for (;;) {
 706     length = align_down((end - start) / 2, ZGranuleSize);
 707     if (length < ZGranuleSize) {
 708       // Done, don't commit more
 709       return start - offset;
 710     }
 711 
 712     if (commit_inner(start, length)) {
 713       // Success, try commit more
 714       start += length;
 715     } else {
 716       // Failed, try commit less
 717       end -= length;
 718     }
 719   }
 720 }
 721 
 722 size_t ZPhysicalMemoryBacking::commit(size_t offset, size_t length) {
 723   if (ZNUMA::is_enabled() && !ZLargePages::is_explicit()) {
 724     // To get granule-level NUMA interleaving when using non-large pages,
 725     // we must explicitly interleave the memory at commit/fallocate time.
 726     return commit_numa_interleaved(offset, length);
 727   }
 728 
 729   return commit_default(offset, length);
 730 }
 731 
 732 size_t ZPhysicalMemoryBacking::uncommit(size_t offset, size_t length) {
 733   log_trace(gc, heap)("Uncommitting memory: " SIZE_FORMAT "M-" SIZE_FORMAT "M (" SIZE_FORMAT "M)",
 734                       offset / M, (offset + length) / M, length / M);
 735 
 736   const ZErrno err = fallocate(true /* punch_hole */, offset, length);
 737   if (err) {
 738     log_error(gc)("Failed to uncommit memory (%s)", err.to_string());
 739     return 0;
 740   }
 741 
 742   return length;
 743 }
 744 
 745 void ZPhysicalMemoryBacking::map(uintptr_t addr, size_t size, uintptr_t offset) const {
 746   const void* const res = mmap((void*)addr, size, PROT_READ|PROT_WRITE, MAP_FIXED|MAP_SHARED, _fd, offset);
 747   if (res == MAP_FAILED) {
 748     ZErrno err;
 749     fatal("Failed to map memory (%s)", err.to_string());
 750   }
 751 }
 752 
 753 void ZPhysicalMemoryBacking::unmap(uintptr_t addr, size_t size) const {
 754   // Note that we must keep the address space reservation intact and just detach
 755   // the backing memory. For this reason we map a new anonymous, non-accessible
 756   // and non-reserved page over the mapping instead of actually unmapping.
 757   const void* const res = mmap((void*)addr, size, PROT_NONE, MAP_FIXED | MAP_ANONYMOUS | MAP_PRIVATE | MAP_NORESERVE, -1, 0);
 758   if (res == MAP_FAILED) {
 759     ZErrno err;
 760     fatal("Failed to map memory (%s)", err.to_string());
 761   }
 762 }