1 /*
   2  * Copyright (c) 2015, 2019, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  */
  23 
  24 #include "precompiled.hpp"
  25 #include "gc/z/zArray.inline.hpp"
  26 #include "gc/z/zBackingFile_linux_x86.hpp"
  27 #include "gc/z/zBackingPath_linux_x86.hpp"
  28 #include "gc/z/zErrno.hpp"
  29 #include "gc/z/zGlobals.hpp"
  30 #include "gc/z/zLargePages.inline.hpp"
  31 #include "logging/log.hpp"
  32 #include "runtime/init.hpp"
  33 #include "runtime/os.hpp"
  34 #include "utilities/align.hpp"
  35 #include "utilities/debug.hpp"
  36 
  37 #include <fcntl.h>
  38 #include <sys/mman.h>
  39 #include <sys/stat.h>
  40 #include <sys/statfs.h>
  41 #include <sys/syscall.h>
  42 #include <sys/types.h>
  43 #include <unistd.h>
  44 
  45 //
  46 // Support for building on older Linux systems
  47 //
  48 
  49 // System calls
  50 #ifndef SYS_fallocate
  51 #define SYS_fallocate                    285
  52 #endif
  53 #ifndef SYS_memfd_create
  54 #define SYS_memfd_create                 319
  55 #endif
  56 
  57 // memfd_create(2) flags
  58 #ifndef MFD_CLOEXEC
  59 #define MFD_CLOEXEC                      0x0001U
  60 #endif
  61 #ifndef MFD_HUGETLB
  62 #define MFD_HUGETLB                      0x0004U
  63 #endif
  64 
  65 // open(2) flags
  66 #ifndef O_CLOEXEC
  67 #define O_CLOEXEC                        02000000
  68 #endif
  69 #ifndef O_TMPFILE
  70 #define O_TMPFILE                        (020000000 | O_DIRECTORY)
  71 #endif
  72 
  73 // fallocate(2) flags
  74 #ifndef FALLOC_FL_KEEP_SIZE
  75 #define FALLOC_FL_KEEP_SIZE              0x01
  76 #endif
  77 #ifndef FALLOC_FL_PUNCH_HOLE
  78 #define FALLOC_FL_PUNCH_HOLE             0x02
  79 #endif
  80 
  81 // Filesystem types, see statfs(2)
  82 #ifndef TMPFS_MAGIC
  83 #define TMPFS_MAGIC                      0x01021994
  84 #endif
  85 #ifndef HUGETLBFS_MAGIC
  86 #define HUGETLBFS_MAGIC                  0x958458f6
  87 #endif
  88 
  89 // Filesystem names
  90 #define ZFILESYSTEM_TMPFS                "tmpfs"
  91 #define ZFILESYSTEM_HUGETLBFS            "hugetlbfs"
  92 
  93 // Sysfs file for transparent huge page on tmpfs
  94 #define ZFILENAME_SHMEM_ENABLED          "/sys/kernel/mm/transparent_hugepage/shmem_enabled"
  95 
  96 // Java heap filename
  97 #define ZFILENAME_HEAP                   "java_heap"
  98 
  99 // Preferred tmpfs mount points, ordered by priority
 100 static const char* z_preferred_tmpfs_mountpoints[] = {
 101   "/dev/shm",
 102   "/run/shm",
 103   NULL
 104 };
 105 
 106 // Preferred hugetlbfs mount points, ordered by priority
 107 static const char* z_preferred_hugetlbfs_mountpoints[] = {
 108   "/dev/hugepages",
 109   "/hugepages",
 110   NULL
 111 };
 112 
 113 static int z_fallocate_hugetlbfs_attempts = 3;
 114 static bool z_fallocate_supported = true;
 115 
 116 static int z_fallocate(int fd, int mode, size_t offset, size_t length) {
 117   return syscall(SYS_fallocate, fd, mode, offset, length);
 118 }
 119 
 120 static int z_memfd_create(const char *name, unsigned int flags) {
 121   return syscall(SYS_memfd_create, name, flags);
 122 }
 123 
 124 ZBackingFile::ZBackingFile() :
 125     _fd(-1),
 126     _size(0),
 127     _filesystem(0),
 128     _block_size(0),
 129     _available(0),
 130     _initialized(false) {
 131 
 132   // Create backing file
 133   _fd = create_fd(ZFILENAME_HEAP);
 134   if (_fd == -1) {
 135     return;
 136   }
 137 
 138   // Get filesystem statistics
 139   struct statfs buf;
 140   if (fstatfs(_fd, &buf) == -1) {
 141     ZErrno err;
 142     log_error(gc)("Failed to determine filesystem type for backing file (%s)", err.to_string());
 143     return;
 144   }
 145 
 146   _filesystem = buf.f_type;
 147   _block_size = buf.f_bsize;
 148   _available = buf.f_bavail * _block_size;
 149 
 150   // Make sure we're on a supported filesystem
 151   if (!is_tmpfs() && !is_hugetlbfs()) {
 152     log_error(gc)("Backing file must be located on a %s or a %s filesystem",
 153                   ZFILESYSTEM_TMPFS, ZFILESYSTEM_HUGETLBFS);
 154     return;
 155   }
 156 
 157   // Make sure the filesystem type matches requested large page type
 158   if (ZLargePages::is_transparent() && !is_tmpfs()) {
 159     log_error(gc)("-XX:+UseTransparentHugePages can only be enable when using a %s filesystem",
 160                   ZFILESYSTEM_TMPFS);
 161     return;
 162   }
 163 
 164   if (ZLargePages::is_transparent() && !tmpfs_supports_transparent_huge_pages()) {
 165     log_error(gc)("-XX:+UseTransparentHugePages on a %s filesystem not supported by kernel",
 166                   ZFILESYSTEM_TMPFS);
 167     return;
 168   }
 169 
 170   if (ZLargePages::is_explicit() && !is_hugetlbfs()) {
 171     log_error(gc)("-XX:+UseLargePages (without -XX:+UseTransparentHugePages) can only be enabled "
 172                   "when using a %s filesystem", ZFILESYSTEM_HUGETLBFS);
 173     return;
 174   }
 175 
 176   if (!ZLargePages::is_explicit() && is_hugetlbfs()) {
 177     log_error(gc)("-XX:+UseLargePages must be enabled when using a %s filesystem",
 178                   ZFILESYSTEM_HUGETLBFS);
 179     return;
 180   }
 181 
 182   const size_t expected_block_size = is_tmpfs() ? os::vm_page_size() : os::large_page_size();
 183   if (expected_block_size != _block_size) {
 184     log_error(gc)("%s filesystem has unexpected block size " SIZE_FORMAT " (expected " SIZE_FORMAT ")",
 185                   is_tmpfs() ? ZFILESYSTEM_TMPFS : ZFILESYSTEM_HUGETLBFS, _block_size, expected_block_size);
 186     return;
 187   }
 188 
 189   // Successfully initialized
 190   _initialized = true;
 191 }
 192 
 193 int ZBackingFile::create_mem_fd(const char* name) const {
 194   // Create file name
 195   char filename[PATH_MAX];
 196   snprintf(filename, sizeof(filename), "%s%s", name, ZLargePages::is_explicit() ? ".hugetlb" : "");
 197 
 198   // Create file
 199   const int extra_flags = ZLargePages::is_explicit() ? MFD_HUGETLB : 0;
 200   const int fd = z_memfd_create(filename, MFD_CLOEXEC | extra_flags);
 201   if (fd == -1) {
 202     ZErrno err;
 203     log_debug(gc, init)("Failed to create memfd file (%s)",
 204                         ((ZLargePages::is_explicit() && err == EINVAL) ? "Hugepages not supported" : err.to_string()));
 205     return -1;
 206   }
 207 
 208   log_info(gc, init)("Heap backed by file: /memfd:%s", filename);
 209 
 210   return fd;
 211 }
 212 
 213 int ZBackingFile::create_file_fd(const char* name) const {
 214   const char* const filesystem = ZLargePages::is_explicit()
 215                                  ? ZFILESYSTEM_HUGETLBFS
 216                                  : ZFILESYSTEM_TMPFS;
 217   const char** const preferred_mountpoints = ZLargePages::is_explicit()
 218                                              ? z_preferred_hugetlbfs_mountpoints
 219                                              : z_preferred_tmpfs_mountpoints;
 220 
 221   // Find mountpoint
 222   ZBackingPath path(filesystem, preferred_mountpoints);
 223   if (path.get() == NULL) {
 224     log_error(gc)("Use -XX:ZPath to specify the path to a %s filesystem", filesystem);
 225     return -1;
 226   }
 227 
 228   // Try to create an anonymous file using the O_TMPFILE flag. Note that this
 229   // flag requires kernel >= 3.11. If this fails we fall back to open/unlink.
 230   const int fd_anon = os::open(path.get(), O_TMPFILE|O_EXCL|O_RDWR|O_CLOEXEC, S_IRUSR|S_IWUSR);
 231   if (fd_anon == -1) {
 232     ZErrno err;
 233     log_debug(gc, init)("Failed to create anonymous file in %s (%s)", path.get(),
 234                         (err == EINVAL ? "Not supported" : err.to_string()));
 235   } else {
 236     // Get inode number for anonymous file
 237     struct stat stat_buf;
 238     if (fstat(fd_anon, &stat_buf) == -1) {
 239       ZErrno err;
 240       log_error(gc)("Failed to determine inode number for anonymous file (%s)", err.to_string());
 241       return -1;
 242     }
 243 
 244     log_info(gc, init)("Heap backed by file: %s/#" UINT64_FORMAT, path.get(), (uint64_t)stat_buf.st_ino);
 245 
 246     return fd_anon;
 247   }
 248 
 249   log_debug(gc, init)("Falling back to open/unlink");
 250 
 251   // Create file name
 252   char filename[PATH_MAX];
 253   snprintf(filename, sizeof(filename), "%s/%s.%d", path.get(), name, os::current_process_id());
 254 
 255   // Create file
 256   const int fd = os::open(filename, O_CREAT|O_EXCL|O_RDWR|O_CLOEXEC, S_IRUSR|S_IWUSR);
 257   if (fd == -1) {
 258     ZErrno err;
 259     log_error(gc)("Failed to create file %s (%s)", filename, err.to_string());
 260     return -1;
 261   }
 262 
 263   // Unlink file
 264   if (unlink(filename) == -1) {
 265     ZErrno err;
 266     log_error(gc)("Failed to unlink file %s (%s)", filename, err.to_string());
 267     return -1;
 268   }
 269 
 270   log_info(gc, init)("Heap backed by file: %s", filename);
 271 
 272   return fd;
 273 }
 274 
 275 int ZBackingFile::create_fd(const char* name) const {
 276   if (ZPath == NULL) {
 277     // If the path is not explicitly specified, then we first try to create a memfd file
 278     // instead of looking for a tmpfd/hugetlbfs mount point. Note that memfd_create() might
 279     // not be supported at all (requires kernel >= 3.17), or it might not support large
 280     // pages (requires kernel >= 4.14). If memfd_create() fails, then we try to create a
 281     // file on an accessible tmpfs or hugetlbfs mount point.
 282     const int fd = create_mem_fd(name);
 283     if (fd != -1) {
 284       return fd;
 285     }
 286 
 287     log_debug(gc, init)("Falling back to searching for an accessible mount point");
 288   }
 289 
 290   return create_file_fd(name);
 291 }
 292 
 293 bool ZBackingFile::is_initialized() const {
 294   return _initialized;
 295 }
 296 
 297 int ZBackingFile::fd() const {
 298   return _fd;
 299 }
 300 
 301 size_t ZBackingFile::size() const {
 302   return _size;
 303 }
 304 
 305 size_t ZBackingFile::available() const {
 306   return _available;
 307 }
 308 
 309 bool ZBackingFile::is_tmpfs() const {
 310   return _filesystem == TMPFS_MAGIC;
 311 }
 312 
 313 bool ZBackingFile::is_hugetlbfs() const {
 314   return _filesystem == HUGETLBFS_MAGIC;
 315 }
 316 
 317 bool ZBackingFile::tmpfs_supports_transparent_huge_pages() const {
 318   // If the shmem_enabled file exists and is readable then we
 319   // know the kernel supports transparent huge pages for tmpfs.
 320   return access(ZFILENAME_SHMEM_ENABLED, R_OK) == 0;
 321 }
 322 
 323 ZErrno ZBackingFile::fallocate_compat_ftruncate(size_t size) const {
 324   while (ftruncate(_fd, size) == -1) {
 325     if (errno != EINTR) {
 326       // Failed
 327       return errno;
 328     }
 329   }
 330 
 331   // Success
 332   return 0;
 333 }
 334 
 335 ZErrno ZBackingFile::fallocate_compat_mmap(size_t offset, size_t length, bool touch) const {
 336   // On hugetlbfs, mapping a file segment will fail immediately, without
 337   // the need to touch the mapped pages first, if there aren't enough huge
 338   // pages available to back the mapping.
 339   void* const addr = mmap(0, length, PROT_READ|PROT_WRITE, MAP_SHARED, _fd, offset);
 340   if (addr == MAP_FAILED) {
 341     // Failed
 342     return errno;
 343   }
 344 
 345   // Once mapped, the huge pages are only reserved. We need to touch them
 346   // to associate them with the file segment. Note that we can not punch
 347   // hole in file segments which only have reserved pages.
 348   if (touch) {
 349     char* const start = (char*)addr;
 350     char* const end = start + length;
 351     os::pretouch_memory(start, end, _block_size);
 352   }
 353 
 354   // Unmap again. From now on, the huge pages that were mapped are allocated
 355   // to this file. There's no risk in getting SIGBUS when touching them.
 356   if (munmap(addr, length) == -1) {
 357     // Failed
 358     return errno;
 359   }
 360 
 361   // Success
 362   return 0;
 363 }
 364 
 365 ZErrno ZBackingFile::fallocate_compat_pwrite(size_t offset, size_t length) const {
 366   uint8_t data = 0;
 367 
 368   // Allocate backing memory by writing to each block
 369   for (size_t pos = offset; pos < offset + length; pos += _block_size) {
 370     if (pwrite(_fd, &data, sizeof(data), pos) == -1) {
 371       // Failed
 372       return errno;
 373     }
 374   }
 375 
 376   // Success
 377   return 0;
 378 }
 379 
 380 ZErrno ZBackingFile::fallocate_fill_hole_compat(size_t offset, size_t length) {
 381   // fallocate(2) is only supported by tmpfs since Linux 3.5, and by hugetlbfs
 382   // since Linux 4.3. When fallocate(2) is not supported we emulate it using
 383   // ftruncate/pwrite (for tmpfs) or ftruncate/mmap/munmap (for hugetlbfs).
 384 
 385   const size_t end = offset + length;
 386   if (end > _size) {
 387     // Increase file size
 388     const ZErrno err = fallocate_compat_ftruncate(end);
 389     if (err) {
 390       // Failed
 391       return err;
 392     }
 393   }
 394 
 395   // Allocate backing memory
 396   const ZErrno err = is_hugetlbfs() ? fallocate_compat_mmap(offset, length, false /* touch */)
 397                                     : fallocate_compat_pwrite(offset, length);
 398   if (err) {
 399     if (end > _size) {
 400       // Restore file size
 401       fallocate_compat_ftruncate(_size);
 402     }
 403 
 404     // Failed
 405     return err;
 406   }
 407 
 408   if (end > _size) {
 409     // Record new file size
 410     _size = end;
 411   }
 412 
 413   // Success
 414   return 0;
 415 }
 416 
 417 ZErrno ZBackingFile::fallocate_fill_hole_syscall(size_t offset, size_t length) {
 418   const int mode = 0; // Allocate
 419   const int res = z_fallocate(_fd, mode, offset, length);
 420   if (res == -1) {
 421     // Failed
 422     return errno;
 423   }
 424 
 425   const size_t end = offset + length;
 426   if (end > _size) {
 427     // Record new file size
 428     _size = end;
 429   }
 430 
 431   // Success
 432   return 0;
 433 }
 434 
 435 ZErrno ZBackingFile::fallocate_fill_hole(size_t offset, size_t length) {
 436   // Using compat mode is more efficient when allocating space on hugetlbfs.
 437   // Note that allocating huge pages this way will only reserve them, and not
 438   // associate them with segments of the file. We must guarantee that we at
 439   // some point touch these segments, otherwise we can not punch hole in them.
 440   if (z_fallocate_supported && !is_hugetlbfs()) {
 441      const ZErrno err = fallocate_fill_hole_syscall(offset, length);
 442      if (!err) {
 443        // Success
 444        return 0;
 445      }
 446 
 447      if (err != ENOSYS && err != EOPNOTSUPP) {
 448        // Failed
 449        return err;
 450      }
 451 
 452      // Not supported
 453      log_debug(gc)("Falling back to fallocate() compatibility mode");
 454      z_fallocate_supported = false;
 455   }
 456 
 457   return fallocate_fill_hole_compat(offset, length);
 458 }
 459 
 460 ZErrno ZBackingFile::fallocate_punch_hole(size_t offset, size_t length) {
 461   if (is_hugetlbfs()) {
 462     // We can only punch hole in pages that have been touched. Non-touched
 463     // pages are only reserved, and not associated with any specific file
 464     // segment. We don't know which pages have been previously touched, so
 465     // we always touch them here to guarantee that we can punch hole.
 466     const ZErrno err = fallocate_compat_mmap(offset, length, true /* touch */);
 467     if (err) {
 468       // Failed
 469       return err;
 470     }
 471   }
 472 
 473   const int mode = FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE;
 474   if (z_fallocate(_fd, mode, offset, length) == -1) {
 475     // Failed
 476     return errno;
 477   }
 478 
 479   // Success
 480   return 0;
 481 }
 482 
 483 ZErrno ZBackingFile::split_and_fallocate(bool punch_hole, size_t offset, size_t length) {
 484   // Try first half
 485   const size_t offset0 = offset;
 486   const size_t length0 = align_up(length / 2, _block_size);
 487   const ZErrno err0 = fallocate(punch_hole, offset0, length0);
 488   if (err0) {
 489     return err0;
 490   }
 491 
 492   // Try second half
 493   const size_t offset1 = offset0 + length0;
 494   const size_t length1 = length - length0;
 495   const ZErrno err1 = fallocate(punch_hole, offset1, length1);
 496   if (err1) {
 497     return err1;
 498   }
 499 
 500   // Success
 501   return 0;
 502 }
 503 
 504 ZErrno ZBackingFile::fallocate(bool punch_hole, size_t offset, size_t length) {
 505   assert(is_aligned(offset, _block_size), "Invalid offset");
 506   assert(is_aligned(length, _block_size), "Invalid length");
 507 
 508   const ZErrno err = punch_hole ? fallocate_punch_hole(offset, length) : fallocate_fill_hole(offset, length);
 509   if (err == EINTR && length > _block_size) {
 510     // Calling fallocate(2) with a large length can take a long time to
 511     // complete. When running profilers, such as VTune, this syscall will
 512     // be constantly interrupted by signals. Expanding the file in smaller
 513     // steps avoids this problem.
 514     return split_and_fallocate(punch_hole, offset, length);
 515   }
 516 
 517   return err;
 518 }
 519 
 520 bool ZBackingFile::commit_inner(size_t offset, size_t length) {
 521   log_trace(gc, heap)("Committing memory: " SIZE_FORMAT "M-" SIZE_FORMAT "M (" SIZE_FORMAT "M)",
 522                       offset / M, (offset + length) / M, length / M);
 523 
 524 retry:
 525   const ZErrno err = fallocate(false /* punch_hole */, offset, length);
 526   if (err) {
 527     if (err == ENOSPC && !is_init_completed() && is_hugetlbfs() && z_fallocate_hugetlbfs_attempts-- > 0) {
 528       // If we fail to allocate during initialization, due to lack of space on
 529       // the hugetlbfs filesystem, then we wait and retry a few times before
 530       // giving up. Otherwise there is a risk that running JVMs back-to-back
 531       // will fail, since there is a delay between process termination and the
 532       // huge pages owned by that process being returned to the huge page pool
 533       // and made available for new allocations.
 534       log_debug(gc, init)("Failed to commit memory (%s), retrying", err.to_string());
 535 
 536       // Wait and retry in one second, in the hope that huge pages will be
 537       // available by then.
 538       sleep(1);
 539       goto retry;
 540     }
 541 
 542     // Failed
 543     log_error(gc)("Failed to commit memory (%s)", err.to_string());
 544     return false;
 545   }
 546 
 547   // Success
 548   return true;
 549 }
 550 
 551 size_t ZBackingFile::commit(size_t offset, size_t length) {
 552   // Try to commit the whole region
 553   if (commit_inner(offset, length)) {
 554     // Success
 555     return length;
 556   }
 557 
 558   // Failed, try to commit as much as possible
 559   size_t start = offset;
 560   size_t end = offset + length;
 561 
 562   for (;;) {
 563     length = align_down((end - start) / 2, ZGranuleSize);
 564     if (length < ZGranuleSize) {
 565       // Done, don't commit more
 566       return start - offset;
 567     }
 568 
 569     if (commit_inner(start, length)) {
 570       // Success, try commit more
 571       start += length;
 572     } else {
 573       // Failed, try commit less
 574       end -= length;
 575     }
 576   }
 577 }
 578 
 579 size_t ZBackingFile::uncommit(size_t offset, size_t length) {
 580   log_trace(gc, heap)("Uncommitting memory: " SIZE_FORMAT "M-" SIZE_FORMAT "M (" SIZE_FORMAT "M)",
 581                       offset / M, (offset + length) / M, length / M);
 582 
 583   const ZErrno err = fallocate(true /* punch_hole */, offset, length);
 584   if (err) {
 585     log_error(gc)("Failed to uncommit memory (%s)", err.to_string());
 586     return 0;
 587   }
 588 
 589   return length;
 590 }