1 /*
   2  * Copyright (c) 2015, 2019, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  */
  23 
  24 #include "precompiled.hpp"
  25 #include "gc/z/zArray.inline.hpp"
  26 #include "gc/z/zBackingFile_linux.hpp"
  27 #include "gc/z/zBackingPath_linux.hpp"
  28 #include "gc/z/zErrno.hpp"
  29 #include "gc/z/zGlobals.hpp"
  30 #include "gc/z/zLargePages.inline.hpp"
  31 #include "gc/z/zSyscall_linux.hpp"
  32 #include "logging/log.hpp"
  33 #include "runtime/init.hpp"
  34 #include "runtime/os.hpp"
  35 #include "utilities/align.hpp"
  36 #include "utilities/debug.hpp"
  37 
  38 #include <fcntl.h>
  39 #include <sys/mman.h>
  40 #include <sys/stat.h>
  41 #include <sys/statfs.h>
  42 #include <sys/types.h>
  43 #include <unistd.h>
  44 
  45 //
  46 // Support for building on older Linux systems
  47 //
  48 
  49 // memfd_create(2) flags
  50 #ifndef MFD_CLOEXEC
  51 #define MFD_CLOEXEC                      0x0001U
  52 #endif
  53 #ifndef MFD_HUGETLB
  54 #define MFD_HUGETLB                      0x0004U
  55 #endif
  56 
  57 // open(2) flags
  58 #ifndef O_CLOEXEC
  59 #define O_CLOEXEC                        02000000
  60 #endif
  61 #ifndef O_TMPFILE
  62 #define O_TMPFILE                        (020000000 | O_DIRECTORY)
  63 #endif
  64 
  65 // fallocate(2) flags
  66 #ifndef FALLOC_FL_KEEP_SIZE
  67 #define FALLOC_FL_KEEP_SIZE              0x01
  68 #endif
  69 #ifndef FALLOC_FL_PUNCH_HOLE
  70 #define FALLOC_FL_PUNCH_HOLE             0x02
  71 #endif
  72 
  73 // Filesystem types, see statfs(2)
  74 #ifndef TMPFS_MAGIC
  75 #define TMPFS_MAGIC                      0x01021994
  76 #endif
  77 #ifndef HUGETLBFS_MAGIC
  78 #define HUGETLBFS_MAGIC                  0x958458f6
  79 #endif
  80 
  81 // Filesystem names
  82 #define ZFILESYSTEM_TMPFS                "tmpfs"
  83 #define ZFILESYSTEM_HUGETLBFS            "hugetlbfs"
  84 
  85 // Sysfs file for transparent huge page on tmpfs
  86 #define ZFILENAME_SHMEM_ENABLED          "/sys/kernel/mm/transparent_hugepage/shmem_enabled"
  87 
  88 // Java heap filename
  89 #define ZFILENAME_HEAP                   "java_heap"
  90 
  91 // Preferred tmpfs mount points, ordered by priority
  92 static const char* z_preferred_tmpfs_mountpoints[] = {
  93   "/dev/shm",
  94   "/run/shm",
  95   NULL
  96 };
  97 
  98 // Preferred hugetlbfs mount points, ordered by priority
  99 static const char* z_preferred_hugetlbfs_mountpoints[] = {
 100   "/dev/hugepages",
 101   "/hugepages",
 102   NULL
 103 };
 104 
 105 static int z_fallocate_hugetlbfs_attempts = 3;
 106 static bool z_fallocate_supported = true;
 107 
 108 ZBackingFile::ZBackingFile() :
 109     _fd(-1),
 110     _size(0),
 111     _filesystem(0),
 112     _block_size(0),
 113     _available(0),
 114     _initialized(false) {
 115 
 116   // Create backing file
 117   _fd = create_fd(ZFILENAME_HEAP);
 118   if (_fd == -1) {
 119     return;
 120   }
 121 
 122   // Get filesystem statistics
 123   struct statfs buf;
 124   if (fstatfs(_fd, &buf) == -1) {
 125     ZErrno err;
 126     log_error(gc)("Failed to determine filesystem type for backing file (%s)", err.to_string());
 127     return;
 128   }
 129 
 130   _filesystem = buf.f_type;
 131   _block_size = buf.f_bsize;
 132   _available = buf.f_bavail * _block_size;
 133 
 134   // Make sure we're on a supported filesystem
 135   if (!is_tmpfs() && !is_hugetlbfs()) {
 136     log_error(gc)("Backing file must be located on a %s or a %s filesystem",
 137                   ZFILESYSTEM_TMPFS, ZFILESYSTEM_HUGETLBFS);
 138     return;
 139   }
 140 
 141   // Make sure the filesystem type matches requested large page type
 142   if (ZLargePages::is_transparent() && !is_tmpfs()) {
 143     log_error(gc)("-XX:+UseTransparentHugePages can only be enable when using a %s filesystem",
 144                   ZFILESYSTEM_TMPFS);
 145     return;
 146   }
 147 
 148   if (ZLargePages::is_transparent() && !tmpfs_supports_transparent_huge_pages()) {
 149     log_error(gc)("-XX:+UseTransparentHugePages on a %s filesystem not supported by kernel",
 150                   ZFILESYSTEM_TMPFS);
 151     return;
 152   }
 153 
 154   if (ZLargePages::is_explicit() && !is_hugetlbfs()) {
 155     log_error(gc)("-XX:+UseLargePages (without -XX:+UseTransparentHugePages) can only be enabled "
 156                   "when using a %s filesystem", ZFILESYSTEM_HUGETLBFS);
 157     return;
 158   }
 159 
 160   if (!ZLargePages::is_explicit() && is_hugetlbfs()) {
 161     log_error(gc)("-XX:+UseLargePages must be enabled when using a %s filesystem",
 162                   ZFILESYSTEM_HUGETLBFS);
 163     return;
 164   }
 165 
 166   const size_t expected_block_size = is_tmpfs() ? os::vm_page_size() : os::large_page_size();
 167   if (expected_block_size != _block_size) {
 168     log_error(gc)("%s filesystem has unexpected block size " SIZE_FORMAT " (expected " SIZE_FORMAT ")",
 169                   is_tmpfs() ? ZFILESYSTEM_TMPFS : ZFILESYSTEM_HUGETLBFS, _block_size, expected_block_size);
 170     return;
 171   }
 172 
 173   // Successfully initialized
 174   _initialized = true;
 175 }
 176 
 177 int ZBackingFile::create_mem_fd(const char* name) const {
 178   // Create file name
 179   char filename[PATH_MAX];
 180   snprintf(filename, sizeof(filename), "%s%s", name, ZLargePages::is_explicit() ? ".hugetlb" : "");
 181 
 182   // Create file
 183   const int extra_flags = ZLargePages::is_explicit() ? MFD_HUGETLB : 0;
 184   const int fd = ZSyscall::memfd_create(filename, MFD_CLOEXEC | extra_flags);
 185   if (fd == -1) {
 186     ZErrno err;
 187     log_debug(gc, init)("Failed to create memfd file (%s)",
 188                         ((ZLargePages::is_explicit() && err == EINVAL) ? "Hugepages not supported" : err.to_string()));
 189     return -1;
 190   }
 191 
 192   log_info(gc, init)("Heap backed by file: /memfd:%s", filename);
 193 
 194   return fd;
 195 }
 196 
 197 int ZBackingFile::create_file_fd(const char* name) const {
 198   const char* const filesystem = ZLargePages::is_explicit()
 199                                  ? ZFILESYSTEM_HUGETLBFS
 200                                  : ZFILESYSTEM_TMPFS;
 201   const char** const preferred_mountpoints = ZLargePages::is_explicit()
 202                                              ? z_preferred_hugetlbfs_mountpoints
 203                                              : z_preferred_tmpfs_mountpoints;
 204 
 205   // Find mountpoint
 206   ZBackingPath path(filesystem, preferred_mountpoints);
 207   if (path.get() == NULL) {
 208     log_error(gc)("Use -XX:ZPath to specify the path to a %s filesystem", filesystem);
 209     return -1;
 210   }
 211 
 212   // Try to create an anonymous file using the O_TMPFILE flag. Note that this
 213   // flag requires kernel >= 3.11. If this fails we fall back to open/unlink.
 214   const int fd_anon = os::open(path.get(), O_TMPFILE|O_EXCL|O_RDWR|O_CLOEXEC, S_IRUSR|S_IWUSR);
 215   if (fd_anon == -1) {
 216     ZErrno err;
 217     log_debug(gc, init)("Failed to create anonymous file in %s (%s)", path.get(),
 218                         (err == EINVAL ? "Not supported" : err.to_string()));
 219   } else {
 220     // Get inode number for anonymous file
 221     struct stat stat_buf;
 222     if (fstat(fd_anon, &stat_buf) == -1) {
 223       ZErrno err;
 224       log_error(gc)("Failed to determine inode number for anonymous file (%s)", err.to_string());
 225       return -1;
 226     }
 227 
 228     log_info(gc, init)("Heap backed by file: %s/#" UINT64_FORMAT, path.get(), (uint64_t)stat_buf.st_ino);
 229 
 230     return fd_anon;
 231   }
 232 
 233   log_debug(gc, init)("Falling back to open/unlink");
 234 
 235   // Create file name
 236   char filename[PATH_MAX];
 237   snprintf(filename, sizeof(filename), "%s/%s.%d", path.get(), name, os::current_process_id());
 238 
 239   // Create file
 240   const int fd = os::open(filename, O_CREAT|O_EXCL|O_RDWR|O_CLOEXEC, S_IRUSR|S_IWUSR);
 241   if (fd == -1) {
 242     ZErrno err;
 243     log_error(gc)("Failed to create file %s (%s)", filename, err.to_string());
 244     return -1;
 245   }
 246 
 247   // Unlink file
 248   if (unlink(filename) == -1) {
 249     ZErrno err;
 250     log_error(gc)("Failed to unlink file %s (%s)", filename, err.to_string());
 251     return -1;
 252   }
 253 
 254   log_info(gc, init)("Heap backed by file: %s", filename);
 255 
 256   return fd;
 257 }
 258 
 259 int ZBackingFile::create_fd(const char* name) const {
 260   if (ZPath == NULL) {
 261     // If the path is not explicitly specified, then we first try to create a memfd file
 262     // instead of looking for a tmpfd/hugetlbfs mount point. Note that memfd_create() might
 263     // not be supported at all (requires kernel >= 3.17), or it might not support large
 264     // pages (requires kernel >= 4.14). If memfd_create() fails, then we try to create a
 265     // file on an accessible tmpfs or hugetlbfs mount point.
 266     const int fd = create_mem_fd(name);
 267     if (fd != -1) {
 268       return fd;
 269     }
 270 
 271     log_debug(gc, init)("Falling back to searching for an accessible mount point");
 272   }
 273 
 274   return create_file_fd(name);
 275 }
 276 
 277 bool ZBackingFile::is_initialized() const {
 278   return _initialized;
 279 }
 280 
 281 int ZBackingFile::fd() const {
 282   return _fd;
 283 }
 284 
 285 size_t ZBackingFile::size() const {
 286   return _size;
 287 }
 288 
 289 size_t ZBackingFile::available() const {
 290   return _available;
 291 }
 292 
 293 bool ZBackingFile::is_tmpfs() const {
 294   return _filesystem == TMPFS_MAGIC;
 295 }
 296 
 297 bool ZBackingFile::is_hugetlbfs() const {
 298   return _filesystem == HUGETLBFS_MAGIC;
 299 }
 300 
 301 bool ZBackingFile::tmpfs_supports_transparent_huge_pages() const {
 302   // If the shmem_enabled file exists and is readable then we
 303   // know the kernel supports transparent huge pages for tmpfs.
 304   return access(ZFILENAME_SHMEM_ENABLED, R_OK) == 0;
 305 }
 306 
 307 ZErrno ZBackingFile::fallocate_compat_ftruncate(size_t size) const {
 308   while (ftruncate(_fd, size) == -1) {
 309     if (errno != EINTR) {
 310       // Failed
 311       return errno;
 312     }
 313   }
 314 
 315   // Success
 316   return 0;
 317 }
 318 
 319 ZErrno ZBackingFile::fallocate_compat_mmap(size_t offset, size_t length, bool touch) const {
 320   // On hugetlbfs, mapping a file segment will fail immediately, without
 321   // the need to touch the mapped pages first, if there aren't enough huge
 322   // pages available to back the mapping.
 323   void* const addr = mmap(0, length, PROT_READ|PROT_WRITE, MAP_SHARED, _fd, offset);
 324   if (addr == MAP_FAILED) {
 325     // Failed
 326     return errno;
 327   }
 328 
 329   // Once mapped, the huge pages are only reserved. We need to touch them
 330   // to associate them with the file segment. Note that we can not punch
 331   // hole in file segments which only have reserved pages.
 332   if (touch) {
 333     char* const start = (char*)addr;
 334     char* const end = start + length;
 335     os::pretouch_memory(start, end, _block_size);
 336   }
 337 
 338   // Unmap again. From now on, the huge pages that were mapped are allocated
 339   // to this file. There's no risk in getting SIGBUS when touching them.
 340   if (munmap(addr, length) == -1) {
 341     // Failed
 342     return errno;
 343   }
 344 
 345   // Success
 346   return 0;
 347 }
 348 
 349 ZErrno ZBackingFile::fallocate_compat_pwrite(size_t offset, size_t length) const {
 350   uint8_t data = 0;
 351 
 352   // Allocate backing memory by writing to each block
 353   for (size_t pos = offset; pos < offset + length; pos += _block_size) {
 354     if (pwrite(_fd, &data, sizeof(data), pos) == -1) {
 355       // Failed
 356       return errno;
 357     }
 358   }
 359 
 360   // Success
 361   return 0;
 362 }
 363 
 364 ZErrno ZBackingFile::fallocate_fill_hole_compat(size_t offset, size_t length) {
 365   // fallocate(2) is only supported by tmpfs since Linux 3.5, and by hugetlbfs
 366   // since Linux 4.3. When fallocate(2) is not supported we emulate it using
 367   // ftruncate/pwrite (for tmpfs) or ftruncate/mmap/munmap (for hugetlbfs).
 368 
 369   const size_t end = offset + length;
 370   if (end > _size) {
 371     // Increase file size
 372     const ZErrno err = fallocate_compat_ftruncate(end);
 373     if (err) {
 374       // Failed
 375       return err;
 376     }
 377   }
 378 
 379   // Allocate backing memory
 380   const ZErrno err = is_hugetlbfs() ? fallocate_compat_mmap(offset, length, false /* touch */)
 381                                     : fallocate_compat_pwrite(offset, length);
 382   if (err) {
 383     if (end > _size) {
 384       // Restore file size
 385       fallocate_compat_ftruncate(_size);
 386     }
 387 
 388     // Failed
 389     return err;
 390   }
 391 
 392   if (end > _size) {
 393     // Record new file size
 394     _size = end;
 395   }
 396 
 397   // Success
 398   return 0;
 399 }
 400 
 401 ZErrno ZBackingFile::fallocate_fill_hole_syscall(size_t offset, size_t length) {
 402   const int mode = 0; // Allocate
 403   const int res = ZSyscall::fallocate(_fd, mode, offset, length);
 404   if (res == -1) {
 405     // Failed
 406     return errno;
 407   }
 408 
 409   const size_t end = offset + length;
 410   if (end > _size) {
 411     // Record new file size
 412     _size = end;
 413   }
 414 
 415   // Success
 416   return 0;
 417 }
 418 
 419 ZErrno ZBackingFile::fallocate_fill_hole(size_t offset, size_t length) {
 420   // Using compat mode is more efficient when allocating space on hugetlbfs.
 421   // Note that allocating huge pages this way will only reserve them, and not
 422   // associate them with segments of the file. We must guarantee that we at
 423   // some point touch these segments, otherwise we can not punch hole in them.
 424   if (z_fallocate_supported && !is_hugetlbfs()) {
 425      const ZErrno err = fallocate_fill_hole_syscall(offset, length);
 426      if (!err) {
 427        // Success
 428        return 0;
 429      }
 430 
 431      if (err != ENOSYS && err != EOPNOTSUPP) {
 432        // Failed
 433        return err;
 434      }
 435 
 436      // Not supported
 437      log_debug(gc)("Falling back to fallocate() compatibility mode");
 438      z_fallocate_supported = false;
 439   }
 440 
 441   return fallocate_fill_hole_compat(offset, length);
 442 }
 443 
 444 ZErrno ZBackingFile::fallocate_punch_hole(size_t offset, size_t length) {
 445   if (is_hugetlbfs()) {
 446     // We can only punch hole in pages that have been touched. Non-touched
 447     // pages are only reserved, and not associated with any specific file
 448     // segment. We don't know which pages have been previously touched, so
 449     // we always touch them here to guarantee that we can punch hole.
 450     const ZErrno err = fallocate_compat_mmap(offset, length, true /* touch */);
 451     if (err) {
 452       // Failed
 453       return err;
 454     }
 455   }
 456 
 457   const int mode = FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE;
 458   if (ZSyscall::fallocate(_fd, mode, offset, length) == -1) {
 459     // Failed
 460     return errno;
 461   }
 462 
 463   // Success
 464   return 0;
 465 }
 466 
 467 ZErrno ZBackingFile::split_and_fallocate(bool punch_hole, size_t offset, size_t length) {
 468   // Try first half
 469   const size_t offset0 = offset;
 470   const size_t length0 = align_up(length / 2, _block_size);
 471   const ZErrno err0 = fallocate(punch_hole, offset0, length0);
 472   if (err0) {
 473     return err0;
 474   }
 475 
 476   // Try second half
 477   const size_t offset1 = offset0 + length0;
 478   const size_t length1 = length - length0;
 479   const ZErrno err1 = fallocate(punch_hole, offset1, length1);
 480   if (err1) {
 481     return err1;
 482   }
 483 
 484   // Success
 485   return 0;
 486 }
 487 
 488 ZErrno ZBackingFile::fallocate(bool punch_hole, size_t offset, size_t length) {
 489   assert(is_aligned(offset, _block_size), "Invalid offset");
 490   assert(is_aligned(length, _block_size), "Invalid length");
 491 
 492   const ZErrno err = punch_hole ? fallocate_punch_hole(offset, length) : fallocate_fill_hole(offset, length);
 493   if (err == EINTR && length > _block_size) {
 494     // Calling fallocate(2) with a large length can take a long time to
 495     // complete. When running profilers, such as VTune, this syscall will
 496     // be constantly interrupted by signals. Expanding the file in smaller
 497     // steps avoids this problem.
 498     return split_and_fallocate(punch_hole, offset, length);
 499   }
 500 
 501   return err;
 502 }
 503 
 504 bool ZBackingFile::commit_inner(size_t offset, size_t length) {
 505   log_trace(gc, heap)("Committing memory: " SIZE_FORMAT "M-" SIZE_FORMAT "M (" SIZE_FORMAT "M)",
 506                       offset / M, (offset + length) / M, length / M);
 507 
 508 retry:
 509   const ZErrno err = fallocate(false /* punch_hole */, offset, length);
 510   if (err) {
 511     if (err == ENOSPC && !is_init_completed() && is_hugetlbfs() && z_fallocate_hugetlbfs_attempts-- > 0) {
 512       // If we fail to allocate during initialization, due to lack of space on
 513       // the hugetlbfs filesystem, then we wait and retry a few times before
 514       // giving up. Otherwise there is a risk that running JVMs back-to-back
 515       // will fail, since there is a delay between process termination and the
 516       // huge pages owned by that process being returned to the huge page pool
 517       // and made available for new allocations.
 518       log_debug(gc, init)("Failed to commit memory (%s), retrying", err.to_string());
 519 
 520       // Wait and retry in one second, in the hope that huge pages will be
 521       // available by then.
 522       sleep(1);
 523       goto retry;
 524     }
 525 
 526     // Failed
 527     log_error(gc)("Failed to commit memory (%s)", err.to_string());
 528     return false;
 529   }
 530 
 531   // Success
 532   return true;
 533 }
 534 
 535 size_t ZBackingFile::commit(size_t offset, size_t length) {
 536   // Try to commit the whole region
 537   if (commit_inner(offset, length)) {
 538     // Success
 539     return length;
 540   }
 541 
 542   // Failed, try to commit as much as possible
 543   size_t start = offset;
 544   size_t end = offset + length;
 545 
 546   for (;;) {
 547     length = align_down((end - start) / 2, ZGranuleSize);
 548     if (length < ZGranuleSize) {
 549       // Done, don't commit more
 550       return start - offset;
 551     }
 552 
 553     if (commit_inner(start, length)) {
 554       // Success, try commit more
 555       start += length;
 556     } else {
 557       // Failed, try commit less
 558       end -= length;
 559     }
 560   }
 561 }
 562 
 563 size_t ZBackingFile::uncommit(size_t offset, size_t length) {
 564   log_trace(gc, heap)("Uncommitting memory: " SIZE_FORMAT "M-" SIZE_FORMAT "M (" SIZE_FORMAT "M)",
 565                       offset / M, (offset + length) / M, length / M);
 566 
 567   const ZErrno err = fallocate(true /* punch_hole */, offset, length);
 568   if (err) {
 569     log_error(gc)("Failed to uncommit memory (%s)", err.to_string());
 570     return 0;
 571   }
 572 
 573   return length;
 574 }