1 /* 2 * Copyright (c) 2015, 2019, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 */ 23 24 #include "precompiled.hpp" 25 #include "gc/z/zArray.inline.hpp" 26 #include "gc/z/zBackingFile_linux_x86.hpp" 27 #include "gc/z/zBackingPath_linux_x86.hpp" 28 #include "gc/z/zErrno.hpp" 29 #include "gc/z/zGlobals.hpp" 30 #include "gc/z/zLargePages.inline.hpp" 31 #include "logging/log.hpp" 32 #include "runtime/init.hpp" 33 #include "runtime/os.hpp" 34 #include "utilities/align.hpp" 35 #include "utilities/debug.hpp" 36 37 #include <fcntl.h> 38 #include <sys/mman.h> 39 #include <sys/stat.h> 40 #include <sys/statfs.h> 41 #include <sys/syscall.h> 42 #include <sys/types.h> 43 #include <unistd.h> 44 45 // 46 // Support for building on older Linux systems 47 // 48 49 // System calls 50 #ifndef SYS_fallocate 51 #define SYS_fallocate 285 52 #endif 53 #ifndef SYS_memfd_create 54 #define SYS_memfd_create 319 55 #endif 56 57 // memfd_create(2) flags 58 #ifndef MFD_CLOEXEC 59 #define MFD_CLOEXEC 0x0001U 60 #endif 61 #ifndef MFD_HUGETLB 62 #define MFD_HUGETLB 0x0004U 63 #endif 64 65 // open(2) flags 66 #ifndef O_CLOEXEC 67 #define O_CLOEXEC 02000000 68 #endif 69 #ifndef O_TMPFILE 70 #define O_TMPFILE (020000000 | O_DIRECTORY) 71 #endif 72 73 // fallocate(2) flags 74 #ifndef FALLOC_FL_KEEP_SIZE 75 #define FALLOC_FL_KEEP_SIZE 0x01 76 #endif 77 #ifndef FALLOC_FL_PUNCH_HOLE 78 #define FALLOC_FL_PUNCH_HOLE 0x02 79 #endif 80 81 // Filesystem types, see statfs(2) 82 #ifndef TMPFS_MAGIC 83 #define TMPFS_MAGIC 0x01021994 84 #endif 85 #ifndef HUGETLBFS_MAGIC 86 #define HUGETLBFS_MAGIC 0x958458f6 87 #endif 88 89 // Filesystem names 90 #define ZFILESYSTEM_TMPFS "tmpfs" 91 #define ZFILESYSTEM_HUGETLBFS "hugetlbfs" 92 93 // Sysfs file for transparent huge page on tmpfs 94 #define ZFILENAME_SHMEM_ENABLED "/sys/kernel/mm/transparent_hugepage/shmem_enabled" 95 96 // Java heap filename 97 #define ZFILENAME_HEAP "java_heap" 98 99 // Preferred tmpfs mount points, ordered by priority 100 static const char* z_preferred_tmpfs_mountpoints[] = { 101 "/dev/shm", 102 "/run/shm", 103 NULL 104 }; 105 106 // Preferred hugetlbfs mount points, ordered by priority 107 static const char* z_preferred_hugetlbfs_mountpoints[] = { 108 "/dev/hugepages", 109 "/hugepages", 110 NULL 111 }; 112 113 static int z_fallocate_hugetlbfs_attempts = 3; 114 static bool z_fallocate_supported = true; 115 116 static int z_fallocate(int fd, int mode, size_t offset, size_t length) { 117 return syscall(SYS_fallocate, fd, mode, offset, length); 118 } 119 120 static int z_memfd_create(const char *name, unsigned int flags) { 121 return syscall(SYS_memfd_create, name, flags); 122 } 123 124 ZBackingFile::ZBackingFile() : 125 _fd(-1), 126 _size(0), 127 _filesystem(0), 128 _block_size(0), 129 _available(0), 130 _initialized(false) { 131 132 // Create backing file 133 _fd = create_fd(ZFILENAME_HEAP); 134 if (_fd == -1) { 135 return; 136 } 137 138 // Get filesystem statistics 139 struct statfs buf; 140 if (fstatfs(_fd, &buf) == -1) { 141 ZErrno err; 142 log_error(gc)("Failed to determine filesystem type for backing file (%s)", err.to_string()); 143 return; 144 } 145 146 _filesystem = buf.f_type; 147 _block_size = buf.f_bsize; 148 _available = buf.f_bavail * _block_size; 149 150 // Make sure we're on a supported filesystem 151 if (!is_tmpfs() && !is_hugetlbfs()) { 152 log_error(gc)("Backing file must be located on a %s or a %s filesystem", 153 ZFILESYSTEM_TMPFS, ZFILESYSTEM_HUGETLBFS); 154 return; 155 } 156 157 // Make sure the filesystem type matches requested large page type 158 if (ZLargePages::is_transparent() && !is_tmpfs()) { 159 log_error(gc)("-XX:+UseTransparentHugePages can only be enable when using a %s filesystem", 160 ZFILESYSTEM_TMPFS); 161 return; 162 } 163 164 if (ZLargePages::is_transparent() && !tmpfs_supports_transparent_huge_pages()) { 165 log_error(gc)("-XX:+UseTransparentHugePages on a %s filesystem not supported by kernel", 166 ZFILESYSTEM_TMPFS); 167 return; 168 } 169 170 if (ZLargePages::is_explicit() && !is_hugetlbfs()) { 171 log_error(gc)("-XX:+UseLargePages (without -XX:+UseTransparentHugePages) can only be enabled " 172 "when using a %s filesystem", ZFILESYSTEM_HUGETLBFS); 173 return; 174 } 175 176 if (!ZLargePages::is_explicit() && is_hugetlbfs()) { 177 log_error(gc)("-XX:+UseLargePages must be enabled when using a %s filesystem", 178 ZFILESYSTEM_HUGETLBFS); 179 return; 180 } 181 182 const size_t expected_block_size = is_tmpfs() ? os::vm_page_size() : os::large_page_size(); 183 if (expected_block_size != _block_size) { 184 log_error(gc)("%s filesystem has unexpected block size " SIZE_FORMAT " (expected " SIZE_FORMAT ")", 185 is_tmpfs() ? ZFILESYSTEM_TMPFS : ZFILESYSTEM_HUGETLBFS, _block_size, expected_block_size); 186 return; 187 } 188 189 // Successfully initialized 190 _initialized = true; 191 } 192 193 int ZBackingFile::create_mem_fd(const char* name) const { 194 // Create file name 195 char filename[PATH_MAX]; 196 snprintf(filename, sizeof(filename), "%s%s", name, ZLargePages::is_explicit() ? ".hugetlb" : ""); 197 198 // Create file 199 const int extra_flags = ZLargePages::is_explicit() ? MFD_HUGETLB : 0; 200 const int fd = z_memfd_create(filename, MFD_CLOEXEC | extra_flags); 201 if (fd == -1) { 202 ZErrno err; 203 log_debug(gc, init)("Failed to create memfd file (%s)", 204 ((ZLargePages::is_explicit() && err == EINVAL) ? "Hugepages not supported" : err.to_string())); 205 return -1; 206 } 207 208 log_info(gc, init)("Heap backed by file: /memfd:%s", filename); 209 210 return fd; 211 } 212 213 int ZBackingFile::create_file_fd(const char* name) const { 214 const char* const filesystem = ZLargePages::is_explicit() 215 ? ZFILESYSTEM_HUGETLBFS 216 : ZFILESYSTEM_TMPFS; 217 const char** const preferred_mountpoints = ZLargePages::is_explicit() 218 ? z_preferred_hugetlbfs_mountpoints 219 : z_preferred_tmpfs_mountpoints; 220 221 // Find mountpoint 222 ZBackingPath path(filesystem, preferred_mountpoints); 223 if (path.get() == NULL) { 224 log_error(gc)("Use -XX:ZPath to specify the path to a %s filesystem", filesystem); 225 return -1; 226 } 227 228 // Try to create an anonymous file using the O_TMPFILE flag. Note that this 229 // flag requires kernel >= 3.11. If this fails we fall back to open/unlink. 230 const int fd_anon = os::open(path.get(), O_TMPFILE|O_EXCL|O_RDWR|O_CLOEXEC, S_IRUSR|S_IWUSR); 231 if (fd_anon == -1) { 232 ZErrno err; 233 log_debug(gc, init)("Failed to create anonymous file in %s (%s)", path.get(), 234 (err == EINVAL ? "Not supported" : err.to_string())); 235 } else { 236 // Get inode number for anonymous file 237 struct stat stat_buf; 238 if (fstat(fd_anon, &stat_buf) == -1) { 239 ZErrno err; 240 log_error(gc)("Failed to determine inode number for anonymous file (%s)", err.to_string()); 241 return -1; 242 } 243 244 log_info(gc, init)("Heap backed by file: %s/#" UINT64_FORMAT, path.get(), (uint64_t)stat_buf.st_ino); 245 246 return fd_anon; 247 } 248 249 log_debug(gc, init)("Falling back to open/unlink"); 250 251 // Create file name 252 char filename[PATH_MAX]; 253 snprintf(filename, sizeof(filename), "%s/%s.%d", path.get(), name, os::current_process_id()); 254 255 // Create file 256 const int fd = os::open(filename, O_CREAT|O_EXCL|O_RDWR|O_CLOEXEC, S_IRUSR|S_IWUSR); 257 if (fd == -1) { 258 ZErrno err; 259 log_error(gc)("Failed to create file %s (%s)", filename, err.to_string()); 260 return -1; 261 } 262 263 // Unlink file 264 if (unlink(filename) == -1) { 265 ZErrno err; 266 log_error(gc)("Failed to unlink file %s (%s)", filename, err.to_string()); 267 return -1; 268 } 269 270 log_info(gc, init)("Heap backed by file: %s", filename); 271 272 return fd; 273 } 274 275 int ZBackingFile::create_fd(const char* name) const { 276 if (ZPath == NULL) { 277 // If the path is not explicitly specified, then we first try to create a memfd file 278 // instead of looking for a tmpfd/hugetlbfs mount point. Note that memfd_create() might 279 // not be supported at all (requires kernel >= 3.17), or it might not support large 280 // pages (requires kernel >= 4.14). If memfd_create() fails, then we try to create a 281 // file on an accessible tmpfs or hugetlbfs mount point. 282 const int fd = create_mem_fd(name); 283 if (fd != -1) { 284 return fd; 285 } 286 287 log_debug(gc, init)("Falling back to searching for an accessible mount point"); 288 } 289 290 return create_file_fd(name); 291 } 292 293 bool ZBackingFile::is_initialized() const { 294 return _initialized; 295 } 296 297 int ZBackingFile::fd() const { 298 return _fd; 299 } 300 301 size_t ZBackingFile::size() const { 302 return _size; 303 } 304 305 size_t ZBackingFile::available() const { 306 return _available; 307 } 308 309 bool ZBackingFile::is_tmpfs() const { 310 return _filesystem == TMPFS_MAGIC; 311 } 312 313 bool ZBackingFile::is_hugetlbfs() const { 314 return _filesystem == HUGETLBFS_MAGIC; 315 } 316 317 bool ZBackingFile::tmpfs_supports_transparent_huge_pages() const { 318 // If the shmem_enabled file exists and is readable then we 319 // know the kernel supports transparent huge pages for tmpfs. 320 return access(ZFILENAME_SHMEM_ENABLED, R_OK) == 0; 321 } 322 323 ZErrno ZBackingFile::fallocate_compat_ftruncate(size_t size) const { 324 while (ftruncate(_fd, size) == -1) { 325 if (errno != EINTR) { 326 // Failed 327 return errno; 328 } 329 } 330 331 // Success 332 return 0; 333 } 334 335 ZErrno ZBackingFile::fallocate_compat_mmap(size_t offset, size_t length, bool touch) const { 336 // On hugetlbfs, mapping a file segment will fail immediately, without 337 // the need to touch the mapped pages first, if there aren't enough huge 338 // pages available to back the mapping. 339 void* const addr = mmap(0, length, PROT_READ|PROT_WRITE, MAP_SHARED, _fd, offset); 340 if (addr == MAP_FAILED) { 341 // Failed 342 return errno; 343 } 344 345 // Once mapped, the huge pages are only reserved. We need to touch them 346 // to associate them with the file segment. Note that we can not punch 347 // hole in file segments which only have reserved pages. 348 if (touch) { 349 char* const start = (char*)addr; 350 char* const end = start + length; 351 os::pretouch_memory(start, end, _block_size); 352 } 353 354 // Unmap again. From now on, the huge pages that were mapped are allocated 355 // to this file. There's no risk in getting SIGBUS when touching them. 356 if (munmap(addr, length) == -1) { 357 // Failed 358 return errno; 359 } 360 361 // Success 362 return 0; 363 } 364 365 ZErrno ZBackingFile::fallocate_compat_pwrite(size_t offset, size_t length) const { 366 uint8_t data = 0; 367 368 // Allocate backing memory by writing to each block 369 for (size_t pos = offset; pos < offset + length; pos += _block_size) { 370 if (pwrite(_fd, &data, sizeof(data), pos) == -1) { 371 // Failed 372 return errno; 373 } 374 } 375 376 // Success 377 return 0; 378 } 379 380 ZErrno ZBackingFile::fallocate_fill_hole_compat(size_t offset, size_t length) { 381 // fallocate(2) is only supported by tmpfs since Linux 3.5, and by hugetlbfs 382 // since Linux 4.3. When fallocate(2) is not supported we emulate it using 383 // ftruncate/pwrite (for tmpfs) or ftruncate/mmap/munmap (for hugetlbfs). 384 385 const size_t end = offset + length; 386 if (end > _size) { 387 // Increase file size 388 const ZErrno err = fallocate_compat_ftruncate(end); 389 if (err) { 390 // Failed 391 return err; 392 } 393 } 394 395 // Allocate backing memory 396 const ZErrno err = is_hugetlbfs() ? fallocate_compat_mmap(offset, length, false /* touch */) 397 : fallocate_compat_pwrite(offset, length); 398 if (err) { 399 if (end > _size) { 400 // Restore file size 401 fallocate_compat_ftruncate(_size); 402 } 403 404 // Failed 405 return err; 406 } 407 408 if (end > _size) { 409 // Record new file size 410 _size = end; 411 } 412 413 // Success 414 return 0; 415 } 416 417 ZErrno ZBackingFile::fallocate_fill_hole_syscall(size_t offset, size_t length) { 418 const int mode = 0; // Allocate 419 const int res = z_fallocate(_fd, mode, offset, length); 420 if (res == -1) { 421 // Failed 422 return errno; 423 } 424 425 const size_t end = offset + length; 426 if (end > _size) { 427 // Record new file size 428 _size = end; 429 } 430 431 // Success 432 return 0; 433 } 434 435 ZErrno ZBackingFile::fallocate_fill_hole(size_t offset, size_t length) { 436 // Using compat mode is more efficient when allocating space on hugetlbfs. 437 // Note that allocating huge pages this way will only reserve them, and not 438 // associate them with segments of the file. We must guarantee that we at 439 // some point touch these segments, otherwise we can not punch hole in them. 440 if (z_fallocate_supported && !is_hugetlbfs()) { 441 const ZErrno err = fallocate_fill_hole_syscall(offset, length); 442 if (!err) { 443 // Success 444 return 0; 445 } 446 447 if (err != ENOSYS && err != EOPNOTSUPP) { 448 // Failed 449 return err; 450 } 451 452 // Not supported 453 log_debug(gc)("Falling back to fallocate() compatibility mode"); 454 z_fallocate_supported = false; 455 } 456 457 return fallocate_fill_hole_compat(offset, length); 458 } 459 460 ZErrno ZBackingFile::fallocate_punch_hole(size_t offset, size_t length) { 461 if (is_hugetlbfs()) { 462 // We can only punch hole in pages that have been touched. Non-touched 463 // pages are only reserved, and not associated with any specific file 464 // segment. We don't know which pages have been previously touched, so 465 // we always touch them here to guarantee that we can punch hole. 466 const ZErrno err = fallocate_compat_mmap(offset, length, true /* touch */); 467 if (err) { 468 // Failed 469 return err; 470 } 471 } 472 473 const int mode = FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE; 474 if (z_fallocate(_fd, mode, offset, length) == -1) { 475 // Failed 476 return errno; 477 } 478 479 // Success 480 return 0; 481 } 482 483 ZErrno ZBackingFile::split_and_fallocate(bool punch_hole, size_t offset, size_t length) { 484 // Try first half 485 const size_t offset0 = offset; 486 const size_t length0 = align_up(length / 2, _block_size); 487 const ZErrno err0 = fallocate(punch_hole, offset0, length0); 488 if (err0) { 489 return err0; 490 } 491 492 // Try second half 493 const size_t offset1 = offset0 + length0; 494 const size_t length1 = length - length0; 495 const ZErrno err1 = fallocate(punch_hole, offset1, length1); 496 if (err1) { 497 return err1; 498 } 499 500 // Success 501 return 0; 502 } 503 504 ZErrno ZBackingFile::fallocate(bool punch_hole, size_t offset, size_t length) { 505 assert(is_aligned(offset, _block_size), "Invalid offset"); 506 assert(is_aligned(length, _block_size), "Invalid length"); 507 508 const ZErrno err = punch_hole ? fallocate_punch_hole(offset, length) : fallocate_fill_hole(offset, length); 509 if (err == EINTR && length > _block_size) { 510 // Calling fallocate(2) with a large length can take a long time to 511 // complete. When running profilers, such as VTune, this syscall will 512 // be constantly interrupted by signals. Expanding the file in smaller 513 // steps avoids this problem. 514 return split_and_fallocate(punch_hole, offset, length); 515 } 516 517 return err; 518 } 519 520 bool ZBackingFile::commit_inner(size_t offset, size_t length) { 521 log_trace(gc, heap)("Committing memory: " SIZE_FORMAT "M-" SIZE_FORMAT "M (" SIZE_FORMAT "M)", 522 offset / M, (offset + length) / M, length / M); 523 524 retry: 525 const ZErrno err = fallocate(false /* punch_hole */, offset, length); 526 if (err) { 527 if (err == ENOSPC && !is_init_completed() && is_hugetlbfs() && z_fallocate_hugetlbfs_attempts-- > 0) { 528 // If we fail to allocate during initialization, due to lack of space on 529 // the hugetlbfs filesystem, then we wait and retry a few times before 530 // giving up. Otherwise there is a risk that running JVMs back-to-back 531 // will fail, since there is a delay between process termination and the 532 // huge pages owned by that process being returned to the huge page pool 533 // and made available for new allocations. 534 log_debug(gc, init)("Failed to commit memory (%s), retrying", err.to_string()); 535 536 // Wait and retry in one second, in the hope that huge pages will be 537 // available by then. 538 sleep(1); 539 goto retry; 540 } 541 542 // Failed 543 log_error(gc)("Failed to commit memory (%s)", err.to_string()); 544 return false; 545 } 546 547 // Success 548 return true; 549 } 550 551 size_t ZBackingFile::commit(size_t offset, size_t length) { 552 // Try to commit the whole region 553 if (commit_inner(offset, length)) { 554 // Success 555 return length; 556 } 557 558 // Failed, try to commit as much as possible 559 size_t start = offset; 560 size_t end = offset + length; 561 562 for (;;) { 563 length = align_down((end - start) / 2, ZGranuleSize); 564 if (length < ZGranuleSize) { 565 // Done, don't commit more 566 return start - offset; 567 } 568 569 if (commit_inner(start, length)) { 570 // Success, try commit more 571 start += length; 572 } else { 573 // Failed, try commit less 574 end -= length; 575 } 576 } 577 } 578 579 size_t ZBackingFile::uncommit(size_t offset, size_t length) { 580 log_trace(gc, heap)("Uncommitting memory: " SIZE_FORMAT "M-" SIZE_FORMAT "M (" SIZE_FORMAT "M)", 581 offset / M, (offset + length) / M, length / M); 582 583 const ZErrno err = fallocate(true /* punch_hole */, offset, length); 584 if (err) { 585 log_error(gc)("Failed to uncommit memory (%s)", err.to_string()); 586 return 0; 587 } 588 589 return length; 590 }