/* * Copyright (c) 2015, 2020, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 only, as * published by the Free Software Foundation. * * This code is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * version 2 for more details (a copy is included in the LICENSE file that * accompanied this code). * * You should have received a copy of the GNU General Public License version * 2 along with this work; if not, write to the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. * * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA * or visit www.oracle.com if you need additional information or have any * questions. */ #include "precompiled.hpp" #include "gc/z/zArray.inline.hpp" #include "gc/z/zErrno.hpp" #include "gc/z/zGlobals.hpp" #include "gc/z/zLargePages.inline.hpp" #include "gc/z/zMountPoint_linux.hpp" #include "gc/z/zNUMA.inline.hpp" #include "gc/z/zPhysicalMemoryBacking_linux.hpp" #include "gc/z/zSyscall_linux.hpp" #include "logging/log.hpp" #include "runtime/init.hpp" #include "runtime/os.hpp" #include "runtime/stubRoutines.hpp" #include "utilities/align.hpp" #include "utilities/debug.hpp" #include "utilities/growableArray.hpp" #include #include #include #include #include #include #include // // Support for building on older Linux systems // // memfd_create(2) flags #ifndef MFD_CLOEXEC #define MFD_CLOEXEC 0x0001U #endif #ifndef MFD_HUGETLB #define MFD_HUGETLB 0x0004U #endif // open(2) flags #ifndef O_CLOEXEC #define O_CLOEXEC 02000000 #endif #ifndef O_TMPFILE #define O_TMPFILE (020000000 | O_DIRECTORY) #endif // fallocate(2) flags #ifndef FALLOC_FL_KEEP_SIZE #define FALLOC_FL_KEEP_SIZE 0x01 #endif #ifndef FALLOC_FL_PUNCH_HOLE #define FALLOC_FL_PUNCH_HOLE 0x02 #endif // Filesystem types, see statfs(2) #ifndef TMPFS_MAGIC #define TMPFS_MAGIC 0x01021994 #endif #ifndef HUGETLBFS_MAGIC #define HUGETLBFS_MAGIC 0x958458f6 #endif // Filesystem names #define ZFILESYSTEM_TMPFS "tmpfs" #define ZFILESYSTEM_HUGETLBFS "hugetlbfs" // Proc file entry for max map mount #define ZFILENAME_PROC_MAX_MAP_COUNT "/proc/sys/vm/max_map_count" // Sysfs file for transparent huge page on tmpfs #define ZFILENAME_SHMEM_ENABLED "/sys/kernel/mm/transparent_hugepage/shmem_enabled" // Java heap filename #define ZFILENAME_HEAP "java_heap" // Preferred tmpfs mount points, ordered by priority static const char* z_preferred_tmpfs_mountpoints[] = { "/dev/shm", "/run/shm", NULL }; // Preferred hugetlbfs mount points, ordered by priority static const char* z_preferred_hugetlbfs_mountpoints[] = { "/dev/hugepages", "/hugepages", NULL }; static int z_fallocate_hugetlbfs_attempts = 3; static bool z_fallocate_supported = true; ZPhysicalMemoryBacking::ZPhysicalMemoryBacking(size_t max_capacity) : _fd(-1), _filesystem(0), _block_size(0), _available(0), _initialized(false) { // Create backing file _fd = create_fd(ZFILENAME_HEAP); if (_fd == -1) { return; } // Truncate backing file if (ftruncate(_fd, max_capacity) == -1) { ZErrno err; log_error(gc)("Failed to truncate backing file (%s)", err.to_string()); return; } // Get filesystem statistics struct statfs buf; if (fstatfs(_fd, &buf) == -1) { ZErrno err; log_error(gc)("Failed to determine filesystem type for backing file (%s)", err.to_string()); return; } _filesystem = buf.f_type; _block_size = buf.f_bsize; _available = buf.f_bavail * _block_size; log_info(gc, init)("Heap Backing Filesystem: %s (0x" UINT64_FORMAT_X ")", is_tmpfs() ? ZFILESYSTEM_TMPFS : is_hugetlbfs() ? ZFILESYSTEM_HUGETLBFS : "other", _filesystem); // Make sure the filesystem type matches requested large page type if (ZLargePages::is_transparent() && !is_tmpfs()) { log_error(gc)("-XX:+UseTransparentHugePages can only be enabled when using a %s filesystem", ZFILESYSTEM_TMPFS); return; } if (ZLargePages::is_transparent() && !tmpfs_supports_transparent_huge_pages()) { log_error(gc)("-XX:+UseTransparentHugePages on a %s filesystem not supported by kernel", ZFILESYSTEM_TMPFS); return; } if (ZLargePages::is_explicit() && !is_hugetlbfs()) { log_error(gc)("-XX:+UseLargePages (without -XX:+UseTransparentHugePages) can only be enabled " "when using a %s filesystem", ZFILESYSTEM_HUGETLBFS); return; } if (!ZLargePages::is_explicit() && is_hugetlbfs()) { log_error(gc)("-XX:+UseLargePages must be enabled when using a %s filesystem", ZFILESYSTEM_HUGETLBFS); return; } if (ZLargePages::is_explicit() && os::large_page_size() != ZGranuleSize) { log_error(gc)("Incompatible large page size configured " SIZE_FORMAT " (expected " SIZE_FORMAT ")", os::large_page_size(), ZGranuleSize); return; } // Make sure the filesystem block size is compatible if (ZGranuleSize % _block_size != 0) { log_error(gc)("Filesystem backing the heap has incompatible block size (" SIZE_FORMAT ")", _block_size); return; } if (is_hugetlbfs() && _block_size != ZGranuleSize) { log_error(gc)("%s filesystem has unexpected block size " SIZE_FORMAT " (expected " SIZE_FORMAT ")", ZFILESYSTEM_HUGETLBFS, _block_size, ZGranuleSize); return; } // Successfully initialized _initialized = true; } int ZPhysicalMemoryBacking::create_mem_fd(const char* name) const { // Create file name char filename[PATH_MAX]; snprintf(filename, sizeof(filename), "%s%s", name, ZLargePages::is_explicit() ? ".hugetlb" : ""); // Create file const int extra_flags = ZLargePages::is_explicit() ? MFD_HUGETLB : 0; const int fd = ZSyscall::memfd_create(filename, MFD_CLOEXEC | extra_flags); if (fd == -1) { ZErrno err; log_debug(gc, init)("Failed to create memfd file (%s)", ((ZLargePages::is_explicit() && err == EINVAL) ? "Hugepages not supported" : err.to_string())); return -1; } log_info(gc, init)("Heap Backing File: /memfd:%s", filename); return fd; } int ZPhysicalMemoryBacking::create_file_fd(const char* name) const { const char* const filesystem = ZLargePages::is_explicit() ? ZFILESYSTEM_HUGETLBFS : ZFILESYSTEM_TMPFS; const char** const preferred_mountpoints = ZLargePages::is_explicit() ? z_preferred_hugetlbfs_mountpoints : z_preferred_tmpfs_mountpoints; // Find mountpoint ZMountPoint mountpoint(filesystem, preferred_mountpoints); if (mountpoint.get() == NULL) { log_error(gc)("Use -XX:AllocateHeapAt to specify the path to a %s filesystem", filesystem); return -1; } // Try to create an anonymous file using the O_TMPFILE flag. Note that this // flag requires kernel >= 3.11. If this fails we fall back to open/unlink. const int fd_anon = os::open(mountpoint.get(), O_TMPFILE|O_EXCL|O_RDWR|O_CLOEXEC, S_IRUSR|S_IWUSR); if (fd_anon == -1) { ZErrno err; log_debug(gc, init)("Failed to create anonymous file in %s (%s)", mountpoint.get(), (err == EINVAL ? "Not supported" : err.to_string())); } else { // Get inode number for anonymous file struct stat stat_buf; if (fstat(fd_anon, &stat_buf) == -1) { ZErrno err; log_error(gc)("Failed to determine inode number for anonymous file (%s)", err.to_string()); return -1; } log_info(gc, init)("Heap Backing File: %s/#" UINT64_FORMAT, mountpoint.get(), (uint64_t)stat_buf.st_ino); return fd_anon; } log_debug(gc, init)("Falling back to open/unlink"); // Create file name char filename[PATH_MAX]; snprintf(filename, sizeof(filename), "%s/%s.%d", mountpoint.get(), name, os::current_process_id()); // Create file const int fd = os::open(filename, O_CREAT|O_EXCL|O_RDWR|O_CLOEXEC, S_IRUSR|S_IWUSR); if (fd == -1) { ZErrno err; log_error(gc)("Failed to create file %s (%s)", filename, err.to_string()); return -1; } // Unlink file if (unlink(filename) == -1) { ZErrno err; log_error(gc)("Failed to unlink file %s (%s)", filename, err.to_string()); return -1; } log_info(gc, init)("Heap Backing File: %s", filename); return fd; } int ZPhysicalMemoryBacking::create_fd(const char* name) const { if (AllocateHeapAt == NULL) { // If the path is not explicitly specified, then we first try to create a memfd file // instead of looking for a tmpfd/hugetlbfs mount point. Note that memfd_create() might // not be supported at all (requires kernel >= 3.17), or it might not support large // pages (requires kernel >= 4.14). If memfd_create() fails, then we try to create a // file on an accessible tmpfs or hugetlbfs mount point. const int fd = create_mem_fd(name); if (fd != -1) { return fd; } log_debug(gc, init)("Falling back to searching for an accessible mount point"); } return create_file_fd(name); } bool ZPhysicalMemoryBacking::is_initialized() const { return _initialized; } void ZPhysicalMemoryBacking::warn_available_space(size_t max_capacity) const { // Note that the available space on a tmpfs or a hugetlbfs filesystem // will be zero if no size limit was specified when it was mounted. if (_available == 0) { // No size limit set, skip check log_info(gc, init)("Available space on backing filesystem: N/A"); return; } log_info(gc, init)("Available space on backing filesystem: " SIZE_FORMAT "M", _available / M); // Warn if the filesystem doesn't currently have enough space available to hold // the max heap size. The max heap size will be capped if we later hit this limit // when trying to expand the heap. if (_available < max_capacity) { log_warning(gc)("***** WARNING! INCORRECT SYSTEM CONFIGURATION DETECTED! *****"); log_warning(gc)("Not enough space available on the backing filesystem to hold the current max Java heap"); log_warning(gc)("size (" SIZE_FORMAT "M). Please adjust the size of the backing filesystem accordingly " "(available", max_capacity / M); log_warning(gc)("space is currently " SIZE_FORMAT "M). Continuing execution with the current filesystem " "size could", _available / M); log_warning(gc)("lead to a premature OutOfMemoryError being thrown, due to failure to commit memory."); } } void ZPhysicalMemoryBacking::warn_max_map_count(size_t max_capacity) const { const char* const filename = ZFILENAME_PROC_MAX_MAP_COUNT; FILE* const file = fopen(filename, "r"); if (file == NULL) { // Failed to open file, skip check log_debug(gc, init)("Failed to open %s", filename); return; } size_t actual_max_map_count = 0; const int result = fscanf(file, SIZE_FORMAT, &actual_max_map_count); fclose(file); if (result != 1) { // Failed to read file, skip check log_debug(gc, init)("Failed to read %s", filename); return; } // The required max map count is impossible to calculate exactly since subsystems // other than ZGC are also creating memory mappings, and we have no control over that. // However, ZGC tends to create the most mappings and dominate the total count. // In the worst cases, ZGC will map each granule three times, i.e. once per heap view. // We speculate that we need another 20% to allow for non-ZGC subsystems to map memory. const size_t required_max_map_count = (max_capacity / ZGranuleSize) * 3 * 1.2; if (actual_max_map_count < required_max_map_count) { log_warning(gc)("***** WARNING! INCORRECT SYSTEM CONFIGURATION DETECTED! *****"); log_warning(gc)("The system limit on number of memory mappings per process might be too low for the given"); log_warning(gc)("max Java heap size (" SIZE_FORMAT "M). Please adjust %s to allow for at", max_capacity / M, filename); log_warning(gc)("least " SIZE_FORMAT " mappings (current limit is " SIZE_FORMAT "). Continuing execution " "with the current", required_max_map_count, actual_max_map_count); log_warning(gc)("limit could lead to a premature OutOfMemoryError being thrown, due to failure to map memory."); } } void ZPhysicalMemoryBacking::warn_commit_limits(size_t max_capacity) const { // Warn if available space is too low warn_available_space(max_capacity); // Warn if max map count is too low warn_max_map_count(max_capacity); } bool ZPhysicalMemoryBacking::is_tmpfs() const { return _filesystem == TMPFS_MAGIC; } bool ZPhysicalMemoryBacking::is_hugetlbfs() const { return _filesystem == HUGETLBFS_MAGIC; } bool ZPhysicalMemoryBacking::tmpfs_supports_transparent_huge_pages() const { // If the shmem_enabled file exists and is readable then we // know the kernel supports transparent huge pages for tmpfs. return access(ZFILENAME_SHMEM_ENABLED, R_OK) == 0; } ZErrno ZPhysicalMemoryBacking::fallocate_compat_mmap_hugetlbfs(size_t offset, size_t length, bool touch) const { // On hugetlbfs, mapping a file segment will fail immediately, without // the need to touch the mapped pages first, if there aren't enough huge // pages available to back the mapping. void* const addr = mmap(0, length, PROT_READ|PROT_WRITE, MAP_SHARED, _fd, offset); if (addr == MAP_FAILED) { // Failed return errno; } // Once mapped, the huge pages are only reserved. We need to touch them // to associate them with the file segment. Note that we can not punch // hole in file segments which only have reserved pages. if (touch) { char* const start = (char*)addr; char* const end = start + length; os::pretouch_memory(start, end, _block_size); } // Unmap again. From now on, the huge pages that were mapped are allocated // to this file. There's no risk of getting a SIGBUS when mapping and // touching these pages again. if (munmap(addr, length) == -1) { // Failed return errno; } // Success return 0; } static bool safe_touch_mapping(void* addr, size_t length, size_t page_size) { char* const start = (char*)addr; char* const end = start + length; // Touching a mapping that can't be backed by memory will generate a // SIGBUS. By using SafeFetch32 any SIGBUS will be safely caught and // handled. On tmpfs, doing a fetch (rather than a store) is enough // to cause backing pages to be allocated (there's no zero-page to // worry about). for (char *p = start; p < end; p += page_size) { if (SafeFetch32((int*)p, -1) == -1) { // Failed return false; } } // Success return true; } ZErrno ZPhysicalMemoryBacking::fallocate_compat_mmap_tmpfs(size_t offset, size_t length) const { // On tmpfs, we need to touch the mapped pages to figure out // if there are enough pages available to back the mapping. void* const addr = mmap(0, length, PROT_READ|PROT_WRITE, MAP_SHARED, _fd, offset); if (addr == MAP_FAILED) { // Failed return errno; } // Advise mapping to use transparent huge pages os::realign_memory((char*)addr, length, os::large_page_size()); // Touch the mapping (safely) to make sure it's backed by memory const bool backed = safe_touch_mapping(addr, length, _block_size); // Unmap again. If successfully touched, the backing memory will // be allocated to this file. There's no risk of getting a SIGBUS // when mapping and touching these pages again. if (munmap(addr, length) == -1) { // Failed return errno; } // Success return backed ? 0 : ENOMEM; } ZErrno ZPhysicalMemoryBacking::fallocate_compat_pwrite(size_t offset, size_t length) const { uint8_t data = 0; // Allocate backing memory by writing to each block for (size_t pos = offset; pos < offset + length; pos += _block_size) { if (pwrite(_fd, &data, sizeof(data), pos) == -1) { // Failed return errno; } } // Success return 0; } ZErrno ZPhysicalMemoryBacking::fallocate_fill_hole_compat(size_t offset, size_t length) const { // fallocate(2) is only supported by tmpfs since Linux 3.5, and by hugetlbfs // since Linux 4.3. When fallocate(2) is not supported we emulate it using // mmap/munmap (for hugetlbfs and tmpfs with transparent huge pages) or pwrite // (for tmpfs without transparent huge pages and other filesystem types). if (ZLargePages::is_explicit()) { return fallocate_compat_mmap_hugetlbfs(offset, length, false /* touch */); } else if (ZLargePages::is_transparent()) { return fallocate_compat_mmap_tmpfs(offset, length); } else { return fallocate_compat_pwrite(offset, length); } } ZErrno ZPhysicalMemoryBacking::fallocate_fill_hole_syscall(size_t offset, size_t length) const { const int mode = 0; // Allocate const int res = ZSyscall::fallocate(_fd, mode, offset, length); if (res == -1) { // Failed return errno; } // Success return 0; } ZErrno ZPhysicalMemoryBacking::fallocate_fill_hole(size_t offset, size_t length) const { // Using compat mode is more efficient when allocating space on hugetlbfs. // Note that allocating huge pages this way will only reserve them, and not // associate them with segments of the file. We must guarantee that we at // some point touch these segments, otherwise we can not punch hole in them. // Also note that we need to use compat mode when using transparent huge pages, // since we need to use madvise(2) on the mapping before the page is allocated. if (z_fallocate_supported && !ZLargePages::is_enabled()) { const ZErrno err = fallocate_fill_hole_syscall(offset, length); if (!err) { // Success return 0; } if (err != ENOSYS && err != EOPNOTSUPP) { // Failed return err; } // Not supported log_debug(gc)("Falling back to fallocate() compatibility mode"); z_fallocate_supported = false; } return fallocate_fill_hole_compat(offset, length); } ZErrno ZPhysicalMemoryBacking::fallocate_punch_hole(size_t offset, size_t length) const { if (ZLargePages::is_explicit()) { // We can only punch hole in pages that have been touched. Non-touched // pages are only reserved, and not associated with any specific file // segment. We don't know which pages have been previously touched, so // we always touch them here to guarantee that we can punch hole. const ZErrno err = fallocate_compat_mmap_hugetlbfs(offset, length, true /* touch */); if (err) { // Failed return err; } } const int mode = FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE; if (ZSyscall::fallocate(_fd, mode, offset, length) == -1) { // Failed return errno; } // Success return 0; } ZErrno ZPhysicalMemoryBacking::split_and_fallocate(bool punch_hole, size_t offset, size_t length) const { // Try first half const size_t offset0 = offset; const size_t length0 = align_up(length / 2, _block_size); const ZErrno err0 = fallocate(punch_hole, offset0, length0); if (err0) { return err0; } // Try second half const size_t offset1 = offset0 + length0; const size_t length1 = length - length0; const ZErrno err1 = fallocate(punch_hole, offset1, length1); if (err1) { return err1; } // Success return 0; } ZErrno ZPhysicalMemoryBacking::fallocate(bool punch_hole, size_t offset, size_t length) const { assert(is_aligned(offset, _block_size), "Invalid offset"); assert(is_aligned(length, _block_size), "Invalid length"); const ZErrno err = punch_hole ? fallocate_punch_hole(offset, length) : fallocate_fill_hole(offset, length); if (err == EINTR && length > _block_size) { // Calling fallocate(2) with a large length can take a long time to // complete. When running profilers, such as VTune, this syscall will // be constantly interrupted by signals. Expanding the file in smaller // steps avoids this problem. return split_and_fallocate(punch_hole, offset, length); } return err; } bool ZPhysicalMemoryBacking::commit_inner(size_t offset, size_t length) const { log_trace(gc, heap)("Committing memory: " SIZE_FORMAT "M-" SIZE_FORMAT "M (" SIZE_FORMAT "M)", offset / M, (offset + length) / M, length / M); retry: const ZErrno err = fallocate(false /* punch_hole */, offset, length); if (err) { if (err == ENOSPC && !is_init_completed() && ZLargePages::is_explicit() && z_fallocate_hugetlbfs_attempts-- > 0) { // If we fail to allocate during initialization, due to lack of space on // the hugetlbfs filesystem, then we wait and retry a few times before // giving up. Otherwise there is a risk that running JVMs back-to-back // will fail, since there is a delay between process termination and the // huge pages owned by that process being returned to the huge page pool // and made available for new allocations. log_debug(gc, init)("Failed to commit memory (%s), retrying", err.to_string()); // Wait and retry in one second, in the hope that huge pages will be // available by then. sleep(1); goto retry; } // Failed log_error(gc)("Failed to commit memory (%s)", err.to_string()); return false; } // Success return true; } static int offset_to_node(size_t offset) { const GrowableArray* mapping = os::Linux::numa_nindex_to_node(); const size_t nindex = (offset >> ZGranuleSizeShift) % mapping->length(); return mapping->at((int)nindex); } size_t ZPhysicalMemoryBacking::commit_numa_interleaved(size_t offset, size_t length) const { size_t committed = 0; // Commit one granule at a time, so that each granule // can be allocated from a different preferred node. while (committed < length) { const size_t granule_offset = offset + committed; // Setup NUMA policy to allocate memory from a preferred node os::Linux::numa_set_preferred(offset_to_node(granule_offset)); if (!commit_inner(granule_offset, ZGranuleSize)) { // Failed break; } committed += ZGranuleSize; } // Restore NUMA policy os::Linux::numa_set_preferred(-1); return committed; } size_t ZPhysicalMemoryBacking::commit_default(size_t offset, size_t length) const { // Try to commit the whole region if (commit_inner(offset, length)) { // Success return length; } // Failed, try to commit as much as possible size_t start = offset; size_t end = offset + length; for (;;) { length = align_down((end - start) / 2, ZGranuleSize); if (length < ZGranuleSize) { // Done, don't commit more return start - offset; } if (commit_inner(start, length)) { // Success, try commit more start += length; } else { // Failed, try commit less end -= length; } } } size_t ZPhysicalMemoryBacking::commit(size_t offset, size_t length) const { if (ZNUMA::is_enabled() && !ZLargePages::is_explicit()) { // To get granule-level NUMA interleaving when using non-large pages, // we must explicitly interleave the memory at commit/fallocate time. return commit_numa_interleaved(offset, length); } return commit_default(offset, length); } size_t ZPhysicalMemoryBacking::uncommit(size_t offset, size_t length) const { log_trace(gc, heap)("Uncommitting memory: " SIZE_FORMAT "M-" SIZE_FORMAT "M (" SIZE_FORMAT "M)", offset / M, (offset + length) / M, length / M); const ZErrno err = fallocate(true /* punch_hole */, offset, length); if (err) { log_error(gc)("Failed to uncommit memory (%s)", err.to_string()); return 0; } return length; } bool ZPhysicalMemoryBacking::map(uintptr_t addr, size_t size, uintptr_t offset) const { const void* const res = mmap((void*)addr, size, PROT_READ|PROT_WRITE, MAP_FIXED|MAP_SHARED, _fd, offset); if (res == MAP_FAILED) { ZErrno err; log_error(gc)("Failed to map memory (%s)", err.to_string()); return false; } return true; } void ZPhysicalMemoryBacking::unmap(uintptr_t addr, size_t size) const { // Note that we must keep the address space reservation intact and just detach // the backing memory. For this reason we map a new anonymous, non-accessible // and non-reserved page over the mapping instead of actually unmapping. const void* const res = mmap((void*)addr, size, PROT_NONE, MAP_FIXED | MAP_ANONYMOUS | MAP_PRIVATE | MAP_NORESERVE, -1, 0); if (res == MAP_FAILED) { ZErrno err; log_error(gc)("Failed to map memory (%s)", err.to_string()); } }