< prev index next >
src/hotspot/os_cpu/linux_x86/gc/z/zBackingFile_linux_x86.cpp
Print this page
@@ -24,58 +24,80 @@
#include "precompiled.hpp"
#include "gc/z/zArray.inline.hpp"
#include "gc/z/zBackingFile_linux_x86.hpp"
#include "gc/z/zBackingPath_linux_x86.hpp"
#include "gc/z/zErrno.hpp"
+#include "gc/z/zGlobals.hpp"
#include "gc/z/zLargePages.inline.hpp"
#include "logging/log.hpp"
+#include "runtime/init.hpp"
#include "runtime/os.hpp"
#include "utilities/align.hpp"
#include "utilities/debug.hpp"
#include <fcntl.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/statfs.h>
+#include <sys/syscall.h>
#include <sys/types.h>
#include <unistd.h>
-// Filesystem names
-#define ZFILESYSTEM_TMPFS "tmpfs"
-#define ZFILESYSTEM_HUGETLBFS "hugetlbfs"
-
-// Sysfs file for transparent huge page on tmpfs
-#define ZFILENAME_SHMEM_ENABLED "/sys/kernel/mm/transparent_hugepage/shmem_enabled"
-
-// Java heap filename
-#define ZFILENAME_HEAP "java_heap"
-
+//
// Support for building on older Linux systems
-#ifndef __NR_memfd_create
-#define __NR_memfd_create 319
+//
+
+// System calls
+#ifndef SYS_fallocate
+#define SYS_fallocate 285
#endif
+#ifndef SYS_memfd_create
+#define SYS_memfd_create 319
+#endif
+
+// memfd_create(2) flags
#ifndef MFD_CLOEXEC
#define MFD_CLOEXEC 0x0001U
#endif
#ifndef MFD_HUGETLB
#define MFD_HUGETLB 0x0004U
#endif
+
+// open(2) flags
#ifndef O_CLOEXEC
#define O_CLOEXEC 02000000
#endif
#ifndef O_TMPFILE
#define O_TMPFILE (020000000 | O_DIRECTORY)
#endif
+// fallocate(2) flags
+#ifndef FALLOC_FL_KEEP_SIZE
+#define FALLOC_FL_KEEP_SIZE 0x01
+#endif
+#ifndef FALLOC_FL_PUNCH_HOLE
+#define FALLOC_FL_PUNCH_HOLE 0x02
+#endif
+
// Filesystem types, see statfs(2)
#ifndef TMPFS_MAGIC
#define TMPFS_MAGIC 0x01021994
#endif
#ifndef HUGETLBFS_MAGIC
#define HUGETLBFS_MAGIC 0x958458f6
#endif
+// Filesystem names
+#define ZFILESYSTEM_TMPFS "tmpfs"
+#define ZFILESYSTEM_HUGETLBFS "hugetlbfs"
+
+// Sysfs file for transparent huge page on tmpfs
+#define ZFILENAME_SHMEM_ENABLED "/sys/kernel/mm/transparent_hugepage/shmem_enabled"
+
+// Java heap filename
+#define ZFILENAME_HEAP "java_heap"
+
// Preferred tmpfs mount points, ordered by priority
static const char* z_preferred_tmpfs_mountpoints[] = {
"/dev/shm",
"/run/shm",
NULL
@@ -86,69 +108,83 @@
"/dev/hugepages",
"/hugepages",
NULL
};
-static int z_memfd_create(const char *name, unsigned int flags) {
- return syscall(__NR_memfd_create, name, flags);
+static int z_fallocate_hugetlbfs_attempts = 3;
+static bool z_fallocate_supported = true;
+
+static int z_fallocate(int fd, int mode, size_t offset, size_t length) {
+ return syscall(SYS_fallocate, fd, mode, offset, length);
}
-bool ZBackingFile::_hugetlbfs_mmap_retry = true;
+static int z_memfd_create(const char *name, unsigned int flags) {
+ return syscall(SYS_memfd_create, name, flags);
+}
ZBackingFile::ZBackingFile() :
_fd(-1),
+ _size(0),
_filesystem(0),
+ _block_size(0),
_available(0),
_initialized(false) {
// Create backing file
_fd = create_fd(ZFILENAME_HEAP);
if (_fd == -1) {
return;
}
// Get filesystem statistics
- struct statfs statfs_buf;
- if (fstatfs(_fd, &statfs_buf) == -1) {
+ struct statfs buf;
+ if (fstatfs(_fd, &buf) == -1) {
ZErrno err;
- log_error(gc, init)("Failed to determine filesystem type for backing file (%s)",
- err.to_string());
+ log_error(gc)("Failed to determine filesystem type for backing file (%s)", err.to_string());
return;
}
- _filesystem = statfs_buf.f_type;
- _available = statfs_buf.f_bavail * statfs_buf.f_bsize;
+ _filesystem = buf.f_type;
+ _block_size = buf.f_bsize;
+ _available = buf.f_bavail * _block_size;
// Make sure we're on a supported filesystem
if (!is_tmpfs() && !is_hugetlbfs()) {
- log_error(gc, init)("Backing file must be located on a %s or a %s filesystem",
- ZFILESYSTEM_TMPFS, ZFILESYSTEM_HUGETLBFS);
+ log_error(gc)("Backing file must be located on a %s or a %s filesystem",
+ ZFILESYSTEM_TMPFS, ZFILESYSTEM_HUGETLBFS);
return;
}
// Make sure the filesystem type matches requested large page type
if (ZLargePages::is_transparent() && !is_tmpfs()) {
- log_error(gc, init)("-XX:+UseTransparentHugePages can only be enable when using a %s filesystem",
- ZFILESYSTEM_TMPFS);
+ log_error(gc)("-XX:+UseTransparentHugePages can only be enable when using a %s filesystem",
+ ZFILESYSTEM_TMPFS);
return;
}
if (ZLargePages::is_transparent() && !tmpfs_supports_transparent_huge_pages()) {
- log_error(gc, init)("-XX:+UseTransparentHugePages on a %s filesystem not supported by kernel",
- ZFILESYSTEM_TMPFS);
+ log_error(gc)("-XX:+UseTransparentHugePages on a %s filesystem not supported by kernel",
+ ZFILESYSTEM_TMPFS);
return;
}
if (ZLargePages::is_explicit() && !is_hugetlbfs()) {
- log_error(gc, init)("-XX:+UseLargePages (without -XX:+UseTransparentHugePages) can only be enabled when using a %s filesystem",
- ZFILESYSTEM_HUGETLBFS);
+ log_error(gc)("-XX:+UseLargePages (without -XX:+UseTransparentHugePages) can only be enabled "
+ "when using a %s filesystem", ZFILESYSTEM_HUGETLBFS);
return;
}
if (!ZLargePages::is_explicit() && is_hugetlbfs()) {
- log_error(gc, init)("-XX:+UseLargePages must be enabled when using a %s filesystem",
- ZFILESYSTEM_HUGETLBFS);
+ log_error(gc)("-XX:+UseLargePages must be enabled when using a %s filesystem",
+ ZFILESYSTEM_HUGETLBFS);
+ return;
+ }
+
+ const size_t expected_block_size = is_tmpfs() ? os::vm_page_size() : os::large_page_size();
+ if (expected_block_size != _block_size) {
+ log_error(gc)("%s filesystem has unexpected block size " SIZE_FORMAT " (expected " SIZE_FORMAT ")",
+ is_tmpfs() ? ZFILESYSTEM_TMPFS : ZFILESYSTEM_HUGETLBFS, _block_size, expected_block_size);
return;
}
// Successfully initialized
_initialized = true;
@@ -163,11 +199,11 @@
const int extra_flags = ZLargePages::is_explicit() ? MFD_HUGETLB : 0;
const int fd = z_memfd_create(filename, MFD_CLOEXEC | extra_flags);
if (fd == -1) {
ZErrno err;
log_debug(gc, init)("Failed to create memfd file (%s)",
- ((UseLargePages && err == EINVAL) ? "Hugepages not supported" : err.to_string()));
+ ((ZLargePages::is_explicit() && err == EINVAL) ? "Hugepages not supported" : err.to_string()));
return -1;
}
log_info(gc, init)("Heap backed by file: /memfd:%s", filename);
@@ -183,11 +219,11 @@
: z_preferred_tmpfs_mountpoints;
// Find mountpoint
ZBackingPath path(filesystem, preferred_mountpoints);
if (path.get() == NULL) {
- log_error(gc, init)("Use -XX:ZPath to specify the path to a %s filesystem", filesystem);
+ log_error(gc)("Use -XX:ZPath to specify the path to a %s filesystem", filesystem);
return -1;
}
// Try to create an anonymous file using the O_TMPFILE flag. Note that this
// flag requires kernel >= 3.11. If this fails we fall back to open/unlink.
@@ -199,11 +235,11 @@
} else {
// Get inode number for anonymous file
struct stat stat_buf;
if (fstat(fd_anon, &stat_buf) == -1) {
ZErrno err;
- log_error(gc, init)("Failed to determine inode number for anonymous file (%s)", err.to_string());
+ log_error(gc)("Failed to determine inode number for anonymous file (%s)", err.to_string());
return -1;
}
log_info(gc, init)("Heap backed by file: %s/#" UINT64_FORMAT, path.get(), (uint64_t)stat_buf.st_ino);
@@ -218,18 +254,18 @@
// Create file
const int fd = os::open(filename, O_CREAT|O_EXCL|O_RDWR|O_CLOEXEC, S_IRUSR|S_IWUSR);
if (fd == -1) {
ZErrno err;
- log_error(gc, init)("Failed to create file %s (%s)", filename, err.to_string());
+ log_error(gc)("Failed to create file %s (%s)", filename, err.to_string());
return -1;
}
// Unlink file
if (unlink(filename) == -1) {
ZErrno err;
- log_error(gc, init)("Failed to unlink file %s (%s)", filename, err.to_string());
+ log_error(gc)("Failed to unlink file %s (%s)", filename, err.to_string());
return -1;
}
log_info(gc, init)("Heap backed by file: %s", filename);
@@ -260,10 +296,14 @@
int ZBackingFile::fd() const {
return _fd;
}
+size_t ZBackingFile::size() const {
+ return _size;
+}
+
size_t ZBackingFile::available() const {
return _available;
}
bool ZBackingFile::is_tmpfs() const {
@@ -278,149 +318,273 @@
// If the shmem_enabled file exists and is readable then we
// know the kernel supports transparent huge pages for tmpfs.
return access(ZFILENAME_SHMEM_ENABLED, R_OK) == 0;
}
-bool ZBackingFile::try_split_and_expand_tmpfs(size_t offset, size_t length, size_t alignment) const {
- // Try first smaller part.
- const size_t offset0 = offset;
- const size_t length0 = align_up(length / 2, alignment);
- if (!try_expand_tmpfs(offset0, length0, alignment)) {
- return false;
- }
-
- // Try second smaller part.
- const size_t offset1 = offset0 + length0;
- const size_t length1 = length - length0;
- if (!try_expand_tmpfs(offset1, length1, alignment)) {
- return false;
+ZErrno ZBackingFile::fallocate_compat_ftruncate(size_t size) const {
+ while (ftruncate(_fd, size) == -1) {
+ if (errno != EINTR) {
+ // Failed
+ return errno;
+ }
}
- return true;
+ // Success
+ return 0;
}
-bool ZBackingFile::try_expand_tmpfs(size_t offset, size_t length, size_t alignment) const {
- assert(length > 0, "Invalid length");
- assert(is_aligned(length, alignment), "Invalid length");
-
- ZErrno err = posix_fallocate(_fd, offset, length);
-
- if (err == EINTR && length > alignment) {
- // Calling posix_fallocate() with a large length can take a long
- // time to complete. When running profilers, such as VTune, this
- // syscall will be constantly interrupted by signals. Expanding
- // the file in smaller steps avoids this problem.
- return try_split_and_expand_tmpfs(offset, length, alignment);
+ZErrno ZBackingFile::fallocate_compat_mmap(size_t offset, size_t length, bool touch) const {
+ // On hugetlbfs, mapping a file segment will fail immediately, without
+ // the need to touch the mapped pages first, if there aren't enough huge
+ // pages available to back the mapping.
+ void* const addr = mmap(0, length, PROT_READ|PROT_WRITE, MAP_SHARED, _fd, offset);
+ if (addr == MAP_FAILED) {
+ // Failed
+ return errno;
}
- if (err) {
- log_error(gc)("Failed to allocate backing file (%s)", err.to_string());
- return false;
+ // Once mapped, the huge pages are only reserved. We need to touch them
+ // to associate them with the file segment. Note that we can not punch
+ // hole in file segments which only have reserved pages.
+ if (touch) {
+ char* const start = (char*)addr;
+ char* const end = start + length;
+ os::pretouch_memory(start, end, _block_size);
}
- return true;
+ // Unmap again. From now on, the huge pages that were mapped are allocated
+ // to this file. There's no risk in getting SIGBUS when touching them.
+ if (munmap(addr, length) == -1) {
+ // Failed
+ return errno;
+ }
+
+ // Success
+ return 0;
}
-bool ZBackingFile::try_expand_tmpfs(size_t offset, size_t length) const {
- assert(is_tmpfs(), "Wrong filesystem");
- return try_expand_tmpfs(offset, length, os::vm_page_size());
+ZErrno ZBackingFile::fallocate_compat_pwrite(size_t offset, size_t length) const {
+ uint8_t data = 0;
+
+ // Allocate backing memory by writing to each block
+ for (size_t pos = offset; pos < offset + length; pos += _block_size) {
+ if (pwrite(_fd, &data, sizeof(data), pos) == -1) {
+ // Failed
+ return errno;
+ }
+ }
+
+ // Success
+ return 0;
}
-bool ZBackingFile::try_expand_hugetlbfs(size_t offset, size_t length) const {
- assert(is_hugetlbfs(), "Wrong filesystem");
-
- // Prior to kernel 4.3, hugetlbfs did not support posix_fallocate().
- // Instead of posix_fallocate() we can use a well-known workaround,
- // which involves truncating the file to requested size and then try
- // to map it to verify that there are enough huge pages available to
- // back it.
- while (ftruncate(_fd, offset + length) == -1) {
- ZErrno err;
- if (err != EINTR) {
- log_error(gc)("Failed to truncate backing file (%s)", err.to_string());
- return false;
+ZErrno ZBackingFile::fallocate_fill_hole_compat(size_t offset, size_t length) {
+ // fallocate(2) is only supported by tmpfs since Linux 3.5, and by hugetlbfs
+ // since Linux 4.3. When fallocate(2) is not supported we emulate it using
+ // ftruncate/pwrite (for tmpfs) or ftruncate/mmap/munmap (for hugetlbfs).
+
+ const size_t end = offset + length;
+ if (end > _size) {
+ // Increase file size
+ const ZErrno err = fallocate_compat_ftruncate(end);
+ if (err) {
+ // Failed
+ return err;
}
}
- // If we fail mapping during initialization, i.e. when we are pre-mapping
- // the heap, then we wait and retry a few times before giving up. Otherwise
- // there is a risk that running JVMs back-to-back will fail, since there
- // is a delay between process termination and the huge pages owned by that
- // process being returned to the huge page pool and made available for new
- // allocations.
- void* addr = MAP_FAILED;
- const int max_attempts = 5;
- for (int attempt = 1; attempt <= max_attempts; attempt++) {
- addr = mmap(0, length, PROT_READ|PROT_WRITE, MAP_SHARED, _fd, offset);
- if (addr != MAP_FAILED || !_hugetlbfs_mmap_retry) {
- // Mapping was successful or mmap retry is disabled
- break;
+ // Allocate backing memory
+ const ZErrno err = is_hugetlbfs() ? fallocate_compat_mmap(offset, length, false /* touch */)
+ : fallocate_compat_pwrite(offset, length);
+ if (err) {
+ if (end > _size) {
+ // Restore file size
+ fallocate_compat_ftruncate(_size);
}
- ZErrno err;
- log_debug(gc)("Failed to map backing file (%s), attempt %d of %d",
- err.to_string(), attempt, max_attempts);
+ // Failed
+ return err;
+ }
- // Wait and retry in one second, in the hope that
- // huge pages will be available by then.
- sleep(1);
+ if (end > _size) {
+ // Record new file size
+ _size = end;
}
- // Disable mmap retry from now on
- if (_hugetlbfs_mmap_retry) {
- _hugetlbfs_mmap_retry = false;
+ // Success
+ return 0;
+}
+
+ZErrno ZBackingFile::fallocate_fill_hole_syscall(size_t offset, size_t length) {
+ const int mode = 0; // Allocate
+ const int res = z_fallocate(_fd, mode, offset, length);
+ if (res == -1) {
+ // Failed
+ return errno;
}
- if (addr == MAP_FAILED) {
- // Not enough huge pages left
- ZErrno err;
- log_error(gc)("Failed to map backing file (%s)", err.to_string());
- return false;
+ const size_t end = offset + length;
+ if (end > _size) {
+ // Record new file size
+ _size = end;
}
- // Successful mapping, unmap again. From now on the pages we mapped
- // will be reserved for this file.
- if (munmap(addr, length) == -1) {
- ZErrno err;
- log_error(gc)("Failed to unmap backing file (%s)", err.to_string());
- return false;
+ // Success
+ return 0;
+}
+
+ZErrno ZBackingFile::fallocate_fill_hole(size_t offset, size_t length) {
+ // Using compat mode is more efficient when allocating space on hugetlbfs.
+ // Note that allocating huge pages this way will only reserve them, and not
+ // associate them with segments of the file. We must guarantee that we at
+ // some point touch these segments, otherwise we can not punch hole in them.
+ if (z_fallocate_supported && !is_hugetlbfs()) {
+ const ZErrno err = fallocate_fill_hole_syscall(offset, length);
+ if (!err) {
+ // Success
+ return 0;
+ }
+
+ if (err != ENOSYS && err != EOPNOTSUPP) {
+ // Failed
+ return err;
+ }
+
+ // Not supported
+ log_debug(gc)("Falling back to fallocate() compatibility mode");
+ z_fallocate_supported = false;
}
- return true;
+ return fallocate_fill_hole_compat(offset, length);
}
-bool ZBackingFile::try_expand_tmpfs_or_hugetlbfs(size_t offset, size_t length, size_t alignment) const {
- assert(is_aligned(offset, alignment), "Invalid offset");
- assert(is_aligned(length, alignment), "Invalid length");
+ZErrno ZBackingFile::fallocate_punch_hole(size_t offset, size_t length) {
+ if (is_hugetlbfs()) {
+ // We can only punch hole in pages that have been touched. Non-touched
+ // pages are only reserved, and not associated with any specific file
+ // segment. We don't know which pages have been previously touched, so
+ // we always touch them here to guarantee that we can punch hole.
+ const ZErrno err = fallocate_compat_mmap(offset, length, true /* touch */);
+ if (err) {
+ // Failed
+ return err;
+ }
+ }
- log_debug(gc)("Expanding heap from " SIZE_FORMAT "M to " SIZE_FORMAT "M", offset / M, (offset + length) / M);
+ const int mode = FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE;
+ if (z_fallocate(_fd, mode, offset, length) == -1) {
+ // Failed
+ return errno;
+ }
- return is_hugetlbfs() ? try_expand_hugetlbfs(offset, length) : try_expand_tmpfs(offset, length);
+ // Success
+ return 0;
}
-size_t ZBackingFile::try_expand(size_t offset, size_t length, size_t alignment) const {
- size_t start = offset;
- size_t end = offset + length;
+ZErrno ZBackingFile::split_and_fallocate(bool punch_hole, size_t offset, size_t length) {
+ // Try first half
+ const size_t offset0 = offset;
+ const size_t length0 = align_up(length / 2, _block_size);
+ const ZErrno err0 = fallocate(punch_hole, offset0, length0);
+ if (err0) {
+ return err0;
+ }
- // Try to expand
- if (try_expand_tmpfs_or_hugetlbfs(start, length, alignment)) {
+ // Try second half
+ const size_t offset1 = offset0 + length0;
+ const size_t length1 = length - length0;
+ const ZErrno err1 = fallocate(punch_hole, offset1, length1);
+ if (err1) {
+ return err1;
+ }
+
+ // Success
+ return 0;
+}
+
+ZErrno ZBackingFile::fallocate(bool punch_hole, size_t offset, size_t length) {
+ assert(is_aligned(offset, _block_size), "Invalid offset");
+ assert(is_aligned(length, _block_size), "Invalid length");
+
+ const ZErrno err = punch_hole ? fallocate_punch_hole(offset, length) : fallocate_fill_hole(offset, length);
+ if (err == EINTR && length > _block_size) {
+ // Calling fallocate(2) with a large length can take a long time to
+ // complete. When running profilers, such as VTune, this syscall will
+ // be constantly interrupted by signals. Expanding the file in smaller
+ // steps avoids this problem.
+ return split_and_fallocate(punch_hole, offset, length);
+ }
+
+ return err;
+}
+
+bool ZBackingFile::commit_inner(size_t offset, size_t length) {
+ log_trace(gc, heap)("Committing memory: " SIZE_FORMAT "M-" SIZE_FORMAT "M (" SIZE_FORMAT "M)",
+ offset / M, (offset + length) / M, length / M);
+
+retry:
+ const ZErrno err = fallocate(false /* punch_hole */, offset, length);
+ if (err) {
+ if (err == ENOSPC && !is_init_completed() && is_hugetlbfs() && z_fallocate_hugetlbfs_attempts-- > 0) {
+ // If we fail to allocate during initialization, due to lack of space on
+ // the hugetlbfs filesystem, then we wait and retry a few times before
+ // giving up. Otherwise there is a risk that running JVMs back-to-back
+ // will fail, since there is a delay between process termination and the
+ // huge pages owned by that process being returned to the huge page pool
+ // and made available for new allocations.
+ log_debug(gc, init)("Failed to commit memory (%s), retrying", err.to_string());
+
+ // Wait and retry in one second, in the hope that huge pages will be
+ // available by then.
+ sleep(1);
+ goto retry;
+ }
+
+ // Failed
+ log_error(gc)("Failed to commit memory (%s)", err.to_string());
+ return false;
+ }
+
+ // Success
+ return true;
+}
+
+size_t ZBackingFile::commit(size_t offset, size_t length) {
+ // Try to commit the whole region
+ if (commit_inner(offset, length)) {
// Success
- return end;
+ return length;
}
- // Failed, try to expand as much as possible
+ // Failed, try to commit as much as possible
+ size_t start = offset;
+ size_t end = offset + length;
+
for (;;) {
- length = align_down((end - start) / 2, alignment);
- if (length < alignment) {
- // Done, don't expand more
- return start;
+ length = align_down((end - start) / 2, ZGranuleSize);
+ if (length < ZGranuleSize) {
+ // Done, don't commit more
+ return start - offset;
}
- if (try_expand_tmpfs_or_hugetlbfs(start, length, alignment)) {
- // Success, try expand more
+ if (commit_inner(start, length)) {
+ // Success, try commit more
start += length;
} else {
- // Failed, try expand less
+ // Failed, try commit less
end -= length;
}
}
}
+
+size_t ZBackingFile::uncommit(size_t offset, size_t length) {
+ log_trace(gc, heap)("Uncommitting memory: " SIZE_FORMAT "M-" SIZE_FORMAT "M (" SIZE_FORMAT "M)",
+ offset / M, (offset + length) / M, length / M);
+
+ const ZErrno err = fallocate(true /* punch_hole */, offset, length);
+ if (err) {
+ log_error(gc)("Failed to uncommit memory (%s)", err.to_string());
+ return 0;
+ }
+
+ return length;
+}
< prev index next >