--- old/src/hotspot/os/linux/osContainer_linux.cpp 2019-10-18 18:16:08.831123889 +0200 +++ new/src/hotspot/os/linux/osContainer_linux.cpp 2019-10-18 18:16:08.704123560 +0200 @@ -25,267 +25,16 @@ #include #include #include -#include "utilities/globalDefinitions.hpp" -#include "memory/allocation.hpp" #include "runtime/globals.hpp" #include "runtime/os.hpp" #include "logging/log.hpp" #include "osContainer_linux.hpp" +#include "cgroupSubsystem_linux.hpp" -/* - * PER_CPU_SHARES has been set to 1024 because CPU shares' quota - * is commonly used in cloud frameworks like Kubernetes[1], - * AWS[2] and Mesos[3] in a similar way. They spawn containers with - * --cpu-shares option values scaled by PER_CPU_SHARES. Thus, we do - * the inverse for determining the number of possible available - * CPUs to the JVM inside a container. See JDK-8216366. - * - * [1] https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/#meaning-of-cpu - * In particular: - * When using Docker: - * The spec.containers[].resources.requests.cpu is converted to its core value, which is potentially - * fractional, and multiplied by 1024. The greater of this number or 2 is used as the value of the - * --cpu-shares flag in the docker run command. - * [2] https://docs.aws.amazon.com/AmazonECS/latest/APIReference/API_ContainerDefinition.html - * [3] https://github.com/apache/mesos/blob/3478e344fb77d931f6122980c6e94cd3913c441d/src/docker/docker.cpp#L648 - * https://github.com/apache/mesos/blob/3478e344fb77d931f6122980c6e94cd3913c441d/src/slave/containerizer/mesos/isolators/cgroups/constants.hpp#L30 - */ -#define PER_CPU_SHARES 1024 bool OSContainer::_is_initialized = false; bool OSContainer::_is_containerized = false; -julong _unlimited_memory; - -class CgroupSubsystem: CHeapObj { - friend class OSContainer; - - private: - /* mountinfo contents */ - char *_root; - char *_mount_point; - - /* Constructed subsystem directory */ - char *_path; - - public: - CgroupSubsystem(char *root, char *mountpoint) { - _root = os::strdup(root); - _mount_point = os::strdup(mountpoint); - _path = NULL; - } - - /* - * Set directory to subsystem specific files based - * on the contents of the mountinfo and cgroup files. - */ - void set_subsystem_path(char *cgroup_path) { - char buf[MAXPATHLEN+1]; - if (_root != NULL && cgroup_path != NULL) { - if (strcmp(_root, "/") == 0) { - int buflen; - strncpy(buf, _mount_point, MAXPATHLEN); - buf[MAXPATHLEN-1] = '\0'; - if (strcmp(cgroup_path,"/") != 0) { - buflen = strlen(buf); - if ((buflen + strlen(cgroup_path)) > (MAXPATHLEN-1)) { - return; - } - strncat(buf, cgroup_path, MAXPATHLEN-buflen); - buf[MAXPATHLEN-1] = '\0'; - } - _path = os::strdup(buf); - } else { - if (strcmp(_root, cgroup_path) == 0) { - strncpy(buf, _mount_point, MAXPATHLEN); - buf[MAXPATHLEN-1] = '\0'; - _path = os::strdup(buf); - } else { - char *p = strstr(cgroup_path, _root); - if (p != NULL && p == _root) { - if (strlen(cgroup_path) > strlen(_root)) { - int buflen; - strncpy(buf, _mount_point, MAXPATHLEN); - buf[MAXPATHLEN-1] = '\0'; - buflen = strlen(buf); - if ((buflen + strlen(cgroup_path) - strlen(_root)) > (MAXPATHLEN-1)) { - return; - } - strncat(buf, cgroup_path + strlen(_root), MAXPATHLEN-buflen); - buf[MAXPATHLEN-1] = '\0'; - _path = os::strdup(buf); - } - } - } - } - } - } - - char *subsystem_path() { return _path; } -}; - -class CgroupMemorySubsystem: CgroupSubsystem { - friend class OSContainer; - - private: - /* Some container runtimes set limits via cgroup - * hierarchy. If set to true consider also memory.stat - * file if everything else seems unlimited */ - bool _uses_mem_hierarchy; - volatile jlong _memory_limit_in_bytes; - volatile jlong _next_check_counter; - - public: - CgroupMemorySubsystem(char *root, char *mountpoint) : CgroupSubsystem::CgroupSubsystem(root, mountpoint) { - _uses_mem_hierarchy = false; - _memory_limit_in_bytes = -1; - _next_check_counter = min_jlong; - - } - - bool is_hierarchical() { return _uses_mem_hierarchy; } - void set_hierarchical(bool value) { _uses_mem_hierarchy = value; } - - bool should_check_memory_limit() { - return os::elapsed_counter() > _next_check_counter; - } - jlong memory_limit_in_bytes() { return _memory_limit_in_bytes; } - void set_memory_limit_in_bytes(jlong value) { - _memory_limit_in_bytes = value; - // max memory limit is unlikely to change, but we want to remain - // responsive to configuration changes. A very short (20ms) grace time - // between re-read avoids excessive overhead during startup without - // significantly reducing the VMs ability to promptly react to reduced - // memory availability - _next_check_counter = os::elapsed_counter() + (NANOSECS_PER_SEC/50); - } - -}; - -CgroupMemorySubsystem* memory = NULL; -CgroupSubsystem* cpuset = NULL; -CgroupSubsystem* cpu = NULL; -CgroupSubsystem* cpuacct = NULL; - -typedef char * cptr; - -PRAGMA_DIAG_PUSH -PRAGMA_FORMAT_NONLITERAL_IGNORED -template int subsystem_file_line_contents(CgroupSubsystem* c, - const char *filename, - const char *matchline, - const char *scan_fmt, - T returnval) { - FILE *fp = NULL; - char *p; - char file[MAXPATHLEN+1]; - char buf[MAXPATHLEN+1]; - char discard[MAXPATHLEN+1]; - bool found_match = false; - - if (c == NULL) { - log_debug(os, container)("subsystem_file_line_contents: CgroupSubsytem* is NULL"); - return OSCONTAINER_ERROR; - } - if (c->subsystem_path() == NULL) { - log_debug(os, container)("subsystem_file_line_contents: subsystem path is NULL"); - return OSCONTAINER_ERROR; - } - - strncpy(file, c->subsystem_path(), MAXPATHLEN); - file[MAXPATHLEN-1] = '\0'; - int filelen = strlen(file); - if ((filelen + strlen(filename)) > (MAXPATHLEN-1)) { - log_debug(os, container)("File path too long %s, %s", file, filename); - return OSCONTAINER_ERROR; - } - strncat(file, filename, MAXPATHLEN-filelen); - log_trace(os, container)("Path to %s is %s", filename, file); - fp = fopen(file, "r"); - if (fp != NULL) { - int err = 0; - while ((p = fgets(buf, MAXPATHLEN, fp)) != NULL) { - found_match = false; - if (matchline == NULL) { - // single-line file case - int matched = sscanf(p, scan_fmt, returnval); - found_match = (matched == 1); - } else { - // multi-line file case - if (strstr(p, matchline) != NULL) { - // discard matchline string prefix - int matched = sscanf(p, scan_fmt, discard, returnval); - found_match = (matched == 2); - } else { - continue; // substring not found - } - } - if (found_match) { - fclose(fp); - return 0; - } else { - err = 1; - log_debug(os, container)("Type %s not found in file %s", scan_fmt, file); - } - } - if (err == 0) { - log_debug(os, container)("Empty file %s", file); - } - } else { - log_debug(os, container)("Open of file %s failed, %s", file, os::strerror(errno)); - } - if (fp != NULL) - fclose(fp); - return OSCONTAINER_ERROR; -} -PRAGMA_DIAG_POP - -#define GET_CONTAINER_INFO(return_type, subsystem, filename, \ - logstring, scan_fmt, variable) \ - return_type variable; \ -{ \ - int err; \ - err = subsystem_file_line_contents(subsystem, \ - filename, \ - NULL, \ - scan_fmt, \ - &variable); \ - if (err != 0) \ - return (return_type) OSCONTAINER_ERROR; \ - \ - log_trace(os, container)(logstring, variable); \ -} - -#define GET_CONTAINER_INFO_CPTR(return_type, subsystem, filename, \ - logstring, scan_fmt, variable, bufsize) \ - char variable[bufsize]; \ -{ \ - int err; \ - err = subsystem_file_line_contents(subsystem, \ - filename, \ - NULL, \ - scan_fmt, \ - variable); \ - if (err != 0) \ - return (return_type) NULL; \ - \ - log_trace(os, container)(logstring, variable); \ -} - -#define GET_CONTAINER_INFO_LINE(return_type, subsystem, filename, \ - matchline, logstring, scan_fmt, variable) \ - return_type variable; \ -{ \ - int err; \ - err = subsystem_file_line_contents(subsystem, \ - filename, \ - matchline, \ - scan_fmt, \ - &variable); \ - if (err != 0) \ - return (return_type) OSCONTAINER_ERROR; \ - \ - log_trace(os, container)(logstring, variable); \ -} +CgroupSubsystem* cgroup_subsystem; /* init * @@ -293,12 +42,6 @@ * we are running under cgroup control. */ void OSContainer::init() { - FILE *mntinfo = NULL; - FILE *cgroup = NULL; - char buf[MAXPATHLEN+1]; - char tmproot[MAXPATHLEN+1]; - char tmpmount[MAXPATHLEN+1]; - char *p; jlong mem_limit; assert(!_is_initialized, "Initializing OSContainer more than once"); @@ -306,139 +49,19 @@ _is_initialized = true; _is_containerized = false; - _unlimited_memory = (LONG_MAX / os::vm_page_size()) * os::vm_page_size(); - log_trace(os, container)("OSContainer::init: Initializing Container Support"); if (!UseContainerSupport) { log_trace(os, container)("Container Support not enabled"); return; } - /* - * Find the cgroup mount point for memory and cpuset - * by reading /proc/self/mountinfo - * - * Example for docker: - * 219 214 0:29 /docker/7208cebd00fa5f2e342b1094f7bed87fa25661471a4637118e65f1c995be8a34 /sys/fs/cgroup/memory ro,nosuid,nodev,noexec,relatime - cgroup cgroup rw,memory - * - * Example for host: - * 34 28 0:29 / /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:16 - cgroup cgroup rw,memory - */ - mntinfo = fopen("/proc/self/mountinfo", "r"); - if (mntinfo == NULL) { - log_debug(os, container)("Can't open /proc/self/mountinfo, %s", - os::strerror(errno)); - return; - } - - while ((p = fgets(buf, MAXPATHLEN, mntinfo)) != NULL) { - char tmpcgroups[MAXPATHLEN+1]; - char *cptr = tmpcgroups; - char *token; - - // mountinfo format is documented at https://www.kernel.org/doc/Documentation/filesystems/proc.txt - if (sscanf(p, "%*d %*d %*d:%*d %s %s %*[^-]- cgroup %*s %s", tmproot, tmpmount, tmpcgroups) != 3) { - continue; - } - while ((token = strsep(&cptr, ",")) != NULL) { - if (strcmp(token, "memory") == 0) { - memory = new CgroupMemorySubsystem(tmproot, tmpmount); - } else if (strcmp(token, "cpuset") == 0) { - cpuset = new CgroupSubsystem(tmproot, tmpmount); - } else if (strcmp(token, "cpu") == 0) { - cpu = new CgroupSubsystem(tmproot, tmpmount); - } else if (strcmp(token, "cpuacct") == 0) { - cpuacct= new CgroupSubsystem(tmproot, tmpmount); - } - } - } - - fclose(mntinfo); - - if (memory == NULL) { - log_debug(os, container)("Required cgroup memory subsystem not found"); - return; - } - if (cpuset == NULL) { - log_debug(os, container)("Required cgroup cpuset subsystem not found"); - return; - } - if (cpu == NULL) { - log_debug(os, container)("Required cgroup cpu subsystem not found"); - return; + cgroup_subsystem = CgroupSubsystemFactory::create(); + if (cgroup_subsystem == NULL) { + return; // Required subsystem files not found or other error } - if (cpuacct == NULL) { - log_debug(os, container)("Required cgroup cpuacct subsystem not found"); - return; - } - - /* - * Read /proc/self/cgroup and map host mount point to - * local one via /proc/self/mountinfo content above - * - * Docker example: - * 5:memory:/docker/6558aed8fc662b194323ceab5b964f69cf36b3e8af877a14b80256e93aecb044 - * - * Host example: - * 5:memory:/user.slice - * - * Construct a path to the process specific memory and cpuset - * cgroup directory. - * - * For a container running under Docker from memory example above - * the paths would be: - * - * /sys/fs/cgroup/memory - * - * For a Host from memory example above the path would be: - * - * /sys/fs/cgroup/memory/user.slice - * - */ - cgroup = fopen("/proc/self/cgroup", "r"); - if (cgroup == NULL) { - log_debug(os, container)("Can't open /proc/self/cgroup, %s", - os::strerror(errno)); - return; - } - - while ((p = fgets(buf, MAXPATHLEN, cgroup)) != NULL) { - char *controllers; - char *token; - char *base; - - /* Skip cgroup number */ - strsep(&p, ":"); - /* Get controllers and base */ - controllers = strsep(&p, ":"); - base = strsep(&p, "\n"); - - if (controllers == NULL) { - continue; - } - - while ((token = strsep(&controllers, ",")) != NULL) { - if (strcmp(token, "memory") == 0) { - memory->set_subsystem_path(base); - jlong hierarchy = uses_mem_hierarchy(); - if (hierarchy > 0) { - memory->set_hierarchical(true); - } - } else if (strcmp(token, "cpuset") == 0) { - cpuset->set_subsystem_path(base); - } else if (strcmp(token, "cpu") == 0) { - cpu->set_subsystem_path(base); - } else if (strcmp(token, "cpuacct") == 0) { - cpuacct->set_subsystem_path(base); - } - } - } - - fclose(cgroup); - // We need to update the amount of physical memory now that - // command line arguments have been processed. - if ((mem_limit = memory_limit_in_bytes()) > 0) { + // cgroup subsystem files have been processed. + if ((mem_limit = cgroup_subsystem->memory_limit_in_bytes()) > 0) { os::Linux::set_physical_memory(mem_limit); log_info(os, container)("Memory Limit is: " JLONG_FORMAT, mem_limit); } @@ -448,259 +71,62 @@ } const char * OSContainer::container_type() { - if (is_containerized()) { - return "cgroupv1"; - } else { - return NULL; - } + assert(cgroup_subsystem != NULL, "cgroup subsystem not available"); + return cgroup_subsystem->container_type(); } -/* uses_mem_hierarchy - * - * Return whether or not hierarchical cgroup accounting is being - * done. - * - * return: - * A number > 0 if true, or - * OSCONTAINER_ERROR for not supported - */ -jlong OSContainer::uses_mem_hierarchy() { - GET_CONTAINER_INFO(jlong, memory, "/memory.use_hierarchy", - "Use Hierarchy is: " JLONG_FORMAT, JLONG_FORMAT, use_hierarchy); - return use_hierarchy; -} - - -/* memory_limit_in_bytes - * - * Return the limit of available memory for this process. - * - * return: - * memory limit in bytes or - * -1 for unlimited - * OSCONTAINER_ERROR for not supported - */ jlong OSContainer::memory_limit_in_bytes() { - if (!memory->should_check_memory_limit()) { - return memory->memory_limit_in_bytes(); - } - jlong memory_limit = read_memory_limit_in_bytes(); - // Update CgroupMemorySubsystem to avoid re-reading container settings too often - memory->set_memory_limit_in_bytes(memory_limit); - return memory_limit; -} - -jlong OSContainer::read_memory_limit_in_bytes() { - GET_CONTAINER_INFO(julong, memory, "/memory.limit_in_bytes", - "Memory Limit is: " JULONG_FORMAT, JULONG_FORMAT, memlimit); - - if (memlimit >= _unlimited_memory) { - log_trace(os, container)("Non-Hierarchical Memory Limit is: Unlimited"); - if (memory->is_hierarchical()) { - const char* matchline = "hierarchical_memory_limit"; - const char* format = "%s " JULONG_FORMAT; - GET_CONTAINER_INFO_LINE(julong, memory, "/memory.stat", matchline, - "Hierarchical Memory Limit is: " JULONG_FORMAT, format, hier_memlimit) - if (hier_memlimit >= _unlimited_memory) { - log_trace(os, container)("Hierarchical Memory Limit is: Unlimited"); - } else { - return (jlong)hier_memlimit; - } - } - return (jlong)-1; - } - else { - return (jlong)memlimit; - } + assert(cgroup_subsystem != NULL, "cgroup subsystem not available"); + return cgroup_subsystem->memory_limit_in_bytes(); } jlong OSContainer::memory_and_swap_limit_in_bytes() { - GET_CONTAINER_INFO(julong, memory, "/memory.memsw.limit_in_bytes", - "Memory and Swap Limit is: " JULONG_FORMAT, JULONG_FORMAT, memswlimit); - if (memswlimit >= _unlimited_memory) { - log_trace(os, container)("Non-Hierarchical Memory and Swap Limit is: Unlimited"); - if (memory->is_hierarchical()) { - const char* matchline = "hierarchical_memsw_limit"; - const char* format = "%s " JULONG_FORMAT; - GET_CONTAINER_INFO_LINE(julong, memory, "/memory.stat", matchline, - "Hierarchical Memory and Swap Limit is : " JULONG_FORMAT, format, hier_memlimit) - if (hier_memlimit >= _unlimited_memory) { - log_trace(os, container)("Hierarchical Memory and Swap Limit is: Unlimited"); - } else { - return (jlong)hier_memlimit; - } - } - return (jlong)-1; - } else { - return (jlong)memswlimit; - } + assert(cgroup_subsystem != NULL, "cgroup subsystem not available"); + return cgroup_subsystem->memory_and_swap_limit_in_bytes(); } jlong OSContainer::memory_soft_limit_in_bytes() { - GET_CONTAINER_INFO(julong, memory, "/memory.soft_limit_in_bytes", - "Memory Soft Limit is: " JULONG_FORMAT, JULONG_FORMAT, memsoftlimit); - if (memsoftlimit >= _unlimited_memory) { - log_trace(os, container)("Memory Soft Limit is: Unlimited"); - return (jlong)-1; - } else { - return (jlong)memsoftlimit; - } + assert(cgroup_subsystem != NULL, "cgroup subsystem not available"); + return cgroup_subsystem->memory_soft_limit_in_bytes(); } -/* memory_usage_in_bytes - * - * Return the amount of used memory for this process. - * - * return: - * memory usage in bytes or - * -1 for unlimited - * OSCONTAINER_ERROR for not supported - */ jlong OSContainer::memory_usage_in_bytes() { - GET_CONTAINER_INFO(jlong, memory, "/memory.usage_in_bytes", - "Memory Usage is: " JLONG_FORMAT, JLONG_FORMAT, memusage); - return memusage; + assert(cgroup_subsystem != NULL, "cgroup subsystem not available"); + return cgroup_subsystem->memory_usage_in_bytes(); } -/* memory_max_usage_in_bytes - * - * Return the maximum amount of used memory for this process. - * - * return: - * max memory usage in bytes or - * OSCONTAINER_ERROR for not supported - */ jlong OSContainer::memory_max_usage_in_bytes() { - GET_CONTAINER_INFO(jlong, memory, "/memory.max_usage_in_bytes", - "Maximum Memory Usage is: " JLONG_FORMAT, JLONG_FORMAT, memmaxusage); - return memmaxusage; -} - -/* active_processor_count - * - * Calculate an appropriate number of active processors for the - * VM to use based on these three inputs. - * - * cpu affinity - * cgroup cpu quota & cpu period - * cgroup cpu shares - * - * Algorithm: - * - * Determine the number of available CPUs from sched_getaffinity - * - * If user specified a quota (quota != -1), calculate the number of - * required CPUs by dividing quota by period. - * - * If shares are in effect (shares != -1), calculate the number - * of CPUs required for the shares by dividing the share value - * by PER_CPU_SHARES. - * - * All results of division are rounded up to the next whole number. - * - * If neither shares or quotas have been specified, return the - * number of active processors in the system. - * - * If both shares and quotas have been specified, the results are - * based on the flag PreferContainerQuotaForCPUCount. If true, - * return the quota value. If false return the smallest value - * between shares or quotas. - * - * If shares and/or quotas have been specified, the resulting number - * returned will never exceed the number of active processors. - * - * return: - * number of CPUs - */ -int OSContainer::active_processor_count() { - int quota_count = 0, share_count = 0; - int cpu_count, limit_count; - int result; - - cpu_count = limit_count = os::Linux::active_processor_count(); - int quota = cpu_quota(); - int period = cpu_period(); - int share = cpu_shares(); - - if (quota > -1 && period > 0) { - quota_count = ceilf((float)quota / (float)period); - log_trace(os, container)("CPU Quota count based on quota/period: %d", quota_count); - } - if (share > -1) { - share_count = ceilf((float)share / (float)PER_CPU_SHARES); - log_trace(os, container)("CPU Share count based on shares: %d", share_count); - } - - // If both shares and quotas are setup results depend - // on flag PreferContainerQuotaForCPUCount. - // If true, limit CPU count to quota - // If false, use minimum of shares and quotas - if (quota_count !=0 && share_count != 0) { - if (PreferContainerQuotaForCPUCount) { - limit_count = quota_count; - } else { - limit_count = MIN2(quota_count, share_count); - } - } else if (quota_count != 0) { - limit_count = quota_count; - } else if (share_count != 0) { - limit_count = share_count; - } - - result = MIN2(cpu_count, limit_count); - log_trace(os, container)("OSContainer::active_processor_count: %d", result); - return result; + assert(cgroup_subsystem != NULL, "cgroup subsystem not available"); + return cgroup_subsystem->memory_max_usage_in_bytes(); } char * OSContainer::cpu_cpuset_cpus() { - GET_CONTAINER_INFO_CPTR(cptr, cpuset, "/cpuset.cpus", - "cpuset.cpus is: %s", "%1023s", cpus, 1024); - return os::strdup(cpus); + assert(cgroup_subsystem != NULL, "cgroup subsystem not available"); + return cgroup_subsystem->cpu_cpuset_cpus(); } char * OSContainer::cpu_cpuset_memory_nodes() { - GET_CONTAINER_INFO_CPTR(cptr, cpuset, "/cpuset.mems", - "cpuset.mems is: %s", "%1023s", mems, 1024); - return os::strdup(mems); + assert(cgroup_subsystem != NULL, "cgroup subsystem not available"); + return cgroup_subsystem->cpu_cpuset_memory_nodes(); +} + +int OSContainer::active_processor_count() { + assert(cgroup_subsystem != NULL, "cgroup subsystem not available"); + int physical_proc = os::Linux::active_processor_count(); + return cgroup_subsystem->active_processor_count(physical_proc); } -/* cpu_quota - * - * Return the number of milliseconds per period - * process is guaranteed to run. - * - * return: - * quota time in milliseconds - * -1 for no quota - * OSCONTAINER_ERROR for not supported - */ int OSContainer::cpu_quota() { - GET_CONTAINER_INFO(int, cpu, "/cpu.cfs_quota_us", - "CPU Quota is: %d", "%d", quota); - return quota; + assert(cgroup_subsystem != NULL, "cgroup subsystem not available"); + return cgroup_subsystem->cpu_quota(); } int OSContainer::cpu_period() { - GET_CONTAINER_INFO(int, cpu, "/cpu.cfs_period_us", - "CPU Period is: %d", "%d", period); - return period; + assert(cgroup_subsystem != NULL, "cgroup subsystem not available"); + return cgroup_subsystem->cpu_period(); } -/* cpu_shares - * - * Return the amount of cpu shares available to the process - * - * return: - * Share number (typically a number relative to 1024) - * (2048 typically expresses 2 CPUs worth of processing) - * -1 for no share setup - * OSCONTAINER_ERROR for not supported - */ int OSContainer::cpu_shares() { - GET_CONTAINER_INFO(int, cpu, "/cpu.shares", - "CPU Shares is: %d", "%d", shares); - // Convert 1024 to no shares setup - if (shares == 1024) return -1; - - return shares; + assert(cgroup_subsystem != NULL, "cgroup subsystem not available"); + return cgroup_subsystem->cpu_shares(); }