--- old/src/hotspot/os/linux/cgroupSubsystem_linux.cpp 2019-09-19 13:59:29.500450193 +0200 +++ new/src/hotspot/os/linux/cgroupSubsystem_linux.cpp 2019-09-19 13:59:29.364449929 +0200 @@ -228,11 +228,176 @@ CgroupV1Controller* cpu = NULL; CgroupV1Controller* cpuacct = NULL; FILE *mntinfo = NULL; + FILE *cgroups = NULL; FILE *cgroup = NULL; char buf[MAXPATHLEN+1]; char tmproot[MAXPATHLEN+1]; char tmpmount[MAXPATHLEN+1]; char *p; + bool is_cgroupsV2; + // true iff all controllers, memory, cpu, cpuset, cpuacct are enabled + // at the kernel level. + bool all_controllers_enabled; + + CgroupInfo cg_infos[CG_INFO_LENGTH]; + int cpuset_idx = 0; + int cpu_idx = 1; + int cpuacct_idx = 2; + int memory_idx = 3; + + /* + * Read /proc/cgroups so as to be able to distinguish cgroups v2 vs cgroups v1. + * + * For cgroups v1 unified hierarchy, cpu, cpuacct, cpuset, memory controllers + * must have non-zero for the hierarchy ID field. + */ + cgroups = fopen("/proc/cgroups", "r"); + if (cgroups == NULL) { + log_debug(os, container)("Can't open /proc/cgroups, %s", + os::strerror(errno)); + return NULL; + } + + while ((p = fgets(buf, MAXPATHLEN, cgroups)) != NULL) { + char name[MAXPATHLEN+1]; + int hierarchy_id; + int enabled; + + // Format of /proc/cgroups documented via man 7 cgroups + if (sscanf(p, "%s %d %*d %d", name, &hierarchy_id, &enabled) != 3) { + continue; + } + if (strcmp(name, "memory") == 0) { + cg_infos[memory_idx]._name = os::strdup(name); + cg_infos[memory_idx]._hierarchy_id = hierarchy_id; + cg_infos[memory_idx]._enabled = (enabled == 1); + } else if (strcmp(name, "cpuset") == 0) { + cg_infos[cpuset_idx]._name = os::strdup(name); + cg_infos[cpuset_idx]._hierarchy_id = hierarchy_id; + cg_infos[cpuset_idx]._enabled = (enabled == 1); + } else if (strcmp(name, "cpu") == 0) { + cg_infos[cpu_idx]._name = os::strdup(name); + cg_infos[cpu_idx]._hierarchy_id = hierarchy_id; + cg_infos[cpu_idx]._enabled = (enabled == 1); + } else if (strcmp(name, "cpuacct") == 0) { + cg_infos[cpuacct_idx]._name = os::strdup(name); + cg_infos[cpuacct_idx]._hierarchy_id = hierarchy_id; + cg_infos[cpuacct_idx]._enabled = (enabled == 1); + } + } + fclose(cgroups); + + is_cgroupsV2 = true; + all_controllers_enabled = true; + for (int i = 0; i < CG_INFO_LENGTH; i++) { + is_cgroupsV2 = is_cgroupsV2 && cg_infos[i]._hierarchy_id == 0; + all_controllers_enabled = all_controllers_enabled && cg_infos[i]._enabled; + } + + if (!all_controllers_enabled) { + // one or more controllers enabled, disable container support + log_debug(os, container)("One or more required controllers not enabled at kernel level."); + return NULL; + } + + /* + * Read /proc/self/cgroup and determine: + * - the cgroup path for cgroups v2 or + * - on a cgroups v1 system, collect info for mapping + * the host mount point to the local one via /proc/self/mountinfo below. + */ + cgroup = fopen("/proc/self/cgroup", "r"); + if (cgroup == NULL) { + log_debug(os, container)("Can't open /proc/self/cgroup, %s", + os::strerror(errno)); + return NULL; + } + + while ((p = fgets(buf, MAXPATHLEN, cgroup)) != NULL) { + char *controllers; + char *token; + char *hierarchy_id_str; + int hierarchy_id; + char *cgroup_path; + + hierarchy_id_str = strsep(&p, ":"); + hierarchy_id = atoi(hierarchy_id_str); + /* Get controllers and base */ + controllers = strsep(&p, ":"); + cgroup_path = strsep(&p, "\n"); + + if (controllers == NULL) { + continue; + } + + while (!is_cgroupsV2 && (token = strsep(&controllers, ",")) != NULL) { + if (strcmp(token, "memory") == 0) { + assert(hierarchy_id == cg_infos[memory_idx]._hierarchy_id, "/proc/cgroups and /proc/self/cgroup hierarchy mismatch"); + cg_infos[memory_idx]._cgroup_path = os::strdup(cgroup_path); + } else if (strcmp(token, "cpuset") == 0) { + assert(hierarchy_id == cg_infos[cpuset_idx]._hierarchy_id, "/proc/cgroups and /proc/self/cgroup hierarchy mismatch"); + cg_infos[cpuset_idx]._cgroup_path = os::strdup(cgroup_path); + } else if (strcmp(token, "cpu") == 0) { + assert(hierarchy_id == cg_infos[cpu_idx]._hierarchy_id, "/proc/cgroups and /proc/self/cgroup hierarchy mismatch"); + cg_infos[cpu_idx]._cgroup_path = os::strdup(cgroup_path); + } else if (strcmp(token, "cpuacct") == 0) { + assert(hierarchy_id == cg_infos[cpuacct_idx]._hierarchy_id, "/proc/cgroups and /proc/self/cgroup hierarchy mismatch"); + cg_infos[cpuacct_idx]._cgroup_path = os::strdup(cgroup_path); + } + } + if (is_cgroupsV2) { + for (int i = 0; i < CG_INFO_LENGTH; i++) { + cg_infos[i]._cgroup_path = os::strdup(cgroup_path); + } + } + } + fclose(cgroup); + + if (is_cgroupsV2) { + // Find the cgroup2 mount point by reading /proc/self/mountinfo + mntinfo = fopen("/proc/self/mountinfo", "r"); + if (mntinfo == NULL) { + log_debug(os, container)("Can't open /proc/self/mountinfo, %s", + os::strerror(errno)); + return NULL; + } + + char cgroupv2_mount[MAXPATHLEN+1]; + char fstype[MAXPATHLEN+1]; + bool mount_point_found = false; + while ((p = fgets(buf, MAXPATHLEN, mntinfo)) != NULL) { + char *tmp_mount_point = cgroupv2_mount; + char *tmp_fs_type = fstype; + + // mountinfo format is documented at https://www.kernel.org/doc/Documentation/filesystems/proc.txt + if (sscanf(p, "%*d %*d %*d:%*d %*s %s %*[^-]- %s cgroup2 %*s", tmp_mount_point, tmp_fs_type) == 2) { + // we likely have an early match return, be sure we have cgroup2 as fstype + if (strcmp("cgroup2", tmp_fs_type) == 0) { + mount_point_found = true; + break; + } + } + } + fclose(mntinfo); + if (!mount_point_found) { + log_trace(os, container)("Mount point for cgroupv2 not found in /proc/self/mountinfo"); + return NULL; + } + // Cgroups v2 case, we have all the info we need. + // Construct the subsystem, free resources and return + // Note: any index in cg_infos will do as the path is the same for + // all controllers. + CgroupController* unified = new CgroupV2Controller(cgroupv2_mount, cg_infos[memory_idx]._cgroup_path); + for (int i = 0; i < CG_INFO_LENGTH; i++) { + os::free(cg_infos[i]._name); + os::free(cg_infos[i]._cgroup_path); + } + log_debug(os, container)("Detected cgroups v2 unified hierarchy"); + return new CgroupV2Subsystem(unified); + } + + // What follows is cgroups v1 + log_debug(os, container)("Detected cgroups hybrid or legacy hierarchy, using cgroups v1 controllers"); /* * Find the cgroup mount point for memory and cpuset @@ -276,24 +441,25 @@ fclose(mntinfo); if (memory == NULL) { - log_debug(os, container)("Required cgroup memory subsystem not found"); + log_debug(os, container)("Required cgroup v1 memory subsystem not found"); return NULL; } if (cpuset == NULL) { - log_debug(os, container)("Required cgroup cpuset subsystem not found"); + log_debug(os, container)("Required cgroup v1 cpuset subsystem not found"); return NULL; } if (cpu == NULL) { - log_debug(os, container)("Required cgroup cpu subsystem not found"); + log_debug(os, container)("Required cgroup v1 cpu subsystem not found"); return NULL; } if (cpuacct == NULL) { - log_debug(os, container)("Required cgroup cpuacct subsystem not found"); + log_debug(os, container)("Required cgroup v1 cpuacct subsystem not found"); return NULL; } /* - * Read /proc/self/cgroup and map host mount point to + * Use info gathered previously from /proc/self/cgroup + * and map host mount point to * local one via /proc/self/mountinfo content above * * Docker example: @@ -315,42 +481,18 @@ * /sys/fs/cgroup/memory/user.slice * */ - cgroup = fopen("/proc/self/cgroup", "r"); - if (cgroup == NULL) { - log_debug(os, container)("Can't open /proc/self/cgroup, %s", - os::strerror(errno)); - return NULL; - } - - while ((p = fgets(buf, MAXPATHLEN, cgroup)) != NULL) { - char *controllers; - char *token; - char *base; - - /* Skip cgroup number */ - strsep(&p, ":"); - /* Get controllers and base */ - controllers = strsep(&p, ":"); - base = strsep(&p, "\n"); - - if (controllers == NULL) { - continue; - } - - while ((token = strsep(&controllers, ",")) != NULL) { - if (strcmp(token, "memory") == 0) { - memory->set_subsystem_path(base); - } else if (strcmp(token, "cpuset") == 0) { - cpuset->set_subsystem_path(base); - } else if (strcmp(token, "cpu") == 0) { - cpu->set_subsystem_path(base); - } else if (strcmp(token, "cpuacct") == 0) { - cpuacct->set_subsystem_path(base); - } + for (int i = 0; i < CG_INFO_LENGTH; i++) { + CgroupInfo info = cg_infos[i]; + if (strcmp(info._name, "memory") == 0) { + memory->set_subsystem_path(info._cgroup_path); + } else if (strcmp(info._name, "cpuset") == 0) { + cpuset->set_subsystem_path(info._cgroup_path); + } else if (strcmp(info._name, "cpu") == 0) { + cpu->set_subsystem_path(info._cgroup_path); + } else if (strcmp(info._name, "cpuacct") == 0) { + cpuacct->set_subsystem_path(info._cgroup_path); } } - - fclose(cgroup); return new CgroupV1Subsystem(cpuset, cpu, cpuacct, memory); } @@ -690,3 +832,196 @@ } st->cr(); } + +/* cpu_shares + * + * Return the amount of cpu shares available to the process + * + * return: + * Share number (typically a number relative to 1024) + * (2048 typically expresses 2 CPUs worth of processing) + * -1 for no share setup + * OSCONTAINER_ERROR for not supported + */ +int CgroupV2Subsystem::cpu_shares() { + GET_CONTAINER_INFO(int, _unified, "/cpu.weight", + "CPU Shares is: %d", "%d", shares); + // Convert default value of 100 to no shares setup + if (shares == 100) return -1; + + // CPU shares (OCI) value needs to get translated into + // a proper Cgroups v2 value. See: + // https://github.com/containers/crun/blob/master/crun.1.md#cpu-controller + // + // Use the inverse of (x == OCI value, y == cgroupsv2 value): + // ((262142 * y - 1)/9999) + 2 = x + // + int x = 262142 * shares - 1; + double frac = x/9999.0; + x = ((int)frac) + 2; + log_trace(os, container)("Scaled CPU Shares value is: %d", x); + // Since the scaled value is not precise, return the closest + // multiple of PER_CPU_SHARES for a more conservative mapping + if ( x <= PER_CPU_SHARES ) { + // will always map to 1 CPU + return x; + } + int f = x/PER_CPU_SHARES; + int lower_multiple = f * PER_CPU_SHARES; + int upper_multiple = (f + 1) * PER_CPU_SHARES; + int distance_lower = MAX2(lower_multiple, x) - MIN2(lower_multiple, x); + int distance_upper = MAX2(upper_multiple, x) - MIN2(upper_multiple, x); + x = distance_lower <= distance_upper ? lower_multiple : upper_multiple; + log_trace(os, container)("Closest multiple of %d of the CPU Shares value is: %d", PER_CPU_SHARES, x); + return x; +} + +/* cpu_quota + * + * Return the number of milliseconds per period + * process is guaranteed to run. + * + * return: + * quota time in milliseconds + * -1 for no quota + * OSCONTAINER_ERROR for not supported + */ +int CgroupV2Subsystem::cpu_quota() { + char * cpu_quota_str = cpu_quota_val(); + return (int)limit_from_str(cpu_quota_str); +} + +char * CgroupV2Subsystem::cpu_cpuset_cpus() { + GET_CONTAINER_INFO_CPTR(cptr, _unified, "/cpuset.cpus", + "cpuset.cpus is: %s", "%1023s", cpus, 1024); + if (cpus == NULL) { + return NULL; + } + return os::strdup(cpus); +} + +char* CgroupV2Subsystem::cpu_quota_val() { + GET_CONTAINER_INFO_CPTR(cptr, _unified, "/cpu.max", + "CPU Quota is: %s", "%s %*d", quota, 1024); + if (quota == NULL) { + return NULL; + } + return os::strdup(quota); +} + +char * CgroupV2Subsystem::cpu_cpuset_memory_nodes() { + GET_CONTAINER_INFO_CPTR(cptr, _unified, "/cpuset.mems", + "cpuset.mems is: %s", "%1023s", mems, 1024); + if (mems == NULL) { + return NULL; + } + return os::strdup(mems); +} + +int CgroupV2Subsystem::cpu_period() { + GET_CONTAINER_INFO(int, _unified, "/cpu.max", + "CPU Period is: %d", "%*s %d", period); + return period; +} + +/* memory_usage_in_bytes + * + * Return the amount of used memory used by this cgroup and decendents + * + * return: + * memory usage in bytes or + * -1 for unlimited + * OSCONTAINER_ERROR for not supported + */ +jlong CgroupV2Subsystem::memory_usage_in_bytes() { + GET_CONTAINER_INFO(jlong, _unified, "/memory.current", + "Memory Usage is: " JLONG_FORMAT, JLONG_FORMAT, memusage); + return memusage; +} + +jlong CgroupV2Subsystem::memory_soft_limit_in_bytes() { + char* mem_soft_limit_str = mem_soft_limit_val(); + return limit_from_str(mem_soft_limit_str); +} + +jlong CgroupV2Subsystem::memory_max_usage_in_bytes() { + return OSCONTAINER_ERROR; // Not supported for Cgroups V2. +} + +char* CgroupV2Subsystem::mem_soft_limit_val() { + GET_CONTAINER_INFO_CPTR(cptr, _unified, "/memory.high", + "Memory Soft Limit is: %s", "%s", mem_soft_limit_str, 1024); + if (mem_soft_limit_str == NULL) { + return NULL; + } + return os::strdup(mem_soft_limit_str); +} + +jlong CgroupV2Subsystem::memory_and_swap_limit_in_bytes() { + char* mem_swp_limit_str = mem_swp_limit_val(); + return limit_from_str(mem_swp_limit_str); +} + +char* CgroupV2Subsystem::mem_swp_limit_val() { + GET_CONTAINER_INFO_CPTR(cptr, _unified, "/memory.swap.max", + "Memory and Swap Limit is: %s", "%s", mem_swp_limit_str, 1024); + if (mem_swp_limit_str == NULL) { + return NULL; + } + return os::strdup(mem_swp_limit_str); +} + +/* memory_limit_in_bytes + * + * Return the limit of available memory for this process. + * + * return: + * memory limit in bytes or + * -1 for unlimited, OSCONTAINER_ERROR for an error + */ +jlong CgroupV2Subsystem::memory_limit_in_bytes() { + char * mem_limit_str = mem_limit_val(); + return limit_from_str(mem_limit_str); +} + +jlong CgroupV2Subsystem::limit_from_str(char* limit_str) { + if (limit_str == NULL) { + return OSCONTAINER_ERROR; + } + // Unlimited memory in Cgroups V2 is the literal string 'max' + if (strcmp("max", limit_str) == 0) { + os::free(limit_str); + return (jlong)-1; + } + julong limit; + if (sscanf(limit_str, JULONG_FORMAT, &limit) != 1) { + os::free(limit_str); + return OSCONTAINER_ERROR; + } + os::free(limit_str); + return (jlong)limit; +} + +char* CgroupV2Subsystem::mem_limit_val() { + GET_CONTAINER_INFO_CPTR(cptr, _unified, "/memory.max", + "Memory Limit is: %s", "%s", mem_limit_str, 1024); + if (mem_limit_str == NULL) { + return NULL; + } + return os::strdup(mem_limit_str); +} + +char* CgroupV2Controller::construct_path(char* mount_path, char *cgroup_path) { + char buf[MAXPATHLEN+1]; + int buflen; + strncpy(buf, mount_path, MAXPATHLEN); + buf[MAXPATHLEN] = '\0'; + buflen = strlen(buf); + if ((buflen + strlen(cgroup_path)) > MAXPATHLEN) { + return NULL; + } + strncat(buf, cgroup_path, MAXPATHLEN-buflen); + buf[MAXPATHLEN] = '\0'; + return os::strdup(buf); +} +