1 /*
   2  * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include <string.h>
  26 #include <math.h>
  27 #include <errno.h>
  28 #include "cgroupSubsystem_linux.hpp"
  29 #include "cgroupV1Subsystem_linux.hpp"
  30 #include "cgroupV2Subsystem_linux.hpp"
  31 #include "logging/log.hpp"
  32 #include "memory/allocation.hpp"
  33 #include "runtime/globals.hpp"
  34 #include "runtime/os.hpp"
  35 #include "utilities/globalDefinitions.hpp"
  36 
  37 CgroupSubsystem* CgroupSubsystemFactory::create() {
  38   CgroupV1MemoryController* memory = NULL;
  39   CgroupV1Controller* cpuset = NULL;
  40   CgroupV1Controller* cpu = NULL;
  41   CgroupV1Controller* cpuacct = NULL;
  42   FILE *mntinfo = NULL;
  43   FILE *cgroups = NULL;
  44   FILE *cgroup = NULL;
  45   char buf[MAXPATHLEN+1];
  46   char tmproot[MAXPATHLEN+1];
  47   char tmpmount[MAXPATHLEN+1];
  48   char *p;
  49   bool is_cgroupsV2;
  50   // true iff all controllers, memory, cpu, cpuset, cpuacct are enabled
  51   // at the kernel level.
  52   bool all_controllers_enabled;
  53 
  54   CgroupInfo cg_infos[CG_INFO_LENGTH];
  55   int cpuset_idx  = 0;
  56   int cpu_idx     = 1;
  57   int cpuacct_idx = 2;
  58   int memory_idx  = 3;
  59 
  60   /*
  61    * Read /proc/cgroups so as to be able to distinguish cgroups v2 vs cgroups v1.
  62    *
  63    * For cgroups v1 unified hierarchy, cpu, cpuacct, cpuset, memory controllers
  64    * must have non-zero for the hierarchy ID field.
  65    */
  66   cgroups = fopen("/proc/cgroups", "r");
  67   if (cgroups == NULL) {
  68       log_debug(os, container)("Can't open /proc/cgroups, %s",
  69                                os::strerror(errno));
  70       return NULL;
  71   }
  72 
  73   while ((p = fgets(buf, MAXPATHLEN, cgroups)) != NULL) {
  74     char name[MAXPATHLEN+1];
  75     int  hierarchy_id;
  76     int  enabled;
  77 
  78     // Format of /proc/cgroups documented via man 7 cgroups
  79     if (sscanf(p, "%s %d %*d %d", name, &hierarchy_id, &enabled) != 3) {
  80       continue;
  81     }
  82     if (strcmp(name, "memory") == 0) {
  83       cg_infos[memory_idx]._name = os::strdup(name);
  84       cg_infos[memory_idx]._hierarchy_id = hierarchy_id;
  85       cg_infos[memory_idx]._enabled = (enabled == 1);
  86     } else if (strcmp(name, "cpuset") == 0) {
  87       cg_infos[cpuset_idx]._name = os::strdup(name);
  88       cg_infos[cpuset_idx]._hierarchy_id = hierarchy_id;
  89       cg_infos[cpuset_idx]._enabled = (enabled == 1);
  90     } else if (strcmp(name, "cpu") == 0) {
  91       cg_infos[cpu_idx]._name = os::strdup(name);
  92       cg_infos[cpu_idx]._hierarchy_id = hierarchy_id;
  93       cg_infos[cpu_idx]._enabled = (enabled == 1);
  94     } else if (strcmp(name, "cpuacct") == 0) {
  95       cg_infos[cpuacct_idx]._name = os::strdup(name);
  96       cg_infos[cpuacct_idx]._hierarchy_id = hierarchy_id;
  97       cg_infos[cpuacct_idx]._enabled = (enabled == 1);
  98     }
  99   }
 100   fclose(cgroups);
 101 
 102   is_cgroupsV2 = true;
 103   all_controllers_enabled = true;
 104   for (int i = 0; i < CG_INFO_LENGTH; i++) {
 105     is_cgroupsV2 = is_cgroupsV2 && cg_infos[i]._hierarchy_id == 0;
 106     all_controllers_enabled = all_controllers_enabled && cg_infos[i]._enabled;
 107   }
 108 
 109   if (!all_controllers_enabled) {
 110     // one or more controllers enabled, disable container support
 111     log_debug(os, container)("One or more required controllers not enabled at kernel level.");
 112     return NULL;
 113   }
 114 
 115   /*
 116    * Read /proc/self/cgroup and determine:
 117    *  - the cgroup path for cgroups v2 or
 118    *  - on a cgroups v1 system, collect info for mapping
 119    *    the host mount point to the local one via /proc/self/mountinfo below.
 120    */
 121   cgroup = fopen("/proc/self/cgroup", "r");
 122   if (cgroup == NULL) {
 123     log_debug(os, container)("Can't open /proc/self/cgroup, %s",
 124                              os::strerror(errno));
 125     return NULL;
 126   }
 127 
 128   while ((p = fgets(buf, MAXPATHLEN, cgroup)) != NULL) {
 129     char *controllers;
 130     char *token;
 131     char *hierarchy_id_str;
 132     int  hierarchy_id;
 133     char *cgroup_path;
 134 
 135     hierarchy_id_str = strsep(&p, ":");
 136     hierarchy_id = atoi(hierarchy_id_str);
 137     /* Get controllers and base */
 138     controllers = strsep(&p, ":");
 139     cgroup_path = strsep(&p, "\n");
 140 
 141     if (controllers == NULL) {
 142       continue;
 143     }
 144 
 145     while (!is_cgroupsV2 && (token = strsep(&controllers, ",")) != NULL) {
 146       if (strcmp(token, "memory") == 0) {
 147         assert(hierarchy_id == cg_infos[memory_idx]._hierarchy_id, "/proc/cgroups and /proc/self/cgroup hierarchy mismatch");
 148         cg_infos[memory_idx]._cgroup_path = os::strdup(cgroup_path);
 149       } else if (strcmp(token, "cpuset") == 0) {
 150         assert(hierarchy_id == cg_infos[cpuset_idx]._hierarchy_id, "/proc/cgroups and /proc/self/cgroup hierarchy mismatch");
 151         cg_infos[cpuset_idx]._cgroup_path = os::strdup(cgroup_path);
 152       } else if (strcmp(token, "cpu") == 0) {
 153         assert(hierarchy_id == cg_infos[cpu_idx]._hierarchy_id, "/proc/cgroups and /proc/self/cgroup hierarchy mismatch");
 154         cg_infos[cpu_idx]._cgroup_path = os::strdup(cgroup_path);
 155       } else if (strcmp(token, "cpuacct") == 0) {
 156         assert(hierarchy_id == cg_infos[cpuacct_idx]._hierarchy_id, "/proc/cgroups and /proc/self/cgroup hierarchy mismatch");
 157         cg_infos[cpuacct_idx]._cgroup_path = os::strdup(cgroup_path);
 158       }
 159     }
 160     if (is_cgroupsV2) {
 161       for (int i = 0; i < CG_INFO_LENGTH; i++) {
 162         cg_infos[i]._cgroup_path = os::strdup(cgroup_path);
 163       }
 164     }
 165   }
 166   fclose(cgroup);
 167 
 168   if (is_cgroupsV2) {
 169     // Find the cgroup2 mount point by reading /proc/self/mountinfo
 170     mntinfo = fopen("/proc/self/mountinfo", "r");
 171     if (mntinfo == NULL) {
 172         log_debug(os, container)("Can't open /proc/self/mountinfo, %s",
 173                                  os::strerror(errno));
 174         return NULL;
 175     }
 176 
 177     char cgroupv2_mount[MAXPATHLEN+1];
 178     char fstype[MAXPATHLEN+1];
 179     bool mount_point_found = false;
 180     while ((p = fgets(buf, MAXPATHLEN, mntinfo)) != NULL) {
 181       char *tmp_mount_point = cgroupv2_mount;
 182       char *tmp_fs_type = fstype;
 183 
 184       // mountinfo format is documented at https://www.kernel.org/doc/Documentation/filesystems/proc.txt
 185       if (sscanf(p, "%*d %*d %*d:%*d %*s %s %*[^-]- %s cgroup2 %*s", tmp_mount_point, tmp_fs_type) == 2) {
 186         // we likely have an early match return, be sure we have cgroup2 as fstype
 187         if (strcmp("cgroup2", tmp_fs_type) == 0) {
 188           mount_point_found = true;
 189           break;
 190         }
 191       }
 192     }
 193     fclose(mntinfo);
 194     if (!mount_point_found) {
 195       log_trace(os, container)("Mount point for cgroupv2 not found in /proc/self/mountinfo");
 196       return NULL;
 197     }
 198     // Cgroups v2 case, we have all the info we need.
 199     // Construct the subsystem, free resources and return
 200     // Note: any index in cg_infos will do as the path is the same for
 201     //       all controllers.
 202     CgroupController* unified = new CgroupV2Controller(cgroupv2_mount, cg_infos[memory_idx]._cgroup_path);
 203     for (int i = 0; i < CG_INFO_LENGTH; i++) {
 204       os::free(cg_infos[i]._name);
 205       os::free(cg_infos[i]._cgroup_path);
 206     }
 207     log_debug(os, container)("Detected cgroups v2 unified hierarchy");
 208     return new CgroupV2Subsystem(unified);
 209   }
 210 
 211   // What follows is cgroups v1
 212   log_debug(os, container)("Detected cgroups hybrid or legacy hierarchy, using cgroups v1 controllers");
 213 
 214   /*
 215    * Find the cgroup mount point for memory and cpuset
 216    * by reading /proc/self/mountinfo
 217    *
 218    * Example for docker:
 219    * 219 214 0:29 /docker/7208cebd00fa5f2e342b1094f7bed87fa25661471a4637118e65f1c995be8a34 /sys/fs/cgroup/memory ro,nosuid,nodev,noexec,relatime - cgroup cgroup rw,memory
 220    *
 221    * Example for host:
 222    * 34 28 0:29 / /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:16 - cgroup cgroup rw,memory
 223    */
 224   mntinfo = fopen("/proc/self/mountinfo", "r");
 225   if (mntinfo == NULL) {
 226       log_debug(os, container)("Can't open /proc/self/mountinfo, %s",
 227                                os::strerror(errno));
 228       return NULL;
 229   }
 230 
 231   while ((p = fgets(buf, MAXPATHLEN, mntinfo)) != NULL) {
 232     char tmpcgroups[MAXPATHLEN+1];
 233     char *cptr = tmpcgroups;
 234     char *token;
 235 
 236     // mountinfo format is documented at https://www.kernel.org/doc/Documentation/filesystems/proc.txt
 237     if (sscanf(p, "%*d %*d %*d:%*d %s %s %*[^-]- cgroup %*s %s", tmproot, tmpmount, tmpcgroups) != 3) {
 238       continue;
 239     }
 240     while ((token = strsep(&cptr, ",")) != NULL) {
 241       if (strcmp(token, "memory") == 0) {
 242         memory = new CgroupV1MemoryController(tmproot, tmpmount);
 243       } else if (strcmp(token, "cpuset") == 0) {
 244         cpuset = new CgroupV1Controller(tmproot, tmpmount);
 245       } else if (strcmp(token, "cpu") == 0) {
 246         cpu = new CgroupV1Controller(tmproot, tmpmount);
 247       } else if (strcmp(token, "cpuacct") == 0) {
 248         cpuacct= new CgroupV1Controller(tmproot, tmpmount);
 249       }
 250     }
 251   }
 252 
 253   fclose(mntinfo);
 254 
 255   if (memory == NULL) {
 256     log_debug(os, container)("Required cgroup v1 memory subsystem not found");
 257     return NULL;
 258   }
 259   if (cpuset == NULL) {
 260     log_debug(os, container)("Required cgroup v1 cpuset subsystem not found");
 261     return NULL;
 262   }
 263   if (cpu == NULL) {
 264     log_debug(os, container)("Required cgroup v1 cpu subsystem not found");
 265     return NULL;
 266   }
 267   if (cpuacct == NULL) {
 268     log_debug(os, container)("Required cgroup v1 cpuacct subsystem not found");
 269     return NULL;
 270   }
 271 
 272   /*
 273    * Use info gathered previously from /proc/self/cgroup
 274    * and map host mount point to
 275    * local one via /proc/self/mountinfo content above
 276    *
 277    * Docker example:
 278    * 5:memory:/docker/6558aed8fc662b194323ceab5b964f69cf36b3e8af877a14b80256e93aecb044
 279    *
 280    * Host example:
 281    * 5:memory:/user.slice
 282    *
 283    * Construct a path to the process specific memory and cpuset
 284    * cgroup directory.
 285    *
 286    * For a container running under Docker from memory example above
 287    * the paths would be:
 288    *
 289    * /sys/fs/cgroup/memory
 290    *
 291    * For a Host from memory example above the path would be:
 292    *
 293    * /sys/fs/cgroup/memory/user.slice
 294    *
 295    */
 296   for (int i = 0; i < CG_INFO_LENGTH; i++) {
 297     CgroupInfo info = cg_infos[i];
 298     if (strcmp(info._name, "memory") == 0) {
 299       memory->set_subsystem_path(info._cgroup_path);
 300     } else if (strcmp(info._name, "cpuset") == 0) {
 301       cpuset->set_subsystem_path(info._cgroup_path);
 302     } else if (strcmp(info._name, "cpu") == 0) {
 303       cpu->set_subsystem_path(info._cgroup_path);
 304     } else if (strcmp(info._name, "cpuacct") == 0) {
 305       cpuacct->set_subsystem_path(info._cgroup_path);
 306     }
 307   }
 308   return new CgroupV1Subsystem(cpuset, cpu, cpuacct, memory);
 309 }
 310 
 311 /* active_processor_count
 312  *
 313  * Calculate an appropriate number of active processors for the
 314  * VM to use based on these three inputs.
 315  *
 316  * cpu affinity
 317  * cgroup cpu quota & cpu period
 318  * cgroup cpu shares
 319  *
 320  * Algorithm:
 321  *
 322  * Determine the number of available CPUs from sched_getaffinity
 323  *
 324  * If user specified a quota (quota != -1), calculate the number of
 325  * required CPUs by dividing quota by period.
 326  *
 327  * If shares are in effect (shares != -1), calculate the number
 328  * of CPUs required for the shares by dividing the share value
 329  * by PER_CPU_SHARES.
 330  *
 331  * All results of division are rounded up to the next whole number.
 332  *
 333  * If neither shares or quotas have been specified, return the
 334  * number of active processors in the system.
 335  *
 336  * If both shares and quotas have been specified, the results are
 337  * based on the flag PreferContainerQuotaForCPUCount.  If true,
 338  * return the quota value.  If false return the smallest value
 339  * between shares or quotas.
 340  *
 341  * If shares and/or quotas have been specified, the resulting number
 342  * returned will never exceed the number of active processors.
 343  *
 344  * return:
 345  *    number of CPUs
 346  */
 347 int CgroupSubsystem::active_processor_count(int physical_proc_count) {
 348   int quota_count = 0, share_count = 0;
 349   int cpu_count, limit_count;
 350   int result;
 351 
 352   cpu_count = limit_count = physical_proc_count;
 353   int quota  = cpu_quota();
 354   int period = cpu_period();
 355   int share  = cpu_shares();
 356 
 357   if (quota > -1 && period > 0) {
 358     quota_count = ceilf((float)quota / (float)period);
 359     log_trace(os, container)("CPU Quota count based on quota/period: %d", quota_count);
 360   }
 361   if (share > -1) {
 362     share_count = ceilf((float)share / (float)PER_CPU_SHARES);
 363     log_trace(os, container)("CPU Share count based on shares: %d", share_count);
 364   }
 365 
 366   // If both shares and quotas are setup results depend
 367   // on flag PreferContainerQuotaForCPUCount.
 368   // If true, limit CPU count to quota
 369   // If false, use minimum of shares and quotas
 370   if (quota_count !=0 && share_count != 0) {
 371     if (PreferContainerQuotaForCPUCount) {
 372       limit_count = quota_count;
 373     } else {
 374       limit_count = MIN2(quota_count, share_count);
 375     }
 376   } else if (quota_count != 0) {
 377     limit_count = quota_count;
 378   } else if (share_count != 0) {
 379     limit_count = share_count;
 380   }
 381 
 382   result = MIN2(cpu_count, limit_count);
 383   log_trace(os, container)("OSContainer::active_processor_count: %d", result);
 384   return result;
 385 }