1 /*
   2  * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include <string.h>
  26 #include <math.h>
  27 #include <errno.h>
  28 #include "cgroupSubsystem_linux.hpp"
  29 #include "logging/log.hpp"
  30 #include "memory/allocation.hpp"
  31 #include "runtime/globals.hpp"
  32 #include "runtime/os.hpp"
  33 #include "utilities/globalDefinitions.hpp"
  34 
  35 typedef char * cptr;
  36 
  37 PRAGMA_DIAG_PUSH
  38 PRAGMA_FORMAT_NONLITERAL_IGNORED
  39 template <typename T> int subsystem_file_line_contents(CgroupController* c,
  40                                               const char *filename,
  41                                               const char *matchline,
  42                                               const char *scan_fmt,
  43                                               T returnval) {
  44   FILE *fp = NULL;
  45   char *p;
  46   char file[MAXPATHLEN+1];
  47   char buf[MAXPATHLEN+1];
  48   char discard[MAXPATHLEN+1];
  49   bool found_match = false;
  50 
  51   if (c == NULL) {
  52     log_debug(os, container)("subsystem_file_line_contents: CgroupV1Controller* is NULL");
  53     return OSCONTAINER_ERROR;
  54   }
  55   if (c->subsystem_path() == NULL) {
  56     log_debug(os, container)("subsystem_file_line_contents: subsystem path is NULL");
  57     return OSCONTAINER_ERROR;
  58   }
  59 
  60   strncpy(file, c->subsystem_path(), MAXPATHLEN);
  61   file[MAXPATHLEN-1] = '\0';
  62   int filelen = strlen(file);
  63   if ((filelen + strlen(filename)) > (MAXPATHLEN-1)) {
  64     log_debug(os, container)("File path too long %s, %s", file, filename);
  65     return OSCONTAINER_ERROR;
  66   }
  67   strncat(file, filename, MAXPATHLEN-filelen);
  68   log_trace(os, container)("Path to %s is %s", filename, file);
  69   fp = fopen(file, "r");
  70   if (fp != NULL) {
  71     int err = 0;
  72     while ((p = fgets(buf, MAXPATHLEN, fp)) != NULL) {
  73       found_match = false;
  74       if (matchline == NULL) {
  75         // single-line file case
  76         int matched = sscanf(p, scan_fmt, returnval);
  77         found_match = (matched == 1);
  78       } else {
  79         // multi-line file case
  80         if (strstr(p, matchline) != NULL) {
  81           // discard matchline string prefix
  82           int matched = sscanf(p, scan_fmt, discard, returnval);
  83           found_match = (matched == 2);
  84         } else {
  85           continue; // substring not found
  86         }
  87       }
  88       if (found_match) {
  89         fclose(fp);
  90         return 0;
  91       } else {
  92         err = 1;
  93         log_debug(os, container)("Type %s not found in file %s", scan_fmt, file);
  94       }
  95     }
  96     if (err == 0) {
  97       log_debug(os, container)("Empty file %s", file);
  98     }
  99   } else {
 100     log_debug(os, container)("Open of file %s failed, %s", file, os::strerror(errno));
 101   }
 102   if (fp != NULL)
 103     fclose(fp);
 104   return OSCONTAINER_ERROR;
 105 }
 106 PRAGMA_DIAG_POP
 107 
 108 #define GET_CONTAINER_INFO(return_type, subsystem, filename,              \
 109                            logstring, scan_fmt, variable)                 \
 110   return_type variable;                                                   \
 111 {                                                                         \
 112   int err;                                                                \
 113   err = subsystem_file_line_contents(subsystem,                           \
 114                                      filename,                            \
 115                                      NULL,                                \
 116                                      scan_fmt,                            \
 117                                      &variable);                          \
 118   if (err != 0)                                                           \
 119     return (return_type) OSCONTAINER_ERROR;                               \
 120                                                                           \
 121   log_trace(os, container)(logstring, variable);                          \
 122 }
 123 
 124 #define GET_CONTAINER_INFO_CPTR(return_type, subsystem, filename,         \
 125                                logstring, scan_fmt, variable, bufsize)    \
 126   char variable[bufsize];                                                 \
 127 {                                                                         \
 128   int err;                                                                \
 129   err = subsystem_file_line_contents(subsystem,                           \
 130                                      filename,                            \
 131                                      NULL,                                \
 132                                      scan_fmt,                            \
 133                                      variable);                           \
 134   if (err != 0)                                                           \
 135     return (return_type) NULL;                                            \
 136                                                                           \
 137   log_trace(os, container)(logstring, variable);                          \
 138 }
 139 
 140 #define GET_CONTAINER_INFO_LINE(return_type, controller, filename,        \
 141                            matchline, logstring, scan_fmt, variable)      \
 142   return_type variable;                                                   \
 143 {                                                                         \
 144   int err;                                                                \
 145   err = subsystem_file_line_contents(controller,                          \
 146                                 filename,                                 \
 147                                 matchline,                                \
 148                                 scan_fmt,                                 \
 149                                 &variable);                               \
 150   if (err != 0)                                                           \
 151     return (return_type) OSCONTAINER_ERROR;                               \
 152                                                                           \
 153   log_trace(os, container)(logstring, variable);                          \
 154 }
 155 
 156 /*
 157  * Set directory to subsystem specific files based
 158  * on the contents of the mountinfo and cgroup files.
 159  */
 160 void CgroupV1Controller::set_subsystem_path(char *cgroup_path) {
 161   char buf[MAXPATHLEN+1];
 162   if (_root != NULL && cgroup_path != NULL) {
 163     if (strcmp(_root, "/") == 0) {
 164       int buflen;
 165       strncpy(buf, _mount_point, MAXPATHLEN);
 166       buf[MAXPATHLEN-1] = '\0';
 167       if (strcmp(cgroup_path,"/") != 0) {
 168         buflen = strlen(buf);
 169         if ((buflen + strlen(cgroup_path)) > (MAXPATHLEN-1)) {
 170           return;
 171         }
 172         strncat(buf, cgroup_path, MAXPATHLEN-buflen);
 173         buf[MAXPATHLEN-1] = '\0';
 174       }
 175       _path = os::strdup(buf);
 176     } else {
 177       if (strcmp(_root, cgroup_path) == 0) {
 178         strncpy(buf, _mount_point, MAXPATHLEN);
 179         buf[MAXPATHLEN-1] = '\0';
 180         _path = os::strdup(buf);
 181       } else {
 182         char *p = strstr(cgroup_path, _root);
 183         if (p != NULL && p == _root) {
 184           if (strlen(cgroup_path) > strlen(_root)) {
 185             int buflen;
 186             strncpy(buf, _mount_point, MAXPATHLEN);
 187             buf[MAXPATHLEN-1] = '\0';
 188             buflen = strlen(buf);
 189             if ((buflen + strlen(cgroup_path) - strlen(_root)) > (MAXPATHLEN-1)) {
 190               return;
 191             }
 192             strncat(buf, cgroup_path + strlen(_root), MAXPATHLEN-buflen);
 193             buf[MAXPATHLEN-1] = '\0';
 194             _path = os::strdup(buf);
 195           }
 196         }
 197       }
 198     }
 199   }
 200 }
 201 
 202 /* uses_mem_hierarchy
 203  *
 204  * Return whether or not hierarchical cgroup accounting is being
 205  * done.
 206  *
 207  * return:
 208  *    A number > 0 if true, or
 209  *    OSCONTAINER_ERROR for not supported
 210  */
 211 jlong CgroupV1MemoryController::uses_mem_hierarchy() {
 212   GET_CONTAINER_INFO(jlong, this, "/memory.use_hierarchy",
 213                     "Use Hierarchy is: " JLONG_FORMAT, JLONG_FORMAT, use_hierarchy);
 214   return use_hierarchy;
 215 }
 216 
 217 void CgroupV1MemoryController::set_subsystem_path(char *cgroup_path) {
 218   CgroupV1Controller::set_subsystem_path(cgroup_path);
 219   jlong hierarchy = uses_mem_hierarchy();
 220   if (hierarchy > 0) {
 221     set_hierarchical(true);
 222   }
 223 }
 224 
 225 CgroupSubsystem* CgroupSubsystemFactory::create() {
 226   CgroupV1MemoryController* memory = NULL;
 227   CgroupV1Controller* cpuset = NULL;
 228   CgroupV1Controller* cpu = NULL;
 229   CgroupV1Controller* cpuacct = NULL;
 230   FILE *mntinfo = NULL;
 231   FILE *cgroups = NULL;
 232   FILE *cgroup = NULL;
 233   char buf[MAXPATHLEN+1];
 234   char tmproot[MAXPATHLEN+1];
 235   char tmpmount[MAXPATHLEN+1];
 236   char *p;
 237   bool is_cgroupsV2;
 238   // true iff all controllers, memory, cpu, cpuset, cpuacct are enabled
 239   // at the kernel level.
 240   bool all_controllers_enabled;
 241 
 242   CgroupInfo cg_infos[CG_INFO_LENGTH];
 243   int cpuset_idx  = 0;
 244   int cpu_idx     = 1;
 245   int cpuacct_idx = 2;
 246   int memory_idx  = 3;
 247 
 248   /*
 249    * Read /proc/cgroups so as to be able to distinguish cgroups v2 vs cgroups v1.
 250    *
 251    * For cgroups v1 unified hierarchy, cpu, cpuacct, cpuset, memory controllers
 252    * must have non-zero for the hierarchy ID field.
 253    */
 254   cgroups = fopen("/proc/cgroups", "r");
 255   if (cgroups == NULL) {
 256       log_debug(os, container)("Can't open /proc/cgroups, %s",
 257                                os::strerror(errno));
 258       return NULL;
 259   }
 260 
 261   while ((p = fgets(buf, MAXPATHLEN, cgroups)) != NULL) {
 262     char name[MAXPATHLEN+1];
 263     int  hierarchy_id;
 264     int  enabled;
 265 
 266     // Format of /proc/cgroups documented via man 7 cgroups
 267     if (sscanf(p, "%s %d %*d %d", name, &hierarchy_id, &enabled) != 3) {
 268       continue;
 269     }
 270     if (strcmp(name, "memory") == 0) {
 271       cg_infos[memory_idx]._name = os::strdup(name);
 272       cg_infos[memory_idx]._hierarchy_id = hierarchy_id;
 273       cg_infos[memory_idx]._enabled = (enabled == 1);
 274     } else if (strcmp(name, "cpuset") == 0) {
 275       cg_infos[cpuset_idx]._name = os::strdup(name);
 276       cg_infos[cpuset_idx]._hierarchy_id = hierarchy_id;
 277       cg_infos[cpuset_idx]._enabled = (enabled == 1);
 278     } else if (strcmp(name, "cpu") == 0) {
 279       cg_infos[cpu_idx]._name = os::strdup(name);
 280       cg_infos[cpu_idx]._hierarchy_id = hierarchy_id;
 281       cg_infos[cpu_idx]._enabled = (enabled == 1);
 282     } else if (strcmp(name, "cpuacct") == 0) {
 283       cg_infos[cpuacct_idx]._name = os::strdup(name);
 284       cg_infos[cpuacct_idx]._hierarchy_id = hierarchy_id;
 285       cg_infos[cpuacct_idx]._enabled = (enabled == 1);
 286     }
 287   }
 288   fclose(cgroups);
 289 
 290   is_cgroupsV2 = true;
 291   all_controllers_enabled = true;
 292   for (int i = 0; i < CG_INFO_LENGTH; i++) {
 293     is_cgroupsV2 = is_cgroupsV2 && cg_infos[i]._hierarchy_id == 0;
 294     all_controllers_enabled = all_controllers_enabled && cg_infos[i]._enabled;
 295   }
 296 
 297   if (!all_controllers_enabled) {
 298     // one or more controllers enabled, disable container support
 299     log_debug(os, container)("One or more required controllers not enabled at kernel level.");
 300     return NULL;
 301   }
 302 
 303   /*
 304    * Read /proc/self/cgroup and determine:
 305    *  - the cgroup path for cgroups v2 or
 306    *  - on a cgroups v1 system, collect info for mapping
 307    *    the host mount point to the local one via /proc/self/mountinfo below.
 308    */
 309   cgroup = fopen("/proc/self/cgroup", "r");
 310   if (cgroup == NULL) {
 311     log_debug(os, container)("Can't open /proc/self/cgroup, %s",
 312                              os::strerror(errno));
 313     return NULL;
 314   }
 315 
 316   while ((p = fgets(buf, MAXPATHLEN, cgroup)) != NULL) {
 317     char *controllers;
 318     char *token;
 319     char *hierarchy_id_str;
 320     int  hierarchy_id;
 321     char *cgroup_path;
 322 
 323     hierarchy_id_str = strsep(&p, ":");
 324     hierarchy_id = atoi(hierarchy_id_str);
 325     /* Get controllers and base */
 326     controllers = strsep(&p, ":");
 327     cgroup_path = strsep(&p, "\n");
 328 
 329     if (controllers == NULL) {
 330       continue;
 331     }
 332 
 333     while (!is_cgroupsV2 && (token = strsep(&controllers, ",")) != NULL) {
 334       if (strcmp(token, "memory") == 0) {
 335         assert(hierarchy_id == cg_infos[memory_idx]._hierarchy_id, "/proc/cgroups and /proc/self/cgroup hierarchy mismatch");
 336         cg_infos[memory_idx]._cgroup_path = os::strdup(cgroup_path);
 337       } else if (strcmp(token, "cpuset") == 0) {
 338         assert(hierarchy_id == cg_infos[cpuset_idx]._hierarchy_id, "/proc/cgroups and /proc/self/cgroup hierarchy mismatch");
 339         cg_infos[cpuset_idx]._cgroup_path = os::strdup(cgroup_path);
 340       } else if (strcmp(token, "cpu") == 0) {
 341         assert(hierarchy_id == cg_infos[cpu_idx]._hierarchy_id, "/proc/cgroups and /proc/self/cgroup hierarchy mismatch");
 342         cg_infos[cpu_idx]._cgroup_path = os::strdup(cgroup_path);
 343       } else if (strcmp(token, "cpuacct") == 0) {
 344         assert(hierarchy_id == cg_infos[cpuacct_idx]._hierarchy_id, "/proc/cgroups and /proc/self/cgroup hierarchy mismatch");
 345         cg_infos[cpuacct_idx]._cgroup_path = os::strdup(cgroup_path);
 346       }
 347     }
 348     if (is_cgroupsV2) {
 349       for (int i = 0; i < CG_INFO_LENGTH; i++) {
 350         cg_infos[i]._cgroup_path = os::strdup(cgroup_path);
 351       }
 352     }
 353   }
 354   fclose(cgroup);
 355 
 356   if (is_cgroupsV2) {
 357     // Find the cgroup2 mount point by reading /proc/self/mountinfo
 358     mntinfo = fopen("/proc/self/mountinfo", "r");
 359     if (mntinfo == NULL) {
 360         log_debug(os, container)("Can't open /proc/self/mountinfo, %s",
 361                                  os::strerror(errno));
 362         return NULL;
 363     }
 364 
 365     char cgroupv2_mount[MAXPATHLEN+1];
 366     char fstype[MAXPATHLEN+1];
 367     bool mount_point_found = false;
 368     while ((p = fgets(buf, MAXPATHLEN, mntinfo)) != NULL) {
 369       char *tmp_mount_point = cgroupv2_mount;
 370       char *tmp_fs_type = fstype;
 371 
 372       // mountinfo format is documented at https://www.kernel.org/doc/Documentation/filesystems/proc.txt
 373       if (sscanf(p, "%*d %*d %*d:%*d %*s %s %*[^-]- %s cgroup2 %*s", tmp_mount_point, tmp_fs_type) == 2) {
 374         // we likely have an early match return, be sure we have cgroup2 as fstype
 375         if (strcmp("cgroup2", tmp_fs_type) == 0) {
 376           mount_point_found = true;
 377           break;
 378         }
 379       }
 380     }
 381     fclose(mntinfo);
 382     if (!mount_point_found) {
 383       log_trace(os, container)("Mount point for cgroupv2 not found in /proc/self/mountinfo");
 384       return NULL;
 385     }
 386     // Cgroups v2 case, we have all the info we need.
 387     // Construct the subsystem, free resources and return
 388     // Note: any index in cg_infos will do as the path is the same for
 389     //       all controllers.
 390     CgroupController* unified = new CgroupV2Controller(cgroupv2_mount, cg_infos[memory_idx]._cgroup_path);
 391     for (int i = 0; i < CG_INFO_LENGTH; i++) {
 392       os::free(cg_infos[i]._name);
 393       os::free(cg_infos[i]._cgroup_path);
 394     }
 395     log_debug(os, container)("Detected cgroups v2 unified hierarchy");
 396     return new CgroupV2Subsystem(unified);
 397   }
 398 
 399   // What follows is cgroups v1
 400   log_debug(os, container)("Detected cgroups hybrid or legacy hierarchy, using cgroups v1 controllers");
 401 
 402   /*
 403    * Find the cgroup mount point for memory and cpuset
 404    * by reading /proc/self/mountinfo
 405    *
 406    * Example for docker:
 407    * 219 214 0:29 /docker/7208cebd00fa5f2e342b1094f7bed87fa25661471a4637118e65f1c995be8a34 /sys/fs/cgroup/memory ro,nosuid,nodev,noexec,relatime - cgroup cgroup rw,memory
 408    *
 409    * Example for host:
 410    * 34 28 0:29 / /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:16 - cgroup cgroup rw,memory
 411    */
 412   mntinfo = fopen("/proc/self/mountinfo", "r");
 413   if (mntinfo == NULL) {
 414       log_debug(os, container)("Can't open /proc/self/mountinfo, %s",
 415                                os::strerror(errno));
 416       return NULL;
 417   }
 418 
 419   while ((p = fgets(buf, MAXPATHLEN, mntinfo)) != NULL) {
 420     char tmpcgroups[MAXPATHLEN+1];
 421     char *cptr = tmpcgroups;
 422     char *token;
 423 
 424     // mountinfo format is documented at https://www.kernel.org/doc/Documentation/filesystems/proc.txt
 425     if (sscanf(p, "%*d %*d %*d:%*d %s %s %*[^-]- cgroup %*s %s", tmproot, tmpmount, tmpcgroups) != 3) {
 426       continue;
 427     }
 428     while ((token = strsep(&cptr, ",")) != NULL) {
 429       if (strcmp(token, "memory") == 0) {
 430         memory = new CgroupV1MemoryController(tmproot, tmpmount);
 431       } else if (strcmp(token, "cpuset") == 0) {
 432         cpuset = new CgroupV1Controller(tmproot, tmpmount);
 433       } else if (strcmp(token, "cpu") == 0) {
 434         cpu = new CgroupV1Controller(tmproot, tmpmount);
 435       } else if (strcmp(token, "cpuacct") == 0) {
 436         cpuacct= new CgroupV1Controller(tmproot, tmpmount);
 437       }
 438     }
 439   }
 440 
 441   fclose(mntinfo);
 442 
 443   if (memory == NULL) {
 444     log_debug(os, container)("Required cgroup v1 memory subsystem not found");
 445     return NULL;
 446   }
 447   if (cpuset == NULL) {
 448     log_debug(os, container)("Required cgroup v1 cpuset subsystem not found");
 449     return NULL;
 450   }
 451   if (cpu == NULL) {
 452     log_debug(os, container)("Required cgroup v1 cpu subsystem not found");
 453     return NULL;
 454   }
 455   if (cpuacct == NULL) {
 456     log_debug(os, container)("Required cgroup v1 cpuacct subsystem not found");
 457     return NULL;
 458   }
 459 
 460   /*
 461    * Use info gathered previously from /proc/self/cgroup
 462    * and map host mount point to
 463    * local one via /proc/self/mountinfo content above
 464    *
 465    * Docker example:
 466    * 5:memory:/docker/6558aed8fc662b194323ceab5b964f69cf36b3e8af877a14b80256e93aecb044
 467    *
 468    * Host example:
 469    * 5:memory:/user.slice
 470    *
 471    * Construct a path to the process specific memory and cpuset
 472    * cgroup directory.
 473    *
 474    * For a container running under Docker from memory example above
 475    * the paths would be:
 476    *
 477    * /sys/fs/cgroup/memory
 478    *
 479    * For a Host from memory example above the path would be:
 480    *
 481    * /sys/fs/cgroup/memory/user.slice
 482    *
 483    */
 484   for (int i = 0; i < CG_INFO_LENGTH; i++) {
 485     CgroupInfo info = cg_infos[i];
 486     if (strcmp(info._name, "memory") == 0) {
 487       memory->set_subsystem_path(info._cgroup_path);
 488     } else if (strcmp(info._name, "cpuset") == 0) {
 489       cpuset->set_subsystem_path(info._cgroup_path);
 490     } else if (strcmp(info._name, "cpu") == 0) {
 491       cpu->set_subsystem_path(info._cgroup_path);
 492     } else if (strcmp(info._name, "cpuacct") == 0) {
 493       cpuacct->set_subsystem_path(info._cgroup_path);
 494     }
 495   }
 496   return new CgroupV1Subsystem(cpuset, cpu, cpuacct, memory);
 497 }
 498 
 499 /* available_memory
 500  *
 501  * Return the available memory for this process.
 502  *
 503  * return:
 504  *    available memory in bytes or
 505  *    -1 for unlimited
 506  *    OSCONTAINER_ERROR for not supported
 507  */
 508 jlong CgroupSubsystem::available_memory() {
 509   jlong mem_limit, mem_usage, avail_mem;
 510   if ((mem_limit = memory_limit_in_bytes()) < 1) {
 511     log_debug(os, container)("container memory limit %s: " JLONG_FORMAT ", using host value",
 512                            mem_limit == OSCONTAINER_ERROR ? "failed" : "unlimited", mem_limit);
 513     return mem_limit; // error case
 514   }
 515   if ((mem_usage = memory_usage_in_bytes()) < 1) {
 516     log_debug(os, container)("container memory usage failed: " JLONG_FORMAT ", using host value", mem_usage);
 517     return mem_usage; // error case
 518   }
 519   avail_mem = mem_limit > mem_usage ? mem_limit - mem_usage : 0;
 520   log_trace(os)("available container memory: " JLONG_FORMAT, avail_mem);
 521   return avail_mem;
 522 }
 523 
 524 /* memory_limit_in_bytes
 525  *
 526  * Return the limit of available memory for this process.
 527  *
 528  * return:
 529  *    memory limit in bytes or
 530  *    -1 for unlimited
 531  *    OSCONTAINER_ERROR for not supported
 532  */
 533 jlong CgroupV1Subsystem::memory_limit_in_bytes() {
 534   GET_CONTAINER_INFO(julong, _memory, "/memory.limit_in_bytes",
 535                      "Memory Limit is: " JULONG_FORMAT, JULONG_FORMAT, memlimit);
 536 
 537   if (memlimit >= _unlimited_memory) {
 538     log_trace(os, container)("Non-Hierarchical Memory Limit is: Unlimited");
 539     if (_memory->is_hierarchical()) {
 540       const char* matchline = "hierarchical_memory_limit";
 541       const char* format = "%s " JULONG_FORMAT;
 542       GET_CONTAINER_INFO_LINE(julong, _memory, "/memory.stat", matchline,
 543                              "Hierarchical Memory Limit is: " JULONG_FORMAT, format, hier_memlimit)
 544       if (hier_memlimit >= _unlimited_memory) {
 545         log_trace(os, container)("Hierarchical Memory Limit is: Unlimited");
 546       } else {
 547         return (jlong)hier_memlimit;
 548       }
 549     }
 550     return (jlong)-1;
 551   }
 552   else {
 553     return (jlong)memlimit;
 554   }
 555 }
 556 
 557 jlong CgroupV1Subsystem::memory_and_swap_limit_in_bytes() {
 558   GET_CONTAINER_INFO(julong, _memory, "/memory.memsw.limit_in_bytes",
 559                      "Memory and Swap Limit is: " JULONG_FORMAT, JULONG_FORMAT, memswlimit);
 560   if (memswlimit >= _unlimited_memory) {
 561     log_trace(os, container)("Non-Hierarchical Memory and Swap Limit is: Unlimited");
 562     if (_memory->is_hierarchical()) {
 563       const char* matchline = "hierarchical_memsw_limit";
 564       const char* format = "%s " JULONG_FORMAT;
 565       GET_CONTAINER_INFO_LINE(julong, _memory, "/memory.stat", matchline,
 566                              "Hierarchical Memory and Swap Limit is : " JULONG_FORMAT, format, hier_memlimit)
 567       if (hier_memlimit >= _unlimited_memory) {
 568         log_trace(os, container)("Hierarchical Memory and Swap Limit is: Unlimited");
 569       } else {
 570         return (jlong)hier_memlimit;
 571       }
 572     }
 573     return (jlong)-1;
 574   } else {
 575     return (jlong)memswlimit;
 576   }
 577 }
 578 
 579 jlong CgroupV1Subsystem::memory_soft_limit_in_bytes() {
 580   GET_CONTAINER_INFO(julong, _memory, "/memory.soft_limit_in_bytes",
 581                      "Memory Soft Limit is: " JULONG_FORMAT, JULONG_FORMAT, memsoftlimit);
 582   if (memsoftlimit >= _unlimited_memory) {
 583     log_trace(os, container)("Memory Soft Limit is: Unlimited");
 584     return (jlong)-1;
 585   } else {
 586     return (jlong)memsoftlimit;
 587   }
 588 }
 589 
 590 /* memory_usage_in_bytes
 591  *
 592  * Return the amount of used memory for this process.
 593  *
 594  * return:
 595  *    memory usage in bytes or
 596  *    -1 for unlimited
 597  *    OSCONTAINER_ERROR for not supported
 598  */
 599 jlong CgroupV1Subsystem::memory_usage_in_bytes() {
 600   GET_CONTAINER_INFO(jlong, _memory, "/memory.usage_in_bytes",
 601                      "Memory Usage is: " JLONG_FORMAT, JLONG_FORMAT, memusage);
 602   return memusage;
 603 }
 604 
 605 /* memory_max_usage_in_bytes
 606  *
 607  * Return the maximum amount of used memory for this process.
 608  *
 609  * return:
 610  *    max memory usage in bytes or
 611  *    OSCONTAINER_ERROR for not supported
 612  */
 613 jlong CgroupV1Subsystem::memory_max_usage_in_bytes() {
 614   GET_CONTAINER_INFO(jlong, _memory, "/memory.max_usage_in_bytes",
 615                      "Maximum Memory Usage is: " JLONG_FORMAT, JLONG_FORMAT, memmaxusage);
 616   return memmaxusage;
 617 }
 618 
 619 char * CgroupV1Subsystem::cpu_cpuset_cpus() {
 620   GET_CONTAINER_INFO_CPTR(cptr, _cpuset, "/cpuset.cpus",
 621                      "cpuset.cpus is: %s", "%1023s", cpus, 1024);
 622   return os::strdup(cpus);
 623 }
 624 
 625 char * CgroupV1Subsystem::cpu_cpuset_memory_nodes() {
 626   GET_CONTAINER_INFO_CPTR(cptr, _cpuset, "/cpuset.mems",
 627                      "cpuset.mems is: %s", "%1023s", mems, 1024);
 628   return os::strdup(mems);
 629 }
 630 
 631 /* cpu_quota
 632  *
 633  * Return the number of milliseconds per period
 634  * process is guaranteed to run.
 635  *
 636  * return:
 637  *    quota time in milliseconds
 638  *    -1 for no quota
 639  *    OSCONTAINER_ERROR for not supported
 640  */
 641 int CgroupV1Subsystem::cpu_quota() {
 642   GET_CONTAINER_INFO(int, _cpu, "/cpu.cfs_quota_us",
 643                      "CPU Quota is: %d", "%d", quota);
 644   return quota;
 645 }
 646 
 647 int CgroupV1Subsystem::cpu_period() {
 648   GET_CONTAINER_INFO(int, _cpu, "/cpu.cfs_period_us",
 649                      "CPU Period is: %d", "%d", period);
 650   return period;
 651 }
 652 
 653 /* cpu_shares
 654  *
 655  * Return the amount of cpu shares available to the process
 656  *
 657  * return:
 658  *    Share number (typically a number relative to 1024)
 659  *                 (2048 typically expresses 2 CPUs worth of processing)
 660  *    -1 for no share setup
 661  *    OSCONTAINER_ERROR for not supported
 662  */
 663 int CgroupV1Subsystem::cpu_shares() {
 664   GET_CONTAINER_INFO(int, _cpu, "/cpu.shares",
 665                      "CPU Shares is: %d", "%d", shares);
 666   // Convert 1024 to no shares setup
 667   if (shares == 1024) return -1;
 668 
 669   return shares;
 670 }
 671 
 672 /* active_processor_count
 673  *
 674  * Calculate an appropriate number of active processors for the
 675  * VM to use based on these three inputs.
 676  *
 677  * cpu affinity
 678  * cgroup cpu quota & cpu period
 679  * cgroup cpu shares
 680  *
 681  * Algorithm:
 682  *
 683  * Determine the number of available CPUs from sched_getaffinity
 684  *
 685  * If user specified a quota (quota != -1), calculate the number of
 686  * required CPUs by dividing quota by period.
 687  *
 688  * If shares are in effect (shares != -1), calculate the number
 689  * of CPUs required for the shares by dividing the share value
 690  * by PER_CPU_SHARES.
 691  *
 692  * All results of division are rounded up to the next whole number.
 693  *
 694  * If neither shares or quotas have been specified, return the
 695  * number of active processors in the system.
 696  *
 697  * If both shares and quotas have been specified, the results are
 698  * based on the flag PreferContainerQuotaForCPUCount.  If true,
 699  * return the quota value.  If false return the smallest value
 700  * between shares or quotas.
 701  *
 702  * If shares and/or quotas have been specified, the resulting number
 703  * returned will never exceed the number of active processors.
 704  *
 705  * return:
 706  *    number of CPUs
 707  */
 708 int CgroupSubsystem::active_processor_count(int physical_proc_count) {
 709   int quota_count = 0, share_count = 0;
 710   int cpu_count, limit_count;
 711   int result;
 712 
 713   cpu_count = limit_count = physical_proc_count;
 714   int quota  = cpu_quota();
 715   int period = cpu_period();
 716   int share  = cpu_shares();
 717 
 718   if (quota > -1 && period > 0) {
 719     quota_count = ceilf((float)quota / (float)period);
 720     log_trace(os, container)("CPU Quota count based on quota/period: %d", quota_count);
 721   }
 722   if (share > -1) {
 723     share_count = ceilf((float)share / (float)PER_CPU_SHARES);
 724     log_trace(os, container)("CPU Share count based on shares: %d", share_count);
 725   }
 726 
 727   // If both shares and quotas are setup results depend
 728   // on flag PreferContainerQuotaForCPUCount.
 729   // If true, limit CPU count to quota
 730   // If false, use minimum of shares and quotas
 731   if (quota_count !=0 && share_count != 0) {
 732     if (PreferContainerQuotaForCPUCount) {
 733       limit_count = quota_count;
 734     } else {
 735       limit_count = MIN2(quota_count, share_count);
 736     }
 737   } else if (quota_count != 0) {
 738     limit_count = quota_count;
 739   } else if (share_count != 0) {
 740     limit_count = share_count;
 741   }
 742 
 743   result = MIN2(cpu_count, limit_count);
 744   log_trace(os, container)("OSContainer::active_processor_count: %d", result);
 745   return result;
 746 }
 747 
 748 void CgroupSubsystem::print_container_info(outputStream* st, int physical_proc_count) {
 749   st->print("container (cgroup) information:\n");
 750 
 751   const char *p_ct = container_type();
 752   st->print("container_type: %s\n", p_ct != NULL ? p_ct : "not supported");
 753 
 754   char *p = cpu_cpuset_cpus();
 755   st->print("cpu_cpuset_cpus: %s\n", p != NULL ? p : "not supported");
 756   os::free(p);
 757 
 758   p = cpu_cpuset_memory_nodes();
 759   st->print("cpu_memory_nodes: %s\n", p != NULL ? p : "not supported");
 760   os::free(p);
 761 
 762   int i = active_processor_count(physical_proc_count);
 763   st->print("active_processor_count: ");
 764   if (i > 0) {
 765     st->print("%d\n", i);
 766   } else {
 767     st->print("not supported\n");
 768   }
 769 
 770   i = cpu_quota();
 771   st->print("cpu_quota: ");
 772   if (i > 0) {
 773     st->print("%d\n", i);
 774   } else {
 775     st->print("%s\n", i == OSCONTAINER_ERROR ? "not supported" : "no quota");
 776   }
 777 
 778   i = cpu_period();
 779   st->print("cpu_period: ");
 780   if (i > 0) {
 781     st->print("%d\n", i);
 782   } else {
 783     st->print("%s\n", i == OSCONTAINER_ERROR ? "not supported" : "no period");
 784   }
 785 
 786   i = cpu_shares();
 787   st->print("cpu_shares: ");
 788   if (i > 0) {
 789     st->print("%d\n", i);
 790   } else {
 791     st->print("%s\n", i == OSCONTAINER_ERROR ? "not supported" : "no shares");
 792   }
 793 
 794   jlong j = memory_limit_in_bytes();
 795   st->print("memory_limit_in_bytes: ");
 796   if (j > 0) {
 797     st->print(JLONG_FORMAT "\n", j);
 798   } else {
 799     st->print("%s\n", j == OSCONTAINER_ERROR ? "not supported" : "unlimited");
 800   }
 801 
 802   j = memory_and_swap_limit_in_bytes();
 803   st->print("memory_and_swap_limit_in_bytes: ");
 804   if (j > 0) {
 805     st->print(JLONG_FORMAT "\n", j);
 806   } else {
 807     st->print("%s\n", j == OSCONTAINER_ERROR ? "not supported" : "unlimited");
 808   }
 809 
 810   j = memory_soft_limit_in_bytes();
 811   st->print("memory_soft_limit_in_bytes: ");
 812   if (j > 0) {
 813     st->print(JLONG_FORMAT "\n", j);
 814   } else {
 815     st->print("%s\n", j == OSCONTAINER_ERROR ? "not supported" : "unlimited");
 816   }
 817 
 818   j = memory_usage_in_bytes();
 819   st->print("memory_usage_in_bytes: ");
 820   if (j > 0) {
 821     st->print(JLONG_FORMAT "\n", j);
 822   } else {
 823     st->print("%s\n", j == OSCONTAINER_ERROR ? "not supported" : "unlimited");
 824   }
 825 
 826   j = memory_max_usage_in_bytes();
 827   st->print("memory_max_usage_in_bytes: ");
 828   if (j > 0) {
 829     st->print(JLONG_FORMAT "\n", j);
 830   } else {
 831     st->print("%s\n", j == OSCONTAINER_ERROR ? "not supported" : "unlimited");
 832   }
 833   st->cr();
 834 }
 835 
 836 /* cpu_shares
 837  *
 838  * Return the amount of cpu shares available to the process
 839  *
 840  * return:
 841  *    Share number (typically a number relative to 1024)
 842  *                 (2048 typically expresses 2 CPUs worth of processing)
 843  *    -1 for no share setup
 844  *    OSCONTAINER_ERROR for not supported
 845  */
 846 int CgroupV2Subsystem::cpu_shares() {
 847   GET_CONTAINER_INFO(int, _unified, "/cpu.weight",
 848                      "CPU Shares is: %d", "%d", shares);
 849   // Convert default value of 100 to no shares setup
 850   if (shares == 100) return -1;
 851 
 852   // CPU shares (OCI) value needs to get translated into
 853   // a proper Cgroups v2 value. See:
 854   // https://github.com/containers/crun/blob/master/crun.1.md#cpu-controller
 855   //
 856   // Use the inverse of (x == OCI value, y == cgroupsv2 value):
 857   // ((262142 * y - 1)/9999) + 2 = x
 858   //
 859   int x = 262142 * shares - 1;
 860   double frac = x/9999.0;
 861   x = ((int)frac) + 2;
 862   log_trace(os, container)("Scaled CPU Shares value is: %d", x);
 863   // Since the scaled value is not precise, return the closest
 864   // multiple of PER_CPU_SHARES for a more conservative mapping
 865   if ( x <= PER_CPU_SHARES ) {
 866      // will always map to 1 CPU
 867      return x;
 868   }
 869   int f = x/PER_CPU_SHARES;
 870   int lower_multiple = f * PER_CPU_SHARES;
 871   int upper_multiple = (f + 1) * PER_CPU_SHARES;
 872   int distance_lower = MAX2(lower_multiple, x) - MIN2(lower_multiple, x);
 873   int distance_upper = MAX2(upper_multiple, x) - MIN2(upper_multiple, x);
 874   x = distance_lower <= distance_upper ? lower_multiple : upper_multiple;
 875   log_trace(os, container)("Closest multiple of %d of the CPU Shares value is: %d", PER_CPU_SHARES, x);
 876   return x;
 877 }
 878 
 879 /* cpu_quota
 880  *
 881  * Return the number of milliseconds per period
 882  * process is guaranteed to run.
 883  *
 884  * return:
 885  *    quota time in milliseconds
 886  *    -1 for no quota
 887  *    OSCONTAINER_ERROR for not supported
 888  */
 889 int CgroupV2Subsystem::cpu_quota() {
 890   char * cpu_quota_str = cpu_quota_val();
 891   return (int)limit_from_str(cpu_quota_str);
 892 }
 893 
 894 char * CgroupV2Subsystem::cpu_cpuset_cpus() {
 895   GET_CONTAINER_INFO_CPTR(cptr, _unified, "/cpuset.cpus",
 896                      "cpuset.cpus is: %s", "%1023s", cpus, 1024);
 897   if (cpus == NULL) {
 898     return NULL;
 899   }
 900   return os::strdup(cpus);
 901 }
 902 
 903 char* CgroupV2Subsystem::cpu_quota_val() {
 904   GET_CONTAINER_INFO_CPTR(cptr, _unified, "/cpu.max",
 905                      "CPU Quota is: %s", "%s %*d", quota, 1024);
 906   if (quota == NULL) {
 907     return NULL;
 908   }
 909   return os::strdup(quota);
 910 }
 911 
 912 char * CgroupV2Subsystem::cpu_cpuset_memory_nodes() {
 913   GET_CONTAINER_INFO_CPTR(cptr, _unified, "/cpuset.mems",
 914                      "cpuset.mems is: %s", "%1023s", mems, 1024);
 915   if (mems == NULL) {
 916     return NULL;
 917   }
 918   return os::strdup(mems);
 919 }
 920 
 921 int CgroupV2Subsystem::cpu_period() {
 922   GET_CONTAINER_INFO(int, _unified, "/cpu.max",
 923                      "CPU Period is: %d", "%*s %d", period);
 924   return period;
 925 }
 926 
 927 /* memory_usage_in_bytes
 928  *
 929  * Return the amount of used memory used by this cgroup and decendents
 930  *
 931  * return:
 932  *    memory usage in bytes or
 933  *    -1 for unlimited
 934  *    OSCONTAINER_ERROR for not supported
 935  */
 936 jlong CgroupV2Subsystem::memory_usage_in_bytes() {
 937   GET_CONTAINER_INFO(jlong, _unified, "/memory.current",
 938                      "Memory Usage is: " JLONG_FORMAT, JLONG_FORMAT, memusage);
 939   return memusage;
 940 }
 941 
 942 jlong CgroupV2Subsystem::memory_soft_limit_in_bytes() {
 943   char* mem_soft_limit_str = mem_soft_limit_val();
 944   return limit_from_str(mem_soft_limit_str);
 945 }
 946 
 947 jlong CgroupV2Subsystem::memory_max_usage_in_bytes() {
 948   return OSCONTAINER_ERROR; // Not supported for Cgroups V2.
 949 }
 950 
 951 char* CgroupV2Subsystem::mem_soft_limit_val() {
 952   GET_CONTAINER_INFO_CPTR(cptr, _unified, "/memory.high",
 953                          "Memory Soft Limit is: %s", "%s", mem_soft_limit_str, 1024);
 954   if (mem_soft_limit_str == NULL) {
 955     return NULL;
 956   }
 957   return os::strdup(mem_soft_limit_str);
 958 }
 959 
 960 jlong CgroupV2Subsystem::memory_and_swap_limit_in_bytes() {
 961   char* mem_swp_limit_str = mem_swp_limit_val();
 962   return limit_from_str(mem_swp_limit_str);
 963 }
 964 
 965 char* CgroupV2Subsystem::mem_swp_limit_val() {
 966   GET_CONTAINER_INFO_CPTR(cptr, _unified, "/memory.swap.max",
 967                          "Memory and Swap Limit is: %s", "%s", mem_swp_limit_str, 1024);
 968   if (mem_swp_limit_str == NULL) {
 969     return NULL;
 970   }
 971   return os::strdup(mem_swp_limit_str);
 972 }
 973 
 974 /* memory_limit_in_bytes
 975  *
 976  * Return the limit of available memory for this process.
 977  *
 978  * return:
 979  *    memory limit in bytes or
 980  *    -1 for unlimited, OSCONTAINER_ERROR for an error
 981  */
 982 jlong CgroupV2Subsystem::memory_limit_in_bytes() {
 983   char * mem_limit_str = mem_limit_val();
 984   return limit_from_str(mem_limit_str);
 985 }
 986 
 987 jlong CgroupV2Subsystem::limit_from_str(char* limit_str) {
 988   if (limit_str == NULL) {
 989     return OSCONTAINER_ERROR;
 990   }
 991   // Unlimited memory in Cgroups V2 is the literal string 'max'
 992   if (strcmp("max", limit_str) == 0) {
 993     os::free(limit_str);
 994     return (jlong)-1;
 995   }
 996   julong limit;
 997   if (sscanf(limit_str, JULONG_FORMAT, &limit) != 1) {
 998     os::free(limit_str);
 999     return OSCONTAINER_ERROR;
1000   }
1001   os::free(limit_str);
1002   return (jlong)limit;
1003 }
1004 
1005 char* CgroupV2Subsystem::mem_limit_val() {
1006   GET_CONTAINER_INFO_CPTR(cptr, _unified, "/memory.max",
1007                          "Memory Limit is: %s", "%s", mem_limit_str, 1024);
1008   if (mem_limit_str == NULL) {
1009     return NULL;
1010   }
1011   return os::strdup(mem_limit_str);
1012 }
1013 
1014 char* CgroupV2Controller::construct_path(char* mount_path, char *cgroup_path) {
1015   char buf[MAXPATHLEN+1];
1016   int buflen;
1017   strncpy(buf, mount_path, MAXPATHLEN);
1018   buf[MAXPATHLEN] = '\0';
1019   buflen = strlen(buf);
1020   if ((buflen + strlen(cgroup_path)) > MAXPATHLEN) {
1021     return NULL;
1022   }
1023   strncat(buf, cgroup_path, MAXPATHLEN-buflen);
1024   buf[MAXPATHLEN] = '\0';
1025   return os::strdup(buf);
1026 }
1027