1 /*
   2  * Copyright (c) 2017, 2019, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include <string.h>
  26 #include <math.h>
  27 #include <errno.h>
  28 #include "utilities/globalDefinitions.hpp"
  29 #include "memory/allocation.hpp"
  30 #include "runtime/os.hpp"
  31 #include "logging/log.hpp"
  32 #include "osContainer_linux.hpp"
  33 
  34 /*
  35  * PER_CPU_SHARES has been set to 1024 because CPU shares' quota
  36  * is commonly used in cloud frameworks like Kubernetes[1],
  37  * AWS[2] and Mesos[3] in a similar way. They spawn containers with
  38  * --cpu-shares option values scaled by PER_CPU_SHARES. Thus, we do
  39  * the inverse for determining the number of possible available
  40  * CPUs to the JVM inside a container. See JDK-8216366.
  41  *
  42  * [1] https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/#meaning-of-cpu
  43  *     In particular:
  44  *        When using Docker:
  45  *          The spec.containers[].resources.requests.cpu is converted to its core value, which is potentially
  46  *          fractional, and multiplied by 1024. The greater of this number or 2 is used as the value of the
  47  *          --cpu-shares flag in the docker run command.
  48  * [2] https://docs.aws.amazon.com/AmazonECS/latest/APIReference/API_ContainerDefinition.html
  49  * [3] https://github.com/apache/mesos/blob/3478e344fb77d931f6122980c6e94cd3913c441d/src/docker/docker.cpp#L648
  50  *     https://github.com/apache/mesos/blob/3478e344fb77d931f6122980c6e94cd3913c441d/src/slave/containerizer/mesos/isolators/cgroups/constants.hpp#L30
  51  */
  52 #define PER_CPU_SHARES 1024
  53 
  54 bool  OSContainer::_is_initialized   = false;
  55 bool  OSContainer::_is_containerized = false;
  56 julong _unlimited_memory;
  57 
  58 class CgroupSubsystem: CHeapObj<mtInternal> {
  59  friend class OSContainer;
  60 
  61  private:
  62     /* mountinfo contents */
  63     char *_root;
  64     char *_mount_point;
  65 
  66     /* Constructed subsystem directory */
  67     char *_path;
  68 
  69  public:
  70     CgroupSubsystem(char *root, char *mountpoint) {
  71       _root = os::strdup(root);
  72       _mount_point = os::strdup(mountpoint);
  73       _path = NULL;
  74     }
  75 
  76     /*
  77      * Set directory to subsystem specific files based
  78      * on the contents of the mountinfo and cgroup files.
  79      */
  80     void set_subsystem_path(char *cgroup_path) {
  81       char buf[MAXPATHLEN+1];
  82       if (_root != NULL && cgroup_path != NULL) {
  83         if (strcmp(_root, "/") == 0) {
  84           int buflen;
  85           strncpy(buf, _mount_point, MAXPATHLEN);
  86           buf[MAXPATHLEN-1] = '\0';
  87           if (strcmp(cgroup_path,"/") != 0) {
  88             buflen = strlen(buf);
  89             if ((buflen + strlen(cgroup_path)) > (MAXPATHLEN-1)) {
  90               return;
  91             }
  92             strncat(buf, cgroup_path, MAXPATHLEN-buflen);
  93             buf[MAXPATHLEN-1] = '\0';
  94           }
  95           _path = os::strdup(buf);
  96         } else {
  97           if (strcmp(_root, cgroup_path) == 0) {
  98             strncpy(buf, _mount_point, MAXPATHLEN);
  99             buf[MAXPATHLEN-1] = '\0';
 100             _path = os::strdup(buf);
 101           } else {
 102             char *p = strstr(cgroup_path, _root);
 103             if (p != NULL && p == _root) {
 104               if (strlen(cgroup_path) > strlen(_root)) {
 105                 int buflen;
 106                 strncpy(buf, _mount_point, MAXPATHLEN);
 107                 buf[MAXPATHLEN-1] = '\0';
 108                 buflen = strlen(buf);
 109                 if ((buflen + strlen(cgroup_path) - strlen(_root)) > (MAXPATHLEN-1)) {
 110                   return;
 111                 }
 112                 strncat(buf, cgroup_path + strlen(_root), MAXPATHLEN-buflen);
 113                 buf[MAXPATHLEN-1] = '\0';
 114                 _path = os::strdup(buf);
 115               }
 116             }
 117           }
 118         }
 119       }
 120     }
 121 
 122     char *subsystem_path() { return _path; }
 123 };
 124 
 125 CgroupSubsystem* memory = NULL;
 126 CgroupSubsystem* cpuset = NULL;
 127 CgroupSubsystem* cpu = NULL;
 128 CgroupSubsystem* cpuacct = NULL;
 129 
 130 typedef char * cptr;
 131 
 132 PRAGMA_DIAG_PUSH
 133 PRAGMA_FORMAT_NONLITERAL_IGNORED
 134 template <typename T> int subsystem_file_contents(CgroupSubsystem* c,
 135                                               const char *filename,
 136                                               const char *scan_fmt,
 137                                               T returnval) {
 138   FILE *fp = NULL;
 139   char *p;
 140   char file[MAXPATHLEN+1];
 141   char buf[MAXPATHLEN+1];
 142 
 143   if (c == NULL) {
 144     log_debug(os, container)("subsystem_file_contents: CgroupSubsytem* is NULL");
 145     return OSCONTAINER_ERROR;
 146   }
 147   if (c->subsystem_path() == NULL) {
 148     log_debug(os, container)("subsystem_file_contents: subsystem path is NULL");
 149     return OSCONTAINER_ERROR;
 150   }
 151 
 152   strncpy(file, c->subsystem_path(), MAXPATHLEN);
 153   file[MAXPATHLEN-1] = '\0';
 154   int filelen = strlen(file);
 155   if ((filelen + strlen(filename)) > (MAXPATHLEN-1)) {
 156     log_debug(os, container)("File path too long %s, %s", file, filename);
 157     return OSCONTAINER_ERROR;
 158   }
 159   strncat(file, filename, MAXPATHLEN-filelen);
 160   log_trace(os, container)("Path to %s is %s", filename, file);
 161   fp = fopen(file, "r");
 162   if (fp != NULL) {
 163     p = fgets(buf, MAXPATHLEN, fp);
 164     if (p != NULL) {
 165       int matched = sscanf(p, scan_fmt, returnval);
 166       if (matched == 1) {
 167         fclose(fp);
 168         return 0;
 169       } else {
 170         log_debug(os, container)("Type %s not found in file %s", scan_fmt, file);
 171       }
 172     } else {
 173       log_debug(os, container)("Empty file %s", file);
 174     }
 175   } else {
 176     log_debug(os, container)("Open of file %s failed, %s", file, os::strerror(errno));
 177   }
 178   if (fp != NULL)
 179     fclose(fp);
 180   return OSCONTAINER_ERROR;
 181 }
 182 PRAGMA_DIAG_POP
 183 
 184 #define GET_CONTAINER_INFO(return_type, subsystem, filename,              \
 185                            logstring, scan_fmt, variable)                 \
 186   return_type variable;                                                   \
 187 {                                                                         \
 188   int err;                                                                \
 189   err = subsystem_file_contents(subsystem,                                \
 190                                 filename,                                 \
 191                                 scan_fmt,                                 \
 192                                 &variable);                               \
 193   if (err != 0)                                                           \
 194     return (return_type) OSCONTAINER_ERROR;                               \
 195                                                                           \
 196   log_trace(os, container)(logstring, variable);                          \
 197 }
 198 
 199 #define GET_CONTAINER_INFO_CPTR(return_type, subsystem, filename,         \
 200                                logstring, scan_fmt, variable, bufsize)    \
 201   char variable[bufsize];                                                 \
 202 {                                                                         \
 203   int err;                                                                \
 204   err = subsystem_file_contents(subsystem,                                \
 205                                 filename,                                 \
 206                                 scan_fmt,                                 \
 207                                 variable);                                \
 208   if (err != 0)                                                           \
 209     return (return_type) NULL;                                            \
 210                                                                           \
 211   log_trace(os, container)(logstring, variable);                          \
 212 }
 213 
 214 /* init
 215  *
 216  * Initialize the container support and determine if
 217  * we are running under cgroup control.
 218  */
 219 void OSContainer::init() {
 220   FILE *mntinfo = NULL;
 221   FILE *cgroup = NULL;
 222   char buf[MAXPATHLEN+1];
 223   char tmproot[MAXPATHLEN+1];
 224   char tmpmount[MAXPATHLEN+1];
 225   char *p;
 226   jlong mem_limit;
 227 
 228   assert(!_is_initialized, "Initializing OSContainer more than once");
 229 
 230   _is_initialized = true;
 231   _is_containerized = false;
 232 
 233   _unlimited_memory = (LONG_MAX / os::vm_page_size()) * os::vm_page_size();
 234 
 235   log_trace(os, container)("OSContainer::init: Initializing Container Support");
 236   if (!UseContainerSupport) {
 237     log_trace(os, container)("Container Support not enabled");
 238     return;
 239   }
 240 
 241   /*
 242    * Find the cgroup mount point for memory and cpuset
 243    * by reading /proc/self/mountinfo
 244    *
 245    * Example for docker:
 246    * 219 214 0:29 /docker/7208cebd00fa5f2e342b1094f7bed87fa25661471a4637118e65f1c995be8a34 /sys/fs/cgroup/memory ro,nosuid,nodev,noexec,relatime - cgroup cgroup rw,memory
 247    *
 248    * Example for host:
 249    * 34 28 0:29 / /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:16 - cgroup cgroup rw,memory
 250    */
 251   mntinfo = fopen("/proc/self/mountinfo", "r");
 252   if (mntinfo == NULL) {
 253       log_debug(os, container)("Can't open /proc/self/mountinfo, %s",
 254                                os::strerror(errno));
 255       return;
 256   }
 257 
 258   while ((p = fgets(buf, MAXPATHLEN, mntinfo)) != NULL) {
 259     char tmpcgroups[MAXPATHLEN+1];
 260     char *cptr = tmpcgroups;
 261     char *token;
 262 
 263     // mountinfo format is documented at https://www.kernel.org/doc/Documentation/filesystems/proc.txt
 264     if (sscanf(p, "%*d %*d %*d:%*d %s %s %*[^-]- cgroup %*s %s", tmproot, tmpmount, tmpcgroups) != 3) {
 265       continue;
 266     }
 267     while ((token = strsep(&cptr, ",")) != NULL) {
 268       if (strcmp(token, "memory") == 0) {
 269         memory = new CgroupSubsystem(tmproot, tmpmount);
 270       } else if (strcmp(token, "cpuset") == 0) {
 271         cpuset = new CgroupSubsystem(tmproot, tmpmount);
 272       } else if (strcmp(token, "cpu") == 0) {
 273         cpu = new CgroupSubsystem(tmproot, tmpmount);
 274       } else if (strcmp(token, "cpuacct") == 0) {
 275         cpuacct= new CgroupSubsystem(tmproot, tmpmount);
 276       }
 277     }
 278   }
 279 
 280   fclose(mntinfo);
 281 
 282   if (memory == NULL) {
 283     log_debug(os, container)("Required cgroup memory subsystem not found");
 284     return;
 285   }
 286   if (cpuset == NULL) {
 287     log_debug(os, container)("Required cgroup cpuset subsystem not found");
 288     return;
 289   }
 290   if (cpu == NULL) {
 291     log_debug(os, container)("Required cgroup cpu subsystem not found");
 292     return;
 293   }
 294   if (cpuacct == NULL) {
 295     log_debug(os, container)("Required cgroup cpuacct subsystem not found");
 296     return;
 297   }
 298 
 299   /*
 300    * Read /proc/self/cgroup and map host mount point to
 301    * local one via /proc/self/mountinfo content above
 302    *
 303    * Docker example:
 304    * 5:memory:/docker/6558aed8fc662b194323ceab5b964f69cf36b3e8af877a14b80256e93aecb044
 305    *
 306    * Host example:
 307    * 5:memory:/user.slice
 308    *
 309    * Construct a path to the process specific memory and cpuset
 310    * cgroup directory.
 311    *
 312    * For a container running under Docker from memory example above
 313    * the paths would be:
 314    *
 315    * /sys/fs/cgroup/memory
 316    *
 317    * For a Host from memory example above the path would be:
 318    *
 319    * /sys/fs/cgroup/memory/user.slice
 320    *
 321    */
 322   cgroup = fopen("/proc/self/cgroup", "r");
 323   if (cgroup == NULL) {
 324     log_debug(os, container)("Can't open /proc/self/cgroup, %s",
 325                              os::strerror(errno));
 326     return;
 327   }
 328 
 329   while ((p = fgets(buf, MAXPATHLEN, cgroup)) != NULL) {
 330     char *controllers;
 331     char *token;
 332     char *base;
 333 
 334     /* Skip cgroup number */
 335     strsep(&p, ":");
 336     /* Get controllers and base */
 337     controllers = strsep(&p, ":");
 338     base = strsep(&p, "\n");
 339 
 340     if (controllers == NULL) {
 341       continue;
 342     }
 343 
 344     while ((token = strsep(&controllers, ",")) != NULL) {
 345       if (strcmp(token, "memory") == 0) {
 346         memory->set_subsystem_path(base);
 347       } else if (strcmp(token, "cpuset") == 0) {
 348         cpuset->set_subsystem_path(base);
 349       } else if (strcmp(token, "cpu") == 0) {
 350         cpu->set_subsystem_path(base);
 351       } else if (strcmp(token, "cpuacct") == 0) {
 352         cpuacct->set_subsystem_path(base);
 353       }
 354     }
 355   }
 356 
 357   fclose(cgroup);
 358 
 359   // We need to update the amount of physical memory now that
 360   // command line arguments have been processed.
 361   if ((mem_limit = memory_limit_in_bytes()) > 0) {
 362     os::Linux::set_physical_memory(mem_limit);
 363   }
 364 
 365   _is_containerized = true;
 366 
 367 }
 368 
 369 const char * OSContainer::container_type() {
 370   if (is_containerized()) {
 371     return "cgroupv1";
 372   } else {
 373     return NULL;
 374   }
 375 }
 376 
 377 
 378 /* memory_limit_in_bytes
 379  *
 380  * Return the limit of available memory for this process.
 381  *
 382  * return:
 383  *    memory limit in bytes or
 384  *    -1 for unlimited
 385  *    OSCONTAINER_ERROR for not supported
 386  */
 387 jlong OSContainer::memory_limit_in_bytes() {
 388   GET_CONTAINER_INFO(julong, memory, "/memory.limit_in_bytes",
 389                      "Memory Limit is: " JULONG_FORMAT, JULONG_FORMAT, memlimit);
 390 
 391   if (memlimit >= _unlimited_memory) {
 392     log_trace(os, container)("Memory Limit is: Unlimited");
 393     return (jlong)-1;
 394   }
 395   else {
 396     return (jlong)memlimit;
 397   }
 398 }
 399 
 400 jlong OSContainer::memory_and_swap_limit_in_bytes() {
 401   GET_CONTAINER_INFO(julong, memory, "/memory.memsw.limit_in_bytes",
 402                      "Memory and Swap Limit is: " JULONG_FORMAT, JULONG_FORMAT, memswlimit);
 403   if (memswlimit >= _unlimited_memory) {
 404     log_trace(os, container)("Memory and Swap Limit is: Unlimited");
 405     return (jlong)-1;
 406   } else {
 407     return (jlong)memswlimit;
 408   }
 409 }
 410 
 411 jlong OSContainer::memory_soft_limit_in_bytes() {
 412   GET_CONTAINER_INFO(julong, memory, "/memory.soft_limit_in_bytes",
 413                      "Memory Soft Limit is: " JULONG_FORMAT, JULONG_FORMAT, memsoftlimit);
 414   if (memsoftlimit >= _unlimited_memory) {
 415     log_trace(os, container)("Memory Soft Limit is: Unlimited");
 416     return (jlong)-1;
 417   } else {
 418     return (jlong)memsoftlimit;
 419   }
 420 }
 421 
 422 /* memory_usage_in_bytes
 423  *
 424  * Return the amount of used memory for this process.
 425  *
 426  * return:
 427  *    memory usage in bytes or
 428  *    -1 for unlimited
 429  *    OSCONTAINER_ERROR for not supported
 430  */
 431 jlong OSContainer::memory_usage_in_bytes() {
 432   GET_CONTAINER_INFO(jlong, memory, "/memory.usage_in_bytes",
 433                      "Memory Usage is: " JLONG_FORMAT, JLONG_FORMAT, memusage);
 434   return memusage;
 435 }
 436 
 437 /* memory_max_usage_in_bytes
 438  *
 439  * Return the maximum amount of used memory for this process.
 440  *
 441  * return:
 442  *    max memory usage in bytes or
 443  *    OSCONTAINER_ERROR for not supported
 444  */
 445 jlong OSContainer::memory_max_usage_in_bytes() {
 446   GET_CONTAINER_INFO(jlong, memory, "/memory.max_usage_in_bytes",
 447                      "Maximum Memory Usage is: " JLONG_FORMAT, JLONG_FORMAT, memmaxusage);
 448   return memmaxusage;
 449 }
 450 
 451 /* active_processor_count
 452  *
 453  * Calculate an appropriate number of active processors for the
 454  * VM to use based on these three inputs.
 455  *
 456  * cpu affinity
 457  * cgroup cpu quota & cpu period
 458  * cgroup cpu shares
 459  *
 460  * Algorithm:
 461  *
 462  * Determine the number of available CPUs from sched_getaffinity
 463  *
 464  * If user specified a quota (quota != -1), calculate the number of
 465  * required CPUs by dividing quota by period.
 466  *
 467  * If shares are in effect (shares != -1), calculate the number
 468  * of CPUs required for the shares by dividing the share value
 469  * by PER_CPU_SHARES.
 470  *
 471  * All results of division are rounded up to the next whole number.
 472  *
 473  * If neither shares or quotas have been specified, return the
 474  * number of active processors in the system.
 475  *
 476  * If both shares and quotas have been specified, the results are
 477  * based on the flag PreferContainerQuotaForCPUCount.  If true,
 478  * return the quota value.  If false return the smallest value
 479  * between shares or quotas.
 480  *
 481  * If shares and/or quotas have been specified, the resulting number
 482  * returned will never exceed the number of active processors.
 483  *
 484  * return:
 485  *    number of CPUs
 486  */
 487 int OSContainer::active_processor_count() {
 488   int quota_count = 0, share_count = 0;
 489   int cpu_count, limit_count;
 490   int result;
 491 
 492   cpu_count = limit_count = os::Linux::active_processor_count();
 493   int quota  = cpu_quota();
 494   int period = cpu_period();
 495   int share  = cpu_shares();
 496 
 497   if (quota > -1 && period > 0) {
 498     quota_count = ceilf((float)quota / (float)period);
 499     log_trace(os, container)("CPU Quota count based on quota/period: %d", quota_count);
 500   }
 501   if (share > -1) {
 502     share_count = ceilf((float)share / (float)PER_CPU_SHARES);
 503     log_trace(os, container)("CPU Share count based on shares: %d", share_count);
 504   }
 505 
 506   // If both shares and quotas are setup results depend
 507   // on flag PreferContainerQuotaForCPUCount.
 508   // If true, limit CPU count to quota
 509   // If false, use minimum of shares and quotas
 510   if (quota_count !=0 && share_count != 0) {
 511     if (PreferContainerQuotaForCPUCount) {
 512       limit_count = quota_count;
 513     } else {
 514       limit_count = MIN2(quota_count, share_count);
 515     }
 516   } else if (quota_count != 0) {
 517     limit_count = quota_count;
 518   } else if (share_count != 0) {
 519     limit_count = share_count;
 520   }
 521 
 522   result = MIN2(cpu_count, limit_count);
 523   log_trace(os, container)("OSContainer::active_processor_count: %d", result);
 524   return result;
 525 }
 526 
 527 char * OSContainer::cpu_cpuset_cpus() {
 528   GET_CONTAINER_INFO_CPTR(cptr, cpuset, "/cpuset.cpus",
 529                      "cpuset.cpus is: %s", "%1023s", cpus, 1024);
 530   return os::strdup(cpus);
 531 }
 532 
 533 char * OSContainer::cpu_cpuset_memory_nodes() {
 534   GET_CONTAINER_INFO_CPTR(cptr, cpuset, "/cpuset.mems",
 535                      "cpuset.mems is: %s", "%1023s", mems, 1024);
 536   return os::strdup(mems);
 537 }
 538 
 539 /* cpu_quota
 540  *
 541  * Return the number of milliseconds per period
 542  * process is guaranteed to run.
 543  *
 544  * return:
 545  *    quota time in milliseconds
 546  *    -1 for no quota
 547  *    OSCONTAINER_ERROR for not supported
 548  */
 549 int OSContainer::cpu_quota() {
 550   GET_CONTAINER_INFO(int, cpu, "/cpu.cfs_quota_us",
 551                      "CPU Quota is: %d", "%d", quota);
 552   return quota;
 553 }
 554 
 555 int OSContainer::cpu_period() {
 556   GET_CONTAINER_INFO(int, cpu, "/cpu.cfs_period_us",
 557                      "CPU Period is: %d", "%d", period);
 558   return period;
 559 }
 560 
 561 /* cpu_shares
 562  *
 563  * Return the amount of cpu shares available to the process
 564  *
 565  * return:
 566  *    Share number (typically a number relative to 1024)
 567  *                 (2048 typically expresses 2 CPUs worth of processing)
 568  *    -1 for no share setup
 569  *    OSCONTAINER_ERROR for not supported
 570  */
 571 int OSContainer::cpu_shares() {
 572   GET_CONTAINER_INFO(int, cpu, "/cpu.shares",
 573                      "CPU Shares is: %d", "%d", shares);
 574   // Convert 1024 to no shares setup
 575   if (shares == 1024) return -1;
 576 
 577   return shares;
 578 }
 579