1 /*
   2  * Copyright (c) 2017, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include <string.h>
  26 #include <math.h>
  27 #include <errno.h>
  28 #include "utilities/globalDefinitions.hpp"
  29 #include "memory/allocation.hpp"
  30 #include "runtime/os.hpp"
  31 #include "logging/log.hpp"
  32 #include "osContainer_linux.hpp"
  33 
  34 /*
  35  * Warning: Some linux distros use 0x7FFFFFFFFFFFF000 (4k pages),
  36  * others use 0x7FFFFFFFFFFF0000 (64k pages),
  37  * and others use 0x7FFFFFFFFFFFFFFF for unlimited.
  38  */
  39 #define UNLIMITED_MEM CONST64(0x7FFFFFFFFFFF0000)
  40 
  41 #define PER_CPU_SHARES 1024
  42 
  43 bool  OSContainer::_is_initialized   = false;
  44 bool  OSContainer::_is_containerized = false;
  45 
  46 class CgroupSubsystem: CHeapObj<mtInternal> {
  47  friend class OSContainer;
  48 
  49  private:
  50     /* mountinfo contents */
  51     char *_root;
  52     char *_mount_point;
  53 
  54     /* Constructed subsystem directory */
  55     char *_path;
  56 
  57  public:
  58     CgroupSubsystem(char *root, char *mountpoint) {
  59       _root = os::strdup(root);
  60       _mount_point = os::strdup(mountpoint);
  61       _path = NULL;
  62     }
  63 
  64     /*
  65      * Set directory to subsystem specific files based
  66      * on the contents of the mountinfo and cgroup files.
  67      */
  68     void set_subsystem_path(char *cgroup_path) {
  69       char buf[MAXPATHLEN+1];
  70       if (_root != NULL && cgroup_path != NULL) {
  71         if (strcmp(_root, "/") == 0) {
  72           int buflen;
  73           strncpy(buf, _mount_point, MAXPATHLEN);
  74           buf[MAXPATHLEN-1] = '\0';
  75           if (strcmp(cgroup_path,"/") != 0) {
  76             buflen = strlen(buf);
  77             if ((buflen + strlen(cgroup_path)) > (MAXPATHLEN-1)) {
  78               return;
  79             }
  80             strncat(buf, cgroup_path, MAXPATHLEN-buflen);
  81             buf[MAXPATHLEN-1] = '\0';
  82           }
  83           _path = os::strdup(buf);
  84         } else {
  85           if (strcmp(_root, cgroup_path) == 0) {
  86             strncpy(buf, _mount_point, MAXPATHLEN);
  87             buf[MAXPATHLEN-1] = '\0';
  88             _path = os::strdup(buf);
  89           } else {
  90             char *p = strstr(_root, cgroup_path);
  91             if (p != NULL && p == _root) {
  92               if (strlen(cgroup_path) > strlen(_root)) {
  93                 int buflen;
  94                 strncpy(buf, _mount_point, MAXPATHLEN);
  95                 buf[MAXPATHLEN-1] = '\0';
  96                 buflen = strlen(buf);
  97                 if ((buflen + strlen(cgroup_path)) > (MAXPATHLEN-1)) {
  98                   return;
  99                 }
 100                 strncat(buf, cgroup_path + strlen(_root), MAXPATHLEN-buflen);
 101                 buf[MAXPATHLEN-1] = '\0';
 102                 _path = os::strdup(buf);
 103               }
 104             }
 105           }
 106         }
 107       }
 108     }
 109 
 110     char *subsystem_path() { return _path; }
 111 };
 112 
 113 CgroupSubsystem* memory = NULL;
 114 CgroupSubsystem* cpuset = NULL;
 115 CgroupSubsystem* cpu = NULL;
 116 CgroupSubsystem* cpuacct = NULL;
 117 
 118 typedef char * cptr;
 119 
 120 PRAGMA_DIAG_PUSH
 121 PRAGMA_FORMAT_NONLITERAL_IGNORED
 122 template <typename T> int subsystem_file_contents(CgroupSubsystem* c,
 123                                               const char *filename,
 124                                               const char *scan_fmt,
 125                                               T returnval) {
 126   FILE *fp = NULL;
 127   char *p;
 128   char file[MAXPATHLEN+1];
 129   char buf[MAXPATHLEN+1];
 130 
 131   if (c != NULL && c->subsystem_path() != NULL) {
 132     strncpy(file, c->subsystem_path(), MAXPATHLEN);
 133     file[MAXPATHLEN-1] = '\0';
 134     int filelen = strlen(file);
 135     if ((filelen + strlen(filename)) > (MAXPATHLEN-1)) {
 136        log_debug(os, container)("File path too long %s, %s", file, filename);
 137        return OSCONTAINER_ERROR;
 138     }
 139     strncat(file, filename, MAXPATHLEN-filelen);
 140     log_trace(os, container)("Path to %s is %s", filename, file);
 141     fp = fopen(file, "r");
 142     if (fp != NULL) {
 143       p = fgets(buf, MAXPATHLEN, fp);
 144       if (p != NULL) {
 145         int matched = sscanf(p, scan_fmt, returnval);
 146         if (matched == 1) {
 147           fclose(fp);
 148           return 0;
 149         } else {
 150           log_debug(os, container)("Type %s not found in file %s",
 151                                      scan_fmt , file);
 152         }
 153       } else {
 154         log_debug(os, container)("Empty file %s", file);
 155       }
 156     } else {
 157       log_debug(os, container)("Open of file %s failed, %s", file,
 158                                os::strerror(errno));
 159     }
 160   }
 161   if (fp != NULL)
 162     fclose(fp);
 163   return OSCONTAINER_ERROR;
 164 }
 165 PRAGMA_DIAG_POP
 166 
 167 #define GET_CONTAINER_INFO(return_type, subsystem, filename,              \
 168                            logstring, scan_fmt, variable)                 \
 169   return_type variable;                                                   \
 170 {                                                                         \
 171   int err;                                                                \
 172   err = subsystem_file_contents(subsystem,                                \
 173                                 filename,                                 \
 174                                 scan_fmt,                                 \
 175                                 &variable);                               \
 176   if (err != 0)                                                           \
 177     return (return_type) OSCONTAINER_ERROR;                               \
 178                                                                           \
 179   log_trace(os, container)(logstring, variable);                          \
 180 }
 181 
 182 #define GET_CONTAINER_INFO_CPTR(return_type, subsystem, filename,         \
 183                                logstring, scan_fmt, variable, bufsize)    \
 184   char variable[bufsize];                                                 \
 185 {                                                                         \
 186   int err;                                                                \
 187   err = subsystem_file_contents(subsystem,                                \
 188                                 filename,                                 \
 189                                 scan_fmt,                                 \
 190                                 variable);                                \
 191   if (err != 0)                                                           \
 192     return (return_type) NULL;                                            \
 193                                                                           \
 194   log_trace(os, container)(logstring, variable);                          \
 195 }
 196 
 197 /* init
 198  *
 199  * Initialize the container support and determine if
 200  * we are running under cgroup control.
 201  */
 202 void OSContainer::init() {
 203   int mountid;
 204   int parentid;
 205   int major;
 206   int minor;
 207   FILE *mntinfo = NULL;
 208   FILE *cgroup = NULL;
 209   char buf[MAXPATHLEN+1];
 210   char tmproot[MAXPATHLEN+1];
 211   char tmpmount[MAXPATHLEN+1];
 212   char tmpbase[MAXPATHLEN+1];
 213   char *p;
 214   jlong mem_limit;
 215 
 216   assert(!_is_initialized, "Initializing OSContainer more than once");
 217 
 218   _is_initialized = true;
 219   _is_containerized = false;
 220 
 221   log_trace(os, container)("OSContainer::init: Initializing Container Support");
 222   if (!UseContainerSupport) {
 223     log_trace(os, container)("Container Support not enabled");
 224     return;
 225   }
 226 
 227   /*
 228    * Find the cgroup mount point for memory and cpuset
 229    * by reading /proc/self/mountinfo
 230    *
 231    * Example for docker:
 232    * 219 214 0:29 /docker/7208cebd00fa5f2e342b1094f7bed87fa25661471a4637118e65f1c995be8a34 /sys/fs/cgroup/memory ro,nosuid,nodev,noexec,relatime - cgroup cgroup rw,memory
 233    *
 234    * Example for host:
 235    * 34 28 0:29 / /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:16 - cgroup cgroup rw,memory
 236    */
 237   mntinfo = fopen("/proc/self/mountinfo", "r");
 238   if (mntinfo == NULL) {
 239       log_debug(os, container)("Can't open /proc/self/mountinfo, %s",
 240                                os::strerror(errno));
 241       return;
 242   }
 243 
 244   while ( (p = fgets(buf, MAXPATHLEN, mntinfo)) != NULL) {
 245     // Look for the filesystem type and see if it's cgroup
 246     char fstype[MAXPATHLEN+1];
 247     fstype[0] = '\0';
 248     char *s =  strstr(p, " - ");
 249     if (s != NULL &&
 250         sscanf(s, " - %s", fstype) == 1 &&
 251         strcmp(fstype, "cgroup") == 0) {
 252 
 253       if (strstr(p, "memory") != NULL) {
 254         int matched = sscanf(p, "%d %d %d:%d %s %s",
 255                              &mountid,
 256                              &parentid,
 257                              &major,
 258                              &minor,
 259                              tmproot,
 260                              tmpmount);
 261         if (matched == 6) {
 262           memory = new CgroupSubsystem(tmproot, tmpmount);
 263         }
 264         else
 265           log_debug(os, container)("Incompatible str containing cgroup and memory: %s", p);
 266       } else if (strstr(p, "cpuset") != NULL) {
 267         int matched = sscanf(p, "%d %d %d:%d %s %s",
 268                              &mountid,
 269                              &parentid,
 270                              &major,
 271                              &minor,
 272                              tmproot,
 273                              tmpmount);
 274         if (matched == 6) {
 275           cpuset = new CgroupSubsystem(tmproot, tmpmount);
 276         }
 277         else {
 278           log_debug(os, container)("Incompatible str containing cgroup and cpuset: %s", p);
 279         }
 280       } else if (strstr(p, "cpu,cpuacct") != NULL) {
 281         int matched = sscanf(p, "%d %d %d:%d %s %s",
 282                              &mountid,
 283                              &parentid,
 284                              &major,
 285                              &minor,
 286                              tmproot,
 287                              tmpmount);
 288         if (matched == 6) {
 289           cpu = new CgroupSubsystem(tmproot, tmpmount);
 290           cpuacct = new CgroupSubsystem(tmproot, tmpmount);
 291         }
 292         else {
 293           log_debug(os, container)("Incompatible str containing cgroup and cpu,cpuacct: %s", p);
 294         }
 295       } else if (strstr(p, "cpuacct") != NULL) {
 296         int matched = sscanf(p, "%d %d %d:%d %s %s",
 297                              &mountid,
 298                              &parentid,
 299                              &major,
 300                              &minor,
 301                              tmproot,
 302                              tmpmount);
 303         if (matched == 6) {
 304           cpuacct = new CgroupSubsystem(tmproot, tmpmount);
 305         }
 306         else {
 307           log_debug(os, container)("Incompatible str containing cgroup and cpuacct: %s", p);
 308         }
 309       } else if (strstr(p, "cpu") != NULL) {
 310         int matched = sscanf(p, "%d %d %d:%d %s %s",
 311                              &mountid,
 312                              &parentid,
 313                              &major,
 314                              &minor,
 315                              tmproot,
 316                              tmpmount);
 317         if (matched == 6) {
 318           cpu = new CgroupSubsystem(tmproot, tmpmount);
 319         }
 320         else {
 321           log_debug(os, container)("Incompatible str containing cgroup and cpu: %s", p);
 322         }
 323       }
 324     }
 325   }
 326 
 327   fclose(mntinfo);
 328 
 329   if (memory == NULL || cpuset == NULL || cpu == NULL || cpuacct == NULL) {
 330     log_debug(os, container)("Required cgroup subsystems not found");
 331     return;
 332   }
 333 
 334   /*
 335    * Read /proc/self/cgroup and map host mount point to
 336    * local one via /proc/self/mountinfo content above
 337    *
 338    * Docker example:
 339    * 5:memory:/docker/6558aed8fc662b194323ceab5b964f69cf36b3e8af877a14b80256e93aecb044
 340    *
 341    * Host example:
 342    * 5:memory:/user.slice
 343    *
 344    * Construct a path to the process specific memory and cpuset
 345    * cgroup directory.
 346    *
 347    * For a container running under Docker from memory example above
 348    * the paths would be:
 349    *
 350    * /sys/fs/cgroup/memory
 351    *
 352    * For a Host from memory example above the path would be:
 353    *
 354    * /sys/fs/cgroup/memory/user.slice
 355    *
 356    */
 357   cgroup = fopen("/proc/self/cgroup", "r");
 358   if (cgroup == NULL) {
 359     log_debug(os, container)("Can't open /proc/self/cgroup, %s",
 360                              os::strerror(errno));
 361     return;
 362   }
 363 
 364   while ( (p = fgets(buf, MAXPATHLEN, cgroup)) != NULL) {
 365     int cgno;
 366     int matched;
 367     char *controller;
 368     char *base;
 369 
 370     /* Skip cgroup number */
 371     strsep(&p, ":");
 372     /* Get controller and base */
 373     controller = strsep(&p, ":");
 374     base = strsep(&p, "\n");
 375 
 376     if (controller != NULL) {
 377       if (strstr(controller, "memory") != NULL) {
 378         memory->set_subsystem_path(base);
 379       } else if (strstr(controller, "cpuset") != NULL) {
 380         cpuset->set_subsystem_path(base);
 381       } else if (strstr(controller, "cpu,cpuacct") != NULL) {
 382         cpu->set_subsystem_path(base);
 383         cpuacct->set_subsystem_path(base);
 384       } else if (strstr(controller, "cpuacct") != NULL) {
 385         cpuacct->set_subsystem_path(base);
 386       } else if (strstr(controller, "cpu") != NULL) {
 387         cpu->set_subsystem_path(base);
 388       }
 389     }
 390   }
 391 
 392   fclose(cgroup);
 393 
 394   // We need to update the amount of physical memory now that
 395   // command line arguments have been processed.
 396   if ((mem_limit = memory_limit_in_bytes()) > 0) {
 397     os::Linux::set_physical_memory(mem_limit);
 398   }
 399 
 400   _is_containerized = true;
 401 
 402 }
 403 
 404 char * OSContainer::container_type() {
 405   if (is_containerized()) {
 406     return (char *)"cgroupv1";
 407   } else {
 408     return NULL;
 409   }
 410 }
 411 
 412 
 413 /* memory_limit_in_bytes
 414  *
 415  * Return the limit of available memory for this process.
 416  *
 417  * return:
 418  *    memory limit in bytes or
 419  *    -1 for unlimited
 420  *    OSCONTAINER_ERROR for not supported
 421  */
 422 jlong OSContainer::memory_limit_in_bytes() {
 423   GET_CONTAINER_INFO(jlong, memory, "/memory.limit_in_bytes",
 424                      "Memory Limit is: " JLONG_FORMAT, JLONG_FORMAT, memlimit);
 425 
 426   if (memlimit >= UNLIMITED_MEM) {
 427     log_trace(os, container)("Memory Limit is: Unlimited");
 428     return (jlong)-1;
 429   }
 430   else {
 431     return memlimit;
 432   }
 433 }
 434 
 435 jlong OSContainer::memory_and_swap_limit_in_bytes() {
 436   GET_CONTAINER_INFO(jlong, memory, "/memory.memsw.limit_in_bytes",
 437                      "Memory and Swap Limit is: " JLONG_FORMAT, JLONG_FORMAT, memswlimit);
 438   if (memswlimit >= UNLIMITED_MEM) {
 439     log_trace(os, container)("Memory and Swap Limit is: Unlimited");
 440     return (jlong)-1;
 441   } else {
 442     return memswlimit;
 443   }
 444 }
 445 
 446 jlong OSContainer::memory_soft_limit_in_bytes() {
 447   GET_CONTAINER_INFO(jlong, memory, "/memory.soft_limit_in_bytes",
 448                      "Memory Soft Limit is: " JLONG_FORMAT, JLONG_FORMAT, memsoftlimit);
 449   if (memsoftlimit >= UNLIMITED_MEM) {
 450     log_trace(os, container)("Memory Soft Limit is: Unlimited");
 451     return (jlong)-1;
 452   } else {
 453     return memsoftlimit;
 454   }
 455 }
 456 
 457 /* memory_usage_in_bytes
 458  *
 459  * Return the amount of used memory for this process.
 460  *
 461  * return:
 462  *    memory usage in bytes or
 463  *    -1 for unlimited
 464  *    OSCONTAINER_ERROR for not supported
 465  */
 466 jlong OSContainer::memory_usage_in_bytes() {
 467   GET_CONTAINER_INFO(jlong, memory, "/memory.usage_in_bytes",
 468                      "Memory Usage is: " JLONG_FORMAT, JLONG_FORMAT, memusage);
 469   return memusage;
 470 }
 471 
 472 /* memory_max_usage_in_bytes
 473  *
 474  * Return the maximum amount of used memory for this process.
 475  *
 476  * return:
 477  *    max memory usage in bytes or
 478  *    OSCONTAINER_ERROR for not supported
 479  */
 480 jlong OSContainer::memory_max_usage_in_bytes() {
 481   GET_CONTAINER_INFO(jlong, memory, "/memory.max_usage_in_bytes",
 482                      "Maximum Memory Usage is: " JLONG_FORMAT, JLONG_FORMAT, memmaxusage);
 483   return memmaxusage;
 484 }
 485 
 486 /* active_processor_count
 487  *
 488  * Calculate an appropriate number of active processors for the
 489  * VM to use based on these three cgroup options.
 490  *
 491  * cpu affinity
 492  * cpu quota & cpu period
 493  * cpu shares
 494  *
 495  * Algorithm:
 496  *
 497  * Determine the number of available CPUs from sched_getaffinity
 498  *
 499  * If user specified a quota (quota != -1), calculate the number of
 500  * required CPUs by dividing quota by period.
 501  *
 502  * If shares are in effect (shares != -1), calculate the number
 503  * of cpus required for the shares by dividing the share value
 504  * by PER_CPU_SHARES.
 505  *
 506  * All results of division are rounded up to the next whole number.
 507  *
 508  * Return the smaller number from the three different settings.
 509  *
 510  * return:
 511  *    number of cpus
 512  *    OSCONTAINER_ERROR if failure occured during extract of cpuset info
 513  */
 514 int OSContainer::active_processor_count() {
 515   int cpu_count, share_count, quota_count;
 516   int share, quota, period;
 517   int result;
 518 
 519   cpu_count = os::Linux::active_processor_count();
 520 
 521   share = cpu_shares();
 522   if (share > -1) {
 523     share_count = ceilf((float)share / (float)PER_CPU_SHARES);
 524     log_trace(os, container)("cpu_share count: %d", share_count);
 525   } else {
 526     share_count = cpu_count;
 527   }
 528 
 529   quota = cpu_quota();
 530   period = cpu_period();
 531   if (quota > -1 && period > 0) {
 532     quota_count = ceilf((float)quota / (float)period);
 533     log_trace(os, container)("quota_count: %d", quota_count);
 534   } else {
 535     quota_count = cpu_count;
 536   }
 537 
 538   result = MIN2(cpu_count, MIN2(share_count, quota_count));
 539   log_trace(os, container)("OSContainer::active_processor_count: %d", result);
 540   return result;
 541 }
 542 
 543 char * OSContainer::cpu_cpuset_cpus() {
 544   GET_CONTAINER_INFO_CPTR(cptr, cpuset, "/cpuset.cpus",
 545                      "cpuset.cpus is: %s", "%1023s", cpus, 1024);
 546   return os::strdup(cpus);
 547 }
 548 
 549 char * OSContainer::cpu_cpuset_memory_nodes() {
 550   GET_CONTAINER_INFO_CPTR(cptr, cpuset, "/cpuset.mems",
 551                      "cpuset.mems is: %s", "%1023s", mems, 1024);
 552   return os::strdup(mems);
 553 }
 554 
 555 /* cpu_quota
 556  *
 557  * Return the number of milliseconds per period
 558  * process is guaranteed to run.
 559  *
 560  * return:
 561  *    quota time in milliseconds
 562  *    -1 for no quota
 563  *    OSCONTAINER_ERROR for not supported
 564  */
 565 int OSContainer::cpu_quota() {
 566   GET_CONTAINER_INFO(int, cpu, "/cpu.cfs_quota_us",
 567                      "CPU Quota is: %d", "%d", quota);
 568   return quota;
 569 }
 570 
 571 int OSContainer::cpu_period() {
 572   GET_CONTAINER_INFO(int, cpu, "/cpu.cfs_period_us",
 573                      "CPU Period is: %d", "%d", period);
 574   return period;
 575 }
 576 
 577 /* cpu_shares
 578  *
 579  * Return the amount of cpu shares available to the process
 580  *
 581  * return:
 582  *    Share number (typically a number relative to 1024)
 583  *                 (2048 typically expresses 2 CPUs worth of processing)
 584  *    -1 for no share setup
 585  *    OSCONTAINER_ERROR for not supported
 586  */
 587 int OSContainer::cpu_shares() {
 588   GET_CONTAINER_INFO(int, cpu, "/cpu.shares",
 589                      "CPU Shares is: %d", "%d", shares);
 590   // Convert 1024 to no shares setup
 591   if (shares == 1024) return -1;
 592 
 593   return shares;
 594 }
 595