1 /*
   2  * Copyright (c) 2017, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include <string.h>
  26 #include <math.h>
  27 #include "utilities/globalDefinitions.hpp"
  28 #include "memory/allocation.hpp"
  29 #include "runtime/os.hpp"
  30 #include "logging/log.hpp"
  31 #include "osContainer_linux.hpp"
  32 
  33 /* 
  34  * Warning: Some linux distros use 0x7FFFFFFFFFFFF000 
  35  * and others use 0x7FFFFFFFFFFFFFFF for unlimited.
  36  */
  37 #define UNLIMITED_MEM CONST64(0x7FFFFFFFFFFFF000)
  38 
  39 #define PER_CPU_SHARES 1024
  40 
  41 bool  OSContainer::_is_initialized   = false;
  42 bool  OSContainer::_is_containerized = false;
  43 
  44 class CgroupSubsystem: CHeapObj<mtInternal> {
  45  friend class OSContainer;
  46 
  47  private:
  48     /* mountinfo contents */
  49     char *_root;
  50     char *_mount_point;
  51 
  52     /* Constructed subsystem directory */
  53     char *_path;
  54 
  55  public:
  56     CgroupSubsystem(char *root, char *mountpoint) {
  57       _root = os::strdup(root);
  58       _mount_point = os::strdup(mountpoint);
  59       _path = NULL;
  60     }
  61 
  62     /* 
  63      * Set directory to subsystem specific files based 
  64      * on the contents of the mountinfo and cgroup files.
  65      */
  66     void set_subsystem_path(char *cgroup_path) {
  67       char buf[MAXPATHLEN+1];
  68       if (_root != NULL && cgroup_path != NULL) {
  69         if (strcmp(_root, "/") == 0) {
  70           strncpy(buf, _mount_point, MAXPATHLEN);
  71           buf[MAXPATHLEN-1] = '\0';
  72           if (strcmp(cgroup_path,"/") != 0) {
  73             int buflen = strlen(buf);
  74             if ((buflen + strlen(cgroup_path)) > (MAXPATHLEN-1)) {
  75               return;
  76             }
  77             strncat(buf, cgroup_path, MAXPATHLEN-buflen);
  78             buf[MAXPATHLEN-1] = '\0';
  79           }
  80           _path = os::strdup(buf);
  81         } else {
  82           if (strcmp(_root, cgroup_path) == 0) {
  83             strncpy(buf, _mount_point, MAXPATHLEN);
  84             buf[MAXPATHLEN-1] = '\0';
  85             _path = os::strdup(buf);
  86           } else {
  87             char *p = strstr(_root, cgroup_path);
  88             if (p != NULL && p == _root) {
  89               if (strlen(cgroup_path) > strlen(_root)) {
  90                 strncpy(buf, _mount_point, MAXPATHLEN);
  91                 buf[MAXPATHLEN-1] = '\0';
  92                 int buflen = strlen(buf);
  93                 if ((buflen + strlen(cgroup_path)) > (MAXPATHLEN-1)) {
  94                   return;
  95                 }
  96                 strncat(buf, cgroup_path + strlen(_root), MAXPATHLEN-buflen);
  97                 buf[MAXPATHLEN-1] = '\0';
  98                 _path = os::strdup(buf);
  99               }
 100             }
 101           }
 102         }
 103       }
 104     }
 105 
 106     char *subsystem_path() { return _path; }
 107 };
 108 
 109 // CgroupSubsystem *cgroupv2;
 110 CgroupSubsystem* memory = NULL;
 111 CgroupSubsystem* cpuset = NULL;
 112 CgroupSubsystem* cpu = NULL;
 113 CgroupSubsystem* cpuacct = NULL;
 114 
 115 typedef char * cptr;
 116 
 117 #define GEN_CONTAINER_GET_INFO(return_type, scan_fmt, isstr)              \
 118 int subsystem_file_contents_##return_type(CgroupSubsystem* c,             \
 119                                               const char *filename,       \
 120                                               return_type *returnval) {   \
 121   FILE *fp = NULL;                                                        \
 122   char *p;                                                                \
 123   char buf[MAXPATHLEN+1];                                                 \
 124                                                                           \
 125   if (c != NULL && c->subsystem_path() != NULL) {                         \
 126     strncpy(buf, c->subsystem_path(), MAXPATHLEN);                        \
 127     buf[MAXPATHLEN-1] = '\0';                                             \
 128     int buflen = strlen(buf);                                             \
 129     if ((buflen + strlen(filename)) > (MAXPATHLEN-1)) {                   \
 130        return OSCONTAINER_ERROR;                                          \
 131     }                                                                     \
 132     strncat(buf, filename, MAXPATHLEN-buflen);                            \
 133     log_trace(os, container)("Path to %s is %s", filename, buf);          \
 134     fp = fopen(buf, "r");                                                 \
 135     if (fp != NULL) {                                                     \
 136       p = fgets(buf, MAXPATHLEN, fp);                                     \
 137       if (p != NULL) {                                                    \
 138         if (isstr) {                                                      \
 139           *(char **)returnval = os::strdup(p);                            \
 140           fclose(fp);                                                     \
 141           return 0;                                                       \
 142         } else {                                                          \
 143           return_type value;                                              \
 144           int matched = sscanf(p, scan_fmt, &value);                      \
 145           if (matched == 1) {                                             \
 146             *returnval = value;                                           \
 147             fclose(fp);                                                   \
 148             return 0;                                                     \
 149           } else {                                                        \
 150             log_debug(os, container)("Type %s not found in file %s",      \
 151                                      scan_fmt , buf);                     \
 152           }                                                               \
 153         }                                                                 \
 154       } else {                                                            \
 155         log_debug(os, container)("Empty file %s", buf);                   \
 156       }                                                                   \
 157     } else {                                                              \
 158       log_debug(os, container)("file not found %s", buf);                 \
 159     }                                                                     \
 160   }                                                                       \
 161   if (fp != NULL)                                                         \
 162     fclose(fp);                                                           \
 163   return OSCONTAINER_ERROR;                                               \
 164 }
 165 
 166 
 167 GEN_CONTAINER_GET_INFO(int, "%d", false)
 168 GEN_CONTAINER_GET_INFO(jlong, JLONG_FORMAT, false)
 169 GEN_CONTAINER_GET_INFO(cptr, "%p", true)
 170  
 171 #define GET_CONTAINER_INFO(return_type, isstring, subsystem,              \
 172                            filename, logstring, variable)                 \
 173   return_type variable;                                                   \
 174 {                                                                         \
 175   int err;                                                                \
 176   err = subsystem_file_contents_##return_type(subsystem,                  \
 177                                     filename,                             \
 178                                     &variable);                           \
 179   if (err != 0) {                                                         \
 180     log_debug(os, container)("Error reading %s", filename);               \
 181     return isstring ? (return_type) NULL :                                \
 182                       (return_type) OSCONTAINER_ERROR;                    \
 183   }                                                                       \
 184   log_trace(os, container)(logstring, variable);                          \
 185 }
 186 
 187 /* init
 188  *
 189  * Initialize the container support and determine if
 190  * we are running under cgroup control.
 191  */
 192 void OSContainer::init() {
 193   int mountid;
 194   int parentid;
 195   int major;
 196   int minor;
 197   FILE *mntinfo = NULL;
 198   FILE *cgroup = NULL;
 199   char buf[MAXPATHLEN+1];
 200   char tmproot[MAXPATHLEN+1];
 201   char tmpmount[MAXPATHLEN+1];
 202   char tmpbase[MAXPATHLEN+1];
 203   char *p;
 204   jlong mem_limit;
 205 
 206   assert(!_is_initialized, "Initializing OSContainer more than once");
 207 
 208   _is_initialized = true;
 209   _is_containerized = false;
 210 
 211   log_trace(os, container)("OSContainer::init: Initializing Container Support");
 212   if (!UseContainerSupport) {
 213     log_trace(os, container)("Container Support not enabled");
 214     return;
 215   }
 216 
 217   /* 
 218    * Find the cgroup mount point for memory and cpuset 
 219    * by reading /proc/self/mountinfo
 220    *
 221    * Example for docker:
 222    * 219 214 0:29 /docker/7208cebd00fa5f2e342b1094f7bed87fa25661471a4637118e65f1c995be8a34 /sys/fs/cgroup/memory ro,nosuid,nodev,noexec,relatime - cgroup cgroup rw,memory
 223    *
 224    * Example for host:
 225    * 34 28 0:29 / /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:16 - cgroup cgroup rw,memory
 226    */
 227   mntinfo = fopen("/proc/self/mountinfo", "r");
 228   if (mntinfo == NULL) {
 229       log_debug(os, container)("Can't locate /proc/self/mountinfo");
 230       return;
 231   }
 232 
 233   while ( (p = fgets(buf, MAXPATHLEN, mntinfo)) != NULL) {
 234     // Look for the filesystem type and see if it's cgroup
 235     char fstype[MAXPATHLEN+1];
 236     fstype[0] = '\0';
 237     char *s =  strstr(p, " - ");
 238     if (s != NULL && 
 239         sscanf(s, " - %s", fstype) == 1 &&
 240         strcmp(fstype, "cgroup") == 0) {
 241 
 242       if (strstr(p, "memory") != NULL) {
 243         int matched = sscanf(p, "%d %d %d:%d %s %s", 
 244                              &mountid,
 245                              &parentid,
 246                              &major,
 247                              &minor,
 248                              tmproot,
 249                              tmpmount);
 250         if (matched == 6) {
 251           memory = new CgroupSubsystem(tmproot, tmpmount);
 252         }
 253         else
 254           log_debug(os, container)("Incompatible str containing cgroup and memory: %s", p);
 255       } else if (strstr(p, "cpuset") != NULL) {
 256         int matched = sscanf(p, "%d %d %d:%d %s %s", 
 257                              &mountid,
 258                              &parentid,
 259                              &major,
 260                              &minor,
 261                              tmproot,
 262                              tmpmount);
 263         if (matched == 6) {
 264           cpuset = new CgroupSubsystem(tmproot, tmpmount);
 265         }
 266         else {
 267           log_debug(os, container)("Incompatible str containing cgroup and cpuset: %s", p);
 268         }
 269       } else if (strstr(p, "cpu,cpuacct") != NULL) {
 270         int matched = sscanf(p, "%d %d %d:%d %s %s", 
 271                              &mountid,
 272                              &parentid,
 273                              &major,
 274                              &minor,
 275                              tmproot,
 276                              tmpmount);
 277         if (matched == 6) {
 278           cpu = new CgroupSubsystem(tmproot, tmpmount);
 279           cpuacct = new CgroupSubsystem(tmproot, tmpmount);
 280         }
 281         else {
 282           log_debug(os, container)("Incompatible str containing cgroup and cpu,cpuacct: %s", p);
 283         }
 284       } else if (strstr(p, "cpuacct") != NULL) {
 285         int matched = sscanf(p, "%d %d %d:%d %s %s", 
 286                              &mountid,
 287                              &parentid,
 288                              &major,
 289                              &minor,
 290                              tmproot,
 291                              tmpmount);
 292         if (matched == 6) {
 293           cpuacct = new CgroupSubsystem(tmproot, tmpmount);
 294         }
 295         else {
 296           log_debug(os, container)("Incompatible str containing cgroup and cpuacct: %s", p);
 297         }
 298       } else if (strstr(p, "cpu") != NULL) {
 299         int matched = sscanf(p, "%d %d %d:%d %s %s", 
 300                              &mountid,
 301                              &parentid,
 302                              &major,
 303                              &minor,
 304                              tmproot,
 305                              tmpmount);
 306         if (matched == 6) {
 307           cpu = new CgroupSubsystem(tmproot, tmpmount);
 308         }
 309         else {
 310           log_debug(os, container)("Incompatible str containing cgroup and cpu: %s", p);
 311         }
 312       }
 313     }
 314   }
 315 
 316   if (mntinfo != NULL) fclose(mntinfo);
 317 
 318   /* 
 319    * Read /proc/self/cgroup and map host mount point to 
 320    * local one via /proc/self/mountinfo content above
 321    *
 322    * Docker example:
 323    * 5:memory:/docker/6558aed8fc662b194323ceab5b964f69cf36b3e8af877a14b80256e93aecb044
 324    *
 325    * Host example:
 326    * 5:memory:/user.slice
 327    *
 328    * Construct a path to the process specific memory and cpuset 
 329    * cgroup directory.
 330    *
 331    * For a container running under Docker from memory example above 
 332    * the paths would be:
 333    *
 334    * /sys/fs/cgroup/memory
 335    *
 336    * For a Host from memory example above the path would be:
 337    *
 338    * /sys/fs/cgroup/memory/user.slice
 339    * 
 340    */
 341   cgroup = fopen("/proc/self/cgroup", "r");
 342   if (cgroup == NULL) {
 343     log_debug(os, container)("Can't locate /proc/self/cgroup");
 344     return;
 345   }
 346 
 347   while ( (p = fgets(buf, MAXPATHLEN, cgroup)) != NULL) {
 348     int cgno;
 349     int matched;
 350     char *controller;
 351     char *base;
 352 
 353     /* Skip cgroup number */
 354     strsep(&p, ":");
 355     /* Get controller and base */
 356     controller = strsep(&p, ":");
 357     base = strsep(&p, "\n");
 358 
 359     if (controller != NULL) {
 360       if (strstr(controller, "memory") != NULL) {
 361         memory->set_subsystem_path(base);
 362       } else if (strstr(controller, "cpuset") != NULL) {
 363         cpuset->set_subsystem_path(base);
 364       } else if (strstr(controller, "cpu,cpuacct") != NULL) {
 365         cpu->set_subsystem_path(base);
 366         cpuacct->set_subsystem_path(base);
 367       } else if (strstr(controller, "cpuacct") != NULL) {
 368         cpuacct->set_subsystem_path(base);
 369       } else if (strstr(controller, "cpu") != NULL) {
 370         cpu->set_subsystem_path(base);
 371       }
 372     }
 373   }
 374 
 375   if (cgroup != NULL) fclose(cgroup);
 376 
 377   if (memory == NULL || cpuset == NULL || cpu == NULL) {
 378     log_debug(os, container)("Required cgroup subsystems not found");
 379     return;
 380   }
 381 
 382   // We need to update the amount of physical memory now that
 383   // command line arguments have been processed.
 384   if ((mem_limit = memory_limit_in_bytes()) > 0) {
 385     os::Linux::set_physical_memory(mem_limit);
 386   }
 387 
 388   _is_containerized = true;
 389 }
 390 
 391 char * OSContainer::container_type() {
 392   if (is_containerized()) {
 393     return (char *)"cgroupv1";
 394   } else {
 395     return NULL;
 396   }
 397 }
 398 
 399 
 400 /* memory_limit_in_bytes
 401  *
 402  * Return the limit of available memory for this process.
 403  *
 404  * return:
 405  *    memory limit in bytes or 
 406  *    -1 for unlimited
 407  *    OSCONTAINER_ERROR for not supported
 408  */
 409 jlong OSContainer::memory_limit_in_bytes() {
 410   GET_CONTAINER_INFO(jlong, false, memory,  (char *)"/memory.limit_in_bytes",
 411                      "Memory Limit is: " JLONG_FORMAT, memlimit);
 412 
 413   if (memlimit >= UNLIMITED_MEM) {
 414     log_trace(os, container)("Memory Limit is: Unlimited");
 415     return (jlong)-1;
 416   }
 417   else {
 418     return memlimit;
 419   }
 420 }
 421 
 422 jlong OSContainer::memory_and_swap_limit_in_bytes() {
 423   GET_CONTAINER_INFO(jlong, false, memory,  (char *)"/memory.memsw.limit_in_bytes",
 424                      "Memory and Swap Limit is: " JLONG_FORMAT, memswlimit);
 425   if (memswlimit >= UNLIMITED_MEM) {
 426     log_trace(os, container)("Memory and Swap Limit is: Unlimited");
 427     return (jlong)-1;
 428   } else {
 429     return memswlimit;
 430   }
 431 }
 432 
 433 jlong OSContainer::memory_soft_limit_in_bytes() {
 434   GET_CONTAINER_INFO(jlong, false, memory,  (char *)"/memory.soft_limit_in_bytes",
 435                      "Memory Soft Limit is: " JLONG_FORMAT, memsoftlimit);
 436   if (memsoftlimit >= UNLIMITED_MEM) {
 437     log_trace(os, container)("Memory Soft Limit is: Unlimited");
 438     return (jlong)-1;
 439   } else {
 440     return memsoftlimit;
 441   }
 442 }
 443 
 444 /* memory_usage_in_bytes
 445  *
 446  * Return the amount of used memory for this process.
 447  *
 448  * return:
 449  *    memory usage in bytes or 
 450  *    -1 for unlimited
 451  *    OSCONTAINER_ERROR for not supported
 452  */
 453 jlong OSContainer::memory_usage_in_bytes() {
 454   GET_CONTAINER_INFO(jlong, false, memory,  (char *)"/memory.usage_in_bytes",
 455                      "Memory Usage is: " JLONG_FORMAT, memusage);
 456   return memusage;
 457 }
 458 
 459 /* memory_max_usage_in_bytes
 460  *
 461  * Return the maximum amount of used memory for this process.
 462  *
 463  * return:
 464  *    max memory usage in bytes or 
 465  *    OSCONTAINER_ERROR for not supported
 466  */
 467 jlong OSContainer::memory_max_usage_in_bytes() {
 468   GET_CONTAINER_INFO(jlong, false, memory,  (char *)"/memory.max_usage_in_bytes",
 469                      "Maximu, Memory Usage is: " JLONG_FORMAT, memmaxusage);
 470   return memmaxusage;
 471 }
 472 
 473 /* active_processor_count
 474  *
 475  * Calculate an appropriate number of active processors for the
 476  * VM to use based on these three cgroup options.
 477  *
 478  * cpu affinity
 479  * cpu quota & cpu period
 480  * cpu shares
 481  *
 482  * Algorithm:
 483  *
 484  * Determine the number of available CPUs from sched_getaffinity
 485  *
 486  * If user specified a quota (quota != -1), calculate the number of 
 487  * required CPUs by dividing quota by period.  
 488  *
 489  * If shares are in effect (shares != -1), calculate the number
 490  * of cpus required for the shares by dividing the share value 
 491  * by PER_CPU_SHARES.  
 492  *
 493  * All results of division are rounded up to the next whole number.
 494  *
 495  * Return the smaller number from the three different settings.
 496  *
 497  * return:
 498  *    number of cpus
 499  *    OSCONTAINER_ERROR if failure occured during extract of cpuset info
 500  */
 501 int OSContainer::active_processor_count() {
 502   int cpu_count, share_count, quota_count;
 503   int share, quota, period;
 504   int result;
 505 
 506   cpu_count = os::Linux::active_processor_count();
 507 
 508   share = cpu_shares();
 509   if (share > -1) {
 510     share_count = ceilf((float)share / (float)PER_CPU_SHARES);
 511     log_trace(os, container)("cpu_share count: %d", share_count);
 512   } else {
 513     share_count = cpu_count;
 514   } 
 515   
 516   quota = cpu_quota();
 517   period = cpu_period();
 518   if (quota > -1 && period > 0) {
 519     quota_count = ceilf((float)quota / (float)period);
 520     log_trace(os, container)("quota_count: %d", quota_count);
 521   } else {
 522     quota_count = cpu_count;
 523   }
 524 
 525   result = MIN2(cpu_count, MIN2(share_count, quota_count)); 
 526   log_trace(os, container)("OSContainer::active_processor_count: %d", result);
 527   return result;
 528 }
 529 
 530 char * OSContainer::cpu_cpuset_cpus() {
 531   GET_CONTAINER_INFO(cptr, true, cpuset, (char *)"/cpuset.cpus", 
 532                      "cpuset.cpus is: %s", cpus);
 533   return cpus;
 534 }
 535 
 536 char * OSContainer::cpu_cpuset_memory_nodes() {
 537   GET_CONTAINER_INFO(cptr, true, cpuset, (char *)"/cpuset.mems", 
 538                      "cpuset.mems is: %s", mems);
 539   return mems;
 540 }
 541 
 542 /* cpu_quota
 543  *
 544  * Return the number of milliseconds per period
 545  * process is guaranteed to run.
 546  *
 547  * return:
 548  *    quota time in milliseconds
 549  *    -1 for no quota
 550  *    OSCONTAINER_ERROR for not supported
 551  */
 552 int OSContainer::cpu_quota() {
 553   GET_CONTAINER_INFO(int, false, cpu, (char *)"/cpu.cfs_quota_us",
 554                      "CPU Quota is: %d", quota);
 555   return quota;
 556 }
 557 
 558 int OSContainer::cpu_period() {
 559   GET_CONTAINER_INFO(int, false, cpu, (char *)"/cpu.cfs_period_us",
 560                      "CPU Period is: %d", period);
 561   return period;
 562 }
 563 
 564 /* cpu_shares
 565  *
 566  * Return the amount of cpu shares available to the process
 567  *
 568  * return:
 569  *    Share number (typically a number relative to 1024)
 570  *                 (2048 typically expresses 2 CPUs worth of processing)
 571  *    -1 for no share setup
 572  *    OSCONTAINER_ERROR for not supported
 573  */
 574 int OSContainer::cpu_shares() {
 575   GET_CONTAINER_INFO(int, false, cpu, (char *)"/cpu.shares", 
 576                      "CPU Shares is: %d", shares);
 577   // Convert 1024 to no shares setup
 578   if (shares == 1024) return -1;
 579 
 580   return shares;
 581 }
 582