1 /*
   2  * Copyright (c) 2017, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include <string.h>
  26 #include <math.h>
  27 #include "utilities/globalDefinitions.hpp"
  28 #include "memory/allocation.hpp"
  29 #include "runtime/os.hpp"
  30 #include "logging/log.hpp"
  31 #include "osContainer_linux.hpp"
  32 
  33 /* 
  34  * Warning: Some linux distros use 0x7FFFFFFFFFFFF000 
  35  * and others use 0x7FFFFFFFFFFFFFFF for unlimited.
  36  */
  37 #define UNLIMITED_MEM CONST64(0x7FFFFFFFFFFFF000)
  38 
  39 #define PER_CPU_SHARES 1024
  40 
  41 bool  OSContainer::_is_initialized   = false;
  42 bool  OSContainer::_is_containerized = false;
  43 
  44 class CgroupSubsystem: CHeapObj<mtInternal> {
  45  friend class OSContainer;
  46 
  47  private:
  48     /* mountinfo contents */
  49     char *_root;
  50     char *_mount_point;
  51 
  52     /* Constructed subsystem directory */
  53     char *_path;
  54 
  55  public:
  56     CgroupSubsystem(char *root, char *mountpoint) {
  57       _root = os::strdup(root);
  58       _mount_point = os::strdup(mountpoint);
  59       _path = NULL;
  60     }
  61 
  62     /* 
  63      * Set directory to subsystem specific files based 
  64      * on the contents of the mountinfo and cgroup files.
  65      */
  66     void set_subsystem_path(char *cgroup_path) {
  67       char buf[MAXPATHLEN+1];
  68       if (_root != NULL && cgroup_path != NULL) {
  69         if (strcmp(_root, "/") == 0) {
  70           strncpy(buf, _mount_point, MAXPATHLEN);
  71           buf[MAXPATHLEN-1] = '\0';
  72           strncat(buf, cgroup_path, MAXPATHLEN-strlen(buf));
  73           buf[MAXPATHLEN-1] = '\0';
  74           _path = os::strdup(buf);
  75         } else {
  76           if (strcmp(_root, cgroup_path) == 0) {
  77             strncpy(buf, _mount_point, MAXPATHLEN);
  78             buf[MAXPATHLEN-1] = '\0';
  79             _path = os::strdup(buf);
  80           } else {
  81             char *p = strstr(_root, cgroup_path);
  82             if (p != NULL && p == _root) {
  83               if (strlen(cgroup_path) > strlen(_root)) {
  84                 strncpy(buf, _mount_point, MAXPATHLEN);
  85                 buf[MAXPATHLEN-1] = '\0';
  86                 strncat(buf, cgroup_path + strlen(_root), MAXPATHLEN-strlen(buf));
  87                 buf[MAXPATHLEN-1] = '\0';
  88                 _path = os::strdup(buf);
  89               }
  90             }
  91           }
  92         }
  93       }
  94     }
  95 
  96     char *subsystem_path() { return _path; }
  97 };
  98 
  99 // CgroupSubsystem *cgroupv2;
 100 CgroupSubsystem* memory = NULL;
 101 CgroupSubsystem* cpuset = NULL;
 102 CgroupSubsystem* cpu = NULL;
 103 CgroupSubsystem* cpuacct = NULL;
 104 
 105 typedef char * cptr;
 106 
 107 #define GEN_CONTAINER_GET_INFO(return_type, scan_fmt, isstr)              \
 108 int subsystem_file_contents_##return_type(CgroupSubsystem* c,             \
 109                                               char *filename,             \
 110                                               return_type *returnval) {   \
 111   FILE *fp = NULL;                                                        \
 112   char *p;                                                                \
 113   char buf[MAXPATHLEN+1];                                                 \
 114                                                                           \
 115   if (c != NULL && c->subsystem_path() != NULL) {                         \
 116     strncpy(buf, c->subsystem_path(), MAXPATHLEN);                        \
 117     buf[MAXPATHLEN-1] = '\0';                                             \
 118     strncat(buf, filename, MAXPATHLEN-strlen(buf));                       \
 119     log_trace(os, container)("Path to %s is %s\n", filename, buf);        \
 120     fp = fopen(buf, "r");                                                 \
 121     if (fp != NULL) {                                                     \
 122       p = fgets(buf, MAXPATHLEN, fp);                                     \
 123       if (p != NULL) {                                                    \
 124         if (isstr) {                                                      \
 125           *(char **)returnval = os::strdup(p);                            \
 126           fclose(fp);                                                     \
 127           return 0;                                                       \
 128         } else {                                                          \
 129           return_type value;                                              \
 130           int matched = sscanf(p, scan_fmt, &value);                      \
 131           if (matched == 1) {                                             \
 132             *returnval = value;                                           \
 133             fclose(fp);                                                   \
 134             return 0;                                                     \
 135           } else {                                                        \
 136             log_debug(os, container)("Type %s not found in file %s\n",    \
 137                                      scan_fmt , buf);                     \
 138           }                                                               \
 139         }                                                                 \
 140       } else {                                                            \
 141         log_debug(os, container)("Empty file %s\n", buf);                 \
 142       }                                                                   \
 143     } else {                                                              \
 144       log_debug(os, container)("file not found %s\n", buf);               \
 145     }                                                                     \
 146   }                                                                       \
 147   if (fp != NULL)                                                         \
 148     fclose(fp);                                                           \
 149   return OSCONTAINER_ERROR;                                               \
 150 }
 151 
 152 
 153 GEN_CONTAINER_GET_INFO(int, "%d", false)
 154 GEN_CONTAINER_GET_INFO(jlong, JLONG_FORMAT, false)
 155 GEN_CONTAINER_GET_INFO(cptr, "%p", true)
 156  
 157 #define GET_CONTAINER_INFO(return_type, isstring, subsystem,              \
 158                            filename, logstring, variable)                 \
 159   return_type variable;                                                   \
 160 {                                                                         \
 161   int err;                                                                \
 162   err = subsystem_file_contents_##return_type(subsystem,                  \
 163                                     filename,                             \
 164                                     &variable);                           \
 165   if (err != 0) {                                                         \
 166     log_debug(os, container)("Error reading %s", filename);               \
 167     return isstring ? (return_type) NULL :                                \
 168                       (return_type) OSCONTAINER_ERROR;                    \
 169   }                                                                       \
 170   log_trace(os, container)(logstring, variable);                          \
 171 }
 172 
 173 /* init
 174  *
 175  * Initialize the container support and determine if
 176  * we are running under cgroup control.
 177  */
 178 void OSContainer::init() {
 179   int mountid;
 180   int parentid;
 181   int major;
 182   int minor;
 183   FILE *mntinfo = NULL;
 184   FILE *cgroup = NULL;
 185   char buf[MAXPATHLEN+1];
 186   char tmproot[MAXPATHLEN+1];
 187   char tmpmount[MAXPATHLEN+1];
 188   char tmpbase[MAXPATHLEN+1];
 189   char *p;
 190   jlong mem_limit;
 191 
 192   assert(!_is_initialized, "Initializing OSContainer more than once");
 193 
 194   _is_initialized = true;
 195   _is_containerized = false;
 196 
 197   log_trace(os, container)("OSContainer::init: Initializing Container Support");
 198   if (!UseContainerSupport) {
 199     log_trace(os, container)("Container Support not enabled");
 200     return;
 201   }
 202 
 203   /* 
 204    * Find the cgroup mount point for memory and cpuset 
 205    * by reading /proc/self/mountinfo
 206    *
 207    * Example for docker:
 208    * 219 214 0:29 /docker/7208cebd00fa5f2e342b1094f7bed87fa25661471a4637118e65f1c995be8a34 /sys/fs/cgroup/memory ro,nosuid,nodev,noexec,relatime - cgroup cgroup rw,memory
 209    *
 210    * Example for host:
 211    * 34 28 0:29 / /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:16 - cgroup cgroup rw,memory
 212    */
 213   mntinfo = fopen("/proc/self/mountinfo", "r");
 214   if (mntinfo == NULL) {
 215       log_debug(os, container)("Can't locate /proc/self/mountinfo\n");
 216       return;
 217   }
 218 
 219   while ( (p = fgets(buf, MAXPATHLEN, mntinfo)) != NULL) {
 220     // Look for the filesystem type and see if it's cgroup
 221     char fstype[MAXPATHLEN+1];
 222     fstype[0] = '\0';
 223     char *s =  strstr(p, " - ");
 224     if (s != NULL && 
 225         sscanf(s, " - %s", fstype) == 1 &&
 226         strcmp(fstype, "cgroup") == 0) {
 227 
 228       if (strstr(p, "memory") != NULL) {
 229         int matched = sscanf(p, "%d %d %d:%d %s %s", 
 230                              &mountid,
 231                              &parentid,
 232                              &major,
 233                              &minor,
 234                              tmproot,
 235                              tmpmount);
 236         if (matched == 6) {
 237           memory = new CgroupSubsystem(tmproot, tmpmount);
 238         }
 239         else
 240           log_debug(os, container)("Incompatible str containing cgroup and memory: %s\n", p);
 241       } else if (strstr(p, "cpuset") != NULL) {
 242         int matched = sscanf(p, "%d %d %d:%d %s %s", 
 243                              &mountid,
 244                              &parentid,
 245                              &major,
 246                              &minor,
 247                              tmproot,
 248                              tmpmount);
 249         if (matched == 6) {
 250           cpuset = new CgroupSubsystem(tmproot, tmpmount);
 251         }
 252         else {
 253           log_debug(os, container)("Incompatible str containing cgroup and cpuset: %s\n", p);
 254         }
 255       } else if (strstr(p, "cpu,cpuacct") != NULL) {
 256         int matched = sscanf(p, "%d %d %d:%d %s %s", 
 257                              &mountid,
 258                              &parentid,
 259                              &major,
 260                              &minor,
 261                              tmproot,
 262                              tmpmount);
 263         if (matched == 6) {
 264           cpu = new CgroupSubsystem(tmproot, tmpmount);
 265           cpuacct = new CgroupSubsystem(tmproot, tmpmount);
 266         }
 267         else {
 268           log_debug(os, container)("Incompatible str containing cgroup and cpu,cpuacct: %s\n", p);
 269         }
 270       } else if (strstr(p, "cpuacct") != NULL) {
 271         int matched = sscanf(p, "%d %d %d:%d %s %s", 
 272                              &mountid,
 273                              &parentid,
 274                              &major,
 275                              &minor,
 276                              tmproot,
 277                              tmpmount);
 278         if (matched == 6) {
 279           cpuacct = new CgroupSubsystem(tmproot, tmpmount);
 280         }
 281         else {
 282           log_debug(os, container)("Incompatible str containing cgroup and cpuacct: %s\n", p);
 283         }
 284       } else if (strstr(p, "cpu") != NULL) {
 285         int matched = sscanf(p, "%d %d %d:%d %s %s", 
 286                              &mountid,
 287                              &parentid,
 288                              &major,
 289                              &minor,
 290                              tmproot,
 291                              tmpmount);
 292         if (matched == 6) {
 293           cpu = new CgroupSubsystem(tmproot, tmpmount);
 294         }
 295         else {
 296           log_debug(os, container)("Incompatible str containing cgroup and cpu: %s\n", p);
 297         }
 298       }
 299     }
 300   }
 301 
 302   if (mntinfo != NULL) fclose(mntinfo);
 303 
 304   /* 
 305    * Read /proc/self/cgroup and map host mount point to 
 306    * local one via /proc/self/mountinfo content above
 307    *
 308    * Docker example:
 309    * 5:memory:/docker/6558aed8fc662b194323ceab5b964f69cf36b3e8af877a14b80256e93aecb044
 310    *
 311    * Host example:
 312    * 5:memory:/user.slice
 313    *
 314    * Construct a path to the process specific memory and cpuset 
 315    * cgroup directory.
 316    *
 317    * For a container running under Docker from memory example above 
 318    * the paths would be:
 319    *
 320    * /sys/fs/cgroup/memory
 321    *
 322    * For a Host from memory example above the path would be:
 323    *
 324    * /sys/fs/cgroup/memory/user.slice
 325    * 
 326    */
 327   cgroup = fopen("/proc/self/cgroup", "r");
 328   if (cgroup == NULL) {
 329     log_debug(os, container)("Can't locate /proc/self/cgroup\n");
 330     return;
 331   }
 332 
 333   while ( (p = fgets(buf, MAXPATHLEN, cgroup)) != NULL) {
 334     int cgno;
 335     int matched;
 336     char *controller;
 337     char *base;
 338 
 339     /* Skip cgroup number */
 340     strsep(&p, ":");
 341     /* Get controller and base */
 342     controller = strsep(&p, ":");
 343     base = strsep(&p, "\n");
 344 
 345     if (controller != NULL) {
 346       if (strstr(controller, "memory") != NULL) {
 347         memory->set_subsystem_path(base);
 348       } else if (strstr(controller, "cpuset") != NULL) {
 349         cpuset->set_subsystem_path(base);
 350       } else if (strstr(controller, "cpu,cpuacct") != NULL) {
 351         cpu->set_subsystem_path(base);
 352         cpuacct->set_subsystem_path(base);
 353       } else if (strstr(controller, "cpuacct") != NULL) {
 354         cpuacct->set_subsystem_path(base);
 355       } else if (strstr(controller, "cpu") != NULL) {
 356         cpu->set_subsystem_path(base);
 357       }
 358     }
 359   }
 360 
 361   if (cgroup != NULL) fclose(cgroup);
 362 
 363   if (memory == NULL || cpuset == NULL || cpu == NULL) {
 364     log_debug(os, container)("Required cgroup subsystems not found");
 365     return;
 366   }
 367 
 368   // We need to update the amount of physical memory now that
 369   // command line arguments have been processed.
 370   if ((mem_limit = memory_limit_in_bytes()) > 0) {
 371     os::Linux::set_physical_memory(mem_limit);
 372   }
 373 
 374   _is_containerized = true;
 375 }
 376 
 377 char * OSContainer::container_type() {
 378   if (is_containerized()) {
 379     return (char *)"cgroupv1";
 380   } else {
 381     return NULL;
 382   }
 383 }
 384 
 385 
 386 /* memory_limit_in_bytes
 387  *
 388  * Return the limit of available memory for this process.
 389  *
 390  * return:
 391  *    memory limit in bytes or 
 392  *    -1 for unlimited
 393  *    OSCONTAINER_ERROR for not supported
 394  */
 395 jlong OSContainer::memory_limit_in_bytes() {
 396   GET_CONTAINER_INFO(jlong, false, memory,  (char *)"/memory.limit_in_bytes",
 397                      "Memory Limit is: " JLONG_FORMAT "\n", memlimit);
 398 
 399   if (memlimit >= UNLIMITED_MEM) {
 400     log_trace(os, container)("Memory Limit is: Unlimited\n");
 401     return (jlong)-1;
 402   }
 403   else {
 404     return memlimit;
 405   }
 406 }
 407 
 408 jlong OSContainer::memory_and_swap_limit_in_bytes() {
 409   GET_CONTAINER_INFO(jlong, false, memory,  (char *)"/memory.memsw.limit_in_bytes",
 410                      "Memory and Swap Limit is: " JLONG_FORMAT "\n", memswlimit);
 411   if (memswlimit >= UNLIMITED_MEM) {
 412     log_trace(os, container)("Memory and Swap Limit is: Unlimited\n");
 413     return (jlong)-1;
 414   } else {
 415     return memswlimit;
 416   }
 417 }
 418 
 419 jlong OSContainer::memory_soft_limit_in_bytes() {
 420   GET_CONTAINER_INFO(jlong, false, memory,  (char *)"/memory.soft_limit_in_bytes",
 421                      "Memory Soft Limit is: " JLONG_FORMAT "\n", memsoftlimit);
 422   if (memsoftlimit >= UNLIMITED_MEM) {
 423     log_trace(os, container)("Memory Soft Limit is: Unlimited\n");
 424     return (jlong)-1;
 425   } else {
 426     return memsoftlimit;
 427   }
 428 }
 429 
 430 /* memory_usage_in_bytes
 431  *
 432  * Return the amount of used memory for this process.
 433  *
 434  * return:
 435  *    memory usage in bytes or 
 436  *    -1 for unlimited
 437  *    OSCONTAINER_ERROR for not supported
 438  */
 439 jlong OSContainer::memory_usage_in_bytes() {
 440   GET_CONTAINER_INFO(jlong, false, memory,  (char *)"/memory.usage_in_bytes",
 441                      "Memory Usage is: " JLONG_FORMAT "\n", memusage);
 442   return memusage;
 443 }
 444 
 445 /* memory_max_usage_in_bytes
 446  *
 447  * Return the maximum amount of used memory for this process.
 448  *
 449  * return:
 450  *    max memory usage in bytes or 
 451  *    OSCONTAINER_ERROR for not supported
 452  */
 453 jlong OSContainer::memory_max_usage_in_bytes() {
 454   GET_CONTAINER_INFO(jlong, false, memory,  (char *)"/memory.max_usage_in_bytes",
 455                      "Maximu, Memory Usage is: " JLONG_FORMAT "\n", memmaxusage);
 456   return memmaxusage;
 457 }
 458 
 459 /* active_processor_count
 460  *
 461  * Calculate an appropriate number of active processors for the
 462  * VM to use based on these three cgroup options.
 463  *
 464  * cpu affinity
 465  * cpu quota & cpu period
 466  * cpu shares
 467  *
 468  * Algorithm:
 469  *
 470  * Determine the number of available CPUs from sched_getaffinity
 471  *
 472  * If user specified a quota (quota != -1), calculate the number of 
 473  * required CPUs by dividing quota by period.  
 474  *
 475  * If shares are in effect (shares != -1), calculate the number
 476  * of cpus required for the shares by dividing the share value 
 477  * by PER_CPU_SHARES.  
 478  *
 479  * All results of division are rounded up to the next whole number.
 480  *
 481  * Return the smaller number from the three different settings.
 482  *
 483  * return:
 484  *    number of cpus
 485  *    OSCONTAINER_ERROR if failure occured during extract of cpuset info
 486  */
 487 int OSContainer::active_processor_count() {
 488   int cpu_count, share_count, quota_count;
 489   int share, quota, period;
 490   int result;
 491 
 492   cpu_count = os::Linux::active_processor_count();
 493 
 494   share = cpu_shares();
 495   if (share > -1) {
 496     share_count = ceilf((float)share / (float)PER_CPU_SHARES);
 497     log_trace(os, container)("cpu_share count: %d", share_count);
 498   } else {
 499     share_count = cpu_count;
 500   } 
 501   
 502   quota = cpu_quota();
 503   period = cpu_period();
 504   if (quota > -1 && period > 0) {
 505     quota_count = ceilf((float)quota / (float)period);
 506     log_trace(os, container)("quota_count: %d", quota_count);
 507   } else {
 508     quota_count = cpu_count;
 509   }
 510 
 511   result = MIN2(cpu_count, MIN2(share_count, quota_count)); 
 512   log_trace(os, container)("OSContainer::active_processor_count: %d", result);
 513   return result;
 514 }
 515 
 516 char * OSContainer::cpu_cpuset_cpus() {
 517   GET_CONTAINER_INFO(cptr, true, cpuset, (char *)"/cpuset.cpus", 
 518                      "cpuset.cpus is: %s\n", cpus);
 519   return cpus;
 520 }
 521 
 522 char * OSContainer::cpu_cpuset_memory_nodes() {
 523   GET_CONTAINER_INFO(cptr, true, cpuset, (char *)"/cpuset.mems", 
 524                      "cpuset.mems is: %s\n", mems);
 525   return mems;
 526 }
 527 
 528 /* cpu_quota
 529  *
 530  * Return the number of milliseconds per period
 531  * process is guaranteed to run.
 532  *
 533  * return:
 534  *    quota time in milliseconds
 535  *    -1 for no quota
 536  *    OSCONTAINER_ERROR for not supported
 537  */
 538 int OSContainer::cpu_quota() {
 539   GET_CONTAINER_INFO(int, false, cpu, (char *)"/cpu.cfs_quota_us",
 540                      "CPU Quota is: %d\n", quota);
 541   return quota;
 542 }
 543 
 544 int OSContainer::cpu_period() {
 545   GET_CONTAINER_INFO(int, false, cpu, (char *)"/cpu.cfs_period_us",
 546                      "CPU Period is: %d\n", period);
 547   return period;
 548 }
 549 
 550 /* cpu_shares
 551  *
 552  * Return the amount of cpu shares available to the process
 553  *
 554  * return:
 555  *    Share number (typically a number relative to 1024)
 556  *                 (2048 typically expresses 2 CPUs worth of processing)
 557  *    -1 for no share setup
 558  *    OSCONTAINER_ERROR for not supported
 559  */
 560 int OSContainer::cpu_shares() {
 561   GET_CONTAINER_INFO(int, false, cpu, (char *)"/cpu.shares", 
 562                      "CPU Shares is: %d\n", shares);
 563   // Convert 1024 to no shares setup
 564   if (shares == 1024) return -1;
 565 
 566   return shares;
 567 }
 568