1 /* 2 * Copyright (c) 2017, 2019, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include <string.h> 26 #include <math.h> 27 #include <errno.h> 28 #include "utilities/globalDefinitions.hpp" 29 #include "memory/allocation.hpp" 30 #include "runtime/os.hpp" 31 #include "logging/log.hpp" 32 #include "osContainer_linux.hpp" 33 34 /* 35 * PER_CPU_SHARES has been set to 1024 because CPU shares' quota 36 * is commonly used in cloud frameworks like Kubernetes[1], 37 * AWS[2] and Mesos[3] in a similar way. They spawn containers with 38 * --cpu-shares option values scaled by PER_CPU_SHARES. Thus, we do 39 * the inverse for determining the number of possible available 40 * CPUs to the JVM inside a container. See JDK-8216366. 41 * 42 * [1] https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/#meaning-of-cpu 43 * In particular: 44 * When using Docker: 45 * The spec.containers[].resources.requests.cpu is converted to its core value, which is potentially 46 * fractional, and multiplied by 1024. The greater of this number or 2 is used as the value of the 47 * --cpu-shares flag in the docker run command. 48 * [2] https://docs.aws.amazon.com/AmazonECS/latest/APIReference/API_ContainerDefinition.html 49 * [3] https://github.com/apache/mesos/blob/3478e344fb77d931f6122980c6e94cd3913c441d/src/docker/docker.cpp#L648 50 * https://github.com/apache/mesos/blob/3478e344fb77d931f6122980c6e94cd3913c441d/src/slave/containerizer/mesos/isolators/cgroups/constants.hpp#L30 51 */ 52 #define PER_CPU_SHARES 1024 53 54 bool OSContainer::_is_initialized = false; 55 bool OSContainer::_is_containerized = false; 56 julong _unlimited_memory; 57 58 class CgroupSubsystem: CHeapObj<mtInternal> { 59 friend class OSContainer; 60 61 private: 62 /* mountinfo contents */ 63 char *_root; 64 char *_mount_point; 65 66 /* Constructed subsystem directory */ 67 char *_path; 68 69 public: 70 CgroupSubsystem(char *root, char *mountpoint) { 71 _root = os::strdup(root); 72 _mount_point = os::strdup(mountpoint); 73 _path = NULL; 74 } 75 76 /* 77 * Set directory to subsystem specific files based 78 * on the contents of the mountinfo and cgroup files. 79 */ 80 void set_subsystem_path(char *cgroup_path) { 81 char buf[MAXPATHLEN+1]; 82 if (_root != NULL && cgroup_path != NULL) { 83 if (strcmp(_root, "/") == 0) { 84 int buflen; 85 strncpy(buf, _mount_point, MAXPATHLEN); 86 buf[MAXPATHLEN-1] = '\0'; 87 if (strcmp(cgroup_path,"/") != 0) { 88 buflen = strlen(buf); 89 if ((buflen + strlen(cgroup_path)) > (MAXPATHLEN-1)) { 90 return; 91 } 92 strncat(buf, cgroup_path, MAXPATHLEN-buflen); 93 buf[MAXPATHLEN-1] = '\0'; 94 } 95 _path = os::strdup(buf); 96 } else { 97 if (strcmp(_root, cgroup_path) == 0) { 98 strncpy(buf, _mount_point, MAXPATHLEN); 99 buf[MAXPATHLEN-1] = '\0'; 100 _path = os::strdup(buf); 101 } else { 102 char *p = strstr(cgroup_path, _root); 103 if (p != NULL && p == _root) { 104 if (strlen(cgroup_path) > strlen(_root)) { 105 int buflen; 106 strncpy(buf, _mount_point, MAXPATHLEN); 107 buf[MAXPATHLEN-1] = '\0'; 108 buflen = strlen(buf); 109 if ((buflen + strlen(cgroup_path) - strlen(_root)) > (MAXPATHLEN-1)) { 110 return; 111 } 112 strncat(buf, cgroup_path + strlen(_root), MAXPATHLEN-buflen); 113 buf[MAXPATHLEN-1] = '\0'; 114 _path = os::strdup(buf); 115 } 116 } 117 } 118 } 119 } 120 } 121 122 char *subsystem_path() { return _path; } 123 }; 124 125 CgroupSubsystem* memory = NULL; 126 CgroupSubsystem* cpuset = NULL; 127 CgroupSubsystem* cpu = NULL; 128 CgroupSubsystem* cpuacct = NULL; 129 130 typedef char * cptr; 131 132 PRAGMA_DIAG_PUSH 133 PRAGMA_FORMAT_NONLITERAL_IGNORED 134 template <typename T> int subsystem_file_contents(CgroupSubsystem* c, 135 const char *filename, 136 const char *scan_fmt, 137 T returnval) { 138 FILE *fp = NULL; 139 char *p; 140 char file[MAXPATHLEN+1]; 141 char buf[MAXPATHLEN+1]; 142 143 if (c == NULL) { 144 log_debug(os, container)("subsystem_file_contents: CgroupSubsytem* is NULL"); 145 return OSCONTAINER_ERROR; 146 } 147 if (c->subsystem_path() == NULL) { 148 log_debug(os, container)("subsystem_file_contents: subsystem path is NULL"); 149 return OSCONTAINER_ERROR; 150 } 151 152 strncpy(file, c->subsystem_path(), MAXPATHLEN); 153 file[MAXPATHLEN-1] = '\0'; 154 int filelen = strlen(file); 155 if ((filelen + strlen(filename)) > (MAXPATHLEN-1)) { 156 log_debug(os, container)("File path too long %s, %s", file, filename); 157 return OSCONTAINER_ERROR; 158 } 159 strncat(file, filename, MAXPATHLEN-filelen); 160 log_trace(os, container)("Path to %s is %s", filename, file); 161 fp = fopen(file, "r"); 162 if (fp != NULL) { 163 p = fgets(buf, MAXPATHLEN, fp); 164 if (p != NULL) { 165 int matched = sscanf(p, scan_fmt, returnval); 166 if (matched == 1) { 167 fclose(fp); 168 return 0; 169 } else { 170 log_debug(os, container)("Type %s not found in file %s", scan_fmt, file); 171 } 172 } else { 173 log_debug(os, container)("Empty file %s", file); 174 } 175 } else { 176 log_debug(os, container)("Open of file %s failed, %s", file, os::strerror(errno)); 177 } 178 if (fp != NULL) 179 fclose(fp); 180 return OSCONTAINER_ERROR; 181 } 182 PRAGMA_DIAG_POP 183 184 #define GET_CONTAINER_INFO(return_type, subsystem, filename, \ 185 logstring, scan_fmt, variable) \ 186 return_type variable; \ 187 { \ 188 int err; \ 189 err = subsystem_file_contents(subsystem, \ 190 filename, \ 191 scan_fmt, \ 192 &variable); \ 193 if (err != 0) \ 194 return (return_type) OSCONTAINER_ERROR; \ 195 \ 196 log_trace(os, container)(logstring, variable); \ 197 } 198 199 #define GET_CONTAINER_INFO_CPTR(return_type, subsystem, filename, \ 200 logstring, scan_fmt, variable, bufsize) \ 201 char variable[bufsize]; \ 202 { \ 203 int err; \ 204 err = subsystem_file_contents(subsystem, \ 205 filename, \ 206 scan_fmt, \ 207 variable); \ 208 if (err != 0) \ 209 return (return_type) NULL; \ 210 \ 211 log_trace(os, container)(logstring, variable); \ 212 } 213 214 /* init 215 * 216 * Initialize the container support and determine if 217 * we are running under cgroup control. 218 */ 219 void OSContainer::init() { 220 FILE *mntinfo = NULL; 221 FILE *cgroup = NULL; 222 char buf[MAXPATHLEN+1]; 223 char tmproot[MAXPATHLEN+1]; 224 char tmpmount[MAXPATHLEN+1]; 225 char *p; 226 jlong mem_limit; 227 228 assert(!_is_initialized, "Initializing OSContainer more than once"); 229 230 _is_initialized = true; 231 _is_containerized = false; 232 233 _unlimited_memory = (LONG_MAX / os::vm_page_size()) * os::vm_page_size(); 234 235 log_trace(os, container)("OSContainer::init: Initializing Container Support"); 236 if (!UseContainerSupport) { 237 log_trace(os, container)("Container Support not enabled"); 238 return; 239 } 240 241 /* 242 * Find the cgroup mount point for memory and cpuset 243 * by reading /proc/self/mountinfo 244 * 245 * Example for docker: 246 * 219 214 0:29 /docker/7208cebd00fa5f2e342b1094f7bed87fa25661471a4637118e65f1c995be8a34 /sys/fs/cgroup/memory ro,nosuid,nodev,noexec,relatime - cgroup cgroup rw,memory 247 * 248 * Example for host: 249 * 34 28 0:29 / /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:16 - cgroup cgroup rw,memory 250 */ 251 mntinfo = fopen("/proc/self/mountinfo", "r"); 252 if (mntinfo == NULL) { 253 log_debug(os, container)("Can't open /proc/self/mountinfo, %s", 254 os::strerror(errno)); 255 return; 256 } 257 258 while ((p = fgets(buf, MAXPATHLEN, mntinfo)) != NULL) { 259 char tmpcgroups[MAXPATHLEN+1]; 260 char *cptr = tmpcgroups; 261 char *token; 262 263 // mountinfo format is documented at https://www.kernel.org/doc/Documentation/filesystems/proc.txt 264 if (sscanf(p, "%*d %*d %*d:%*d %s %s %*[^-]- cgroup %*s %s", tmproot, tmpmount, tmpcgroups) != 3) { 265 continue; 266 } 267 while ((token = strsep(&cptr, ",")) != NULL) { 268 if (strcmp(token, "memory") == 0) { 269 memory = new CgroupSubsystem(tmproot, tmpmount); 270 } else if (strcmp(token, "cpuset") == 0) { 271 cpuset = new CgroupSubsystem(tmproot, tmpmount); 272 } else if (strcmp(token, "cpu") == 0) { 273 cpu = new CgroupSubsystem(tmproot, tmpmount); 274 } else if (strcmp(token, "cpuacct") == 0) { 275 cpuacct= new CgroupSubsystem(tmproot, tmpmount); 276 } 277 } 278 } 279 280 fclose(mntinfo); 281 282 if (memory == NULL) { 283 log_debug(os, container)("Required cgroup memory subsystem not found"); 284 return; 285 } 286 if (cpuset == NULL) { 287 log_debug(os, container)("Required cgroup cpuset subsystem not found"); 288 return; 289 } 290 if (cpu == NULL) { 291 log_debug(os, container)("Required cgroup cpu subsystem not found"); 292 return; 293 } 294 if (cpuacct == NULL) { 295 log_debug(os, container)("Required cgroup cpuacct subsystem not found"); 296 return; 297 } 298 299 /* 300 * Read /proc/self/cgroup and map host mount point to 301 * local one via /proc/self/mountinfo content above 302 * 303 * Docker example: 304 * 5:memory:/docker/6558aed8fc662b194323ceab5b964f69cf36b3e8af877a14b80256e93aecb044 305 * 306 * Host example: 307 * 5:memory:/user.slice 308 * 309 * Construct a path to the process specific memory and cpuset 310 * cgroup directory. 311 * 312 * For a container running under Docker from memory example above 313 * the paths would be: 314 * 315 * /sys/fs/cgroup/memory 316 * 317 * For a Host from memory example above the path would be: 318 * 319 * /sys/fs/cgroup/memory/user.slice 320 * 321 */ 322 cgroup = fopen("/proc/self/cgroup", "r"); 323 if (cgroup == NULL) { 324 log_debug(os, container)("Can't open /proc/self/cgroup, %s", 325 os::strerror(errno)); 326 return; 327 } 328 329 while ((p = fgets(buf, MAXPATHLEN, cgroup)) != NULL) { 330 char *controllers; 331 char *token; 332 char *base; 333 334 /* Skip cgroup number */ 335 strsep(&p, ":"); 336 /* Get controllers and base */ 337 controllers = strsep(&p, ":"); 338 base = strsep(&p, "\n"); 339 340 if (controllers == NULL) { 341 continue; 342 } 343 344 while ((token = strsep(&controllers, ",")) != NULL) { 345 if (strcmp(token, "memory") == 0) { 346 memory->set_subsystem_path(base); 347 } else if (strcmp(token, "cpuset") == 0) { 348 cpuset->set_subsystem_path(base); 349 } else if (strcmp(token, "cpu") == 0) { 350 cpu->set_subsystem_path(base); 351 } else if (strcmp(token, "cpuacct") == 0) { 352 cpuacct->set_subsystem_path(base); 353 } 354 } 355 } 356 357 fclose(cgroup); 358 359 // We need to update the amount of physical memory now that 360 // command line arguments have been processed. 361 if ((mem_limit = memory_limit_in_bytes()) > 0) { 362 os::Linux::set_physical_memory(mem_limit); 363 } 364 365 _is_containerized = true; 366 367 } 368 369 const char * OSContainer::container_type() { 370 if (is_containerized()) { 371 return "cgroupv1"; 372 } else { 373 return NULL; 374 } 375 } 376 377 378 /* memory_limit_in_bytes 379 * 380 * Return the limit of available memory for this process. 381 * 382 * return: 383 * memory limit in bytes or 384 * -1 for unlimited 385 * OSCONTAINER_ERROR for not supported 386 */ 387 jlong OSContainer::memory_limit_in_bytes() { 388 GET_CONTAINER_INFO(julong, memory, "/memory.limit_in_bytes", 389 "Memory Limit is: " JULONG_FORMAT, JULONG_FORMAT, memlimit); 390 391 if (memlimit >= _unlimited_memory) { 392 log_trace(os, container)("Memory Limit is: Unlimited"); 393 return (jlong)-1; 394 } 395 else { 396 return (jlong)memlimit; 397 } 398 } 399 400 jlong OSContainer::memory_and_swap_limit_in_bytes() { 401 GET_CONTAINER_INFO(julong, memory, "/memory.memsw.limit_in_bytes", 402 "Memory and Swap Limit is: " JULONG_FORMAT, JULONG_FORMAT, memswlimit); 403 if (memswlimit >= _unlimited_memory) { 404 log_trace(os, container)("Memory and Swap Limit is: Unlimited"); 405 return (jlong)-1; 406 } else { 407 return (jlong)memswlimit; 408 } 409 } 410 411 jlong OSContainer::memory_soft_limit_in_bytes() { 412 GET_CONTAINER_INFO(julong, memory, "/memory.soft_limit_in_bytes", 413 "Memory Soft Limit is: " JULONG_FORMAT, JULONG_FORMAT, memsoftlimit); 414 if (memsoftlimit >= _unlimited_memory) { 415 log_trace(os, container)("Memory Soft Limit is: Unlimited"); 416 return (jlong)-1; 417 } else { 418 return (jlong)memsoftlimit; 419 } 420 } 421 422 /* memory_usage_in_bytes 423 * 424 * Return the amount of used memory for this process. 425 * 426 * return: 427 * memory usage in bytes or 428 * -1 for unlimited 429 * OSCONTAINER_ERROR for not supported 430 */ 431 jlong OSContainer::memory_usage_in_bytes() { 432 GET_CONTAINER_INFO(jlong, memory, "/memory.usage_in_bytes", 433 "Memory Usage is: " JLONG_FORMAT, JLONG_FORMAT, memusage); 434 return memusage; 435 } 436 437 /* memory_max_usage_in_bytes 438 * 439 * Return the maximum amount of used memory for this process. 440 * 441 * return: 442 * max memory usage in bytes or 443 * OSCONTAINER_ERROR for not supported 444 */ 445 jlong OSContainer::memory_max_usage_in_bytes() { 446 GET_CONTAINER_INFO(jlong, memory, "/memory.max_usage_in_bytes", 447 "Maximum Memory Usage is: " JLONG_FORMAT, JLONG_FORMAT, memmaxusage); 448 return memmaxusage; 449 } 450 451 /* active_processor_count 452 * 453 * Calculate an appropriate number of active processors for the 454 * VM to use based on these three inputs. 455 * 456 * cpu affinity 457 * cgroup cpu quota & cpu period 458 * cgroup cpu shares 459 * 460 * Algorithm: 461 * 462 * Determine the number of available CPUs from sched_getaffinity 463 * 464 * If user specified a quota (quota != -1), calculate the number of 465 * required CPUs by dividing quota by period. 466 * 467 * If shares are in effect (shares != -1), calculate the number 468 * of CPUs required for the shares by dividing the share value 469 * by PER_CPU_SHARES. 470 * 471 * All results of division are rounded up to the next whole number. 472 * 473 * If neither shares or quotas have been specified, return the 474 * number of active processors in the system. 475 * 476 * If both shares and quotas have been specified, the results are 477 * based on the flag PreferContainerQuotaForCPUCount. If true, 478 * return the quota value. If false return the smallest value 479 * between shares or quotas. 480 * 481 * If shares and/or quotas have been specified, the resulting number 482 * returned will never exceed the number of active processors. 483 * 484 * return: 485 * number of CPUs 486 */ 487 int OSContainer::active_processor_count() { 488 int quota_count = 0, share_count = 0; 489 int cpu_count, limit_count; 490 int result; 491 492 cpu_count = limit_count = os::Linux::active_processor_count(); 493 int quota = cpu_quota(); 494 int period = cpu_period(); 495 int share = cpu_shares(); 496 497 if (quota > -1 && period > 0) { 498 quota_count = ceilf((float)quota / (float)period); 499 log_trace(os, container)("CPU Quota count based on quota/period: %d", quota_count); 500 } 501 if (share > -1) { 502 share_count = ceilf((float)share / (float)PER_CPU_SHARES); 503 log_trace(os, container)("CPU Share count based on shares: %d", share_count); 504 } 505 506 // If both shares and quotas are setup results depend 507 // on flag PreferContainerQuotaForCPUCount. 508 // If true, limit CPU count to quota 509 // If false, use minimum of shares and quotas 510 if (quota_count !=0 && share_count != 0) { 511 if (PreferContainerQuotaForCPUCount) { 512 limit_count = quota_count; 513 } else { 514 limit_count = MIN2(quota_count, share_count); 515 } 516 } else if (quota_count != 0) { 517 limit_count = quota_count; 518 } else if (share_count != 0) { 519 limit_count = share_count; 520 } 521 522 result = MIN2(cpu_count, limit_count); 523 log_trace(os, container)("OSContainer::active_processor_count: %d", result); 524 return result; 525 } 526 527 char * OSContainer::cpu_cpuset_cpus() { 528 GET_CONTAINER_INFO_CPTR(cptr, cpuset, "/cpuset.cpus", 529 "cpuset.cpus is: %s", "%1023s", cpus, 1024); 530 return os::strdup(cpus); 531 } 532 533 char * OSContainer::cpu_cpuset_memory_nodes() { 534 GET_CONTAINER_INFO_CPTR(cptr, cpuset, "/cpuset.mems", 535 "cpuset.mems is: %s", "%1023s", mems, 1024); 536 return os::strdup(mems); 537 } 538 539 /* cpu_quota 540 * 541 * Return the number of milliseconds per period 542 * process is guaranteed to run. 543 * 544 * return: 545 * quota time in milliseconds 546 * -1 for no quota 547 * OSCONTAINER_ERROR for not supported 548 */ 549 int OSContainer::cpu_quota() { 550 GET_CONTAINER_INFO(int, cpu, "/cpu.cfs_quota_us", 551 "CPU Quota is: %d", "%d", quota); 552 return quota; 553 } 554 555 int OSContainer::cpu_period() { 556 GET_CONTAINER_INFO(int, cpu, "/cpu.cfs_period_us", 557 "CPU Period is: %d", "%d", period); 558 return period; 559 } 560 561 /* cpu_shares 562 * 563 * Return the amount of cpu shares available to the process 564 * 565 * return: 566 * Share number (typically a number relative to 1024) 567 * (2048 typically expresses 2 CPUs worth of processing) 568 * -1 for no share setup 569 * OSCONTAINER_ERROR for not supported 570 */ 571 int OSContainer::cpu_shares() { 572 GET_CONTAINER_INFO(int, cpu, "/cpu.shares", 573 "CPU Shares is: %d", "%d", shares); 574 // Convert 1024 to no shares setup 575 if (shares == 1024) return -1; 576 577 return shares; 578 } 579