1 /* 2 * Copyright (c) 2001, 2016, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2016, SAP SE and/or its affiliates. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. Oracle designates this 9 * particular file as subject to the "Classpath" exception as provided 10 * by Oracle in the LICENSE file that accompanied this code. 11 * 12 * This code is distributed in the hope that it will be useful, but WITHOUT 13 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 14 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 15 * version 2 for more details (a copy is included in the LICENSE file that 16 * accompanied this code). 17 * 18 * You should have received a copy of the GNU General Public License version 19 * 2 along with this work; if not, write to the Free Software Foundation, 20 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 21 * 22 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 23 * or visit www.oracle.com if you need additional information or have any 24 * questions. 25 */ 26 27 /* 28 * This file contains implementations of NET_... functions. The NET_.. functions are 29 * wrappers for common file- and socket functions plus provisions for non-blocking IO. 30 * 31 * (basically, the layers remember all file descriptors waiting for a particular fd; 32 * all threads waiting on a certain fd can be woken up by sending them a signal; this 33 * is done e.g. when the fd is closed.) 34 * 35 * This was originally copied from the linux_close.c implementation. 36 * 37 * Side Note: This coding needs initialization. Under Linux this is done 38 * automatically via __attribute((constructor)), on AIX this is done manually 39 * (see aix_close_init). 40 * 41 */ 42 43 /* 44 AIX needs a workaround for I/O cancellation, see: 45 http://publib.boulder.ibm.com/infocenter/pseries/v5r3/index.jsp?topic=/com.ibm.aix.basetechref/doc/basetrf1/close.htm 46 ... 47 The close subroutine is blocked until all subroutines which use the file 48 descriptor return to usr space. For example, when a thread is calling close 49 and another thread is calling select with the same file descriptor, the 50 close subroutine does not return until the select call returns. 51 ... 52 */ 53 54 #include <assert.h> 55 #include <limits.h> 56 #include <stdio.h> 57 #include <stdlib.h> 58 #include <signal.h> 59 #include <pthread.h> 60 #include <sys/types.h> 61 #include <sys/socket.h> 62 #include <sys/time.h> 63 #include <sys/resource.h> 64 #include <sys/uio.h> 65 #include <unistd.h> 66 #include <errno.h> 67 #include <sys/poll.h> 68 69 /* 70 * Stack allocated by thread when doing blocking operation 71 */ 72 typedef struct threadEntry { 73 pthread_t thr; /* this thread */ 74 struct threadEntry *next; /* next thread */ 75 int intr; /* interrupted */ 76 } threadEntry_t; 77 78 /* 79 * Heap allocated during initialized - one entry per fd 80 */ 81 typedef struct { 82 pthread_mutex_t lock; /* fd lock */ 83 threadEntry_t *threads; /* threads blocked on fd */ 84 } fdEntry_t; 85 86 /* 87 * Signal to unblock thread 88 */ 89 static int sigWakeup = (SIGRTMAX - 1); 90 91 /* 92 * fdTable holds one entry per file descriptor, up to a certain 93 * maximum. 94 * Theoretically, the number of possible file descriptors can get 95 * large, though usually it does not. Entries for small value file 96 * descriptors are kept in a simple table, which covers most scenarios. 97 * Entries for large value file descriptors are kept in an overflow 98 * table, which is organized as a sparse two dimensional array whose 99 * slabs are allocated on demand. This covers all corner cases while 100 * keeping memory consumption reasonable. 101 */ 102 103 /* Base table for low value file descriptors */ 104 static fdEntry_t* fdTable = NULL; 105 /* Maximum size of base table (in number of entries). */ 106 static const int fdTableMaxSize = 0x1000; /* 4K */ 107 /* Actual size of base table (in number of entries) */ 108 static int fdTableLen = 0; 109 /* Max. theoretical number of file descriptors on system. */ 110 static int fdLimit = 0; 111 112 /* Overflow table, should base table not be large enough. Organized as 113 * an array of n slabs, each holding 64k entries. 114 */ 115 static fdEntry_t** fdOverflowTable = NULL; 116 /* Number of slabs in the overflow table */ 117 static int fdOverflowTableLen = 0; 118 /* Number of entries in one slab */ 119 static const int fdOverflowTableSlabSize = 0x10000; /* 64k */ 120 pthread_mutex_t fdOverflowTableLock = PTHREAD_MUTEX_INITIALIZER; 121 122 /* 123 * Null signal handler 124 */ 125 static void sig_wakeup(int sig) { 126 } 127 128 /* 129 * Initialization routine (executed when library is loaded) 130 * Allocate fd tables and sets up signal handler. 131 * 132 * On AIX we don't have __attribute((constructor)) so we need to initialize 133 * manually (from JNI_OnLoad() in 'src/share/native/java/net/net_util.c') 134 */ 135 void aix_close_init() { 136 struct rlimit nbr_files; 137 sigset_t sigset; 138 struct sigaction sa; 139 int i = 0; 140 141 /* Determine the maximum number of possible file descriptors. */ 142 if (-1 == getrlimit(RLIMIT_NOFILE, &nbr_files)) { 143 fprintf(stderr, "library initialization failed - " 144 "unable to get max # of allocated fds\n"); 145 abort(); 146 } 147 if (nbr_files.rlim_max != RLIM_INFINITY) { 148 fdLimit = nbr_files.rlim_max; 149 } else { 150 /* We just do not know. */ 151 fdLimit = INT_MAX; 152 } 153 154 /* Allocate table for low value file descriptors. */ 155 fdTableLen = fdLimit < fdTableMaxSize ? fdLimit : fdTableMaxSize; 156 fdTable = (fdEntry_t*) calloc(fdTableLen, sizeof(fdEntry_t)); 157 if (fdTable == NULL) { 158 fprintf(stderr, "library initialization failed - " 159 "unable to allocate file descriptor table - out of memory"); 160 abort(); 161 } else { 162 for (i = 0; i < fdTableLen; i ++) { 163 pthread_mutex_init(&fdTable[i].lock, NULL); 164 } 165 } 166 167 /* Allocate overflow table, if needed */ 168 if (fdLimit > fdTableMaxSize) { 169 fdOverflowTableLen = ((fdLimit - fdTableMaxSize) / fdOverflowTableSlabSize) + 1; 170 fdOverflowTable = (fdEntry_t**) calloc(fdOverflowTableLen, sizeof(fdEntry_t*)); 171 if (fdOverflowTable == NULL) { 172 fprintf(stderr, "library initialization failed - " 173 "unable to allocate file descriptor overflow table - out of memory"); 174 abort(); 175 } 176 } 177 178 /* 179 * Setup the signal handler 180 */ 181 sa.sa_handler = sig_wakeup; 182 sa.sa_flags = 0; 183 sigemptyset(&sa.sa_mask); 184 sigaction(sigWakeup, &sa, NULL); 185 186 sigemptyset(&sigset); 187 sigaddset(&sigset, sigWakeup); 188 sigprocmask(SIG_UNBLOCK, &sigset, NULL); 189 } 190 191 /* 192 * Return the fd table for this fd. 193 */ 194 static inline fdEntry_t *getFdEntry(int fd) 195 { 196 fdEntry_t* result = NULL; 197 198 if (fd < 0) { 199 return NULL; 200 } 201 202 /* This should not happen. If it does, our assumption about 203 * max. fd value was wrong. */ 204 assert(fd < fdLimit); 205 206 if (fd < fdTableMaxSize) { 207 /* fd is in base table. */ 208 assert(fd < fdTableLen); 209 result = &fdTable[fd]; 210 } else { 211 /* fd is in overflow table. */ 212 const int indexInOverflowTable = fd - fdTableMaxSize; 213 const int rootindex = indexInOverflowTable / fdOverflowTableSlabSize; 214 const int slabindex = indexInOverflowTable % fdOverflowTableSlabSize; 215 fdEntry_t* slab = NULL; 216 assert(rootindex < fdOverflowTableLen); 217 assert(slabindex < fdOverflowTableSlabSize); 218 pthread_mutex_lock(&fdOverflowTableLock); 219 /* Allocate new slab in overflow table if needed */ 220 if (fdOverflowTable[rootindex] == NULL) { 221 fdEntry_t* const newSlab = 222 (fdEntry_t*)calloc(fdOverflowTableSlabSize, sizeof(fdEntry_t)); 223 if (newSlab == NULL) { 224 fprintf(stderr, "Unable to allocate file descriptor overflow" 225 " table slab - out of memory"); 226 pthread_mutex_unlock(&fdOverflowTableLock); 227 abort(); 228 } else { 229 int i; 230 for (i = 0; i < fdOverflowTableSlabSize; i ++) { 231 pthread_mutex_init(&newSlab[i].lock, NULL); 232 } 233 fdOverflowTable[rootindex] = newSlab; 234 } 235 } 236 pthread_mutex_unlock(&fdOverflowTableLock); 237 slab = fdOverflowTable[rootindex]; 238 result = &slab[slabindex]; 239 } 240 241 return result; 242 243 } 244 245 246 /* 247 * Start a blocking operation :- 248 * Insert thread onto thread list for the fd. 249 */ 250 static inline void startOp(fdEntry_t *fdEntry, threadEntry_t *self) 251 { 252 self->thr = pthread_self(); 253 self->intr = 0; 254 255 pthread_mutex_lock(&(fdEntry->lock)); 256 { 257 self->next = fdEntry->threads; 258 fdEntry->threads = self; 259 } 260 pthread_mutex_unlock(&(fdEntry->lock)); 261 } 262 263 /* 264 * End a blocking operation :- 265 * Remove thread from thread list for the fd 266 * If fd has been interrupted then set errno to EBADF 267 */ 268 static inline void endOp 269 (fdEntry_t *fdEntry, threadEntry_t *self) 270 { 271 int orig_errno = errno; 272 pthread_mutex_lock(&(fdEntry->lock)); 273 { 274 threadEntry_t *curr, *prev=NULL; 275 curr = fdEntry->threads; 276 while (curr != NULL) { 277 if (curr == self) { 278 if (curr->intr) { 279 orig_errno = EBADF; 280 } 281 if (prev == NULL) { 282 fdEntry->threads = curr->next; 283 } else { 284 prev->next = curr->next; 285 } 286 break; 287 } 288 prev = curr; 289 curr = curr->next; 290 } 291 } 292 pthread_mutex_unlock(&(fdEntry->lock)); 293 errno = orig_errno; 294 } 295 296 /* 297 * Close or dup2 a file descriptor ensuring that all threads blocked on 298 * the file descriptor are notified via a wakeup signal. 299 * 300 * fd1 < 0 => close(fd2) 301 * fd1 >= 0 => dup2(fd1, fd2) 302 * 303 * Returns -1 with errno set if operation fails. 304 */ 305 static int closefd(int fd1, int fd2) { 306 int rv, orig_errno; 307 fdEntry_t *fdEntry = getFdEntry(fd2); 308 if (fdEntry == NULL) { 309 errno = EBADF; 310 return -1; 311 } 312 313 /* 314 * Lock the fd to hold-off additional I/O on this fd. 315 */ 316 pthread_mutex_lock(&(fdEntry->lock)); 317 318 { 319 /* On fast machines we see that we enter dup2 before the 320 * accepting thread had a chance to get and process the signal. 321 * So in case we woke a thread up, give it some time to cope. 322 * Also see https://bugs.openjdk.java.net/browse/JDK-8006395 */ 323 int num_woken = 0; 324 325 /* 326 * Send a wakeup signal to all threads blocked on this 327 * file descriptor. 328 */ 329 threadEntry_t *curr = fdEntry->threads; 330 while (curr != NULL) { 331 curr->intr = 1; 332 pthread_kill( curr->thr, sigWakeup ); 333 num_woken ++; 334 curr = curr->next; 335 } 336 337 if (num_woken > 0) { 338 usleep(num_woken * 50); 339 } 340 341 /* 342 * And close/dup the file descriptor 343 * (restart if interrupted by signal) 344 */ 345 do { 346 if (fd1 < 0) { 347 rv = close(fd2); 348 } else { 349 rv = dup2(fd1, fd2); 350 } 351 } while (rv == -1 && errno == EINTR); 352 } 353 354 /* 355 * Unlock without destroying errno 356 */ 357 orig_errno = errno; 358 pthread_mutex_unlock(&(fdEntry->lock)); 359 errno = orig_errno; 360 361 return rv; 362 } 363 364 /* 365 * Wrapper for dup2 - same semantics as dup2 system call except 366 * that any threads blocked in an I/O system call on fd2 will be 367 * preempted and return -1/EBADF; 368 */ 369 int NET_Dup2(int fd, int fd2) { 370 if (fd < 0) { 371 errno = EBADF; 372 return -1; 373 } 374 return closefd(fd, fd2); 375 } 376 377 /* 378 * Wrapper for close - same semantics as close system call 379 * except that any threads blocked in an I/O on fd will be 380 * preempted and the I/O system call will return -1/EBADF. 381 */ 382 int NET_SocketClose(int fd) { 383 return closefd(-1, fd); 384 } 385 386 /************** Basic I/O operations here ***************/ 387 388 /* 389 * Macro to perform a blocking IO operation. Restarts 390 * automatically if interrupted by signal (other than 391 * our wakeup signal) 392 */ 393 #define BLOCKING_IO_RETURN_INT(FD, FUNC) { \ 394 int ret; \ 395 threadEntry_t self; \ 396 fdEntry_t *fdEntry = getFdEntry(FD); \ 397 if (fdEntry == NULL) { \ 398 errno = EBADF; \ 399 return -1; \ 400 } \ 401 do { \ 402 startOp(fdEntry, &self); \ 403 ret = FUNC; \ 404 endOp(fdEntry, &self); \ 405 } while (ret == -1 && errno == EINTR); \ 406 return ret; \ 407 } 408 409 int NET_Read(int s, void* buf, size_t len) { 410 BLOCKING_IO_RETURN_INT( s, recv(s, buf, len, 0) ); 411 } 412 413 int NET_ReadV(int s, const struct iovec * vector, int count) { 414 BLOCKING_IO_RETURN_INT( s, readv(s, vector, count) ); 415 } 416 417 int NET_RecvFrom(int s, void *buf, int len, unsigned int flags, 418 struct sockaddr *from, int *fromlen) { 419 socklen_t socklen = *fromlen; 420 BLOCKING_IO_RETURN_INT( s, recvfrom(s, buf, len, flags, from, &socklen) ); 421 *fromlen = socklen; 422 } 423 424 int NET_Send(int s, void *msg, int len, unsigned int flags) { 425 BLOCKING_IO_RETURN_INT( s, send(s, msg, len, flags) ); 426 } 427 428 int NET_WriteV(int s, const struct iovec * vector, int count) { 429 BLOCKING_IO_RETURN_INT( s, writev(s, vector, count) ); 430 } 431 432 int NET_SendTo(int s, const void *msg, int len, unsigned int 433 flags, const struct sockaddr *to, int tolen) { 434 BLOCKING_IO_RETURN_INT( s, sendto(s, msg, len, flags, to, tolen) ); 435 } 436 437 int NET_Accept(int s, struct sockaddr *addr, int *addrlen) { 438 socklen_t socklen = *addrlen; 439 BLOCKING_IO_RETURN_INT( s, accept(s, addr, &socklen) ); 440 *addrlen = socklen; 441 } 442 443 int NET_Connect(int s, struct sockaddr *addr, int addrlen) { 444 int crc = -1, prc = -1; 445 threadEntry_t self; 446 fdEntry_t* fdEntry = getFdEntry(s); 447 448 if (fdEntry == NULL) { 449 errno = EBADF; 450 return -1; 451 } 452 453 /* On AIX, when the system call connect() is interrupted, the connection 454 * is not aborted and it will be established asynchronously by the kernel. 455 * Hence, no need to restart connect() when EINTR is received 456 */ 457 startOp(fdEntry, &self); 458 crc = connect(s, addr, addrlen); 459 endOp(fdEntry, &self); 460 461 if (crc == -1 && errno == EINTR) { 462 struct pollfd s_pollfd; 463 int sockopt_arg = 0; 464 socklen_t len; 465 466 s_pollfd.fd = s; 467 s_pollfd.events = POLLOUT | POLLERR; 468 469 /* poll the file descriptor */ 470 do { 471 startOp(fdEntry, &self); 472 prc = poll(&s_pollfd, 1, -1); 473 endOp(fdEntry, &self); 474 } while (prc == -1 && errno == EINTR); 475 476 if (prc < 0) 477 return prc; 478 479 len = sizeof(sockopt_arg); 480 481 /* Check whether the connection has been established */ 482 if (getsockopt(s, SOL_SOCKET, SO_ERROR, &sockopt_arg, &len) == -1) 483 return -1; 484 485 if (sockopt_arg != 0 ) { 486 errno = sockopt_arg; 487 return -1; 488 } 489 } else { 490 return crc; 491 } 492 493 /* At this point, fd is connected. Set successful return code */ 494 return 0; 495 } 496 497 int NET_Poll(struct pollfd *ufds, unsigned int nfds, int timeout) { 498 BLOCKING_IO_RETURN_INT( ufds[0].fd, poll(ufds, nfds, timeout) ); 499 } 500 501 /* 502 * Wrapper for poll(s, timeout). 503 * Auto restarts with adjusted timeout if interrupted by 504 * signal other than our wakeup signal. 505 */ 506 int NET_Timeout(int s, long timeout) { 507 long prevtime = 0, newtime; 508 struct timeval t; 509 fdEntry_t *fdEntry = getFdEntry(s); 510 511 /* 512 * Check that fd hasn't been closed. 513 */ 514 if (fdEntry == NULL) { 515 errno = EBADF; 516 return -1; 517 } 518 519 /* 520 * Pick up current time as may need to adjust timeout 521 */ 522 if (timeout > 0) { 523 gettimeofday(&t, NULL); 524 prevtime = t.tv_sec * 1000 + t.tv_usec / 1000; 525 } 526 527 for(;;) { 528 struct pollfd pfd; 529 int rv; 530 threadEntry_t self; 531 532 /* 533 * Poll the fd. If interrupted by our wakeup signal 534 * errno will be set to EBADF. 535 */ 536 pfd.fd = s; 537 pfd.events = POLLIN | POLLERR; 538 539 startOp(fdEntry, &self); 540 rv = poll(&pfd, 1, timeout); 541 endOp(fdEntry, &self); 542 543 /* 544 * If interrupted then adjust timeout. If timeout 545 * has expired return 0 (indicating timeout expired). 546 */ 547 if (rv < 0 && errno == EINTR) { 548 if (timeout > 0) { 549 gettimeofday(&t, NULL); 550 newtime = t.tv_sec * 1000 + t.tv_usec / 1000; 551 timeout -= newtime - prevtime; 552 if (timeout <= 0) { 553 return 0; 554 } 555 prevtime = newtime; 556 } 557 } else { 558 return rv; 559 } 560 561 } 562 }