1 /* 2 * Copyright (c) 2001, 2016, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2016, SAP SE and/or its affiliates. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. Oracle designates this 9 * particular file as subject to the "Classpath" exception as provided 10 * by Oracle in the LICENSE file that accompanied this code. 11 * 12 * This code is distributed in the hope that it will be useful, but WITHOUT 13 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 14 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 15 * version 2 for more details (a copy is included in the LICENSE file that 16 * accompanied this code). 17 * 18 * You should have received a copy of the GNU General Public License version 19 * 2 along with this work; if not, write to the Free Software Foundation, 20 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 21 * 22 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 23 * or visit www.oracle.com if you need additional information or have any 24 * questions. 25 */ 26 27 /* 28 * This file contains implementations of NET_... functions. The NET_.. functions are 29 * wrappers for common file- and socket functions plus provisions for non-blocking IO. 30 * 31 * (basically, the layers remember all file descriptors waiting for a particular fd; 32 * all threads waiting on a certain fd can be woken up by sending them a signal; this 33 * is done e.g. when the fd is closed.) 34 * 35 * This was originally copied from the linux_close.c implementation. 36 * 37 * Side Note: This coding needs initialization. Under Linux this is done 38 * automatically via __attribute((constructor)), on AIX this is done manually 39 * (see aix_close_init). 40 * 41 */ 42 43 /* 44 AIX needs a workaround for I/O cancellation, see: 45 http://publib.boulder.ibm.com/infocenter/pseries/v5r3/index.jsp?topic=/com.ibm.aix.basetechref/doc/basetrf1/close.htm 46 ... 47 The close subroutine is blocked until all subroutines which use the file 48 descriptor return to usr space. For example, when a thread is calling close 49 and another thread is calling select with the same file descriptor, the 50 close subroutine does not return until the select call returns. 51 ... 52 */ 53 54 #include <assert.h> 55 #include <limits.h> 56 #include <stdio.h> 57 #include <stdlib.h> 58 #include <signal.h> 59 #include <pthread.h> 60 #include <sys/types.h> 61 #include <sys/socket.h> 62 #include <sys/time.h> 63 #include <sys/resource.h> 64 #include <sys/uio.h> 65 #include <unistd.h> 66 #include <errno.h> 67 #include <sys/poll.h> 68 69 /* 70 * Stack allocated by thread when doing blocking operation 71 */ 72 typedef struct threadEntry { 73 pthread_t thr; /* this thread */ 74 struct threadEntry *next; /* next thread */ 75 int intr; /* interrupted */ 76 } threadEntry_t; 77 78 /* 79 * Heap allocated during initialized - one entry per fd 80 */ 81 typedef struct { 82 pthread_mutex_t lock; /* fd lock */ 83 threadEntry_t *threads; /* threads blocked on fd */ 84 } fdEntry_t; 85 86 /* 87 * Signal to unblock thread 88 */ 89 static int sigWakeup = (SIGRTMAX - 1); 90 91 /* 92 * fdTable holds one entry per file descriptor, up to a certain 93 * maximum. 94 * Theoretically, the number of possible file descriptors can get 95 * large, though usually it does not. Entries for small value file 96 * descriptors are kept in a simple table, which covers most scenarios. 97 * Entries for large value file descriptors are kept in an overflow 98 * table, which is organized as a sparse two dimensional array whose 99 * slabs are allocated on demand. This covers all corner cases while 100 * keeping memory consumption reasonable. 101 */ 102 103 /* Base table for low value file descriptors */ 104 static fdEntry_t* fdTable = NULL; 105 /* Maximum size of base table (in number of entries). */ 106 static const int fdTableMaxSize = 0x1000; /* 4K */ 107 /* Actual size of base table (in number of entries) */ 108 static int fdTableLen = 0; 109 /* Max. theoretical number of file descriptors on system. */ 110 static int fdLimit = 0; 111 112 /* Overflow table, should base table not be large enough. Organized as 113 * an array of n slabs, each holding 64k entries. 114 */ 115 static fdEntry_t** fdOverflowTable = NULL; 116 /* Number of slabs in the overflow table */ 117 static int fdOverflowTableLen = 0; 118 /* Number of entries in one slab */ 119 static const int fdOverflowTableSlabSize = 0x10000; /* 64k */ 120 pthread_mutex_t fdOverflowTableLock = PTHREAD_MUTEX_INITIALIZER; 121 122 /* 123 * Null signal handler 124 */ 125 static void sig_wakeup(int sig) { 126 } 127 128 /* 129 * Initialization routine (executed when library is loaded) 130 * Allocate fd tables and sets up signal handler. 131 * 132 * On AIX we don't have __attribute((constructor)) so we need to initialize 133 * manually (from JNI_OnLoad() in 'src/share/native/java/net/net_util.c') 134 */ 135 void aix_close_init() { 136 struct rlimit nbr_files; 137 sigset_t sigset; 138 struct sigaction sa; 139 int i = 0; 140 141 /* Determine the maximum number of possible file descriptors. */ 142 if (-1 == getrlimit(RLIMIT_NOFILE, &nbr_files)) { 143 fprintf(stderr, "library initialization failed - " 144 "unable to get max # of allocated fds\n"); 145 abort(); 146 } 147 if (nbr_files.rlim_max != RLIM_INFINITY) { 148 fdLimit = nbr_files.rlim_max; 149 } else { 150 /* We just do not know. */ 151 fdLimit = INT_MAX; 152 } 153 154 /* Allocate table for low value file descriptors. */ 155 fdTableLen = fdLimit < fdTableMaxSize ? fdLimit : fdTableMaxSize; 156 fdTable = (fdEntry_t*) calloc(fdTableLen, sizeof(fdEntry_t)); 157 if (fdTable == NULL) { 158 fprintf(stderr, "library initialization failed - " 159 "unable to allocate file descriptor table - out of memory"); 160 abort(); 161 } else { 162 for (i = 0; i < fdTableLen; i ++) { 163 pthread_mutex_init(&fdTable[i].lock, NULL); 164 } 165 } 166 167 /* Allocate overflow table, if needed */ 168 if (fdLimit > fdTableMaxSize) { 169 fdOverflowTableLen = ((fdLimit - fdTableMaxSize) / fdOverflowTableSlabSize) + 1; 170 fdOverflowTable = (fdEntry_t**) calloc(fdOverflowTableLen, sizeof(fdEntry_t*)); 171 if (fdOverflowTable == NULL) { 172 fprintf(stderr, "library initialization failed - " 173 "unable to allocate file descriptor overflow table - out of memory"); 174 abort(); 175 } 176 } 177 178 /* 179 * Setup the signal handler 180 */ 181 sa.sa_handler = sig_wakeup; 182 sa.sa_flags = 0; 183 sigemptyset(&sa.sa_mask); 184 sigaction(sigWakeup, &sa, NULL); 185 186 sigemptyset(&sigset); 187 sigaddset(&sigset, sigWakeup); 188 sigprocmask(SIG_UNBLOCK, &sigset, NULL); 189 } 190 191 /* 192 * Return the fd table for this fd. 193 */ 194 static inline fdEntry_t *getFdEntry(int fd) 195 { 196 fdEntry_t* result = NULL; 197 198 if (fd < 0) { 199 return NULL; 200 } 201 202 /* This should not happen. If it does, our assumption about 203 * max. fd value was wrong. */ 204 assert(fd < fdLimit); 205 206 if (fd < fdTableMaxSize) { 207 /* fd is in base table. */ 208 assert(fd < fdTableLen); 209 result = &fdTable[fd]; 210 } else { 211 /* fd is in overflow table. */ 212 const int indexInOverflowTable = fd - fdTableMaxSize; 213 const int rootindex = indexInOverflowTable / fdOverflowTableSlabSize; 214 const int slabindex = indexInOverflowTable % fdOverflowTableSlabSize; 215 fdEntry_t* slab = NULL; 216 assert(rootindex < fdOverflowTableLen); 217 assert(slabindex < fdOverflowTableSlabSize); 218 pthread_mutex_lock(&fdOverflowTableLock); 219 /* Allocate new slab in overflow table if needed */ 220 if (fdOverflowTable[rootindex] == NULL) { 221 fdEntry_t* const newSlab = 222 (fdEntry_t*)calloc(fdOverflowTableSlabSize, sizeof(fdEntry_t)); 223 if (newSlab == NULL) { 224 fprintf(stderr, "Unable to allocate file descriptor overflow" 225 " table slab - out of memory"); 226 pthread_mutex_unlock(&fdOverflowTableLock); 227 abort(); 228 } else { 229 int i; 230 for (i = 0; i < fdOverflowTableSlabSize; i ++) { 231 pthread_mutex_init(&newSlab[i].lock, NULL); 232 } 233 fdOverflowTable[rootindex] = newSlab; 234 } 235 } 236 pthread_mutex_unlock(&fdOverflowTableLock); 237 slab = fdOverflowTable[rootindex]; 238 result = &slab[slabindex]; 239 } 240 241 return result; 242 243 } 244 245 246 /* 247 * Start a blocking operation :- 248 * Insert thread onto thread list for the fd. 249 */ 250 static inline void startOp(fdEntry_t *fdEntry, threadEntry_t *self) 251 { 252 self->thr = pthread_self(); 253 self->intr = 0; 254 255 pthread_mutex_lock(&(fdEntry->lock)); 256 { 257 self->next = fdEntry->threads; 258 fdEntry->threads = self; 259 } 260 pthread_mutex_unlock(&(fdEntry->lock)); 261 } 262 263 /* 264 * End a blocking operation :- 265 * Remove thread from thread list for the fd 266 * If fd has been interrupted then set errno to EBADF 267 */ 268 static inline void endOp 269 (fdEntry_t *fdEntry, threadEntry_t *self) 270 { 271 int orig_errno = errno; 272 pthread_mutex_lock(&(fdEntry->lock)); 273 { 274 threadEntry_t *curr, *prev=NULL; 275 curr = fdEntry->threads; 276 while (curr != NULL) { 277 if (curr == self) { 278 if (curr->intr) { 279 orig_errno = EBADF; 280 } 281 if (prev == NULL) { 282 fdEntry->threads = curr->next; 283 } else { 284 prev->next = curr->next; 285 } 286 break; 287 } 288 prev = curr; 289 curr = curr->next; 290 } 291 } 292 pthread_mutex_unlock(&(fdEntry->lock)); 293 errno = orig_errno; 294 } 295 296 /* 297 * Close or dup2 a file descriptor ensuring that all threads blocked on 298 * the file descriptor are notified via a wakeup signal. 299 * 300 * fd1 < 0 => close(fd2) 301 * fd1 >= 0 => dup2(fd1, fd2) 302 * 303 * Returns -1 with errno set if operation fails. 304 */ 305 static int closefd(int fd1, int fd2) { 306 int rv, orig_errno; 307 fdEntry_t *fdEntry = getFdEntry(fd2); 308 if (fdEntry == NULL) { 309 errno = EBADF; 310 return -1; 311 } 312 313 /* 314 * Lock the fd to hold-off additional I/O on this fd. 315 */ 316 pthread_mutex_lock(&(fdEntry->lock)); 317 318 { 319 /* On fast machines we see that we enter dup2 before the 320 * accepting thread had a chance to get and process the signal. 321 * So in case we woke a thread up, give it some time to cope. 322 * Also see https://bugs.openjdk.java.net/browse/JDK-8006395 */ 323 int num_woken = 0; 324 325 /* 326 * Send a wakeup signal to all threads blocked on this 327 * file descriptor. 328 */ 329 threadEntry_t *curr = fdEntry->threads; 330 while (curr != NULL) { 331 curr->intr = 1; 332 pthread_kill( curr->thr, sigWakeup ); 333 num_woken ++; 334 curr = curr->next; 335 } 336 337 if (num_woken > 0) { 338 usleep(num_woken * 50); 339 } 340 341 /* 342 * And close/dup the file descriptor 343 * (restart if interrupted by signal) 344 */ 345 do { 346 if (fd1 < 0) { 347 rv = close(fd2); 348 } else { 349 rv = dup2(fd1, fd2); 350 } 351 } while (rv == -1 && errno == EINTR); 352 } 353 354 /* 355 * Unlock without destroying errno 356 */ 357 orig_errno = errno; 358 pthread_mutex_unlock(&(fdEntry->lock)); 359 errno = orig_errno; 360 361 return rv; 362 } 363 364 /* 365 * Wrapper for dup2 - same semantics as dup2 system call except 366 * that any threads blocked in an I/O system call on fd2 will be 367 * preempted and return -1/EBADF; 368 */ 369 int NET_Dup2(int fd, int fd2) { 370 if (fd < 0) { 371 errno = EBADF; 372 return -1; 373 } 374 return closefd(fd, fd2); 375 } 376 377 /* 378 * Wrapper for close - same semantics as close system call 379 * except that any threads blocked in an I/O on fd will be 380 * preempted and the I/O system call will return -1/EBADF. 381 */ 382 int NET_SocketClose(int fd) { 383 return closefd(-1, fd); 384 } 385 386 /************** Basic I/O operations here ***************/ 387 388 /* 389 * Macro to perform a blocking IO operation. Restarts 390 * automatically if interrupted by signal (other than 391 * our wakeup signal) 392 */ 393 #define BLOCKING_IO_RETURN_INT(FD, FUNC) { \ 394 int ret; \ 395 threadEntry_t self; \ 396 fdEntry_t *fdEntry = getFdEntry(FD); \ 397 if (fdEntry == NULL) { \ 398 errno = EBADF; \ 399 return -1; \ 400 } \ 401 do { \ 402 startOp(fdEntry, &self); \ 403 ret = FUNC; \ 404 endOp(fdEntry, &self); \ 405 } while (ret == -1 && errno == EINTR); \ 406 return ret; \ 407 } 408 409 int NET_Read(int s, void* buf, size_t len) { 410 BLOCKING_IO_RETURN_INT( s, recv(s, buf, len, 0) ); 411 } 412 413 int NET_NonBlockingRead(int s, void* buf, size_t len) { 414 BLOCKING_IO_RETURN_INT( s, recv(s, buf, len, MSG_DONTWAIT)); 415 } 416 417 int NET_ReadV(int s, const struct iovec * vector, int count) { 418 BLOCKING_IO_RETURN_INT( s, readv(s, vector, count) ); 419 } 420 421 int NET_RecvFrom(int s, void *buf, int len, unsigned int flags, 422 struct sockaddr *from, int *fromlen) { 423 socklen_t socklen = *fromlen; 424 BLOCKING_IO_RETURN_INT( s, recvfrom(s, buf, len, flags, from, &socklen) ); 425 *fromlen = socklen; 426 } 427 428 int NET_Send(int s, void *msg, int len, unsigned int flags) { 429 BLOCKING_IO_RETURN_INT( s, send(s, msg, len, flags) ); 430 } 431 432 int NET_WriteV(int s, const struct iovec * vector, int count) { 433 BLOCKING_IO_RETURN_INT( s, writev(s, vector, count) ); 434 } 435 436 int NET_SendTo(int s, const void *msg, int len, unsigned int 437 flags, const struct sockaddr *to, int tolen) { 438 BLOCKING_IO_RETURN_INT( s, sendto(s, msg, len, flags, to, tolen) ); 439 } 440 441 int NET_Accept(int s, struct sockaddr *addr, int *addrlen) { 442 socklen_t socklen = *addrlen; 443 BLOCKING_IO_RETURN_INT( s, accept(s, addr, &socklen) ); 444 *addrlen = socklen; 445 } 446 447 int NET_Connect(int s, struct sockaddr *addr, int addrlen) { 448 int crc = -1, prc = -1; 449 threadEntry_t self; 450 fdEntry_t* fdEntry = getFdEntry(s); 451 452 if (fdEntry == NULL) { 453 errno = EBADF; 454 return -1; 455 } 456 457 /* On AIX, when the system call connect() is interrupted, the connection 458 * is not aborted and it will be established asynchronously by the kernel. 459 * Hence, no need to restart connect() when EINTR is received 460 */ 461 startOp(fdEntry, &self); 462 crc = connect(s, addr, addrlen); 463 endOp(fdEntry, &self); 464 465 if (crc == -1 && errno == EINTR) { 466 struct pollfd s_pollfd; 467 int sockopt_arg = 0; 468 socklen_t len; 469 470 s_pollfd.fd = s; 471 s_pollfd.events = POLLOUT | POLLERR; 472 473 /* poll the file descriptor */ 474 do { 475 startOp(fdEntry, &self); 476 prc = poll(&s_pollfd, 1, -1); 477 endOp(fdEntry, &self); 478 } while (prc == -1 && errno == EINTR); 479 480 if (prc < 0) 481 return prc; 482 483 len = sizeof(sockopt_arg); 484 485 /* Check whether the connection has been established */ 486 if (getsockopt(s, SOL_SOCKET, SO_ERROR, &sockopt_arg, &len) == -1) 487 return -1; 488 489 if (sockopt_arg != 0 ) { 490 errno = sockopt_arg; 491 return -1; 492 } 493 } else { 494 return crc; 495 } 496 497 /* At this point, fd is connected. Set successful return code */ 498 return 0; 499 } 500 501 int NET_Poll(struct pollfd *ufds, unsigned int nfds, int timeout) { 502 BLOCKING_IO_RETURN_INT( ufds[0].fd, poll(ufds, nfds, timeout) ); 503 } 504 505 /* 506 * Wrapper for poll(s, timeout). 507 * Auto restarts with adjusted timeout if interrupted by 508 * signal other than our wakeup signal. 509 */ 510 int NET_Timeout(int s, long timeout) { 511 long prevtime = 0, newtime; 512 struct timeval t; 513 fdEntry_t *fdEntry = getFdEntry(s); 514 515 /* 516 * Check that fd hasn't been closed. 517 */ 518 if (fdEntry == NULL) { 519 errno = EBADF; 520 return -1; 521 } 522 523 /* 524 * Pick up current time as may need to adjust timeout 525 */ 526 if (timeout > 0) { 527 gettimeofday(&t, NULL); 528 prevtime = t.tv_sec * 1000 + t.tv_usec / 1000; 529 } 530 531 for(;;) { 532 struct pollfd pfd; 533 int rv; 534 threadEntry_t self; 535 536 /* 537 * Poll the fd. If interrupted by our wakeup signal 538 * errno will be set to EBADF. 539 */ 540 pfd.fd = s; 541 pfd.events = POLLIN | POLLERR; 542 543 startOp(fdEntry, &self); 544 rv = poll(&pfd, 1, timeout); 545 endOp(fdEntry, &self); 546 547 /* 548 * If interrupted then adjust timeout. If timeout 549 * has expired return 0 (indicating timeout expired). 550 */ 551 if (rv < 0 && errno == EINTR) { 552 if (timeout > 0) { 553 gettimeofday(&t, NULL); 554 newtime = t.tv_sec * 1000 + t.tv_usec / 1000; 555 timeout -= newtime - prevtime; 556 if (timeout <= 0) { 557 return 0; 558 } 559 prevtime = newtime; 560 } 561 } else { 562 return rv; 563 } 564 565 } 566 }