1 /*
   2  * Copyright (c) 2001, 2016, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2016, SAP SE and/or its affiliates. All rights reserved.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.  Oracle designates this
   9  * particular file as subject to the "Classpath" exception as provided
  10  * by Oracle in the LICENSE file that accompanied this code.
  11  *
  12  * This code is distributed in the hope that it will be useful, but WITHOUT
  13  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  14  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  15  * version 2 for more details (a copy is included in the LICENSE file that
  16  * accompanied this code).
  17  *
  18  * You should have received a copy of the GNU General Public License version
  19  * 2 along with this work; if not, write to the Free Software Foundation,
  20  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  21  *
  22  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  23  * or visit www.oracle.com if you need additional information or have any
  24  * questions.
  25  */
  27 /*
  28  * This file contains implementations of NET_... functions. The NET_.. functions are
  29  * wrappers for common file- and socket functions plus provisions for non-blocking IO.
  30  *
  31  * (basically, the layers remember all  file descriptors waiting for a particular fd;
  32  *  all threads waiting on a certain fd can be woken up by sending them a signal; this
  33  *  is done e.g. when the fd is closed.)
  34  *
  35  * This was originally copied from the linux_close.c implementation.
  36  *
  37  * Side Note: This coding needs initialization. Under Linux this is done
  38  * automatically via __attribute((constructor)), on AIX this is done manually
  39  * (see aix_close_init).
  40  *
  41  */
  43 /*
  44    AIX needs a workaround for I/O cancellation, see:
  45    http://publib.boulder.ibm.com/infocenter/pseries/v5r3/index.jsp?topic=/com.ibm.aix.basetechref/doc/basetrf1/close.htm
  46    ...
  47    The close subroutine is blocked until all subroutines which use the file
  48    descriptor return to usr space. For example, when a thread is calling close
  49    and another thread is calling select with the same file descriptor, the
  50    close subroutine does not return until the select call returns.
  51    ...
  52 */
  54 #include <assert.h>
  55 #include <limits.h>
  56 #include <stdio.h>
  57 #include <stdlib.h>
  58 #include <signal.h>
  59 #include <pthread.h>
  60 #include <sys/types.h>
  61 #include <sys/socket.h>
  62 #include <sys/time.h>
  63 #include <sys/resource.h>
  64 #include <sys/uio.h>
  65 #include <unistd.h>
  66 #include <errno.h>
  67 #include <sys/poll.h>
  69 /*
  70  * Stack allocated by thread when doing blocking operation
  71  */
  72 typedef struct threadEntry {
  73     pthread_t thr;                      /* this thread */
  74     struct threadEntry *next;           /* next thread */
  75     int intr;                           /* interrupted */
  76 } threadEntry_t;
  78 /*
  79  * Heap allocated during initialized - one entry per fd
  80  */
  81 typedef struct {
  82     pthread_mutex_t lock;               /* fd lock */
  83     threadEntry_t *threads;             /* threads blocked on fd */
  84 } fdEntry_t;
  86 /*
  87  * Signal to unblock thread
  88  */
  89 static int sigWakeup = (SIGRTMAX - 1);
  91 /*
  92  * fdTable holds one entry per file descriptor, up to a certain
  93  * maximum.
  94  * Theoretically, the number of possible file descriptors can get
  95  * large, though usually it does not. Entries for small value file
  96  * descriptors are kept in a simple table, which covers most scenarios.
  97  * Entries for large value file descriptors are kept in an overflow
  98  * table, which is organized as a sparse two dimensional array whose
  99  * slabs are allocated on demand. This covers all corner cases while
 100  * keeping memory consumption reasonable.
 101  */
 103 /* Base table for low value file descriptors */
 104 static fdEntry_t* fdTable = NULL;
 105 /* Maximum size of base table (in number of entries). */
 106 static const int fdTableMaxSize = 0x1000; /* 4K */
 107 /* Actual size of base table (in number of entries) */
 108 static int fdTableLen = 0;
 109 /* Max. theoretical number of file descriptors on system. */
 110 static int fdLimit = 0;
 112 /* Overflow table, should base table not be large enough. Organized as
 113  *   an array of n slabs, each holding 64k entries.
 114  */
 115 static fdEntry_t** fdOverflowTable = NULL;
 116 /* Number of slabs in the overflow table */
 117 static int fdOverflowTableLen = 0;
 118 /* Number of entries in one slab */
 119 static const int fdOverflowTableSlabSize = 0x10000; /* 64k */
 120 pthread_mutex_t fdOverflowTableLock = PTHREAD_MUTEX_INITIALIZER;
 122 /*
 123  * Null signal handler
 124  */
 125 static void sig_wakeup(int sig) {
 126 }
 128 /*
 129  * Initialization routine (executed when library is loaded)
 130  * Allocate fd tables and sets up signal handler.
 131  *
 132  * On AIX we don't have __attribute((constructor)) so we need to initialize
 133  * manually (from JNI_OnLoad() in 'src/share/native/java/net/net_util.c')
 134  */
 135 void aix_close_init() {
 136     struct rlimit nbr_files;
 137     sigset_t sigset;
 138     struct sigaction sa;
 139     int i = 0;
 141     /* Determine the maximum number of possible file descriptors. */
 142     if (-1 == getrlimit(RLIMIT_NOFILE, &nbr_files)) {
 143         fprintf(stderr, "library initialization failed - "
 144                 "unable to get max # of allocated fds\n");
 145         abort();
 146     }
 147     if (nbr_files.rlim_max != RLIM_INFINITY) {
 148         fdLimit = nbr_files.rlim_max;
 149     } else {
 150         /* We just do not know. */
 151         fdLimit = INT_MAX;
 152     }
 154     /* Allocate table for low value file descriptors. */
 155     fdTableLen = fdLimit < fdTableMaxSize ? fdLimit : fdTableMaxSize;
 156     fdTable = (fdEntry_t*) calloc(fdTableLen, sizeof(fdEntry_t));
 157     if (fdTable == NULL) {
 158         fprintf(stderr, "library initialization failed - "
 159                 "unable to allocate file descriptor table - out of memory");
 160         abort();
 161     } else {
 162         for (i = 0; i < fdTableLen; i ++) {
 163             pthread_mutex_init(&fdTable[i].lock, NULL);
 164         }
 165     }
 167     /* Allocate overflow table, if needed */
 168     if (fdLimit > fdTableMaxSize) {
 169         fdOverflowTableLen = ((fdLimit - fdTableMaxSize) / fdOverflowTableSlabSize) + 1;
 170         fdOverflowTable = (fdEntry_t**) calloc(fdOverflowTableLen, sizeof(fdEntry_t*));
 171         if (fdOverflowTable == NULL) {
 172             fprintf(stderr, "library initialization failed - "
 173                     "unable to allocate file descriptor overflow table - out of memory");
 174             abort();
 175         }
 176     }
 178     /*
 179      * Setup the signal handler
 180      */
 181     sa.sa_handler = sig_wakeup;
 182     sa.sa_flags   = 0;
 183     sigemptyset(&sa.sa_mask);
 184     sigaction(sigWakeup, &sa, NULL);
 186     sigemptyset(&sigset);
 187     sigaddset(&sigset, sigWakeup);
 188     sigprocmask(SIG_UNBLOCK, &sigset, NULL);
 189 }
 191 /*
 192  * Return the fd table for this fd.
 193  */
 194 static inline fdEntry_t *getFdEntry(int fd)
 195 {
 196     fdEntry_t* result = NULL;
 198     if (fd < 0) {
 199         return NULL;
 200     }
 202     /* This should not happen. If it does, our assumption about
 203      * max. fd value was wrong. */
 204     assert(fd < fdLimit);
 206     if (fd < fdTableMaxSize) {
 207         /* fd is in base table. */
 208         assert(fd < fdTableLen);
 209         result = &fdTable[fd];
 210     } else {
 211         /* fd is in overflow table. */
 212         const int indexInOverflowTable = fd - fdTableMaxSize;
 213         const int rootindex = indexInOverflowTable / fdOverflowTableSlabSize;
 214         const int slabindex = indexInOverflowTable % fdOverflowTableSlabSize;
 215         fdEntry_t* slab = NULL;
 216         assert(rootindex < fdOverflowTableLen);
 217         assert(slabindex < fdOverflowTableSlabSize);
 218         pthread_mutex_lock(&fdOverflowTableLock);
 219         /* Allocate new slab in overflow table if needed */
 220         if (fdOverflowTable[rootindex] == NULL) {
 221             fdEntry_t* const newSlab =
 222                 (fdEntry_t*)calloc(fdOverflowTableSlabSize, sizeof(fdEntry_t));
 223             if (newSlab == NULL) {
 224                 fprintf(stderr, "Unable to allocate file descriptor overflow"
 225                         " table slab - out of memory");
 226                 pthread_mutex_unlock(&fdOverflowTableLock);
 227                 abort();
 228             } else {
 229                 int i;
 230                 for (i = 0; i < fdOverflowTableSlabSize; i ++) {
 231                     pthread_mutex_init(&newSlab[i].lock, NULL);
 232                 }
 233                 fdOverflowTable[rootindex] = newSlab;
 234             }
 235         }
 236         pthread_mutex_unlock(&fdOverflowTableLock);
 237         slab = fdOverflowTable[rootindex];
 238         result = &slab[slabindex];
 239     }
 241     return result;
 243 }
 246 /*
 247  * Start a blocking operation :-
 248  *    Insert thread onto thread list for the fd.
 249  */
 250 static inline void startOp(fdEntry_t *fdEntry, threadEntry_t *self)
 251 {
 252     self->thr = pthread_self();
 253     self->intr = 0;
 255     pthread_mutex_lock(&(fdEntry->lock));
 256     {
 257         self->next = fdEntry->threads;
 258         fdEntry->threads = self;
 259     }
 260     pthread_mutex_unlock(&(fdEntry->lock));
 261 }
 263 /*
 264  * End a blocking operation :-
 265  *     Remove thread from thread list for the fd
 266  *     If fd has been interrupted then set errno to EBADF
 267  */
 268 static inline void endOp
 269     (fdEntry_t *fdEntry, threadEntry_t *self)
 270 {
 271     int orig_errno = errno;
 272     pthread_mutex_lock(&(fdEntry->lock));
 273     {
 274         threadEntry_t *curr, *prev=NULL;
 275         curr = fdEntry->threads;
 276         while (curr != NULL) {
 277             if (curr == self) {
 278                 if (curr->intr) {
 279                     orig_errno = EBADF;
 280                 }
 281                 if (prev == NULL) {
 282                     fdEntry->threads = curr->next;
 283                 } else {
 284                     prev->next = curr->next;
 285                 }
 286                 break;
 287             }
 288             prev = curr;
 289             curr = curr->next;
 290         }
 291     }
 292     pthread_mutex_unlock(&(fdEntry->lock));
 293     errno = orig_errno;
 294 }
 296 /*
 297  * Close or dup2 a file descriptor ensuring that all threads blocked on
 298  * the file descriptor are notified via a wakeup signal.
 299  *
 300  *      fd1 < 0    => close(fd2)
 301  *      fd1 >= 0   => dup2(fd1, fd2)
 302  *
 303  * Returns -1 with errno set if operation fails.
 304  */
 305 static int closefd(int fd1, int fd2) {
 306     int rv, orig_errno;
 307     fdEntry_t *fdEntry = getFdEntry(fd2);
 308     if (fdEntry == NULL) {
 309         errno = EBADF;
 310         return -1;
 311     }
 313     /*
 314      * Lock the fd to hold-off additional I/O on this fd.
 315      */
 316     pthread_mutex_lock(&(fdEntry->lock));
 318     {
 319         /* On fast machines we see that we enter dup2 before the
 320          * accepting thread had a chance to get and process the signal.
 321          * So in case we woke a thread up, give it some time to cope.
 322          * Also see https://bugs.openjdk.java.net/browse/JDK-8006395 */
 323         int num_woken = 0;
 325         /*
 326          * Send a wakeup signal to all threads blocked on this
 327          * file descriptor.
 328          */
 329         threadEntry_t *curr = fdEntry->threads;
 330         while (curr != NULL) {
 331             curr->intr = 1;
 332             pthread_kill( curr->thr, sigWakeup );
 333             num_woken ++;
 334             curr = curr->next;
 335         }
 337         if (num_woken > 0) {
 338           usleep(num_woken * 50);
 339         }
 341         /*
 342          * And close/dup the file descriptor
 343          * (restart if interrupted by signal)
 344          */
 345         do {
 346             if (fd1 < 0) {
 347                 rv = close(fd2);
 348             } else {
 349                 rv = dup2(fd1, fd2);
 350             }
 351         } while (rv == -1 && errno == EINTR);
 352     }
 354     /*
 355      * Unlock without destroying errno
 356      */
 357     orig_errno = errno;
 358     pthread_mutex_unlock(&(fdEntry->lock));
 359     errno = orig_errno;
 361     return rv;
 362 }
 364 /*
 365  * Wrapper for dup2 - same semantics as dup2 system call except
 366  * that any threads blocked in an I/O system call on fd2 will be
 367  * preempted and return -1/EBADF;
 368  */
 369 int NET_Dup2(int fd, int fd2) {
 370     if (fd < 0) {
 371         errno = EBADF;
 372         return -1;
 373     }
 374     return closefd(fd, fd2);
 375 }
 377 /*
 378  * Wrapper for close - same semantics as close system call
 379  * except that any threads blocked in an I/O on fd will be
 380  * preempted and the I/O system call will return -1/EBADF.
 381  */
 382 int NET_SocketClose(int fd) {
 383     return closefd(-1, fd);
 384 }
 386 /************** Basic I/O operations here ***************/
 388 /*
 389  * Macro to perform a blocking IO operation. Restarts
 390  * automatically if interrupted by signal (other than
 391  * our wakeup signal)
 392  */
 393 #define BLOCKING_IO_RETURN_INT(FD, FUNC) {      \
 394     int ret;                                    \
 395     threadEntry_t self;                         \
 396     fdEntry_t *fdEntry = getFdEntry(FD);        \
 397     if (fdEntry == NULL) {                      \
 398         errno = EBADF;                          \
 399         return -1;                              \
 400     }                                           \
 401     do {                                        \
 402         startOp(fdEntry, &self);                \
 403         ret = FUNC;                             \
 404         endOp(fdEntry, &self);                  \
 405     } while (ret == -1 && errno == EINTR);      \
 406     return ret;                                 \
 407 }
 409 int NET_Read(int s, void* buf, size_t len) {
 410     BLOCKING_IO_RETURN_INT( s, recv(s, buf, len, 0) );
 411 }
 413 int NET_ReadV(int s, const struct iovec * vector, int count) {
 414     BLOCKING_IO_RETURN_INT( s, readv(s, vector, count) );
 415 }
 417 int NET_RecvFrom(int s, void *buf, int len, unsigned int flags,
 418        struct sockaddr *from, int *fromlen) {
 419     socklen_t socklen = *fromlen;
 420     BLOCKING_IO_RETURN_INT( s, recvfrom(s, buf, len, flags, from, &socklen) );
 421     *fromlen = socklen;
 422 }
 424 int NET_Send(int s, void *msg, int len, unsigned int flags) {
 425     BLOCKING_IO_RETURN_INT( s, send(s, msg, len, flags) );
 426 }
 428 int NET_WriteV(int s, const struct iovec * vector, int count) {
 429     BLOCKING_IO_RETURN_INT( s, writev(s, vector, count) );
 430 }
 432 int NET_SendTo(int s, const void *msg, int len,  unsigned  int
 433        flags, const struct sockaddr *to, int tolen) {
 434     BLOCKING_IO_RETURN_INT( s, sendto(s, msg, len, flags, to, tolen) );
 435 }
 437 int NET_Accept(int s, struct sockaddr *addr, int *addrlen) {
 438     socklen_t socklen = *addrlen;
 439     BLOCKING_IO_RETURN_INT( s, accept(s, addr, &socklen) );
 440     *addrlen = socklen;
 441 }
 443 int NET_Connect(int s, struct sockaddr *addr, int addrlen) {
 444     int crc = -1, prc = -1;
 445     threadEntry_t self;
 446     fdEntry_t* fdEntry = getFdEntry(s);
 448     if (fdEntry == NULL) {
 449         errno = EBADF;
 450         return -1;
 451     }
 453     /* On AIX, when the system call connect() is interrupted, the connection
 454      * is not aborted and it will be established asynchronously by the kernel.
 455      * Hence, no need to restart connect() when EINTR is received
 456      */
 457     startOp(fdEntry, &self);
 458     crc = connect(s, addr, addrlen);
 459     endOp(fdEntry, &self);
 461     if (crc == -1 && errno == EINTR) {
 462         struct pollfd s_pollfd;
 463         int sockopt_arg = 0;
 464         socklen_t len;
 466         s_pollfd.fd = s;
 467         s_pollfd.events = POLLOUT | POLLERR;
 469         /* poll the file descriptor */
 470         do {
 471             startOp(fdEntry, &self);
 472             prc = poll(&s_pollfd, 1, -1);
 473             endOp(fdEntry, &self);
 474         } while (prc == -1  && errno == EINTR);
 476         if (prc < 0)
 477             return prc;
 479         len = sizeof(sockopt_arg);
 481         /* Check whether the connection has been established */
 482         if (getsockopt(s, SOL_SOCKET, SO_ERROR, &sockopt_arg, &len) == -1)
 483             return -1;
 485         if (sockopt_arg != 0 ) {
 486             errno = sockopt_arg;
 487             return -1;
 488         }
 489     } else {
 490         return crc;
 491     }
 493     /* At this point, fd is connected. Set successful return code */
 494     return 0;
 495 }
 497 int NET_Poll(struct pollfd *ufds, unsigned int nfds, int timeout) {
 498     BLOCKING_IO_RETURN_INT( ufds[0].fd, poll(ufds, nfds, timeout) );
 499 }
 501 /*
 502  * Wrapper for poll(s, timeout).
 503  * Auto restarts with adjusted timeout if interrupted by
 504  * signal other than our wakeup signal.
 505  */
 506 int NET_Timeout(int s, long timeout) {
 507     long prevtime = 0, newtime;
 508     struct timeval t;
 509     fdEntry_t *fdEntry = getFdEntry(s);
 511     /*
 512      * Check that fd hasn't been closed.
 513      */
 514     if (fdEntry == NULL) {
 515         errno = EBADF;
 516         return -1;
 517     }
 519     /*
 520      * Pick up current time as may need to adjust timeout
 521      */
 522     if (timeout > 0) {
 523         gettimeofday(&t, NULL);
 524         prevtime = t.tv_sec * 1000  +  t.tv_usec / 1000;
 525     }
 527     for(;;) {
 528         struct pollfd pfd;
 529         int rv;
 530         threadEntry_t self;
 532         /*
 533          * Poll the fd. If interrupted by our wakeup signal
 534          * errno will be set to EBADF.
 535          */
 536         pfd.fd = s;
 537         pfd.events = POLLIN | POLLERR;
 539         startOp(fdEntry, &self);
 540         rv = poll(&pfd, 1, timeout);
 541         endOp(fdEntry, &self);
 543         /*
 544          * If interrupted then adjust timeout. If timeout
 545          * has expired return 0 (indicating timeout expired).
 546          */
 547         if (rv < 0 && errno == EINTR) {
 548             if (timeout > 0) {
 549                 gettimeofday(&t, NULL);
 550                 newtime = t.tv_sec * 1000  +  t.tv_usec / 1000;
 551                 timeout -= newtime - prevtime;
 552                 if (timeout <= 0) {
 553                     return 0;
 554                 }
 555                 prevtime = newtime;
 556             }
 557         } else {
 558             return rv;
 559         }
 561     }
 562 }