1 /*
   2  * Copyright (c) 2001, 2016, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2016, SAP SE and/or its affiliates. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.  Oracle designates this
   9  * particular file as subject to the "Classpath" exception as provided
  10  * by Oracle in the LICENSE file that accompanied this code.
  11  *
  12  * This code is distributed in the hope that it will be useful, but WITHOUT
  13  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  14  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  15  * version 2 for more details (a copy is included in the LICENSE file that
  16  * accompanied this code).
  17  *
  18  * You should have received a copy of the GNU General Public License version
  19  * 2 along with this work; if not, write to the Free Software Foundation,
  20  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  21  *
  22  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  23  * or visit www.oracle.com if you need additional information or have any
  24  * questions.
  25  */
  26 
  27 /*
  28  * This file contains implementations of NET_... functions. The NET_.. functions are
  29  * wrappers for common file- and socket functions plus provisions for non-blocking IO.
  30  *
  31  * (basically, the layers remember all  file descriptors waiting for a particular fd;
  32  *  all threads waiting on a certain fd can be woken up by sending them a signal; this
  33  *  is done e.g. when the fd is closed.)
  34  *
  35  * This was originally copied from the linux_close.c implementation.
  36  *
  37  * Side Note: This coding needs initialization. Under Linux this is done
  38  * automatically via __attribute((constructor)), on AIX this is done manually
  39  * (see aix_close_init).
  40  *
  41  */
  42 
  43 /*
  44    AIX needs a workaround for I/O cancellation, see:
  45    http://publib.boulder.ibm.com/infocenter/pseries/v5r3/index.jsp?topic=/com.ibm.aix.basetechref/doc/basetrf1/close.htm
  46    ...
  47    The close subroutine is blocked until all subroutines which use the file
  48    descriptor return to usr space. For example, when a thread is calling close
  49    and another thread is calling select with the same file descriptor, the
  50    close subroutine does not return until the select call returns.
  51    ...
  52 */
  53 
  54 #include <assert.h>
  55 #include <limits.h>
  56 #include <stdio.h>
  57 #include <stdlib.h>
  58 #include <signal.h>
  59 #include <pthread.h>
  60 #include <sys/types.h>
  61 #include <sys/socket.h>
  62 #include <sys/time.h>
  63 #include <sys/resource.h>
  64 #include <sys/uio.h>
  65 #include <unistd.h>
  66 #include <errno.h>
  67 #include <sys/poll.h>
  68 
  69 /*
  70  * Stack allocated by thread when doing blocking operation
  71  */
  72 typedef struct threadEntry {
  73     pthread_t thr;                      /* this thread */
  74     struct threadEntry *next;           /* next thread */
  75     int intr;                           /* interrupted */
  76 } threadEntry_t;
  77 
  78 /*
  79  * Heap allocated during initialized - one entry per fd
  80  */
  81 typedef struct {
  82     pthread_mutex_t lock;               /* fd lock */
  83     threadEntry_t *threads;             /* threads blocked on fd */
  84 } fdEntry_t;
  85 
  86 /*
  87  * Signal to unblock thread
  88  */
  89 static int sigWakeup = (SIGRTMAX - 1);
  90 
  91 /*
  92  * fdTable holds one entry per file descriptor, up to a certain
  93  * maximum.
  94  * Theoretically, the number of possible file descriptors can get
  95  * large, though usually it does not. Entries for small value file
  96  * descriptors are kept in a simple table, which covers most scenarios.
  97  * Entries for large value file descriptors are kept in an overflow
  98  * table, which is organized as a sparse two dimensional array whose
  99  * slabs are allocated on demand. This covers all corner cases while
 100  * keeping memory consumption reasonable.
 101  */
 102 
 103 /* Base table for low value file descriptors */
 104 static fdEntry_t* fdTable = NULL;
 105 /* Maximum size of base table (in number of entries). */
 106 static const int fdTableMaxSize = 0x1000; /* 4K */
 107 /* Actual size of base table (in number of entries) */
 108 static int fdTableLen = 0;
 109 /* Max. theoretical number of file descriptors on system. */
 110 static int fdLimit = 0;
 111 
 112 /* Overflow table, should base table not be large enough. Organized as
 113  *   an array of n slabs, each holding 64k entries.
 114  */
 115 static fdEntry_t** fdOverflowTable = NULL;
 116 /* Number of slabs in the overflow table */
 117 static int fdOverflowTableLen = 0;
 118 /* Number of entries in one slab */
 119 static const int fdOverflowTableSlabSize = 0x10000; /* 64k */
 120 pthread_mutex_t fdOverflowTableLock = PTHREAD_MUTEX_INITIALIZER;
 121 
 122 /*
 123  * Null signal handler
 124  */
 125 static void sig_wakeup(int sig) {
 126 }
 127 
 128 /*
 129  * Initialization routine (executed when library is loaded)
 130  * Allocate fd tables and sets up signal handler.
 131  *
 132  * On AIX we don't have __attribute((constructor)) so we need to initialize
 133  * manually (from JNI_OnLoad() in 'src/share/native/java/net/net_util.c')
 134  */
 135 void aix_close_init() {
 136     struct rlimit nbr_files;
 137     sigset_t sigset;
 138     struct sigaction sa;
 139     int i = 0;
 140 
 141     /* Determine the maximum number of possible file descriptors. */
 142     if (-1 == getrlimit(RLIMIT_NOFILE, &nbr_files)) {
 143         fprintf(stderr, "library initialization failed - "
 144                 "unable to get max # of allocated fds\n");
 145         abort();
 146     }
 147     if (nbr_files.rlim_max != RLIM_INFINITY) {
 148         fdLimit = nbr_files.rlim_max;
 149     } else {
 150         /* We just do not know. */
 151         fdLimit = INT_MAX;
 152     }
 153 
 154     /* Allocate table for low value file descriptors. */
 155     fdTableLen = fdLimit < fdTableMaxSize ? fdLimit : fdTableMaxSize;
 156     fdTable = (fdEntry_t*) calloc(fdTableLen, sizeof(fdEntry_t));
 157     if (fdTable == NULL) {
 158         fprintf(stderr, "library initialization failed - "
 159                 "unable to allocate file descriptor table - out of memory");
 160         abort();
 161     } else {
 162         for (i = 0; i < fdTableLen; i ++) {
 163             pthread_mutex_init(&fdTable[i].lock, NULL);
 164         }
 165     }
 166 
 167     /* Allocate overflow table, if needed */
 168     if (fdLimit > fdTableMaxSize) {
 169         fdOverflowTableLen = ((fdLimit - fdTableMaxSize) / fdOverflowTableSlabSize) + 1;
 170         fdOverflowTable = (fdEntry_t**) calloc(fdOverflowTableLen, sizeof(fdEntry_t*));
 171         if (fdOverflowTable == NULL) {
 172             fprintf(stderr, "library initialization failed - "
 173                     "unable to allocate file descriptor overflow table - out of memory");
 174             abort();
 175         }
 176     }
 177 
 178     /*
 179      * Setup the signal handler
 180      */
 181     sa.sa_handler = sig_wakeup;
 182     sa.sa_flags   = 0;
 183     sigemptyset(&sa.sa_mask);
 184     sigaction(sigWakeup, &sa, NULL);
 185 
 186     sigemptyset(&sigset);
 187     sigaddset(&sigset, sigWakeup);
 188     sigprocmask(SIG_UNBLOCK, &sigset, NULL);
 189 }
 190 
 191 /*
 192  * Return the fd table for this fd.
 193  */
 194 static inline fdEntry_t *getFdEntry(int fd)
 195 {
 196     fdEntry_t* result = NULL;
 197 
 198     if (fd < 0) {
 199         return NULL;
 200     }
 201 
 202     /* This should not happen. If it does, our assumption about
 203      * max. fd value was wrong. */
 204     assert(fd < fdLimit);
 205 
 206     if (fd < fdTableMaxSize) {
 207         /* fd is in base table. */
 208         assert(fd < fdTableLen);
 209         result = &fdTable[fd];
 210     } else {
 211         /* fd is in overflow table. */
 212         const int indexInOverflowTable = fd - fdTableMaxSize;
 213         const int rootindex = indexInOverflowTable / fdOverflowTableSlabSize;
 214         const int slabindex = indexInOverflowTable % fdOverflowTableSlabSize;
 215         fdEntry_t* slab = NULL;
 216         assert(rootindex < fdOverflowTableLen);
 217         assert(slabindex < fdOverflowTableSlabSize);
 218         pthread_mutex_lock(&fdOverflowTableLock);
 219         /* Allocate new slab in overflow table if needed */
 220         if (fdOverflowTable[rootindex] == NULL) {
 221             fdEntry_t* const newSlab =
 222                 (fdEntry_t*)calloc(fdOverflowTableSlabSize, sizeof(fdEntry_t));
 223             if (newSlab == NULL) {
 224                 fprintf(stderr, "Unable to allocate file descriptor overflow"
 225                         " table slab - out of memory");
 226                 pthread_mutex_unlock(&fdOverflowTableLock);
 227                 abort();
 228             } else {
 229                 int i;
 230                 for (i = 0; i < fdOverflowTableSlabSize; i ++) {
 231                     pthread_mutex_init(&newSlab[i].lock, NULL);
 232                 }
 233                 fdOverflowTable[rootindex] = newSlab;
 234             }
 235         }
 236         pthread_mutex_unlock(&fdOverflowTableLock);
 237         slab = fdOverflowTable[rootindex];
 238         result = &slab[slabindex];
 239     }
 240 
 241     return result;
 242 
 243 }
 244 
 245 
 246 /*
 247  * Start a blocking operation :-
 248  *    Insert thread onto thread list for the fd.
 249  */
 250 static inline void startOp(fdEntry_t *fdEntry, threadEntry_t *self)
 251 {
 252     self->thr = pthread_self();
 253     self->intr = 0;
 254 
 255     pthread_mutex_lock(&(fdEntry->lock));
 256     {
 257         self->next = fdEntry->threads;
 258         fdEntry->threads = self;
 259     }
 260     pthread_mutex_unlock(&(fdEntry->lock));
 261 }
 262 
 263 /*
 264  * End a blocking operation :-
 265  *     Remove thread from thread list for the fd
 266  *     If fd has been interrupted then set errno to EBADF
 267  */
 268 static inline void endOp
 269     (fdEntry_t *fdEntry, threadEntry_t *self)
 270 {
 271     int orig_errno = errno;
 272     pthread_mutex_lock(&(fdEntry->lock));
 273     {
 274         threadEntry_t *curr, *prev=NULL;
 275         curr = fdEntry->threads;
 276         while (curr != NULL) {
 277             if (curr == self) {
 278                 if (curr->intr) {
 279                     orig_errno = EBADF;
 280                 }
 281                 if (prev == NULL) {
 282                     fdEntry->threads = curr->next;
 283                 } else {
 284                     prev->next = curr->next;
 285                 }
 286                 break;
 287             }
 288             prev = curr;
 289             curr = curr->next;
 290         }
 291     }
 292     pthread_mutex_unlock(&(fdEntry->lock));
 293     errno = orig_errno;
 294 }
 295 
 296 /*
 297  * Close or dup2 a file descriptor ensuring that all threads blocked on
 298  * the file descriptor are notified via a wakeup signal.
 299  *
 300  *      fd1 < 0    => close(fd2)
 301  *      fd1 >= 0   => dup2(fd1, fd2)
 302  *
 303  * Returns -1 with errno set if operation fails.
 304  */
 305 static int closefd(int fd1, int fd2) {
 306     int rv, orig_errno;
 307     fdEntry_t *fdEntry = getFdEntry(fd2);
 308     if (fdEntry == NULL) {
 309         errno = EBADF;
 310         return -1;
 311     }
 312 
 313     /*
 314      * Lock the fd to hold-off additional I/O on this fd.
 315      */
 316     pthread_mutex_lock(&(fdEntry->lock));
 317 
 318     {
 319         /* On fast machines we see that we enter dup2 before the
 320          * accepting thread had a chance to get and process the signal.
 321          * So in case we woke a thread up, give it some time to cope.
 322          * Also see https://bugs.openjdk.java.net/browse/JDK-8006395 */
 323         int num_woken = 0;
 324 
 325         /*
 326          * Send a wakeup signal to all threads blocked on this
 327          * file descriptor.
 328          */
 329         threadEntry_t *curr = fdEntry->threads;
 330         while (curr != NULL) {
 331             curr->intr = 1;
 332             pthread_kill( curr->thr, sigWakeup );
 333             num_woken ++;
 334             curr = curr->next;
 335         }
 336 
 337         if (num_woken > 0) {
 338           usleep(num_woken * 50);
 339         }
 340 
 341         /*
 342          * And close/dup the file descriptor
 343          * (restart if interrupted by signal)
 344          */
 345         do {
 346             if (fd1 < 0) {
 347                 rv = close(fd2);
 348             } else {
 349                 rv = dup2(fd1, fd2);
 350             }
 351         } while (rv == -1 && errno == EINTR);
 352     }
 353 
 354     /*
 355      * Unlock without destroying errno
 356      */
 357     orig_errno = errno;
 358     pthread_mutex_unlock(&(fdEntry->lock));
 359     errno = orig_errno;
 360 
 361     return rv;
 362 }
 363 
 364 /*
 365  * Wrapper for dup2 - same semantics as dup2 system call except
 366  * that any threads blocked in an I/O system call on fd2 will be
 367  * preempted and return -1/EBADF;
 368  */
 369 int NET_Dup2(int fd, int fd2) {
 370     if (fd < 0) {
 371         errno = EBADF;
 372         return -1;
 373     }
 374     return closefd(fd, fd2);
 375 }
 376 
 377 /*
 378  * Wrapper for close - same semantics as close system call
 379  * except that any threads blocked in an I/O on fd will be
 380  * preempted and the I/O system call will return -1/EBADF.
 381  */
 382 int NET_SocketClose(int fd) {
 383     return closefd(-1, fd);
 384 }
 385 
 386 /************** Basic I/O operations here ***************/
 387 
 388 /*
 389  * Macro to perform a blocking IO operation. Restarts
 390  * automatically if interrupted by signal (other than
 391  * our wakeup signal)
 392  */
 393 #define BLOCKING_IO_RETURN_INT(FD, FUNC) {      \
 394     int ret;                                    \
 395     threadEntry_t self;                         \
 396     fdEntry_t *fdEntry = getFdEntry(FD);        \
 397     if (fdEntry == NULL) {                      \
 398         errno = EBADF;                          \
 399         return -1;                              \
 400     }                                           \
 401     do {                                        \
 402         startOp(fdEntry, &self);                \
 403         ret = FUNC;                             \
 404         endOp(fdEntry, &self);                  \
 405     } while (ret == -1 && errno == EINTR);      \
 406     return ret;                                 \
 407 }
 408 
 409 int NET_Read(int s, void* buf, size_t len) {
 410     BLOCKING_IO_RETURN_INT( s, recv(s, buf, len, 0) );
 411 }
 412 
 413 int NET_NonBlockingRead(int s, void* buf, size_t len) {
 414     BLOCKING_IO_RETURN_INT( s, recv(s, buf, len, MSG_DONTWAIT));
 415 }
 416 
 417 int NET_ReadV(int s, const struct iovec * vector, int count) {
 418     BLOCKING_IO_RETURN_INT( s, readv(s, vector, count) );
 419 }
 420 
 421 int NET_RecvFrom(int s, void *buf, int len, unsigned int flags,
 422        struct sockaddr *from, int *fromlen) {
 423     socklen_t socklen = *fromlen;
 424     BLOCKING_IO_RETURN_INT( s, recvfrom(s, buf, len, flags, from, &socklen) );
 425     *fromlen = socklen;
 426 }
 427 
 428 int NET_Send(int s, void *msg, int len, unsigned int flags) {
 429     BLOCKING_IO_RETURN_INT( s, send(s, msg, len, flags) );
 430 }
 431 
 432 int NET_WriteV(int s, const struct iovec * vector, int count) {
 433     BLOCKING_IO_RETURN_INT( s, writev(s, vector, count) );
 434 }
 435 
 436 int NET_SendTo(int s, const void *msg, int len,  unsigned  int
 437        flags, const struct sockaddr *to, int tolen) {
 438     BLOCKING_IO_RETURN_INT( s, sendto(s, msg, len, flags, to, tolen) );
 439 }
 440 
 441 int NET_Accept(int s, struct sockaddr *addr, int *addrlen) {
 442     socklen_t socklen = *addrlen;
 443     BLOCKING_IO_RETURN_INT( s, accept(s, addr, &socklen) );
 444     *addrlen = socklen;
 445 }
 446 
 447 int NET_Connect(int s, struct sockaddr *addr, int addrlen) {
 448     int crc = -1, prc = -1;
 449     threadEntry_t self;
 450     fdEntry_t* fdEntry = getFdEntry(s);
 451 
 452     if (fdEntry == NULL) {
 453         errno = EBADF;
 454         return -1;
 455     }
 456 
 457     /* On AIX, when the system call connect() is interrupted, the connection
 458      * is not aborted and it will be established asynchronously by the kernel.
 459      * Hence, no need to restart connect() when EINTR is received
 460      */
 461     startOp(fdEntry, &self);
 462     crc = connect(s, addr, addrlen);
 463     endOp(fdEntry, &self);
 464 
 465     if (crc == -1 && errno == EINTR) {
 466         struct pollfd s_pollfd;
 467         int sockopt_arg = 0;
 468         socklen_t len;
 469 
 470         s_pollfd.fd = s;
 471         s_pollfd.events = POLLOUT | POLLERR;
 472 
 473         /* poll the file descriptor */
 474         do {
 475             startOp(fdEntry, &self);
 476             prc = poll(&s_pollfd, 1, -1);
 477             endOp(fdEntry, &self);
 478         } while (prc == -1  && errno == EINTR);
 479 
 480         if (prc < 0)
 481             return prc;
 482 
 483         len = sizeof(sockopt_arg);
 484 
 485         /* Check whether the connection has been established */
 486         if (getsockopt(s, SOL_SOCKET, SO_ERROR, &sockopt_arg, &len) == -1)
 487             return -1;
 488 
 489         if (sockopt_arg != 0 ) {
 490             errno = sockopt_arg;
 491             return -1;
 492         }
 493     } else {
 494         return crc;
 495     }
 496 
 497     /* At this point, fd is connected. Set successful return code */
 498     return 0;
 499 }
 500 
 501 int NET_Poll(struct pollfd *ufds, unsigned int nfds, int timeout) {
 502     BLOCKING_IO_RETURN_INT( ufds[0].fd, poll(ufds, nfds, timeout) );
 503 }
 504 
 505 /*
 506  * Wrapper for poll(s, timeout).
 507  * Auto restarts with adjusted timeout if interrupted by
 508  * signal other than our wakeup signal.
 509  */
 510 int NET_Timeout(int s, long timeout) {
 511     long prevtime = 0, newtime;
 512     struct timeval t;
 513     fdEntry_t *fdEntry = getFdEntry(s);
 514 
 515     /*
 516      * Check that fd hasn't been closed.
 517      */
 518     if (fdEntry == NULL) {
 519         errno = EBADF;
 520         return -1;
 521     }
 522 
 523     /*
 524      * Pick up current time as may need to adjust timeout
 525      */
 526     if (timeout > 0) {
 527         gettimeofday(&t, NULL);
 528         prevtime = t.tv_sec * 1000  +  t.tv_usec / 1000;
 529     }
 530 
 531     for(;;) {
 532         struct pollfd pfd;
 533         int rv;
 534         threadEntry_t self;
 535 
 536         /*
 537          * Poll the fd. If interrupted by our wakeup signal
 538          * errno will be set to EBADF.
 539          */
 540         pfd.fd = s;
 541         pfd.events = POLLIN | POLLERR;
 542 
 543         startOp(fdEntry, &self);
 544         rv = poll(&pfd, 1, timeout);
 545         endOp(fdEntry, &self);
 546 
 547         /*
 548          * If interrupted then adjust timeout. If timeout
 549          * has expired return 0 (indicating timeout expired).
 550          */
 551         if (rv < 0 && errno == EINTR) {
 552             if (timeout > 0) {
 553                 gettimeofday(&t, NULL);
 554                 newtime = t.tv_sec * 1000  +  t.tv_usec / 1000;
 555                 timeout -= newtime - prevtime;
 556                 if (timeout <= 0) {
 557                     return 0;
 558                 }
 559                 prevtime = newtime;
 560             }
 561         } else {
 562             return rv;
 563         }
 564 
 565     }
 566 }