1 /*
   2  * Copyright (c) 2001, 2016, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2016, SAP SE and/or its affiliates. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.  Oracle designates this
   9  * particular file as subject to the "Classpath" exception as provided
  10  * by Oracle in the LICENSE file that accompanied this code.
  11  *
  12  * This code is distributed in the hope that it will be useful, but WITHOUT
  13  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  14  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  15  * version 2 for more details (a copy is included in the LICENSE file that
  16  * accompanied this code).
  17  *
  18  * You should have received a copy of the GNU General Public License version
  19  * 2 along with this work; if not, write to the Free Software Foundation,
  20  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  21  *
  22  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  23  * or visit www.oracle.com if you need additional information or have any
  24  * questions.
  25  */
  26 
  27 /*
  28  * This file contains implementations of NET_... functions. The NET_.. functions are
  29  * wrappers for common file- and socket functions plus provisions for non-blocking IO.
  30  *
  31  * (basically, the layers remember all  file descriptors waiting for a particular fd;
  32  *  all threads waiting on a certain fd can be woken up by sending them a signal; this
  33  *  is done e.g. when the fd is closed.)
  34  *
  35  * This was originally copied from the linux_close.c implementation.
  36  *
  37  * Side Note: This coding needs initialization. Under Linux this is done
  38  * automatically via __attribute((constructor)), on AIX this is done manually
  39  * (see aix_close_init).
  40  *
  41  */
  42 
  43 /*
  44    AIX needs a workaround for I/O cancellation, see:
  45    http://publib.boulder.ibm.com/infocenter/pseries/v5r3/index.jsp?topic=/com.ibm.aix.basetechref/doc/basetrf1/close.htm
  46    ...
  47    The close subroutine is blocked until all subroutines which use the file
  48    descriptor return to usr space. For example, when a thread is calling close
  49    and another thread is calling select with the same file descriptor, the
  50    close subroutine does not return until the select call returns.
  51    ...
  52 */
  53 
  54 #include <assert.h>
  55 #include <limits.h>
  56 #include <stdio.h>
  57 #include <stdlib.h>
  58 #include <signal.h>
  59 #include <pthread.h>
  60 #include <sys/types.h>
  61 #include <sys/socket.h>
  62 #include <sys/time.h>
  63 #include <sys/resource.h>
  64 #include <sys/uio.h>
  65 #include <unistd.h>
  66 #include <errno.h>
  67 #include <sys/poll.h>
  68 
  69 /*
  70  * Stack allocated by thread when doing blocking operation
  71  */
  72 typedef struct threadEntry {
  73     pthread_t thr;                      /* this thread */
  74     struct threadEntry *next;           /* next thread */
  75     int intr;                           /* interrupted */
  76 } threadEntry_t;
  77 
  78 /*
  79  * Heap allocated during initialized - one entry per fd
  80  */
  81 typedef struct {
  82     pthread_mutex_t lock;               /* fd lock */
  83     threadEntry_t *threads;             /* threads blocked on fd */
  84 } fdEntry_t;
  85 
  86 /*
  87  * Signal to unblock thread
  88  */
  89 static int sigWakeup = (SIGRTMAX - 1);
  90 
  91 /*
  92  * fdTable holds one entry per file descriptor, up to a certain
  93  * maximum.
  94  * Theoretically, the number of possible file descriptors can get
  95  * large, though usually it does not. To save memory, we keep file
  96  * descriptors with large numerical values in an overflow table. That
  97  * table is organized as a two-dimensional sparse array, allocated
  98  * on demand.
  99  */
 100 
 101 static fdEntry_t* fdTable;
 102 /* Max. number of file descriptors in fdTable. */
 103 static const int fdTableMaxSize = 0x1000; /* 4K */
 104 /* Max. theoretical number of file descriptor on system. */
 105 static int fdLimit;
 106 /* Length of fdTable, in number of entries. */
 107 static int fdTableLen;
 108 
 109 /* Overflow table: organized as array of n slabs, each holding
 110  *   64k entries.
 111  */
 112 static fdEntry_t** fdOverflowTable;
 113 /* Number of slabs in the overflow table */
 114 static int fdOverflowTableLen;
 115 /* Number of entries in one slab */
 116 static const int fdOverflowTableSlabSize = 0x10000; /* 64k */
 117 pthread_mutex_t fdOverflowTableLock = PTHREAD_MUTEX_INITIALIZER;
 118 
 119 /*
 120  * Null signal handler
 121  */
 122 static void sig_wakeup(int sig) {
 123 }
 124 
 125 /*
 126  * Initialization routine (executed when library is loaded)
 127  * Allocate fd tables and sets up signal handler.
 128  *
 129  * On AIX we don't have __attribute((constructor)) so we need to initialize
 130  * manually (from JNI_OnLoad() in 'src/share/native/java/net/net_util.c')
 131  */
 132 void aix_close_init() {
 133     struct rlimit nbr_files;
 134     sigset_t sigset;
 135     struct sigaction sa;
 136     int i = 0;
 137 
 138     assert(fdTable == NULL);
 139 
 140     /* Determine the maximum number of possible file descriptors. */
 141     getrlimit(RLIMIT_NOFILE, &nbr_files);
 142     if (nbr_files.rlim_max != RLIM_INFINITY) {
 143         fdLimit = nbr_files.rlim_max;
 144     } else {
 145         /* We just do not know. */
 146         fdLimit = INT_MAX;
 147     }
 148 
 149     /* Allocate table for low value file descriptors. */
 150     fdTableLen = fdLimit < fdTableMaxSize ? fdLimit : fdTableMaxSize;
 151     fdTable = (fdEntry_t*) calloc(fdTableLen, sizeof(fdEntry_t));
 152     if (fdTable == NULL) {
 153         fprintf(stderr, "library initialization failed - "
 154                 "unable to allocate file descriptor table - out of memory");
 155         abort();
 156     } else {
 157         for (i = 0; i < fdTableLen; i ++) {
 158             pthread_mutex_init(&fdTable[i].lock, NULL);
 159         }
 160     }
 161 
 162     /* Allocate overflow table, if needed */
 163     if (fdLimit > fdTableMaxSize) {
 164         fdOverflowTableLen = ((fdLimit - fdTableMaxSize) / fdOverflowTableSlabSize) + 1;
 165         fdOverflowTable = (fdEntry_t**) calloc(fdOverflowTableLen, sizeof(fdEntry_t*));
 166         if (fdOverflowTable == NULL) {
 167             fprintf(stderr, "library initialization failed - "
 168                     "unable to allocate file descriptor overflow table - out of memory");
 169             abort();
 170         }
 171     }
 172 
 173     /*
 174      * Setup the signal handler
 175      */
 176     sa.sa_handler = sig_wakeup;
 177     sa.sa_flags   = 0;
 178     sigemptyset(&sa.sa_mask);
 179     sigaction(sigWakeup, &sa, NULL);
 180 
 181     sigemptyset(&sigset);
 182     sigaddset(&sigset, sigWakeup);
 183     sigprocmask(SIG_UNBLOCK, &sigset, NULL);
 184 }
 185 
 186 /*
 187  * Return the fd table for this fd.
 188  */
 189 static inline fdEntry_t *getFdEntry(int fd)
 190 {
 191     fdEntry_t* result = NULL;
 192 
 193     if (fd < 0) {
 194         return NULL;
 195     }
 196 
 197     /* This should not happen. If it does, our assumption about
 198      * max. fd value was wrong. */
 199     assert(fd < fdLimit);
 200 
 201     if (fd < fdTableMaxSize) {
 202         assert(fd < fdTableLen);
 203         result = fdTable + fd;
 204     } else {
 205         const int indexInOverflowTable = fd - fdTableMaxSize;
 206         const int rootindex = indexInOverflowTable / fdOverflowTableSlabSize;
 207         const int slabindex = indexInOverflowTable % fdOverflowTableSlabSize;
 208         assert(rootindex < fdOverflowTableLen);
 209         assert(slabindex < fdOverflowTableSlabSize);
 210         pthread_mutex_lock(&fdOverflowTableLock);
 211         if (fdOverflowTable[rootindex] == NULL) {
 212             fdEntry_t* const newSlab =
 213                 (fdEntry_t*)calloc(fdOverflowTableSlabSize, sizeof(fdEntry_t));
 214             if (newSlab == NULL) {
 215                 fprintf(stderr, "Unable to allocate file descriptor table - out of memory");
 216                 pthread_mutex_unlock(&fdOverflowTableLock);
 217                 abort();
 218             } else {
 219                 int i;
 220                 for (i = 0; i < fdOverflowTableSlabSize; i ++) {
 221                     pthread_mutex_init(&newSlab[i].lock, NULL);
 222                 }
 223                 fdOverflowTable[rootindex] = newSlab;
 224             }
 225         }
 226         pthread_mutex_unlock(&fdOverflowTableLock);
 227         result = fdOverflowTable[rootindex] + slabindex;
 228     }
 229     return result;
 230 }
 231 
 232 /*
 233  * Start a blocking operation :-
 234  *    Insert thread onto thread list for the fd.
 235  */
 236 static inline void startOp(fdEntry_t *fdEntry, threadEntry_t *self)
 237 {
 238     self->thr = pthread_self();
 239     self->intr = 0;
 240 
 241     pthread_mutex_lock(&(fdEntry->lock));
 242     {
 243         self->next = fdEntry->threads;
 244         fdEntry->threads = self;
 245     }
 246     pthread_mutex_unlock(&(fdEntry->lock));
 247 }
 248 
 249 /*
 250  * End a blocking operation :-
 251  *     Remove thread from thread list for the fd
 252  *     If fd has been interrupted then set errno to EBADF
 253  */
 254 static inline void endOp
 255     (fdEntry_t *fdEntry, threadEntry_t *self)
 256 {
 257     int orig_errno = errno;
 258     pthread_mutex_lock(&(fdEntry->lock));
 259     {
 260         threadEntry_t *curr, *prev=NULL;
 261         curr = fdEntry->threads;
 262         while (curr != NULL) {
 263             if (curr == self) {
 264                 if (curr->intr) {
 265                     orig_errno = EBADF;
 266                 }
 267                 if (prev == NULL) {
 268                     fdEntry->threads = curr->next;
 269                 } else {
 270                     prev->next = curr->next;
 271                 }
 272                 break;
 273             }
 274             prev = curr;
 275             curr = curr->next;
 276         }
 277     }
 278     pthread_mutex_unlock(&(fdEntry->lock));
 279     errno = orig_errno;
 280 }
 281 
 282 /*
 283  * Close or dup2 a file descriptor ensuring that all threads blocked on
 284  * the file descriptor are notified via a wakeup signal.
 285  *
 286  *      fd1 < 0    => close(fd2)
 287  *      fd1 >= 0   => dup2(fd1, fd2)
 288  *
 289  * Returns -1 with errno set if operation fails.
 290  */
 291 static int closefd(int fd1, int fd2) {
 292     int rv, orig_errno;
 293     fdEntry_t *fdEntry = getFdEntry(fd2);
 294     if (fdEntry == NULL) {
 295         errno = EBADF;
 296         return -1;
 297     }
 298 
 299     /*
 300      * Lock the fd to hold-off additional I/O on this fd.
 301      */
 302     pthread_mutex_lock(&(fdEntry->lock));
 303 
 304     {
 305         /* On fast machines we see that we enter dup2 before the
 306          * accepting thread had a chance to get and process the signal.
 307          * So in case we woke a thread up, give it some time to cope.
 308          * Also see https://bugs.openjdk.java.net/browse/JDK-8006395 */
 309         int num_woken = 0;
 310 
 311         /*
 312          * Send a wakeup signal to all threads blocked on this
 313          * file descriptor.
 314          */
 315         threadEntry_t *curr = fdEntry->threads;
 316         while (curr != NULL) {
 317             curr->intr = 1;
 318             pthread_kill( curr->thr, sigWakeup );
 319             num_woken ++;
 320             curr = curr->next;
 321         }
 322 
 323         if (num_woken > 0) {
 324           usleep(num_woken * 50);
 325         }
 326 
 327         /*
 328          * And close/dup the file descriptor
 329          * (restart if interrupted by signal)
 330          */
 331         do {
 332             if (fd1 < 0) {
 333                 rv = close(fd2);
 334             } else {
 335                 rv = dup2(fd1, fd2);
 336             }
 337         } while (rv == -1 && errno == EINTR);
 338     }
 339 
 340     /*
 341      * Unlock without destroying errno
 342      */
 343     orig_errno = errno;
 344     pthread_mutex_unlock(&(fdEntry->lock));
 345     errno = orig_errno;
 346 
 347     return rv;
 348 }
 349 
 350 /*
 351  * Wrapper for dup2 - same semantics as dup2 system call except
 352  * that any threads blocked in an I/O system call on fd2 will be
 353  * preempted and return -1/EBADF;
 354  */
 355 int NET_Dup2(int fd, int fd2) {
 356     if (fd < 0) {
 357         errno = EBADF;
 358         return -1;
 359     }
 360     return closefd(fd, fd2);
 361 }
 362 
 363 /*
 364  * Wrapper for close - same semantics as close system call
 365  * except that any threads blocked in an I/O on fd will be
 366  * preempted and the I/O system call will return -1/EBADF.
 367  */
 368 int NET_SocketClose(int fd) {
 369     return closefd(-1, fd);
 370 }
 371 
 372 /************** Basic I/O operations here ***************/
 373 
 374 /*
 375  * Macro to perform a blocking IO operation. Restarts
 376  * automatically if interrupted by signal (other than
 377  * our wakeup signal)
 378  */
 379 #define BLOCKING_IO_RETURN_INT(FD, FUNC) {      \
 380     int ret;                                    \
 381     threadEntry_t self;                         \
 382     fdEntry_t *fdEntry = getFdEntry(FD);        \
 383     if (fdEntry == NULL) {                      \
 384         errno = EBADF;                          \
 385         return -1;                              \
 386     }                                           \
 387     do {                                        \
 388         startOp(fdEntry, &self);                \
 389         ret = FUNC;                             \
 390         endOp(fdEntry, &self);                  \
 391     } while (ret == -1 && errno == EINTR);      \
 392     return ret;                                 \
 393 }
 394 
 395 int NET_Read(int s, void* buf, size_t len) {
 396     BLOCKING_IO_RETURN_INT( s, recv(s, buf, len, 0) );
 397 }
 398 
 399 int NET_ReadV(int s, const struct iovec * vector, int count) {
 400     BLOCKING_IO_RETURN_INT( s, readv(s, vector, count) );
 401 }
 402 
 403 int NET_RecvFrom(int s, void *buf, int len, unsigned int flags,
 404        struct sockaddr *from, int *fromlen) {
 405     socklen_t socklen = *fromlen;
 406     BLOCKING_IO_RETURN_INT( s, recvfrom(s, buf, len, flags, from, &socklen) );
 407     *fromlen = socklen;
 408 }
 409 
 410 int NET_Send(int s, void *msg, int len, unsigned int flags) {
 411     BLOCKING_IO_RETURN_INT( s, send(s, msg, len, flags) );
 412 }
 413 
 414 int NET_WriteV(int s, const struct iovec * vector, int count) {
 415     BLOCKING_IO_RETURN_INT( s, writev(s, vector, count) );
 416 }
 417 
 418 int NET_SendTo(int s, const void *msg, int len,  unsigned  int
 419        flags, const struct sockaddr *to, int tolen) {
 420     BLOCKING_IO_RETURN_INT( s, sendto(s, msg, len, flags, to, tolen) );
 421 }
 422 
 423 int NET_Accept(int s, struct sockaddr *addr, int *addrlen) {
 424     socklen_t socklen = *addrlen;
 425     BLOCKING_IO_RETURN_INT( s, accept(s, addr, &socklen) );
 426     *addrlen = socklen;
 427 }
 428 
 429 int NET_Connect(int s, struct sockaddr *addr, int addrlen) {
 430     int crc = -1, prc = -1;
 431     threadEntry_t self;
 432     fdEntry_t* fdEntry = getFdEntry(s);
 433 
 434     if (fdEntry == NULL) {
 435         errno = EBADF;
 436         return -1;
 437     }
 438 
 439     /* On AIX, when the system call connect() is interrupted, the connection
 440      * is not aborted and it will be established asynchronously by the kernel.
 441      * Hence, no need to restart connect() when EINTR is received
 442      */
 443     startOp(fdEntry, &self);
 444     crc = connect(s, addr, addrlen);
 445     endOp(fdEntry, &self);
 446 
 447     if (crc == -1 && errno == EINTR) {
 448         struct pollfd s_pollfd;
 449         int sockopt_arg = 0;
 450         socklen_t len;
 451 
 452         s_pollfd.fd = s;
 453         s_pollfd.events = POLLOUT | POLLERR;
 454 
 455         /* poll the file descriptor */
 456         do {
 457             startOp(fdEntry, &self);
 458             prc = poll(&s_pollfd, 1, -1);
 459             endOp(fdEntry, &self);
 460         } while (prc == -1  && errno == EINTR);
 461 
 462         if (prc < 0)
 463             return prc;
 464 
 465         len = sizeof(sockopt_arg);
 466 
 467         /* Check whether the connection has been established */
 468         if (getsockopt(s, SOL_SOCKET, SO_ERROR, &sockopt_arg, &len) == -1)
 469             return -1;
 470 
 471         if (sockopt_arg != 0 ) {
 472             errno = sockopt_arg;
 473             return -1;
 474         }
 475     } else {
 476         return crc;
 477     }
 478 
 479     /* At this point, fd is connected. Set successful return code */
 480     return 0;
 481 }
 482 
 483 int NET_Poll(struct pollfd *ufds, unsigned int nfds, int timeout) {
 484     BLOCKING_IO_RETURN_INT( ufds[0].fd, poll(ufds, nfds, timeout) );
 485 }
 486 
 487 /*
 488  * Wrapper for poll(s, timeout).
 489  * Auto restarts with adjusted timeout if interrupted by
 490  * signal other than our wakeup signal.
 491  */
 492 int NET_Timeout(int s, long timeout) {
 493     long prevtime = 0, newtime;
 494     struct timeval t;
 495     fdEntry_t *fdEntry = getFdEntry(s);
 496 
 497     /*
 498      * Check that fd hasn't been closed.
 499      */
 500     if (fdEntry == NULL) {
 501         errno = EBADF;
 502         return -1;
 503     }
 504 
 505     /*
 506      * Pick up current time as may need to adjust timeout
 507      */
 508     if (timeout > 0) {
 509         gettimeofday(&t, NULL);
 510         prevtime = t.tv_sec * 1000  +  t.tv_usec / 1000;
 511     }
 512 
 513     for(;;) {
 514         struct pollfd pfd;
 515         int rv;
 516         threadEntry_t self;
 517 
 518         /*
 519          * Poll the fd. If interrupted by our wakeup signal
 520          * errno will be set to EBADF.
 521          */
 522         pfd.fd = s;
 523         pfd.events = POLLIN | POLLERR;
 524 
 525         startOp(fdEntry, &self);
 526         rv = poll(&pfd, 1, timeout);
 527         endOp(fdEntry, &self);
 528 
 529         /*
 530          * If interrupted then adjust timeout. If timeout
 531          * has expired return 0 (indicating timeout expired).
 532          */
 533         if (rv < 0 && errno == EINTR) {
 534             if (timeout > 0) {
 535                 gettimeofday(&t, NULL);
 536                 newtime = t.tv_sec * 1000  +  t.tv_usec / 1000;
 537                 timeout -= newtime - prevtime;
 538                 if (timeout <= 0) {
 539                     return 0;
 540                 }
 541                 prevtime = newtime;
 542             }
 543         } else {
 544             return rv;
 545         }
 546 
 547     }
 548 }