--- /dev/null 2018-09-16 08:14:05.604314476 -0700 +++ new/src/jdk.net/linux/native/libextnet/rdma_util_md.c 2018-10-05 14:14:58.469757210 -0700 @@ -0,0 +1,484 @@ +/* + * Copyright (c) 2018, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +#include +#include "java_net_SocketOptions.h" +#include "jdk_net_RdmaSocketOptions.h" +#include "jvm.h" +#include // defines TCP_NODELAY +#include "net_util.h" +#include "rdma_util_md.h" +#include "Rsocket.h" +#include +#include +#include +#include + +#define BLOCKING_IO_RETURN_INT(FD, FUNC) { \ + int ret; \ + threadEntry_t self; \ + fdEntry_t *fdEntry = getFdEntry(FD); \ + if (fdEntry == NULL) { \ + errno = EBADF; \ + return -1; \ + } \ + do { \ + startOp(fdEntry, &self); \ + ret = FUNC; \ + endOp(fdEntry, &self); \ + } while (ret == -1 && errno == EINTR); \ + return ret; \ +} + +typedef struct threadEntry { + pthread_t thr; /* this thread */ + struct threadEntry *next; /* next thread */ + int intr; /* interrupted */ +} threadEntry_t; + +typedef struct { + pthread_mutex_t lock; /* fd lock */ + threadEntry_t *threads; /* threads blocked on fd */ +} fdEntry_t; + +static int sigWakeup = (__SIGRTMAX - 2); + +static fdEntry_t* fdTable = NULL; + +static const int fdTableMaxSize = 0x1000; /* 4K */ + +static int fdTableLen = 0; + +static int fdLimit = 0; + +static fdEntry_t** fdOverflowTable = NULL; + +static int fdOverflowTableLen = 0; + +static const int fdOverflowTableSlabSize = 0x10000; /* 64k */ + +pthread_mutex_t fdOverflowTableLock = PTHREAD_MUTEX_INITIALIZER; + +static void sig_wakeup(int sig) { +} + + +static void __attribute((constructor)) init() { + struct rlimit nbr_files; + sigset_t sigset; + struct sigaction sa; + int i = 0; + + if (-1 == getrlimit(RLIMIT_NOFILE, &nbr_files)) { + fprintf(stderr, "library initialization failed - " + "unable to get max # of allocated fds\n"); + abort(); + } + if (nbr_files.rlim_max != RLIM_INFINITY) { + fdLimit = nbr_files.rlim_max; + } else { + fdLimit = INT_MAX; + } + + fdTableLen = fdLimit < fdTableMaxSize ? fdLimit : fdTableMaxSize; + fdTable = (fdEntry_t*) calloc(fdTableLen, sizeof(fdEntry_t)); + if (fdTable == NULL) { + fprintf(stderr, "library initialization failed - " + "unable to allocate file descriptor table - out of memory"); + abort(); + } else { + for (i = 0; i < fdTableLen; i ++) { + pthread_mutex_init(&fdTable[i].lock, NULL); + } + } + + if (fdLimit > fdTableMaxSize) { + fdOverflowTableLen = ((fdLimit - fdTableMaxSize) / fdOverflowTableSlabSize) + 1; + fdOverflowTable = (fdEntry_t**) calloc(fdOverflowTableLen, sizeof(fdEntry_t*)); + if (fdOverflowTable == NULL) { + fprintf(stderr, "library initialization failed - " + "unable to allocate file descriptor overflow table - out of memory"); + abort(); + } + } + + sa.sa_handler = sig_wakeup; + sa.sa_flags = 0; + sigemptyset(&sa.sa_mask); + sigaction(sigWakeup, &sa, NULL); + + sigemptyset(&sigset); + sigaddset(&sigset, sigWakeup); + sigprocmask(SIG_UNBLOCK, &sigset, NULL); +} + +static inline fdEntry_t *getFdEntry(int fd) +{ + fdEntry_t* result = NULL; + + if (fd < 0) { + return NULL; + } + + assert(fd < fdLimit); + + if (fd < fdTableMaxSize) { + assert(fd < fdTableLen); + result = &fdTable[fd]; + } else { + const int indexInOverflowTable = fd - fdTableMaxSize; + const int rootindex = indexInOverflowTable / fdOverflowTableSlabSize; + const int slabindex = indexInOverflowTable % fdOverflowTableSlabSize; + fdEntry_t* slab = NULL; + assert(rootindex < fdOverflowTableLen); + assert(slabindex < fdOverflowTableSlabSize); + pthread_mutex_lock(&fdOverflowTableLock); + if (fdOverflowTable[rootindex] == NULL) { + fdEntry_t* const newSlab = + (fdEntry_t*)calloc(fdOverflowTableSlabSize, sizeof(fdEntry_t)); + if (newSlab == NULL) { + fprintf(stderr, "Unable to allocate file descriptor overflow" + " table slab - out of memory"); + pthread_mutex_unlock(&fdOverflowTableLock); + abort(); + } else { + int i; + for (i = 0; i < fdOverflowTableSlabSize; i ++) { + pthread_mutex_init(&newSlab[i].lock, NULL); + } + fdOverflowTable[rootindex] = newSlab; + } + } + pthread_mutex_unlock(&fdOverflowTableLock); + slab = fdOverflowTable[rootindex]; + result = &slab[slabindex]; + } + + return result; + +} + +static inline void startOp(fdEntry_t *fdEntry, threadEntry_t *self) +{ + self->thr = pthread_self(); + self->intr = 0; + + pthread_mutex_lock(&(fdEntry->lock)); + { + self->next = fdEntry->threads; + fdEntry->threads = self; + } + pthread_mutex_unlock(&(fdEntry->lock)); +} + +static inline void endOp + (fdEntry_t *fdEntry, threadEntry_t *self) +{ + int orig_errno = errno; + pthread_mutex_lock(&(fdEntry->lock)); + { + threadEntry_t *curr, *prev=NULL; + curr = fdEntry->threads; + while (curr != NULL) { + if (curr == self) { + if (curr->intr) { + orig_errno = EBADF; + } + if (prev == NULL) { + fdEntry->threads = curr->next; + } else { + prev->next = curr->next; + } + break; + } + prev = curr; + curr = curr->next; + } + } + pthread_mutex_unlock(&(fdEntry->lock)); + errno = orig_errno; +} + +#define RESTARTABLE(_cmd, _result) do { \ + do { \ + _result = _cmd; \ + } while((_result == -1) && (errno == EINTR)); \ +} while(0) + +int rdma_supported() +{ + int one = 1; + int rv, s; + s = rs_socket(PF_INET, SOCK_STREAM, 0); + if (s < 0) { + return JNI_FALSE; + } + return JNI_TRUE; +} + +int RDMA_SocketAvailable(int s, jint *pbytes) { + int result; + RESTARTABLE(ioctl(s, FIONREAD, pbytes), result); + return (result == -1) ? 0 : 1; +} + +int +RDMA_MapSocketOption(jint cmd, int *level, int *optname) { + static struct { + jint cmd; + int level; + int optname; + } const opts[] = { + { java_net_SocketOptions_TCP_NODELAY, IPPROTO_TCP, TCP_NODELAY }, + { java_net_SocketOptions_SO_SNDBUF, SOL_SOCKET, SO_SNDBUF }, + { java_net_SocketOptions_SO_RCVBUF, SOL_SOCKET, SO_RCVBUF }, + { java_net_SocketOptions_SO_REUSEADDR, SOL_SOCKET, SO_REUSEADDR }, + { jdk_net_RdmaSocketOptions_SQSIZE, SOL_RDMA, RDMA_SQSIZE }, + { jdk_net_RdmaSocketOptions_RQSIZE, SOL_RDMA, RDMA_RQSIZE }, + { jdk_net_RdmaSocketOptions_INLINE, SOL_RDMA, RDMA_INLINE }, + }; + int i; + for (i=0; i<(int)(sizeof(opts) / sizeof(opts[0])); i++) { + if (cmd == opts[i].cmd) { + *level = opts[i].level; + *optname = opts[i].optname; + return 0; + } + } + + return -1; +} + +int +RDMA_GetSockOpt(int fd, int level, int opt, void *result, + int *len) +{ + int rv; + socklen_t socklen = *len; + + rv = rs_getsockopt(fd, level, opt, result, &socklen); + *len = socklen; + + if (rv < 0) { + return rv; + } + + if ((level == SOL_SOCKET) && ((opt == SO_SNDBUF) + || (opt == SO_RCVBUF))) { + int n = *((int *)result); + n /= 2; + *((int *)result) = n; + } + return rv; +} + +int +RDMA_SetSockOpt(int fd, int level, int opt, const void *arg, + int len) +{ + int *bufsize; + if (level == SOL_SOCKET && opt == SO_RCVBUF) { + int *bufsize = (int *)arg; + if (*bufsize < 1024) { + *bufsize = 1024; + } + } + + return rs_setsockopt(fd, level, opt, arg, len); +} + +int +RDMA_Bind(int fd, SOCKETADDRESS *sa, int len) +{ + int rv; + int arg, alen; + + if (sa->sa.sa_family == AF_INET) { + if ((ntohl(sa->sa4.sin_addr.s_addr) & 0x7f0000ff) == 0x7f0000ff) { + errno = EADDRNOTAVAIL; + return -1; + } + } + rv = rs_bind(fd, &sa->sa, len); + return rv; +} + +jint +RDMA_Wait(JNIEnv *env, jint fd, jint flags, jint timeout) +{ + jlong prevNanoTime = JVM_NanoTime(env, 0); + jlong nanoTimeout = (jlong) timeout * NET_NSEC_PER_MSEC; + jint read_rv; + + while (1) { + jlong newNanoTime; + struct pollfd pfd; + pfd.fd = fd; + pfd.events = 0; + if (flags & NET_WAIT_READ) + pfd.events |= POLLIN; + if (flags & NET_WAIT_WRITE) + pfd.events |= POLLOUT; + if (flags & NET_WAIT_CONNECT) + pfd.events |= POLLOUT; + + errno = 0; + read_rv = RDMA_Poll(&pfd, 1, nanoTimeout / NET_NSEC_PER_MSEC); + + newNanoTime = JVM_NanoTime(env, 0); + nanoTimeout -= (newNanoTime - prevNanoTime); + if (nanoTimeout < NET_NSEC_PER_MSEC) { + return read_rv > 0 ? 0 : -1; + } + prevNanoTime = newNanoTime; + + if (read_rv > 0) { + break; + } + } + return (nanoTimeout / NET_NSEC_PER_MSEC); +} + +static int rdma_closefd(int fd1, int fd2) { + int rv, orig_errno; + fdEntry_t *fdEntry = getFdEntry(fd2); + if (fdEntry == NULL) { + errno = EBADF; + return -1; + } + + pthread_mutex_lock(&(fdEntry->lock)); + + { + do { + if (fd1 < 0) { + rv = rs_close(fd2); + } else { +// rv = dup2(fd1, fd2); + } + } while (rv == -1 && errno == EINTR); + + threadEntry_t *curr = fdEntry->threads; + while (curr != NULL) { + curr->intr = 1; + pthread_kill( curr->thr, sigWakeup ); + curr = curr->next; + } + } + + orig_errno = errno; + pthread_mutex_unlock(&(fdEntry->lock)); + errno = orig_errno; + + return rv; +} + +int RDMA_Dup2(int fd, int fd2) { + if (fd < 0) { + errno = EBADF; + return -1; + } + return rdma_closefd(fd, fd2); +} + +int RDMA_SocketClose(int fd) { + return rdma_closefd(-1, fd); +} + +int RDMA_Read(int s, void* buf, size_t len) { + BLOCKING_IO_RETURN_INT( s, rs_recv(s, buf, len, 0) ); +} + +int RDMA_NonBlockingRead(int s, void* buf, size_t len) { + BLOCKING_IO_RETURN_INT( s, rs_recv(s, buf, len, MSG_DONTWAIT) ); +} + +int RDMA_ReadV(int s, const struct iovec * vector, int count) { + BLOCKING_IO_RETURN_INT( s, rs_readv(s, vector, count) ); +} + +int RDMA_RecvFrom(int s, void *buf, int len, unsigned int flags, + struct sockaddr *from, socklen_t *fromlen) { + BLOCKING_IO_RETURN_INT( s, rs_recvfrom(s, buf, len, flags, from, fromlen) ); +} + +int RDMA_Send(int s, void *msg, int len, unsigned int flags) { + BLOCKING_IO_RETURN_INT( s, rs_send(s, msg, len, flags) ); +} + +int RDMA_WriteV(int s, const struct iovec * vector, int count) { + BLOCKING_IO_RETURN_INT( s, rs_writev(s, vector, count) ); +} + +int NET_RSendTo(int s, const void *msg, int len, unsigned int + flags, const struct sockaddr *to, int tolen) { + BLOCKING_IO_RETURN_INT( s, rs_sendto(s, msg, len, flags, to, tolen) ); +} + +int RDMA_Accept(int s, struct sockaddr *addr, socklen_t *addrlen) { + BLOCKING_IO_RETURN_INT( s, rs_accept(s, addr, addrlen) ); +} + +int RDMA_Connect(int s, struct sockaddr *addr, int addrlen) { + BLOCKING_IO_RETURN_INT( s, rs_connect(s, addr, addrlen) ); +} + +int RDMA_Poll(struct pollfd *ufds, unsigned int nfds, int timeout) { + BLOCKING_IO_RETURN_INT( ufds[0].fd, rs_poll(ufds, nfds, timeout) ); +} + +int RDMA_Timeout(JNIEnv *env, int s, long timeout, jlong nanoTimeStamp) { + jlong prevNanoTime = nanoTimeStamp; + jlong nanoTimeout = (jlong)timeout * NET_NSEC_PER_MSEC; + fdEntry_t *fdEntry = getFdEntry(s); + + if (fdEntry == NULL) { + errno = EBADF; + return -1; + } + + for(;;) { + struct pollfd pfd; + int rv; + threadEntry_t self; + + pfd.fd = s; + pfd.events = POLLIN | POLLERR; + + startOp(fdEntry, &self); + rv = rs_poll(&pfd, 1, nanoTimeout / NET_NSEC_PER_MSEC); + endOp(fdEntry, &self); + if (rv < 0 && errno == EINTR) { + jlong newNanoTime = JVM_NanoTime(env, 0); + nanoTimeout -= newNanoTime - prevNanoTime; + if (nanoTimeout < NET_NSEC_PER_MSEC) { + return 0; + } + prevNanoTime = newNanoTime; + } else { + return rv; + } + } +}